<a href="https://colab.research.google.com/github/tetherless-world/ontology-engineering/blob/when-to-go-where/oe2024/when-to-go-where/experiments/DataScrapingWhenToGoWhere.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gspread gspread_dataframe
!pip install rdflib

Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.1.1


In [None]:
'''
Imports
'''
from bs4 import BeautifulSoup
import requests
import re
from google.colab import auth
from google.auth.transport.requests import Request
import gspread
from gspread_dataframe import get_as_dataframe
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, OWL, XSD

In [None]:
# Define namespaces
OE_WTGW = Namespace("https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/")
OE_WTGW_IND = Namespace("https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/")
LOCATIONS = Namespace("https://www.omg.org/spec/Commons/Locations/")
CMNS_AV = Namespace("https://www.omg.org/spec/Commons/AnnotationVocabulary/")
DCT = Namespace("http://purl.org/dc/terms/")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")


def create_graph():
  # Create a graph
  g = Graph(base="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/")

  # Bind prefixes
  g.bind("Locations", LOCATIONS)
  g.bind("cmns-av", CMNS_AV)
  g.bind("dct", DCT)
  g.bind("oe-wtgw", OE_WTGW)
  g.bind("oe-wtgw-ind", OE_WTGW_IND)
  g.bind("owl", OWL)
  g.bind("rdf", RDF)
  g.bind("rdfs", RDFS)
  g.bind("skos", SKOS)
  g.bind("xsd", XSD)

  # Define ontology metadata
  ontology_uri = URIRef("https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/")
  g.add((ontology_uri, RDF.type, OWL.Ontology))
  g.add((ontology_uri, DCT.abstract, Literal("""When to go where is a project that aims to promote outdoor exploration across the 63 National Parks in the United States. We would like to give people a tool that will allow them to find national parks that will fit any preferences that they may have. We want to give people as few excuses as possible for not pursing outdoor activities such as camping and hiking.

To achieve this we are creating a semantically enabled application that will leverage an OWL ontology to answer user questions. Our interface will give users options to input preferences and using semantics we will identify the best parks that match a users particular criteria.""")))
  g.add((ontology_uri, DCT.abstract, Literal("some description")))
  g.add((ontology_uri, DCT.creator, Literal("Annabelle Choi")))
  g.add((ontology_uri, DCT.creator, Literal("Ben Rodgers")))
  g.add((ontology_uri, DCT.creator, Literal("Samyuth Sagi")))
  g.add((ontology_uri, DCT.creator, Literal("Tyler Layton")))
  g.add((ontology_uri, DCT.license, Literal("https://opensource.org/licenses/MIT", datatype=XSD.anyURI)))
  g.add((ontology_uri, OWL.imports, OE_WTGW[""]))
  g.add((ontology_uri, DCT.modified, Literal("2024-11-22", datatype=XSD.date)))
  g.add((ontology_uri, DCT.publisher, Literal("WhenToGoWhere")))
  g.add((ontology_uri, OWL.priorVersion, URIRef("https://github.com/tetherless-world/ontology-engineering/blob/dac8dda75fb576d8551123076e10627d716549d9/oe2024/when-to-go-where/WhenToGoWhere-individuals.rdf")))
  g.add((ontology_uri, OWL.versionIRI, OE_WTGW_IND["4.0"]))

  return g

# Function to convert names to camel case
def convert_to_camel_case(name):
    # Remove non-alphanumeric characters
    name = re.sub(r'[^A-Za-z0-9 ]+', '', name)
    # Split by spaces, capitalize each word, and join without spaces
    return ''.join(word.capitalize() for word in name.split())

# Function to create park individual in RDF
def add_park_individual(g, name, state=""):
    # Convert names to camel case
    camel_case_name = convert_to_camel_case(name)

    # Create the URI for the individual
    individual_uri = OE_WTGW_IND[camel_case_name]

    # Add the RDF type for NationalPark
    g.add((individual_uri, RDF.type, OWL.NamedIndividual))
    g.add((individual_uri, RDF.type, OE_WTGW.NationalPark))
    if name == "Wrangell St Elias National Park":
      g.add((individual_uri, RDFS.label, Literal("Wrangell-St. Elias National Park".lower())))
    else:
      g.add((individual_uri, RDFS.label, Literal(name.lower())))


    # With states add triples pointing at the state and pointing back at park
    if state:
      state = convert_to_camel_case(state)
      g.add((individual_uri, OE_WTGW.hasState, OE_WTGW_IND[state]))
      g.add((OE_WTGW_IND[state], OE_WTGW.hasPark, individual_uri))

def add_hike_individual(g, name, hike_difficulty, hike_len, park):
    # Convert names to camel case
    camel_case_name = convert_to_camel_case(name)
    hike_difficulty = convert_to_camel_case(hike_difficulty)
    park = convert_to_camel_case(park)

    # Create the URI for the individual
    individual_uri = OE_WTGW_IND[camel_case_name]

    # Add triples to the graph
    g.add((individual_uri, RDF.type, OWL.NamedIndividual))
    g.add((individual_uri, RDF.type, OE_WTGW.Hike))
    g.add((individual_uri, RDFS.label, Literal(name.lower())))
    g.add((individual_uri, OE_WTGW.hasDifficulty, OE_WTGW_IND[hike_difficulty]))
    g.add((individual_uri, OE_WTGW.hasDistance, Literal(hike_len, datatype=XSD.decimal)))
    g.add((individual_uri, OE_WTGW.isHikeOf, OE_WTGW_IND[park]))

    # Add triple pointing back to park
    g.add((OE_WTGW_IND[park], OE_WTGW.hasHike, individual_uri))


def add_location_individual(g, name, latitude, longitude):
  # Convert names to camel case
  camel_case_name = convert_to_camel_case(name) + "Location"

  # Create the URI for the individual
  individual_uri = OE_WTGW_IND[camel_case_name]

  # Add triples to the graph
  g.add((individual_uri, RDF.type, OWL.NamedIndividual))
  g.add((individual_uri, RDF.type, LOCATIONS.PhysicalLocation))
  g.add((individual_uri, RDFS.label, Literal(name.lower() + " location")))
  g.add((individual_uri, LOCATIONS.hasLatitude, Literal(latitude, datatype=XSD.decimal)))
  g.add((individual_uri, LOCATIONS.hasLongitude, Literal(longitude, datatype=XSD.decimal)))

  # Add triple pointing to park
  g.add((OE_WTGW_IND[convert_to_camel_case(name)], LOCATIONS.hasLocation, individual_uri))


def add_state_individual(g, name):
    camel_case_name = convert_to_camel_case(name)

    # Create the URI for the individual
    individual_uri = OE_WTGW_IND[camel_case_name]

    # Add triples to the graph
    g.add((individual_uri, RDF.type, OWL.NamedIndividual))
    g.add((individual_uri, RDFS.label, Literal(name.lower())))
    g.add((individual_uri, RDF.type, OE_WTGW.State))

def add_temperature_individual(g, name, season, temperature):
  camel_case_name = convert_to_camel_case(name)
  individual_name = camel_case_name + season + "AvgTemperature"
  individual_label = name.lower() + " " + season.lower() + " average temperature"

  # Create the URI for the individual
  individual_uri = OE_WTGW_IND[individual_name]

  # Add triples to the graph
  g.add((individual_uri, RDF.type, OWL.NamedIndividual))
  g.add((individual_uri, RDFS.label, Literal(individual_label)))
  g.add((individual_uri, RDF.type, OE_WTGW['AvgSeasonalTemperature']))
  g.add((individual_uri, OE_WTGW.temperatureHasSeason, OE_WTGW_IND[season]))
  g.add((individual_uri, OE_WTGW.hasTemperature, Literal(temperature, datatype=XSD.decimal)))

  # Add triple pointing back to park
  g.add((OE_WTGW_IND[camel_case_name], OE_WTGW.hasAvgSeasonalTemperature, individual_uri))

def add_generic_individual(g, name):
    # Create the URI for the individual
    individual_uri = OE_WTGW_IND[name]

    # Add triples to the graph
    g.add((individual_uri, RDF.type, OWL.NamedIndividual))
    g.add((individual_uri, RDFS.label, Literal(name.lower())))

# Serialize the graph
def print_graph(g):
  rdf_data = g.serialize(format="pretty-xml", encoding="utf-8").decode("utf-8")

  if 'xmlns:oe-wtgw-ind=' not in rdf_data:
      rdf_data = rdf_data.replace(
          '<rdf:RDF',
          '<rdf:RDF\n  xmlns:oe-wtgw-ind="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/"\n ',
          1
      )

  # Print the output
  print(rdf_data)

# test graph
test_g = create_graph()

# test trail
name = "The Watchman Trail"
hike_difficuty = "Moderate"
hike_len = "3.3"
park = "Zion National Park"
add_hike_individual(test_g, name, hike_difficuty, hike_len, park)

# test park
add_park_individual(test_g, park)

# test park 2
name = "Acadia National Park"
add_park_individual(test_g, name)
add_location_individual(test_g, name, 44.33, -68)
add_state_individual(test_g, "Maine")
add_park_individual(test_g, name, state="Maine")
add_temperature_individual(test_g, name, "Spring", 60)

# # season
# seasons = ["Spring", "Summer", "Fall", "Winter"]
# for season in seasons:
#   add_generic_individual(test_g, season)
#   test_g.add((OE_WTGW_IND[season], RDF.type, OE_WTGW['Season']))

# # difficulties
# difficulties = ["Easy", "Moderate", "Challenging"]
# for difficulty in difficulties:
#   add_generic_individual(test_g, difficulty)
#   test_g.add((OE_WTGW_IND[difficulty], RDF.type, OE_WTGW['Difficulty']))

print_graph(test_g)

<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
  xmlns:oe-wtgw-ind="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/"
  xml:base="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/"
  xmlns:oe-wtgw="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/"
  xmlns:owl="http://www.w3.org/2002/07/owl#"
  xmlns:dct="http://purl.org/dc/terms/"
  xmlns:Locations="https://www.omg.org/spec/Commons/Locations/"
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
>
  <owl:Ontology rdf:about="">
    <dct:abstract>When to go where is a project that aims to promote outdoor exploration across the 63 National Parks in the United States. We would like to give people a tool that will allow them to find national parks that will fit any preferences that they may have. We want to give people as few excuses as possible for not pursing outdoor ac

In [None]:
# Create the rdf graph
graph = create_graph()

# Add some basic individuals
# season
seasons = ["Spring", "Summer", "Fall", "Winter"]
for season in seasons:
  add_generic_individual(graph, season)
  graph.add((OE_WTGW_IND[season], RDF.type, OE_WTGW['Season']))

# difficulties
difficulties = ["Easy", "Moderate", "Challenging"]
for difficulty in difficulties:
  add_generic_individual(graph, difficulty)
  graph.add((OE_WTGW_IND[difficulty], RDF.type, OE_WTGW['Difficulty']))

# states all 50
states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
    "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
    "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
    "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma",
    "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas",
    "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming", "U.S. Virgin Islands"
]
for state in states:
  add_state_individual(graph, state)


In [None]:
'''
Webscraping Data for a Specific Link
'''

LINK = "https://www.cntraveler.com/gallery/the-best-hike-in-every-national-park"

response = requests.get(LINK)
soup = BeautifulSoup(response.text, 'html.parser')
figures = soup.find_all("figure", class_=re.compile(r"^GallerySlideFigure"))

parks = set()
national_parks  = [
    "Acadia National Park", "American Samoa National Park", "Arches National Park",
    "Badlands National Park", "Big Bend National Park", "Biscayne National Park",
    "Black Canyon of the Gunnison National Park", "Bryce Canyon National Park",
    "Canyonlands National Park", "Capitol Reef National Park", "Carlsbad Caverns National Park",
    "Channel Islands National Park", "Congaree National Park", "Crater Lake National Park",
    "Death Valley National Park", "Denali National Park", "Dry Tortugas National Park",
    "Everglades National Park", "Gates of the Arctic National Park", "Glacier Bay National Park",
    "Glacier National Park", "Grand Canyon National Park", "Grand Teton National Park",
    "Great Basin National Park", "Great Sand Dunes National Park", "Great Smoky Mountains National Park",
    "Haleakala National Park", "Hawaii Volcanoes National Park", "Hot Springs National Park",
    "Isle Royale National Park", "Joshua Tree National Park", "Katmai National Park",
    "Kenai Fjords National Park", "Kings Canyon National Park", "Kobuk Valley National Park",
    "Lake Clark National Park", "Lassen Volcanic National Park", "Mammoth Cave National Park",
    "Mesa Verde National Park", "Mount Rainier National Park", "North Cascades National Park",
    "Olympic National Park", "Petrified Forest National Park", "Pinnacles National Park",
    "Redwood National Park", "Rocky Mountain National Park", "Saguaro National Park",
    "Sequoia National Park", "Shenandoah National Park", "Theodore Roosevelt National Park",
    "Wind Cave National Park", "Wrangell St Elias National Park", "Yellowstone National Park",
    "Yosemite National Park", "Zion National Park", "Gateway Arch National Park",
    "Channel Islands National Park", "Black Canyon of the Gunnison National Park",
    "Cuyahoga Valley National Park", "Denali National Park", "Kings Canyon National Park",
    "Petrified Forest National Park", "US Virgin Islands National Park", "Voyageurs National Park",
    "White Sands National Park", "New River Gorge National Park", "Indiana Dunes National Park", "Guadalupe Mountains National Park"
]





for figure in figures:
    park = figure.find("h2").get_text(strip=True)
    park = park.replace(" and Preserve", "")  # Remove "and Preserve" from park names
    if park == ("National Park of American Samoa"):
      park = "American Samoa National Park"
    elif park == "Wrangell-St.Elias National Park":
      park = "Wrangell St Elias National Park"
    elif park == "Virgin Islands National Park":
      park = "US " + park
    elif park == "Haleakalā National Park":
      park = "Haleakala National Park"



    # if park in national_parks:
    #   print(park+" yes")
    # else:
    #   print(park+" no     "+park)



    paragraph = figure.find("figcaption").find("p")
    children = list(paragraph.children)

    name = children[1].strip()
    hike_len = children[3].split()[1].strip()
    hike_difficuty = children[5].split()[-1].strip()


    add_park_individual(graph, park)

    number_pattern = r"^\d+(\.\d+)?$"

    parks.add(park)

    if re.match(number_pattern, hike_len):
        add_hike_individual(graph, name, hike_difficuty, hike_len, park)

tmp = 1
for park in national_parks:
  if park in parks:
      print(park+" yes")
      tmp+=1
  else:
    print(park+" no     "+park)
print(tmp)
print(len(national_parks))
print(len(parks))


Acadia National Park yes
American Samoa National Park yes
Arches National Park yes
Badlands National Park yes
Big Bend National Park yes
Biscayne National Park yes
Black Canyon of the Gunnison National Park yes
Bryce Canyon National Park yes
Canyonlands National Park yes
Capitol Reef National Park yes
Carlsbad Caverns National Park yes
Channel Islands National Park yes
Congaree National Park yes
Crater Lake National Park yes
Death Valley National Park yes
Denali National Park yes
Dry Tortugas National Park yes
Everglades National Park yes
Gates of the Arctic National Park yes
Glacier Bay National Park yes
Glacier National Park yes
Grand Canyon National Park yes
Grand Teton National Park yes
Great Basin National Park yes
Great Sand Dunes National Park yes
Great Smoky Mountains National Park yes
Haleakala National Park yes
Hawaii Volcanoes National Park yes
Hot Springs National Park yes
Isle Royale National Park yes
Joshua Tree National Park yes
Katmai National Park yes
Kenai Fjords Nati

In [None]:
'''
latitude and longitude
'''
# US National Parks with Exact Coordinates (Latitude, Longitude)
national_parks = [
    {"name": "Acadia", "state": "Maine", "latitude": 44.3386, "longitude": -68.2733},
    {"name": "American Samoa", "state": "American Samoa", "latitude": -14.2418, "longitude": -170.6819},
    {"name": "Arches", "state": "Utah", "latitude": 38.7331, "longitude": -109.5925},
    {"name": "Badlands", "state": "South Dakota", "latitude": 43.7554, "longitude": -102.4868},
    {"name": "Big Bend", "state": "Texas", "latitude": 29.2888, "longitude": -103.2634},
    {"name": "Biscayne", "state": "Florida", "latitude": 25.4652, "longitude": -80.2123},
    {"name": "Black Canyon of the Gunnison", "state": "Colorado", "latitude": 38.5704, "longitude": -107.7416},
    {"name": "Bryce Canyon", "state": "Utah", "latitude": 37.5930, "longitude": -112.1871},
    {"name": "Canyonlands", "state": "Utah", "latitude": 38.2186, "longitude": -109.9025},
    {"name": "Capitol Reef", "state": "Utah", "latitude": 38.3670, "longitude": -111.2615},
    {"name": "Carlsbad Caverns", "state": "New Mexico", "latitude": 32.1758, "longitude": -104.4468},
    {"name": "Channel Islands", "state": "California", "latitude": 34.0069, "longitude": -119.7030},
    {"name": "Congaree", "state": "South Carolina", "latitude": 33.8921, "longitude": -80.7747},
    {"name": "Crater Lake", "state": "Oregon", "latitude": 42.9405, "longitude": -122.1186},
    {"name": "Cuyahoga Valley", "state": "Ohio", "latitude": 41.2808, "longitude": -81.5578},
    {"name": "Death Valley", "state": "California/Nevada", "latitude": 36.4864, "longitude": -116.8361},
    {"name": "Denali", "state": "Alaska", "latitude": 63.1148, "longitude": -151.1926},
    {"name": "Dry Tortugas", "state": "Florida", "latitude": 24.6285, "longitude": -82.8732},
    {"name": "Everglades", "state": "Florida", "latitude": 25.3720, "longitude": -80.8995},
    {"name": "Gates of the Arctic", "state": "Alaska", "latitude": 67.4089, "longitude": -153.1423},
    {"name": "Gateway Arch", "state": "Missouri", "latitude": 38.6251, "longitude": -90.1868},
    {"name": "Glacier", "state": "Montana", "latitude": 48.7596, "longitude": -113.7870},
    {"name": "Glacier Bay", "state": "Alaska", "latitude": 58.6548, "longitude": -136.9075},
    {"name": "Grand Canyon", "state": "Arizona", "latitude": 36.0544, "longitude": -112.1401},
    {"name": "Grand Teton", "state": "Wyoming", "latitude": 43.7904, "longitude": -110.6818},
    {"name": "Great Basin", "state": "Nevada", "latitude": 38.7391, "longitude": -114.2384},
    {"name": "Great Sand Dunes", "state": "Colorado", "latitude": 37.7916, "longitude": -105.5943},
    {"name": "Great Smoky Mountains", "state": "Tennessee/North Carolina", "latitude": 35.6131, "longitude": -83.5532},
    {"name": "Guadalupe Mountains", "state": "Texas", "latitude": 31.9231, "longitude": -104.8619},
    {"name": "Haleakala", "state": "Hawaii", "latitude": 20.7204, "longitude": -156.1551},
    {"name": "Hawaii Volcanoes", "state": "Hawaii", "latitude": 19.4230, "longitude": -155.2866},
    {"name": "Hot Springs", "state": "Arkansas", "latitude": 34.5133, "longitude": -93.0631},
    {"name": "Indiana Dunes", "state": "Indiana", "latitude": 41.6533, "longitude": -87.0524},
    {"name": "Isle Royale", "state": "Michigan", "latitude": 48.0595, "longitude": -88.9093},
    {"name": "Joshua Tree", "state": "California", "latitude": 33.8734, "longitude": -115.9010},
    {"name": "Katmai", "state": "Alaska", "latitude": 58.5463, "longitude": -155.0626},
    {"name": "Kenai Fjords", "state": "Alaska", "latitude": 59.8521, "longitude": -149.6423},
    {"name": "Kings Canyon", "state": "California", "latitude": 36.8879, "longitude": -118.5551},
    {"name": "Kobuk Valley", "state": "Alaska", "latitude": 67.3351, "longitude": -159.1280},
    {"name": "Lake Clark", "state": "Alaska", "latitude": 60.4026, "longitude": -154.3230},
    {"name": "Lassen Volcanic", "state": "California", "latitude": 40.4978, "longitude": -121.4207},
    {"name": "Mammoth Cave", "state": "Kentucky", "latitude": 37.1866, "longitude": -86.1336},
    {"name": "Mesa Verde", "state": "Colorado", "latitude": 37.2309, "longitude": -108.4618},
    {"name": "Mount Rainier", "state": "Washington", "latitude": 46.8523, "longitude": -121.7603},
    {"name": "New River Gorge", "state": "West Virginia", "latitude": 38.0780, "longitude": -81.0817},
    {"name": "North Cascades", "state": "Washington", "latitude": 48.7718, "longitude": -121.2035},
    {"name": "Olympic", "state": "Washington", "latitude": 47.8021, "longitude": -123.6044},
    {"name": "Petrified Forest", "state": "Arizona", "latitude": 34.9097, "longitude": -109.8068},
    {"name": "Pinnacles", "state": "California", "latitude": 36.4908, "longitude": -121.1825},
    {"name": "Redwood", "state": "California", "latitude": 41.3134, "longitude": -124.0068},
    {"name": "Rocky Mountain", "state": "Colorado", "latitude": 40.3428, "longitude": -105.6836},
    {"name": "Saguaro", "state": "Arizona", "latitude": 32.2967, "longitude": -111.1665},
    {"name": "Sequoia", "state": "California", "latitude": 36.4863, "longitude": -118.5658},
    {"name": "Shenandoah", "state": "Virginia", "latitude": 38.5598, "longitude": -78.3732},
    {"name": "Theodore Roosevelt", "state": "North Dakota", "latitude": 46.9737, "longitude": -103.5387},
    {"name": "US Virgin Islands", "state": "U.S. Virgin Islands", "latitude": 18.3434, "longitude": -64.7081},
    {"name": "Voyageurs", "state": "Minnesota", "latitude": 48.4764, "longitude": -92.8261},
    {"name": "Wind Cave", "state": "South Dakota", "latitude": 43.6086, "longitude": -103.4850},
    {"name": "Wrangell St Elias", "state": "Alaska", "latitude": 61.7104, "longitude": -142.9865},
    {"name": "White Sands", "state": "New Mexico", "latitude": 32.7872, "longitude": -106.3257},
    {"name": "Yellowstone", "state": "Wyoming", "latitude": 44.4280, "longitude": -110.5885},
    {"name": "Yosemite", "state": "California", "latitude": 37.8651, "longitude": -119.5383},
    {"name": "Zion", "state": "Utah", "latitude": 37.2982, "longitude": -113.0263}
]


# List to store the parks with coordinates added
parks2 = []

for park in national_parks:
    matching_park_name = f"{park['name']} National Park"
    latitude = str(park['latitude'])
    longitude = str(park['longitude'])

    if matching_park_name not in parks2:
        parks2.append(matching_park_name)
        add_location_individual(graph, matching_park_name, latitude, longitude)




In [None]:
'''
Authentication logic sourced from chatgpt
'''
# Authenticate the user
auth.authenticate_user()

# Authenticate with gspread using Google Drive credentials
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

# Open the spreadsheet by its URL
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1pMLCErMIz6XHOSjQQ5GmqtnM3jViEiDlIjKGQZIhmF0/edit?gid=0"
spreadsheet = gc.open_by_url(spreadsheet_url)

# Access the first sheet (or specify a sheet by name)
worksheet = spreadsheet.get_worksheet(0)

# Convert the worksheet data to a Pandas DataFrame
import pandas as pd
data = get_as_dataframe(worksheet)
data = data.iloc[:63, :6]

park_data = data.iloc[1:, 0]

for index, row in data.iterrows():
  park = row[0]
  state = row[1]

  for temp, season in zip(row[2:], ["Winter", "Spring", "Summer", "Fall"]):
    match = re.search(r"(-?\d+\.?\d*)", temp)

    if match:
      add_temperature_individual(graph, park, season, match.group(0))

  if park in parks:
    for state in state.split('/'):
      add_park_individual(graph, park, state=state)

# count
# print(data)


  park = row[0]
  state = row[1]


In [None]:
# Print output
print_graph(graph)

<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
  xmlns:oe-wtgw-ind="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/"
  xml:base="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/"
  xmlns:oe-wtgw="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/"
  xmlns:owl="http://www.w3.org/2002/07/owl#"
  xmlns:dct="http://purl.org/dc/terms/"
  xmlns:Locations="https://www.omg.org/spec/Commons/Locations/"
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
>
  <owl:NamedIndividual rdf:about="Oklahoma">
    <rdf:type rdf:resource="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/State"/>
    <rdfs:label>oklahoma</rdfs:label>
  </owl:NamedIndividual>
  <owl:NamedIndividual rdf:about="NewJersey">
    <rdf:type rdf:resource="https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/S

In [None]:
# Query 1
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX oe-wtgw: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/>
PREFIX oe-wtgw-ind: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/>
SELECT ?hike
WHERE {
    ?hike oe-wtgw:isHikeOf oe-wtgw-ind:ArchesNationalPark
}
"""

for row in graph.query(query):
  print(row)

(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/DoubleOArch'),)


In [None]:
# Query 2
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX oe-wtgw: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/>
PREFIX oe-wtgw-ind: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/>
PREFIX Locations: <https://www.omg.org/spec/Commons/Locations/>
SELECT ?park ?latitude
WHERE  {
    ?park Locations:hasLocation ?location .
    ?location Locations:hasLatitude ?latitude .
    FILTER(datatype(?latitude) = xsd:decimal)
}
ORDER BY DESC(?latitude)
"""

for row in graph.query(query):
  print(row)

(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/GatesOfTheArcticNationalPark'), rdflib.term.Literal('67.4089', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/KobukValleyNationalPark'), rdflib.term.Literal('67.3351', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/DenaliNationalPark'), rdflib.term.Literal('63.1148', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/WrangellStEliasNationalPark'), rdflib.term.Literal('61.7104', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('htt

In [None]:
# Query 3
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX oe-wtgw-ind: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/>
PREFIX oe-wtgw: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/>
SELECT ?park
WHERE {
    ?park rdf:type oe-wtgw:NationalPark .
    ?park oe-wtgw:hasState oe-wtgw-ind:Maine .
}
"""

for row in graph.query(query):
  print(row)

(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/AcadiaNationalPark'),)


In [None]:
# Query 4
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX oe-wtgw: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/>
PREFIX oe-wtgw-ind: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/>
PREFIX Locations: <https://www.omg.org/spec/Commons/Locations/>
SELECT ?park ?longitude ?summerTemp
WHERE {
    ?park rdf:type oe-wtgw:NationalPark .
    ?park Locations:hasLocation ?location .
    ?location Locations:hasLongitude ?longitude .
    ?park oe-wtgw:hasAvgSeasonalTemperature ?tempInd .
    ?tempInd rdf:type oe-wtgw:AvgSeasonalTemperature .
    ?tempInd oe-wtgw:temperatureHasSeason oe-wtgw-ind:Summer .
    ?tempInd oe-wtgw:hasTemperature ?summerTemp .
    FILTER(?longitude >= -110 && ?longitude <= -82)
}
ORDER BY ASC(?summerTemp)
"""

for row in graph.query(query):
  print(row)

(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/IsleRoyaleNationalPark'), rdflib.term.Literal('-88.9093', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')), rdflib.term.Literal('65', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/BlackCanyonOfTheGunnisonNationalPark'), rdflib.term.Literal('-107.7416', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')), rdflib.term.Literal('70', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/RockyMountainNationalPark'), rdflib.term.Literal('-105.6836', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')), rdflib.term.Literal('70', datatype=rdflib.term.URIRe

In [None]:
# Query 5
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX oe-wtgw: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/>
PREFIX oe-wtgw-ind: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/>
SELECT ?park ?summerTemp ?hike ?distance
WHERE {
    ?park rdf:type oe-wtgw:NationalPark .
    ?park oe-wtgw:hasAvgSeasonalTemperature ?tempInd .
    ?tempInd rdf:type oe-wtgw:AvgSeasonalTemperature .
    ?tempInd oe-wtgw:temperatureHasSeason oe-wtgw-ind:Summer .
    ?tempInd oe-wtgw:hasTemperature ?summerTemp .
    ?park oe-wtgw:hasHike ?hike .
    ?hike oe-wtgw:hasDistance ?distance .
    FILTER(?distance <= 2)
}
ORDER BY ASC(?summerTemp)
"""

for row in graph.query(query):
  print(row)

(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/CraterLakeNationalPark'), rdflib.term.Literal('65', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')), rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/WatchmanPeakTrail'), rdflib.term.Literal('1.7', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/CuyahogaValleyNationalPark'), rdflib.term.Literal('75', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')), rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/BrandywineGorgeTrail'), rdflib.term.Literal('1.5', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engine

In [None]:
# Query 6
query="""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX oe-wtgw: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere/>
PREFIX oe-wtgw-ind: <https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/>
SELECT ?park ?hike ?distance
WHERE {
    ?park rdf:type oe-wtgw:NationalPark .
    ?park oe-wtgw:hasState oe-wtgw-ind:California .
    ?park oe-wtgw:hasHike ?hike .
    ?hike oe-wtgw:hasDistance ?distance .
    FILTER(datatype(?distance) = xsd:decimal)
}
ORDER BY DESC(?distance)
"""

for row in graph.query(query):
  print(row)

(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/SequoiaNationalPark'), rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/LakesTrail'), rdflib.term.Literal('12.2', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/YosemiteNationalPark'), rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/PanoramaTrail'), rdflib.term.Literal('9', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#decimal')))
(rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/KingsCanyonNationalPark'), rdflib.term.URIRef('https://tw.rpi.edu/ontology-engineering/oe2024/when-to-go-where/WhenToGoWhere-individuals/MistFalls'), rdflib.term.Litera

In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
from fuzzywuzzy import process

list1 = ['Acadia National Park', 'Arches National Park', 'Badlands National Park', 'Big Bend National Park',
         'Biscayne National Park', 'Black Canyon of the Gunnison National Park', 'Bryce Canyon National Park',
         'Canyonlands National Park', 'Capitol Reef National Park', 'Carlsbad Caverns National Park',
         'Channel Islands National Park', 'Congaree National Park', 'Crater Lake National Park',
         'Cuyahoga Valley National Park', 'Death Valley National Park', 'Denali National Park and Preserve',
         'Dry Tortugas National Park', 'Everglades National Park', 'Gates of the Arctic National Park',
         'Gateway Arch National Park', 'Glacier National Park', 'Glacier Bay National Park',
         'Grand Canyon National Park', 'Grand Teton National Park', 'Great Basin National Park',
         'Great Sand Dunes National Park', 'Great Smoky Mountains National Park', 'Guadalupe Mountains National Park',
         'Haleakalā National Park', 'Hawaii Volcanoes National Park', 'Hot Springs National Park',
         'Indiana Dunes National Park', 'Isle Royale National Park', 'Joshua Tree National Park',
         'Katmai National Park and Preserve', 'Kenai Fjords National Park', 'Kings Canyon National Park',
         'Kobuk Valley National Park', 'Lake Clark National Park', 'Lassen Volcanic National Park',
         'Mammoth Cave National Park', 'Mesa Verde National Park', 'Mount Rainier National Park',
         'National Park of American Samoa', 'New River Gorge National Park', 'North Cascades National Park',
         'Olympic National Park', 'Petrified Forest National Park', 'Pinnacles National Park',
         'Redwood National Park', 'Rocky Mountain National Park', 'Saguaro National Park',
         'Sequoia National Park', 'Shenandoah National Park', 'Theodore Roosevelt National Park',
         'Virgin Islands National Park', 'Voyageurs National Park', 'White Sands National Park',
         'Wind Cave National Park', 'Wrangell-St.Elias National Park and Preserve',
         'Yellowstone National Park', 'Yosemite National Park', 'Zion National Park']

list2 = ['American Samoa National Park', 'Arches National Park', 'Badlands National Park', 'Big Bend National Park',
         'Biscayne National Park', 'Black Canyon of the Gunnison National Park', 'Bryce Canyon National Park',
         'Canyonlands National Park', 'Capitol Reef National Park', 'Carlsbad Caverns National Park',
         'Channel Islands National Park', 'Congaree National Park', 'Crater Lake National Park',
         'Death Valley National Park', 'Denali National Park', 'Dry Tortugas National Park',
         'Everglades National Park', 'Gates of the Arctic National Park', 'Glacier Bay National Park',
         'Glacier National Park', 'Grand Canyon National Park', 'Grand Teton National Park',
         'Great Basin National Park', 'Great Sand Dunes National Park', 'Great Smoky Mountains National Park',
         'Haleakalā National Park', 'Hawaii Volcanoes National Park', 'Hot Springs National Park',
         'Isle Royale National Park', 'Joshua Tree National Park', 'Katmai National Park',
         'Kenai Fjords National Park', 'Kings Canyon National Park', 'Kobuk Valley National Park',
         'Lake Clark National Park', 'Lassen Volcanic National Park', 'Mammoth Cave National Park',
         'Mesa Verde National Park', 'Mount Rainier National Park', 'North Cascades National Park',
         'Olympic National Park', 'Petrified Forest National Park', 'Pinnacles National Park',
         'Redwood National and State Parks', 'Rocky Mountain National Park', 'Saguaro National Park',
         'Sequoia National Park', 'Shenandoah National Park', 'Theodore Roosevelt National Park',
         'Virgin Islands National Park', 'Wind Cave National Park', 'Wrangell-St. Elias National Park',
         'Yellowstone National Park', 'Yosemite National Park', 'Zion National Park',
         'The Gateway Arch National Park', 'Channel Islands National Park',
         'Black Canyon of the Gunnison National Park', 'Cuyahoga Valley National Park',
         'Denali National Park', 'Kings Canyon National Park', 'Petrified Forest National Park']

# Match list1 names to list2
matches = {}
for park in list1:
    best_match, score = process.extractOne(park, list2)
    if score > 89:  # Adjust threshold as needed
        matches[park] = best_match

# Display results
for park, match in matches.items():
    print(f"{park} -> {match}")

print(len(list(matches.items())))


Arches National Park -> Arches National Park
Badlands National Park -> Badlands National Park
Big Bend National Park -> Big Bend National Park
Biscayne National Park -> Biscayne National Park
Black Canyon of the Gunnison National Park -> Black Canyon of the Gunnison National Park
Bryce Canyon National Park -> Bryce Canyon National Park
Canyonlands National Park -> Canyonlands National Park
Capitol Reef National Park -> Capitol Reef National Park
Carlsbad Caverns National Park -> Carlsbad Caverns National Park
Channel Islands National Park -> Channel Islands National Park
Congaree National Park -> Congaree National Park
Crater Lake National Park -> Crater Lake National Park
Cuyahoga Valley National Park -> Cuyahoga Valley National Park
Death Valley National Park -> Death Valley National Park
Denali National Park and Preserve -> Denali National Park
Dry Tortugas National Park -> Dry Tortugas National Park
Everglades National Park -> Everglades National Park
Gates of the Arctic National P