oedb-backend/extractors/osm_cal.py

491 lines
17 KiB
Python
Raw Normal View History

2025-09-18 19:27:28 +02:00
#!/usr/bin/env python3
"""
OSM Calendar Extractor for the OpenEventDatabase.
2025-09-18 19:27:28 +02:00
This script fetches events from the OpenStreetMap Calendar RSS feed
and adds them to the OpenEventDatabase if they don't already exist.
2025-09-18 19:27:28 +02:00
RSS Feed URL: https://osmcal.org/events.rss
2025-09-18 22:18:25 +02:00
Environment Variables:
DB_NAME: The name of the database (default: "oedb")
DB_HOST: The hostname of the database server (default: "localhost")
DB_USER: The username to connect to the database (default: "")
POSTGRES_PASSWORD: The password to connect to the database (default: None)
These environment variables can be set in the system environment or in a .env file
in the project root directory.
2025-09-18 19:27:28 +02:00
"""
import json
import requests
import sys
import os
2025-09-18 19:27:28 +02:00
import xml.etree.ElementTree as ET
2025-09-18 19:27:28 +02:00
import re
2025-09-18 19:27:28 +02:00
import html
from datetime import datetime, timedelta
2025-09-18 19:27:28 +02:00
# Add the parent directory to the path so we can import from oedb
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
2025-09-18 22:18:25 +02:00
from oedb.utils.db import db_connect, load_env_from_file
2025-09-18 19:27:28 +02:00
from oedb.utils.logging import logger
2025-09-18 19:27:28 +02:00
# RSS Feed URL for OSM Calendar
2025-09-18 19:27:28 +02:00
RSS_URL = "https://osmcal.org/events.rss"
2025-09-18 19:27:28 +02:00
def fetch_osm_calendar_data():
2025-09-18 19:27:28 +02:00
"""
2025-09-18 19:27:28 +02:00
Fetch events from the OSM Calendar RSS feed.
2025-09-18 19:27:28 +02:00
Returns:
2025-09-18 19:27:28 +02:00
list: A list of event items from the RSS feed.
2025-09-18 19:27:28 +02:00
"""
2025-09-18 19:27:28 +02:00
logger.info("Fetching data from OSM Calendar RSS feed")
2025-09-18 19:27:28 +02:00
try:
2025-09-18 19:27:28 +02:00
response = requests.get(RSS_URL)
response.raise_for_status() # Raise an exception for HTTP errors
# Parse the XML response
root = ET.fromstring(response.content)
# Find all item elements (events)
channel = root.find('channel')
if channel is None:
logger.error("No channel element found in RSS feed")
return []
items = channel.findall('item')
if not items:
logger.error("No items found in RSS feed")
2025-09-18 19:27:28 +02:00
return []
2025-09-18 19:27:28 +02:00
logger.success(f"Successfully fetched {len(items)} events from OSM Calendar RSS feed")
return items
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching data from OSM Calendar RSS feed: {e}")
return []
except ET.ParseError as e:
logger.error(f"Error parsing XML response: {e}")
return []
2025-09-18 19:27:28 +02:00
except Exception as e:
2025-09-18 19:27:28 +02:00
logger.error(f"Unexpected error fetching OSM Calendar data: {e}")
2025-09-18 19:27:28 +02:00
return []
2025-09-18 19:27:28 +02:00
def parse_event_dates(description):
2025-09-18 19:27:28 +02:00
"""
2025-09-18 19:27:28 +02:00
Parse event dates from the description.
2025-09-18 19:27:28 +02:00
Args:
2025-09-18 19:27:28 +02:00
description (str): The event description HTML.
2025-09-18 19:27:28 +02:00
Returns:
2025-09-18 19:27:28 +02:00
tuple: A tuple containing (start_date, end_date) as ISO format strings.
2025-09-18 19:27:28 +02:00
"""
2025-09-18 19:27:28 +02:00
try:
# Extract the date information from the description
date_pattern = r'(\d+)(?:st|nd|rd|th)\s+(\w+)(?:\s+(\d+):(\d+)(?:\s+\s+(\d+):(\d+))?)?(?:\s+\(([^)]+)\))?(?:\s+\s+(\d+)(?:st|nd|rd|th)\s+(\w+))?'
date_match = re.search(date_pattern, description)
if not date_match:
# Try alternative pattern for single day with time range
date_pattern = r'(\d+)(?:st|nd|rd|th)\s+(\w+)\s+(\d+):(\d+)\s+\s+(\d+):(\d+)'
date_match = re.search(date_pattern, description)
if date_match:
# Extract date components
day = int(date_match.group(1))
month_name = date_match.group(2)
# Convert month name to month number
month_map = {
'January': 1, 'February': 2, 'March': 3, 'April': 4,
'May': 5, 'June': 6, 'July': 7, 'August': 8,
'September': 9, 'October': 10, 'November': 11, 'December': 12
}
# Try to match the month name (case insensitive)
month = None
for name, num in month_map.items():
if month_name.lower() == name.lower():
month = num
break
if month is None:
# If month name not found, use current month
month = datetime.now().month
logger.warning(f"Could not parse month name: {month_name}, using current month")
# Get current year (assuming events are current or future)
current_year = datetime.now().year
# Create start date
try:
start_date = datetime(current_year, month, day)
except ValueError:
# Handle invalid dates (e.g., February 30)
logger.warning(f"Invalid date: {day} {month_name} {current_year}, using current date")
start_date = datetime.now()
# Check if there's an end date
if len(date_match.groups()) >= 8 and date_match.group(8):
end_day = int(date_match.group(8))
end_month_name = date_match.group(9)
# Convert end month name to month number
end_month = None
for name, num in month_map.items():
if end_month_name.lower() == name.lower():
end_month = num
break
if end_month is None:
# If end month name not found, use start month
end_month = month
logger.warning(f"Could not parse end month name: {end_month_name}, using start month")
try:
end_date = datetime(current_year, end_month, end_day)
# Add a day to include the full end day
end_date = end_date + timedelta(days=1)
except ValueError:
# Handle invalid dates
logger.warning(f"Invalid end date: {end_day} {end_month_name} {current_year}, using start date + 1 day")
end_date = start_date + timedelta(days=1)
else:
# If no end date, use start date + 1 day as default
end_date = start_date + timedelta(days=1)
# Format dates as ISO strings
start_iso = start_date.isoformat()
end_iso = end_date.isoformat()
return (start_iso, end_iso)
else:
# If no date pattern found, use current date as fallback
now = datetime.now()
start_iso = now.isoformat()
end_iso = (now + timedelta(days=1)).isoformat()
logger.warning(f"Could not parse date from description, using current date: {start_iso} to {end_iso}")
return (start_iso, end_iso)
except Exception as e:
logger.error(f"Error parsing event dates: {e}")
# Return default dates (current date)
now = datetime.now()
return (now.isoformat(), (now + timedelta(days=1)).isoformat())
def extract_location(description):
2025-09-18 19:27:28 +02:00
"""
2025-09-18 19:27:28 +02:00
Extract location information from the event description.
2025-09-18 19:27:28 +02:00
Args:
2025-09-18 19:27:28 +02:00
description (str): The event description HTML.
2025-09-18 19:27:28 +02:00
Returns:
2025-09-18 19:27:28 +02:00
tuple: A tuple containing (location_name, coordinates).
2025-09-18 19:27:28 +02:00
"""
try:
2025-09-18 19:27:28 +02:00
# Default coordinates (center of the world)
coordinates = [0, 0]
location_name = "Unknown Location"
# Try to find location in the description
location_pattern = r'<p>([^<]+)</p>'
location_matches = re.findall(location_pattern, description)
if location_matches and len(location_matches) > 1:
# The second paragraph often contains the location
location_candidate = location_matches[1].strip()
if location_candidate and "," in location_candidate and not location_candidate.startswith('<'):
location_name = location_candidate
# For now, we don't have exact coordinates, so we'll use a placeholder
# In a real implementation, you might want to geocode the location
coordinates = [0, 0]
return (location_name, coordinates)
2025-09-18 19:27:28 +02:00
except Exception as e:
2025-09-18 19:27:28 +02:00
logger.error(f"Error extracting location: {e}")
return ("Unknown Location", [0, 0])
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
def create_event(item):
2025-09-18 19:27:28 +02:00
"""
2025-09-18 19:27:28 +02:00
Create an event object from an RSS item.
2025-09-18 19:27:28 +02:00
Args:
2025-09-18 19:27:28 +02:00
item: An item element from the RSS feed.
2025-09-18 19:27:28 +02:00
Returns:
dict: A GeoJSON Feature representing the event.
"""
try:
2025-09-18 19:27:28 +02:00
# Extract data from the item
title = item.find('title').text
link = item.find('link').text
description = item.find('description').text
guid = item.find('guid').text
# Clean up the description (remove HTML tags for text extraction)
clean_description = re.sub(r'<[^>]+>', ' ', description)
clean_description = html.unescape(clean_description)
clean_description = re.sub(r'\s+', ' ', clean_description).strip()
# Parse dates from the description
start_date, end_date = parse_event_dates(description)
# Extract location information
location_name, coordinates = extract_location(description)
2025-09-18 19:27:28 +02:00
# Create a descriptive label
label = title
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
# Create the event object
event = {
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": coordinates
},
"properties": {
"type": "scheduled",
2025-09-18 19:27:28 +02:00
"what": "community.osm.event",
"what:series": "OpenStreetMap Calendar",
"where": location_name,
2025-09-18 19:27:28 +02:00
"label": label,
2025-09-18 19:27:28 +02:00
"description": clean_description,
2025-09-18 19:27:28 +02:00
"start": start_date,
"stop": end_date,
"url": link,
2025-09-18 19:27:28 +02:00
"external_id": guid,
"source": "OSM Calendar"
2025-09-18 19:27:28 +02:00
}
}
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
return event
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
except Exception as e:
2025-09-18 19:27:28 +02:00
logger.error(f"Error creating event from item: {e}")
2025-09-18 19:27:28 +02:00
return None
2025-09-18 19:27:28 +02:00
def event_exists(db, properties):
"""
Check if an event with the same properties already exists in the database.
Args:
db: Database connection.
properties: Event properties.
Returns:
bool: True if the event exists, False otherwise.
"""
2025-09-18 23:43:06 +02:00
print('event: ', properties)
2025-09-18 19:27:28 +02:00
try:
cur = db.cursor()
# Check if an event with the same external_id exists
if 'external_id' in properties:
cur.execute("""
SELECT events_id FROM events
WHERE events_tags->>'external_id' = %s;
""", (properties['external_id'],))
result = cur.fetchone()
if result:
logger.info(f"Event with external_id {properties['external_id']} already exists")
return True
# Check if an event with the same label, start, and stop exists
cur.execute("""
SELECT events_id FROM events
WHERE events_tags->>'label' = %s
AND events_tags->>'start' = %s
AND events_tags->>'stop' = %s;
""", (
properties.get('label', ''),
properties.get('start', ''),
properties.get('stop', '')
))
result = cur.fetchone()
if result:
logger.info(f"Event with label '{properties.get('label')}' and same dates already exists")
return True
return False
except Exception as e:
logger.error(f"Error checking if event exists: {e}")
return False
2025-09-18 19:27:28 +02:00
def submit_event(event):
"""
Submit an event to the OpenEventDatabase.
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
Args:
event: A GeoJSON Feature representing the event.
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
Returns:
bool: True if the event was successfully submitted, False otherwise.
"""
try:
# Connect to the database
db = db_connect()
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
# Extract event properties
properties = event['properties']
2025-09-18 19:27:28 +02:00
# Check if the event already exists
if event_exists(db, properties):
logger.info(f"Skipping event '{properties.get('label')}' as it already exists")
db.close()
return False
cur = db.cursor()
2025-09-18 19:27:28 +02:00
geometry = json.dumps(event['geometry'])
2025-09-18 19:27:28 +02:00
2025-09-18 23:43:06 +02:00
print('event: ', event)
2025-09-18 19:27:28 +02:00
# Insert the geometry into the geo table
cur.execute("""
INSERT INTO geo
SELECT geom, md5(st_astext(geom)) as hash, st_centroid(geom) as geom_center FROM
(SELECT st_setsrid(st_geomfromgeojson(%s),4326) as geom) as g
WHERE ST_IsValid(geom)
ON CONFLICT DO NOTHING RETURNING hash;
""", (geometry,))
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
# Get the geometry hash
hash_result = cur.fetchone()
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
if hash_result is None:
2025-09-18 23:43:06 +02:00
# If the hash is None, check if the geometry already exists in the database
2025-09-18 19:27:28 +02:00
cur.execute("""
2025-09-18 23:43:06 +02:00
SELECT hash FROM geo
WHERE hash = md5(st_astext(st_setsrid(st_geomfromgeojson(%s),4326)));
2025-09-18 19:27:28 +02:00
""", (geometry,))
2025-09-18 23:43:06 +02:00
existing_hash = cur.fetchone()
if existing_hash:
# Geometry already exists in the database, use its hash
geo_hash = existing_hash[0]
logger.info(f"Using existing geometry with hash: {geo_hash}")
else:
# Geometry doesn't exist, try to insert it directly
cur.execute("""
SELECT md5(st_astext(geom)) as hash,
ST_IsValid(geom),
ST_IsValidReason(geom) from (SELECT st_setsrid(st_geomfromgeojson(%s),4326) as geom) as g;
""", (geometry,))
hash_result = cur.fetchone()
if hash_result is None or not hash_result[1]:
logger.error(f"Invalid geometry for event: {properties.get('label')}")
if hash_result and len(hash_result) > 2:
logger.error(f"Reason: {hash_result[2]}")
db.close()
return False
geo_hash = hash_result[0]
# Now insert the geometry explicitly
cur.execute("""
INSERT INTO geo (geom, hash, geom_center)
VALUES (
st_setsrid(st_geomfromgeojson(%s),4326),
%s,
st_centroid(st_setsrid(st_geomfromgeojson(%s),4326))
)
ON CONFLICT (hash) DO NOTHING;
""", (geometry, geo_hash, geometry))
# Verify the geometry was inserted
cur.execute("SELECT 1 FROM geo WHERE hash = %s", (geo_hash,))
if cur.fetchone() is None:
logger.error(f"Failed to insert geometry with hash: {geo_hash}")
db.close()
return False
logger.info(f"Inserted new geometry with hash: {geo_hash}")
else:
geo_hash = hash_result[0]
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
# Determine the bounds for the time range
bounds = '[]' if properties['start'] == properties['stop'] else '[)'
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
# Insert the event into the database
cur.execute("""
INSERT INTO events (events_type, events_what, events_when, events_tags, events_geo)
VALUES (%s, %s, tstzrange(%s, %s, %s), %s, %s)
ON CONFLICT DO NOTHING RETURNING events_id;
""", (
properties['type'],
properties['what'],
properties['start'],
properties['stop'],
bounds,
json.dumps(properties),
geo_hash
))
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
# Get the event ID
event_id = cur.fetchone()
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
if event_id:
logger.success(f"Event created with ID: {event_id[0]}")
db.commit()
db.close()
return True
else:
logger.warning(f"Failed to create event: {properties.get('label')}")
db.close()
return False
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
except Exception as e:
logger.error(f"Error submitting event: {e}")
return False
def main():
"""
2025-09-18 19:27:28 +02:00
Main function to fetch OSM Calendar events and add them to the database.
2025-09-18 22:30:25 +02:00
The function will exit if the .env file doesn't exist, as it's required
for database connection parameters.
2025-09-18 19:27:28 +02:00
"""
logger.info("Starting OSM Calendar extractor")
2025-09-18 22:18:25 +02:00
2025-09-18 22:30:25 +02:00
# Load environment variables from .env file and check if it exists
if not load_env_from_file():
logger.error("Required .env file not found. Exiting.")
sys.exit(1)
logger.info("Environment variables loaded successfully from .env file")
2025-09-18 19:27:28 +02:00
# Fetch events from the OSM Calendar RSS feed
items = fetch_osm_calendar_data()
if not items:
logger.warning("No events found, exiting")
2025-09-18 19:27:28 +02:00
return
2025-09-18 19:27:28 +02:00
# Process each item
2025-09-18 19:27:28 +02:00
success_count = 0
2025-09-18 19:27:28 +02:00
for item in items:
# Create an event from the item
event = create_event(item)
2025-09-18 19:27:28 +02:00
if not event:
continue
2025-09-18 19:27:28 +02:00
2025-09-18 19:27:28 +02:00
# Submit the event to the database
if submit_event(event):
success_count += 1
2025-09-18 19:27:28 +02:00
logger.success(f"Successfully added {success_count} out of {len(items)} events to the database")
2025-09-18 19:27:28 +02:00
if __name__ == "__main__":
main()