907 lines
		
	
	
		
			No EOL
		
	
	
		
			36 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			907 lines
		
	
	
		
			No EOL
		
	
	
		
			36 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/env python3
 | ||
| """
 | ||
| OSM Calendar Extractor for the OpenEventDatabase.
 | ||
| 
 | ||
| This script fetches events from the OpenStreetMap Calendar RSS feed
 | ||
| and adds them to the OpenEventDatabase via the API.
 | ||
| 
 | ||
| For events that don't have geographic coordinates in the RSS feed but have a link
 | ||
| to an OSM Calendar event (https://osmcal.org/event/...), the script will fetch
 | ||
| the iCal version of the event and extract the coordinates and location from there.
 | ||
| 
 | ||
| RSS Feed URL: https://osmcal.org/events.rss
 | ||
| API Endpoint: https://api.openeventdatabase.org/event
 | ||
| 
 | ||
| Usage:
 | ||
|     python osm_cal.py [--max-events MAX_EVENTS] [--offset OFFSET]
 | ||
| 
 | ||
| Arguments:
 | ||
|     --max-events MAX_EVENTS  Maximum number of events to insert (default: 1)
 | ||
|     --offset OFFSET          Number of events to skip from the beginning of the RSS feed (default: 0)
 | ||
| 
 | ||
| Examples:
 | ||
|     # Insert the first event from the RSS feed
 | ||
|     python osm_cal.py
 | ||
|     
 | ||
|     # Insert up to 5 events from the RSS feed
 | ||
|     python osm_cal.py --max-events 5
 | ||
|     
 | ||
|     # Skip the first 3 events and insert the next 2
 | ||
|     python osm_cal.py --offset 3 --max-events 2
 | ||
| 
 | ||
| Environment Variables:
 | ||
|     These environment variables can be set in the system environment or in a .env file
 | ||
|     in the project root directory.
 | ||
| """
 | ||
| 
 | ||
| import json
 | ||
| import requests
 | ||
| import sys
 | ||
| import os
 | ||
| import xml.etree.ElementTree as ET
 | ||
| import re
 | ||
| import html
 | ||
| from datetime import datetime, timedelta
 | ||
| from bs4 import BeautifulSoup
 | ||
| import unicodedata
 | ||
| 
 | ||
| # Add the parent directory to the path so we can import from oedb
 | ||
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 | ||
| 
 | ||
| from oedb.utils.db import db_connect, load_env_from_file
 | ||
| from oedb.utils.logging import logger
 | ||
| 
 | ||
| # RSS Feed URL for OSM Calendar
 | ||
| RSS_URL = "https://osmcal.org/events.rss"
 | ||
| # Base URL for OSM Calendar events
 | ||
| OSMCAL_EVENT_BASE_URL = "https://osmcal.org/event/"
 | ||
| # Main OSM Calendar page
 | ||
| OSMCAL_MAIN_URL = "https://osmcal.org"
 | ||
| # Cache file for processed events
 | ||
| CACHE_FILE = os.path.join(os.path.dirname(__file__), 'osm_cal_cache.json')
 | ||
| 
 | ||
| def fix_encoding(text):
 | ||
|     """
 | ||
|     Corrige les problèmes d'encodage UTF-8 courants.
 | ||
| 
 | ||
|     Args:
 | ||
|         text (str): Texte potentiellement mal encodé
 | ||
| 
 | ||
|     Returns:
 | ||
|         str: Texte avec l'encodage corrigé
 | ||
|     """
 | ||
|     if not text:
 | ||
|         return text
 | ||
| 
 | ||
|     try:
 | ||
|         # Essayer de détecter et corriger l'encodage double UTF-8
 | ||
|         # (UTF-8 interprété comme Latin-1 puis réencodé en UTF-8)
 | ||
|         if 'Ã' in text:
 | ||
|             # Encoder en latin-1 puis décoder en UTF-8
 | ||
|             corrected = text.encode('latin-1').decode('utf-8')
 | ||
|             logger.info(f"Encodage corrigé : '{text}' -> '{corrected}'")
 | ||
|             return corrected
 | ||
|     except (UnicodeEncodeError, UnicodeDecodeError):
 | ||
|         # Si la correction échoue, essayer d'autres méthodes
 | ||
|         try:
 | ||
|             # Normaliser les caractères Unicode
 | ||
|             normalized = unicodedata.normalize('NFKD', text)
 | ||
|             return normalized
 | ||
|         except:
 | ||
|             pass
 | ||
| 
 | ||
|     # Si aucune correction ne fonctionne, retourner le texte original
 | ||
|     return text
 | ||
| 
 | ||
| def load_event_cache():
 | ||
|     """
 | ||
|     Charge le cache des événements traités depuis le fichier JSON.
 | ||
| 
 | ||
|     Returns:
 | ||
|         dict: Dictionnaire des événements avec leur statut de traitement
 | ||
|     """
 | ||
|     if os.path.exists(CACHE_FILE):
 | ||
|         try:
 | ||
|             with open(CACHE_FILE, 'r', encoding='utf-8') as f:
 | ||
|                 cache = json.load(f)
 | ||
|                 logger.info(f"Cache chargé : {len(cache)} événements en cache")
 | ||
|                 return cache
 | ||
|         except Exception as e:
 | ||
|             logger.error(f"Erreur lors du chargement du cache : {e}")
 | ||
|             return {}
 | ||
|     else:
 | ||
|         logger.info("Aucun cache trouvé, création d'un nouveau cache")
 | ||
|         return {}
 | ||
| 
 | ||
| def save_event_cache(cache):
 | ||
|     """
 | ||
|     Sauvegarde le cache des événements dans le fichier JSON.
 | ||
| 
 | ||
|     Args:
 | ||
|         cache (dict): Dictionnaire des événements avec leur statut
 | ||
|     """
 | ||
|     try:
 | ||
|         with open(CACHE_FILE, 'w', encoding='utf-8') as f:
 | ||
|             json.dump(cache, f, indent=2, ensure_ascii=False)
 | ||
|         logger.info(f"Cache sauvegardé : {len(cache)} événements")
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Erreur lors de la sauvegarde du cache : {e}")
 | ||
| 
 | ||
| def scrape_osmcal_event_links():
 | ||
|     """
 | ||
|     Scrape la page principale d'osmcal.org pour extraire tous les liens d'événements.
 | ||
| 
 | ||
|     Returns:
 | ||
|         list: Liste des URLs d'événements trouvés
 | ||
|     """
 | ||
|     logger.info(f"Scraping de la page principale : {OSMCAL_MAIN_URL}")
 | ||
| 
 | ||
|     try:
 | ||
|         headers = {
 | ||
|             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 | ||
|         }
 | ||
|         response = requests.get(OSMCAL_MAIN_URL, headers=headers)
 | ||
|         response.raise_for_status()
 | ||
| 
 | ||
|         soup = BeautifulSoup(response.content, 'html.parser')
 | ||
| 
 | ||
|         # Debugging : sauvegarder le HTML pour inspection
 | ||
|         debug_file = os.path.join(os.path.dirname(__file__), 'osmcal_debug.html')
 | ||
|         with open(debug_file, 'w', encoding='utf-8') as f:
 | ||
|             f.write(response.text)
 | ||
|         logger.info(f"HTML de débogage sauvegardé dans : {debug_file}")
 | ||
| 
 | ||
|         event_links = []
 | ||
| 
 | ||
|         # Essayer différents sélecteurs basés sur la structure HTML fournie
 | ||
|         selectors_to_try = [
 | ||
|             'a.event-list-entry-box',  # Sélecteur principal basé sur l'exemple HTML
 | ||
|             'li.event-list-entry a',   # Sélecteur alternatif basé sur la structure
 | ||
|             '.event-list-entry a',     # Variation sans spécifier le tag li
 | ||
|             'a[href*="/event/"]',      # Tous les liens contenant "/event/"
 | ||
|             '.event-list-entry-box'    # Au cas où ce serait juste la classe
 | ||
|         ]
 | ||
| 
 | ||
|         for selector in selectors_to_try:
 | ||
|             logger.info(f"Essai du sélecteur : {selector}")
 | ||
|             elements = soup.select(selector)
 | ||
|             logger.info(f"Trouvé {len(elements)} éléments avec le sélecteur {selector}")
 | ||
| 
 | ||
|             if elements:
 | ||
|                 for element in elements:
 | ||
|                     href = None
 | ||
| 
 | ||
|                     # Si l'élément est déjà un lien
 | ||
|                     if element.name == 'a' and element.get('href'):
 | ||
|                         href = element.get('href')
 | ||
|                     # Si l'élément contient un lien
 | ||
|                     elif element.name != 'a':
 | ||
|                         link_element = element.find('a')
 | ||
|                         if link_element and link_element.get('href'):
 | ||
|                             href = link_element.get('href')
 | ||
| 
 | ||
|                     if href:
 | ||
|                         # Construire l'URL complète si c'est un lien relatif
 | ||
|                         if href.startswith('/'):
 | ||
|                             # Enlever les paramètres de requête de l'URL de base
 | ||
|                             base_url = OSMCAL_MAIN_URL.split('?')[0]
 | ||
|                             if base_url.endswith('/'):
 | ||
|                                 base_url = base_url[:-1]
 | ||
|                             full_url = base_url + href
 | ||
|                         else:
 | ||
|                             full_url = href
 | ||
| 
 | ||
|                         # Vérifier que c'est bien un lien vers un événement
 | ||
|                         if '/event/' in href and full_url not in event_links:
 | ||
|                             event_links.append(full_url)
 | ||
|                             logger.info(f"Lien d'événement trouvé : {full_url}")
 | ||
| 
 | ||
|                 # Si on a trouvé des liens avec ce sélecteur, on s'arrête
 | ||
|                 if event_links:
 | ||
|                     break
 | ||
| 
 | ||
|         # Si aucun lien trouvé, essayer de lister tous les liens pour débugger
 | ||
|         if not event_links:
 | ||
|             logger.warning("Aucun lien d'événement trouvé. Listing de tous les liens pour débogage :")
 | ||
|             all_links = soup.find_all('a', href=True)
 | ||
|             logger.info(f"Total de liens trouvés sur la page : {len(all_links)}")
 | ||
| 
 | ||
|             # Afficher les 10 premiers liens pour débogage
 | ||
|             for i, link in enumerate(all_links[:10]):
 | ||
|                 logger.info(f"Lien {i+1}: {link.get('href')} (classes: {link.get('class', [])})")
 | ||
| 
 | ||
|             # Chercher spécifiquement les liens contenant "event"
 | ||
|             event_related_links = [link for link in all_links if 'event' in link.get('href', '').lower()]
 | ||
|             logger.info(f"Liens contenant 'event' : {len(event_related_links)}")
 | ||
|             for link in event_related_links[:5]:
 | ||
|                 logger.info(f"Lien event: {link.get('href')}")
 | ||
| 
 | ||
|         logger.success(f"Trouvé {len(event_links)} liens d'événements uniques sur la page principale")
 | ||
|         return event_links
 | ||
| 
 | ||
|     except requests.exceptions.RequestException as e:
 | ||
|         logger.error(f"Erreur lors du scraping de osmcal.org : {e}")
 | ||
|         return []
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Erreur inattendue lors du scraping : {e}")
 | ||
|         import traceback
 | ||
|         logger.error(f"Traceback: {traceback.format_exc()}")
 | ||
|         return []
 | ||
| 
 | ||
| def fetch_osm_calendar_data():
 | ||
|     """
 | ||
|     Fetch events from the OSM Calendar RSS feed.
 | ||
| 
 | ||
|     Returns:
 | ||
|         list: A list of event items from the RSS feed.
 | ||
|     """
 | ||
|     logger.info("Fetching data from OSM Calendar RSS feed")
 | ||
| 
 | ||
|     try:
 | ||
|         response = requests.get(RSS_URL)
 | ||
|         response.raise_for_status()  # Raise an exception for HTTP errors
 | ||
| 
 | ||
|         # Parse the XML response
 | ||
|         root = ET.fromstring(response.content)
 | ||
| 
 | ||
|         # Find all item elements (events)
 | ||
|         channel = root.find('channel')
 | ||
|         if channel is None:
 | ||
|             logger.error("No channel element found in RSS feed")
 | ||
|             return []
 | ||
| 
 | ||
|         items = channel.findall('item')
 | ||
| 
 | ||
|         if not items:
 | ||
|             logger.error("No items found in RSS feed")
 | ||
|             return []
 | ||
| 
 | ||
|         logger.success(f"Successfully fetched {len(items)} events from OSM Calendar RSS feed")
 | ||
|         return items
 | ||
| 
 | ||
|     except requests.exceptions.RequestException as e:
 | ||
|         logger.error(f"Error fetching data from OSM Calendar RSS feed: {e}")
 | ||
|         return []
 | ||
|     except ET.ParseError as e:
 | ||
|         logger.error(f"Error parsing XML response: {e}")
 | ||
|         return []
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Unexpected error fetching OSM Calendar data: {e}")
 | ||
|         return []
 | ||
| 
 | ||
| def parse_event_dates(description):
 | ||
|     """
 | ||
|     Parse event dates from the description.
 | ||
| 
 | ||
|     Args:
 | ||
|         description (str): The event description HTML.
 | ||
| 
 | ||
|     Returns:
 | ||
|         tuple: A tuple containing (start_date, end_date) as ISO format strings.
 | ||
|     """
 | ||
|     try:
 | ||
|         # Extract the date information from the description
 | ||
|         date_pattern = r'(\d+)(?:st|nd|rd|th)\s+(\w+)(?:\s+(\d+):(\d+)(?:\s+–\s+(\d+):(\d+))?)?(?:\s+\(([^)]+)\))?(?:\s+–\s+(\d+)(?:st|nd|rd|th)\s+(\w+))?'
 | ||
|         date_match = re.search(date_pattern, description)
 | ||
| 
 | ||
|         if not date_match:
 | ||
|             # Try alternative pattern for single day with time range
 | ||
|             date_pattern = r'(\d+)(?:st|nd|rd|th)\s+(\w+)\s+(\d+):(\d+)\s+–\s+(\d+):(\d+)'
 | ||
|             date_match = re.search(date_pattern, description)
 | ||
| 
 | ||
|         if date_match:
 | ||
|             # Extract date components
 | ||
|             day = int(date_match.group(1))
 | ||
|             month_name = date_match.group(2)
 | ||
| 
 | ||
|             # Convert month name to month number
 | ||
|             month_map = {
 | ||
|                 'January': 1, 'February': 2, 'March': 3, 'April': 4,
 | ||
|                 'May': 5, 'June': 6, 'July': 7, 'August': 8,
 | ||
|                 'September': 9, 'October': 10, 'November': 11, 'December': 12
 | ||
|             }
 | ||
| 
 | ||
|             # Try to match the month name (case insensitive)
 | ||
|             month = None
 | ||
|             for name, num in month_map.items():
 | ||
|                 if month_name.lower() == name.lower():
 | ||
|                     month = num
 | ||
|                     break
 | ||
| 
 | ||
|             if month is None:
 | ||
|                 # If month name not found, use current month
 | ||
|                 month = datetime.now().month
 | ||
|                 logger.warning(f"Could not parse month name: {month_name}, using current month")
 | ||
| 
 | ||
|             # Get current year (assuming events are current or future)
 | ||
|             current_year = datetime.now().year
 | ||
| 
 | ||
|             # Create start date
 | ||
|             try:
 | ||
|                 start_date = datetime(current_year, month, day)
 | ||
|             except ValueError:
 | ||
|                 # Handle invalid dates (e.g., February 30)
 | ||
|                 logger.warning(f"Invalid date: {day} {month_name} {current_year}, using current date")
 | ||
|                 start_date = datetime.now()
 | ||
| 
 | ||
|             # Check if there's an end date
 | ||
|             if len(date_match.groups()) >= 8 and date_match.group(8):
 | ||
|                 end_day = int(date_match.group(8))
 | ||
|                 end_month_name = date_match.group(9)
 | ||
| 
 | ||
|                 # Convert end month name to month number
 | ||
|                 end_month = None
 | ||
|                 for name, num in month_map.items():
 | ||
|                     if end_month_name.lower() == name.lower():
 | ||
|                         end_month = num
 | ||
|                         break
 | ||
| 
 | ||
|                 if end_month is None:
 | ||
|                     # If end month name not found, use start month
 | ||
|                     end_month = month
 | ||
|                     logger.warning(f"Could not parse end month name: {end_month_name}, using start month")
 | ||
| 
 | ||
|                 try:
 | ||
|                     end_date = datetime(current_year, end_month, end_day)
 | ||
|                     # Add a day to include the full end day
 | ||
|                     end_date = end_date + timedelta(days=1)
 | ||
|                 except ValueError:
 | ||
|                     # Handle invalid dates
 | ||
|                     logger.warning(f"Invalid end date: {end_day} {end_month_name} {current_year}, using start date + 1 day")
 | ||
|                     end_date = start_date + timedelta(days=1)
 | ||
|             else:
 | ||
|                 # If no end date, use start date + 1 day as default
 | ||
|                 end_date = start_date + timedelta(days=1)
 | ||
| 
 | ||
|             # Format dates as ISO strings
 | ||
|             start_iso = start_date.isoformat()
 | ||
|             end_iso = end_date.isoformat()
 | ||
| 
 | ||
|             return (start_iso, end_iso)
 | ||
|         else:
 | ||
|             # If no date pattern found, use current date as fallback
 | ||
|             now = datetime.now()
 | ||
|             start_iso = now.isoformat()
 | ||
|             end_iso = (now + timedelta(days=1)).isoformat()
 | ||
|             logger.warning(f"Could not parse date from description, using current date: {start_iso} to {end_iso}")
 | ||
|             return (start_iso, end_iso)
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Error parsing event dates: {e}")
 | ||
|         # Return default dates (current date)
 | ||
|         now = datetime.now()
 | ||
|         return (now.isoformat(), (now + timedelta(days=1)).isoformat())
 | ||
| 
 | ||
| def fetch_ical_data(event_url):
 | ||
|     """
 | ||
|     Fetch and parse iCal data for an OSM Calendar event.
 | ||
|     
 | ||
|     Args:
 | ||
|         event_url (str): The URL of the OSM Calendar event.
 | ||
|         
 | ||
|     Returns:
 | ||
|         tuple: A tuple containing (location_name, coordinates).
 | ||
|     """
 | ||
|     try:
 | ||
|         # Check if the URL is an OSM Calendar event URL
 | ||
|         if not event_url.startswith(OSMCAL_EVENT_BASE_URL):
 | ||
|             logger.warning(f"Not an OSM Calendar event URL: {event_url}")
 | ||
|             return ("Unknown Location", [0, 0])
 | ||
|             
 | ||
|         # Extract the event ID from the URL
 | ||
|         event_id_match = re.search(r'event/(\d+)', event_url)
 | ||
|         if not event_id_match:
 | ||
|             logger.warning(f"Could not extract event ID from URL: {event_url}")
 | ||
|             return ("Unknown Location", [0, 0])
 | ||
|             
 | ||
|         event_id = event_id_match.group(1)
 | ||
|         
 | ||
|         # Construct the iCal URL
 | ||
|         ical_url = f"{OSMCAL_EVENT_BASE_URL}{event_id}.ics"
 | ||
|         
 | ||
|         # Fetch the iCal content
 | ||
|         logger.info(f"Fetching iCal data from: {ical_url}")
 | ||
|         response = requests.get(ical_url)
 | ||
|         
 | ||
|         if not response.ok:
 | ||
|             logger.warning(f"Failed to fetch iCal data: {response.status_code}")
 | ||
|             return ("Unknown Location", [0, 0])
 | ||
|             
 | ||
|         # Parse the iCal content avec l'encodage correct
 | ||
|         response.encoding = response.apparent_encoding or 'utf-8'
 | ||
|         ical_content = response.text
 | ||
|         
 | ||
|         # Extract GEO information
 | ||
|         geo_match = re.search(r'GEO:([-+]?\d+\.\d+);([-+]?\d+\.\d+)', ical_content)
 | ||
|         if geo_match:
 | ||
|             # GEO format is latitude;longitude
 | ||
|             latitude = float(geo_match.group(2))
 | ||
|             longitude = float(geo_match.group(1))
 | ||
|             coordinates = [longitude, latitude]  # GeoJSON uses [longitude, latitude]
 | ||
|             logger.info(f"Extracted coordinates from iCal: {coordinates}")
 | ||
|         else:
 | ||
|             logger.warning(f"No GEO information found in iCal data for event: {event_id}")
 | ||
|             coordinates = [0, 0]
 | ||
|             
 | ||
|         # Extract LOCATION information
 | ||
|         location_match = re.search(r'LOCATION:(.+?)(?:\r\n|\n|\r)', ical_content)
 | ||
|         if location_match:
 | ||
|             location_name = location_match.group(1).strip()
 | ||
|             # Unescape backslash-escaped characters (e.g., \, becomes ,)
 | ||
|             location_name = re.sub(r'\\(.)', r'\1', location_name)
 | ||
|             # Corriger l'encodage
 | ||
|             location_name = fix_encoding(location_name)
 | ||
|             logger.info(f"Extracted location from iCal: {location_name}")
 | ||
|         else:
 | ||
|             logger.warning(f"No LOCATION information found in iCal data for event: {event_id}")
 | ||
|             location_name = "Unknown Location"
 | ||
|             
 | ||
|         return (location_name, coordinates)
 | ||
|         
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Error fetching or parsing iCal data: {e}")
 | ||
|         return ("Unknown Location", [0, 0])
 | ||
| 
 | ||
| def extract_location(description):
 | ||
|     """
 | ||
|     Extract location information from the event description.
 | ||
| 
 | ||
|     Args:
 | ||
|         description (str): The event description HTML.
 | ||
| 
 | ||
|     Returns:
 | ||
|         tuple: A tuple containing (location_name, coordinates).
 | ||
|     """
 | ||
|     try:
 | ||
|         # Default coordinates (center of the world)
 | ||
|         coordinates = [0, 0]
 | ||
|         location_name = "Unknown Location"
 | ||
| 
 | ||
|         # Try to find location in the description
 | ||
|         location_pattern = r'<p>([^<]+)</p>'
 | ||
|         location_matches = re.findall(location_pattern, description)
 | ||
| 
 | ||
|         if location_matches and len(location_matches) > 1:
 | ||
|             # The second paragraph often contains the location
 | ||
|             location_candidate = location_matches[1].strip()
 | ||
|             if location_candidate and "," in location_candidate and not location_candidate.startswith('<'):
 | ||
|                 location_name = fix_encoding(location_candidate)
 | ||
| 
 | ||
|                 # For now, we don't have exact coordinates, so we'll use a placeholder
 | ||
|                 # In a real implementation, you might want to geocode the location
 | ||
|                 coordinates = [0, 0]
 | ||
| 
 | ||
|         return (location_name, coordinates)
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Error extracting location: {e}")
 | ||
|         return ("Unknown Location", [0, 0])
 | ||
| 
 | ||
| def create_event(item):
 | ||
|     """
 | ||
|     Create an event object from an RSS item.
 | ||
| 
 | ||
|     Args:
 | ||
|         item: An item element from the RSS feed.
 | ||
| 
 | ||
|     Returns:
 | ||
|         dict: A GeoJSON Feature representing the event.
 | ||
|     """
 | ||
|     try:
 | ||
|         # Extract data from the item
 | ||
|         title = item.find('title').text
 | ||
|         link = item.find('link').text
 | ||
|         description = item.find('description').text
 | ||
|         guid = item.find('guid').text
 | ||
| 
 | ||
|         # Clean up the description (remove HTML tags for text extraction)
 | ||
|         clean_description = re.sub(r'<[^>]+>', ' ', description)
 | ||
|         clean_description = html.unescape(clean_description)
 | ||
|         clean_description = re.sub(r'\s+', ' ', clean_description).strip()
 | ||
| 
 | ||
|         # Corriger l'encodage du titre et de la description
 | ||
|         title = fix_encoding(title)
 | ||
|         clean_description = fix_encoding(clean_description)
 | ||
| 
 | ||
|         # Parse dates from the description
 | ||
|         start_date, end_date = parse_event_dates(description)
 | ||
| 
 | ||
|         # Extract location information from the description
 | ||
|         location_name, coordinates = extract_location(description)
 | ||
| 
 | ||
|         # If we don't have coordinates and the link is to an OSM Calendar event,
 | ||
|         # try to get coordinates and location from the iCal file
 | ||
|         if coordinates == [0, 0] and link and link.startswith(OSMCAL_EVENT_BASE_URL):
 | ||
|             logger.info(f"No coordinates found in description, trying to get from iCal: {link}")
 | ||
|             ical_location_name, ical_coordinates = fetch_ical_data(link)
 | ||
|             
 | ||
|             # Use iCal coordinates if available
 | ||
|             if ical_coordinates != [0, 0]:
 | ||
|                 coordinates = ical_coordinates
 | ||
|                 logger.info(f"Using coordinates from iCal: {coordinates}")
 | ||
|                 
 | ||
|             # Use iCal location name if available and better than what we have
 | ||
|             if ical_location_name != "Unknown Location":
 | ||
|                 location_name = ical_location_name
 | ||
|                 logger.info(f"Using location name from iCal: {location_name}")
 | ||
| 
 | ||
|         # Create a descriptive label
 | ||
|         label = title
 | ||
| 
 | ||
|         # Create the event object
 | ||
|         event = {
 | ||
|             "type": "Feature",
 | ||
|             "geometry": {
 | ||
|                 "type": "Point",
 | ||
|                 "coordinates": coordinates
 | ||
|             },
 | ||
|             "properties": {
 | ||
|                 "type": "scheduled",
 | ||
|                 "what": "community.osm.event",
 | ||
|                 "what:series": "OpenStreetMap Calendar",
 | ||
|                 "where": location_name,
 | ||
|                 "label": label,
 | ||
|                 "description": clean_description,
 | ||
|                 "start": start_date,
 | ||
|                 "stop": end_date,
 | ||
|                 "url": link,
 | ||
|                 "external_id": guid,
 | ||
|                 "source": "OSM Calendar"
 | ||
|             }
 | ||
|         }
 | ||
| 
 | ||
|         return event
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Error creating event from item: {e}")
 | ||
|         return None
 | ||
| 
 | ||
| def event_exists(db, properties):
 | ||
|     """
 | ||
|     Check if an event with the same properties already exists in the database.
 | ||
| 
 | ||
|     Args:
 | ||
|         db: Database connection.
 | ||
|         properties: Event properties.
 | ||
| 
 | ||
|     Returns:
 | ||
|         bool: True if the event exists, False otherwise.
 | ||
|     """
 | ||
|     print('event: ', properties)
 | ||
|     try:
 | ||
|         cur = db.cursor()
 | ||
| 
 | ||
|         # Check if an event with the same external_id exists
 | ||
|         if 'external_id' in properties:
 | ||
|             cur.execute("""
 | ||
|                 SELECT events_id FROM events
 | ||
|                 WHERE events_tags->>'external_id' = %s;
 | ||
|             """, (properties['external_id'],))
 | ||
| 
 | ||
|             result = cur.fetchone()
 | ||
|             if result:
 | ||
|                 logger.info(f"Event with external_id {properties['external_id']} already exists")
 | ||
|                 return True
 | ||
| 
 | ||
|         # Check if an event with the same label, start, and stop exists
 | ||
|         cur.execute("""
 | ||
|             SELECT events_id FROM events
 | ||
|             WHERE events_tags->>'label' = %s
 | ||
|             AND events_tags->>'start' = %s
 | ||
|             AND events_tags->>'stop' = %s;
 | ||
|         """, (
 | ||
|             properties.get('label', ''),
 | ||
|             properties.get('start', ''),
 | ||
|             properties.get('stop', '')
 | ||
|         ))
 | ||
| 
 | ||
|         result = cur.fetchone()
 | ||
|         if result:
 | ||
|             logger.info(f"Event with label '{properties.get('label')}' and same dates already exists")
 | ||
|             return True
 | ||
| 
 | ||
|         return False
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Error checking if event exists: {e}")
 | ||
|         return False
 | ||
| 
 | ||
| def submit_event(event):
 | ||
|     """
 | ||
|     Submit an event to the OpenEventDatabase using the API.
 | ||
| 
 | ||
|     Args:
 | ||
|         event: A GeoJSON Feature representing the event.
 | ||
| 
 | ||
|     Returns:
 | ||
|         tuple: A tuple containing (success: bool, event_id: str or None).
 | ||
|                success is True if the event was successfully submitted, False otherwise.
 | ||
|                event_id is the OEDB event ID if available, None otherwise.
 | ||
|     """
 | ||
|     try:
 | ||
|         # Extract event properties for logging
 | ||
|         properties = event['properties']
 | ||
| 
 | ||
|         # API endpoint for OpenEventDatabase
 | ||
|         api_url = "https://api.openeventdatabase.org/event"
 | ||
| 
 | ||
|         # Make the API request
 | ||
|         logger.info(f"Submitting event '{properties.get('label')}' to API")
 | ||
|         response = requests.post(
 | ||
|             api_url,
 | ||
|             headers={"Content-Type": "application/json"},
 | ||
|             data=json.dumps(event)
 | ||
|         )
 | ||
| 
 | ||
|         # Check if the request was successful
 | ||
|         if response.status_code == 200 or response.status_code == 201:
 | ||
|             # Parse the response to get the event ID
 | ||
|             response_data = response.json()
 | ||
|             event_id = response_data.get('id')
 | ||
| 
 | ||
|             if event_id:
 | ||
|                 logger.success(f"Event created with ID: {event_id}")
 | ||
|                 logger.info(f" https://api.openeventdatabase.org/event/{event_id}")
 | ||
|                 return (True, event_id)
 | ||
|             else:
 | ||
|                 logger.warning(f"Event created but no ID returned in response")
 | ||
|                 return (True, None)
 | ||
|         elif response.status_code == 409:
 | ||
|             # 409 Conflict - L'événement existe déjà, considéré comme un succès
 | ||
|             logger.success(f"Event already exists in database: {properties.get('label')} (HTTP 409)")
 | ||
|             # Essayer d'extraire l'ID de l'événement existant depuis la réponse
 | ||
|             try:
 | ||
|                 response_data = response.json()
 | ||
|                 existing_event_id = response_data.get('id')
 | ||
|                 return (True, existing_event_id)
 | ||
|             except:
 | ||
|                 return (True, None)
 | ||
|         else:
 | ||
|             logger.warning(f"Failed to create event: {properties.get('label')}. Status code: {response.status_code}")
 | ||
|             logger.warning(f"Response: {response.text}")
 | ||
|             return (False, None)
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Error submitting event: {e}")
 | ||
|         return (False, None)
 | ||
| 
 | ||
| def main(max_events=1, offset=0):
 | ||
|     """
 | ||
|     Main function to fetch OSM Calendar events and add them to the OpenEventDatabase API.
 | ||
|     
 | ||
|     Args:
 | ||
|         max_events (int): Maximum number of events to insert (default: 1)
 | ||
|         offset (int): Number of events to skip from the beginning of the RSS feed (default: 0)
 | ||
|     
 | ||
|     The function will exit if the .env file doesn't exist, as it's required
 | ||
|     for environment variables.
 | ||
|     """
 | ||
|     logger.info(f"Starting OSM Calendar extractor (max_events={max_events}, offset={offset})")
 | ||
|     
 | ||
|     # Load environment variables from .env file and check if it exists
 | ||
|     if not load_env_from_file():
 | ||
|         logger.error("Required .env file not found. Exiting.")
 | ||
|         sys.exit(1)
 | ||
|         
 | ||
|     logger.info("Environment variables loaded successfully from .env file")
 | ||
| 
 | ||
|     # Charger le cache des événements traités
 | ||
|     event_cache = load_event_cache()
 | ||
| 
 | ||
|     # Scraper la page principale pour obtenir tous les liens d'événements
 | ||
|     event_links = scrape_osmcal_event_links()
 | ||
| 
 | ||
|     if not event_links:
 | ||
|         logger.warning("Aucun lien d'événement trouvé sur la page principale")
 | ||
|         return
 | ||
| 
 | ||
|     # Identifier les nouveaux événements (non présents dans le cache ou non traités avec succès)
 | ||
|     new_events = []
 | ||
|     success_events = []
 | ||
| 
 | ||
|     for link in event_links:
 | ||
|         # Vérifier si l'événement existe dans le cache et a le statut 'success'
 | ||
|         if link in event_cache and event_cache[link].get('status') == 'success':
 | ||
|             success_events.append(link)
 | ||
|             oedb_id = event_cache[link].get('oedb_event_id', 'ID non disponible')
 | ||
|             logger.info(f"Événement déjà traité avec succès (ID OEDB: {oedb_id}), ignoré : {link}")
 | ||
|         else:
 | ||
|             new_events.append(link)
 | ||
|             # Initialiser l'événement dans le cache s'il n'existe pas
 | ||
|             if link not in event_cache:
 | ||
|                 event_cache[link] = {
 | ||
|                     'discovered_at': datetime.now().isoformat(),
 | ||
|                     'status': 'pending',
 | ||
|                     'attempts': 0
 | ||
|                 }
 | ||
|             else:
 | ||
|                 # Log du statut actuel pour les événements déjà en cache
 | ||
|                 current_status = event_cache[link].get('status', 'unknown')
 | ||
|                 attempts = event_cache[link].get('attempts', 0)
 | ||
|                 oedb_id = event_cache[link].get('oedb_event_id', 'non disponible')
 | ||
|                 logger.info(f"Événement à retraiter (statut: {current_status}, tentatives: {attempts}, ID OEDB: {oedb_id}) : {link}")
 | ||
| 
 | ||
|     logger.info(f"Liens d'événements trouvés : {len(event_links)}")
 | ||
|     logger.info(f"Événements déjà traités avec succès : {len(success_events)}")
 | ||
|     logger.info(f"Nouveaux événements à traiter : {len(new_events)}")
 | ||
| 
 | ||
|     if len(new_events) == 0:
 | ||
|         logger.success("Aucun nouvel événement à traiter. Tous les événements ont déjà été insérés avec succès.")
 | ||
|         return
 | ||
| 
 | ||
|     # Appliquer l'offset et la limite aux nouveaux événements
 | ||
|     if offset >= len(new_events):
 | ||
|         logger.warning(f"Offset {offset} est supérieur ou égal au nombre de nouveaux événements {len(new_events)}")
 | ||
|         return
 | ||
| 
 | ||
|     events_to_process = new_events[offset:offset + max_events]
 | ||
|     logger.info(f"Traitement de {len(events_to_process)} nouveaux événements")
 | ||
| 
 | ||
|     # Fetch events from the OSM Calendar RSS feed pour obtenir les détails
 | ||
|     rss_items = fetch_osm_calendar_data()
 | ||
| 
 | ||
|     if not rss_items:
 | ||
|         logger.warning("Aucun événement trouvé dans le flux RSS, mais continuons avec les liens scrapés")
 | ||
| 
 | ||
|     # Créer un mapping des liens RSS vers les items pour un accès rapide
 | ||
|     rss_link_to_item = {}
 | ||
|     for item in rss_items:
 | ||
|         link_element = item.find('link')
 | ||
|         if link_element is not None:
 | ||
|             rss_link_to_item[link_element.text] = item
 | ||
| 
 | ||
|     # Process each new event
 | ||
|     success_count = 0
 | ||
|     for event_link in events_to_process:
 | ||
|         try:
 | ||
|             # Vérifier si l'événement est déjà en succès (sécurité supplémentaire)
 | ||
|             if event_cache.get(event_link, {}).get('status') == 'success':
 | ||
|                 logger.info(f"Événement déjà en succès, passage au suivant : {event_link}")
 | ||
|                 success_count += 1  # Compter comme succès puisqu'il est déjà traité
 | ||
|                 continue
 | ||
| 
 | ||
|             event_cache[event_link]['attempts'] += 1
 | ||
|             event_cache[event_link]['last_attempt'] = datetime.now().isoformat()
 | ||
| 
 | ||
|             # Chercher l'item correspondant dans le flux RSS
 | ||
|             rss_item = rss_link_to_item.get(event_link)
 | ||
| 
 | ||
|             if rss_item is not None:
 | ||
|                 # Créer l'événement depuis l'item RSS
 | ||
|                 event = create_event(rss_item)
 | ||
|             else:
 | ||
|                 # Si pas trouvé dans le flux RSS, essayer de créer un événement minimal depuis le lien
 | ||
|                 logger.warning(f"Événement {event_link} non trouvé dans le flux RSS, tentative de création depuis le lien")
 | ||
|                 event = create_event_from_link(event_link)
 | ||
| 
 | ||
|             if event:
 | ||
|                 # Tenter de soumettre l'événement à l'API
 | ||
|                 submit_success, oedb_event_id = submit_event(event)
 | ||
|                 if submit_success:
 | ||
|                     success_count += 1
 | ||
|                     event_cache[event_link]['status'] = 'success'
 | ||
|                     event_cache[event_link]['inserted_at'] = datetime.now().isoformat()
 | ||
|                     # Sauvegarder l'ID de l'événement OEDB dans le cache
 | ||
|                     if oedb_event_id:
 | ||
|                         event_cache[event_link]['oedb_event_id'] = oedb_event_id
 | ||
|                         logger.success(f"Événement inséré avec succès (ID OEDB: {oedb_event_id}) : {event_link}")
 | ||
|                     else:
 | ||
|                         logger.success(f"Événement inséré avec succès (ID OEDB non disponible) : {event_link}")
 | ||
|                 else:
 | ||
|                     event_cache[event_link]['status'] = 'failed'
 | ||
|                     logger.warning(f"Échec de l'insertion de l'événement : {event_link}")
 | ||
|             else:
 | ||
|                 event_cache[event_link]['status'] = 'failed'
 | ||
|                 logger.error(f"Impossible de créer l'événement depuis : {event_link}")
 | ||
| 
 | ||
|         except Exception as e:
 | ||
|             logger.error(f"Erreur lors du traitement de l'événement {event_link} : {e}")
 | ||
|             event_cache[event_link]['status'] = 'error'
 | ||
|             event_cache[event_link]['error'] = str(e)
 | ||
| 
 | ||
|     # Sauvegarder le cache mis à jour
 | ||
|     save_event_cache(event_cache)
 | ||
| 
 | ||
|     # Calculer les statistiques finales du cache
 | ||
|     cache_stats = {
 | ||
|         'success': 0,
 | ||
|         'pending': 0,
 | ||
|         'failed': 0,
 | ||
|         'error': 0,
 | ||
|         'total': len(event_cache)
 | ||
|     }
 | ||
| 
 | ||
|     for link, data in event_cache.items():
 | ||
|         status = data.get('status', 'pending')
 | ||
|         if status in cache_stats:
 | ||
|             cache_stats[status] += 1
 | ||
| 
 | ||
|     # Événements en attente d'insertion (tous sauf success)
 | ||
|     events_awaiting_insertion = cache_stats['pending'] + cache_stats['failed'] + cache_stats['error']
 | ||
| 
 | ||
|     logger.success(f"Traitement terminé : {success_count} événements insérés avec succès sur {len(events_to_process)} traités")
 | ||
|     logger.info("=== STATISTIQUES GLOBALES DU CACHE ===")
 | ||
|     logger.info(f"Total d'événements dans le cache : {cache_stats['total']}")
 | ||
|     logger.info(f"Événements traités avec succès : {cache_stats['success']}")
 | ||
|     logger.info(f"Événements en attente d'insertion : {events_awaiting_insertion}")
 | ||
|     logger.info(f"  - Statut 'pending' : {cache_stats['pending']}")
 | ||
|     logger.info(f"  - Statut 'failed' : {cache_stats['failed']}")
 | ||
|     logger.info(f"  - Statut 'error' : {cache_stats['error']}")
 | ||
| 
 | ||
|     if events_awaiting_insertion > 0:
 | ||
|         logger.info(f"🔄 Il reste {events_awaiting_insertion} événements à traiter lors de la prochaine exécution")
 | ||
|     else:
 | ||
|         logger.success("✅ Tous les événements découverts ont été traités avec succès")
 | ||
| 
 | ||
| def create_event_from_link(event_link):
 | ||
|     """
 | ||
|     Créer un événement minimal depuis un lien osmcal.org quand il n'est pas disponible dans le flux RSS.
 | ||
| 
 | ||
|     Args:
 | ||
|         event_link (str): URL de l'événement osmcal.org
 | ||
| 
 | ||
|     Returns:
 | ||
|         dict: Un objet GeoJSON Feature représentant l'événement, ou None en cas d'échec
 | ||
|     """
 | ||
|     try:
 | ||
|         logger.info(f"Tentative de création d'événement depuis le lien : {event_link}")
 | ||
| 
 | ||
|         # Si c'est un lien vers un événement OSM Calendar, essayer d'obtenir les données iCal
 | ||
|         if event_link.startswith(OSMCAL_EVENT_BASE_URL):
 | ||
|             location_name, coordinates = fetch_ical_data(event_link)
 | ||
| 
 | ||
|             # Extraire l'ID de l'événement pour créer un GUID
 | ||
|             event_id_match = re.search(r'event/(\d+)', event_link)
 | ||
|             if event_id_match:
 | ||
|                 event_id = event_id_match.group(1)
 | ||
|                 external_id = f"osmcal_{event_id}"
 | ||
|             else:
 | ||
|                 external_id = event_link
 | ||
| 
 | ||
|             # Créer un événement avec les informations minimales disponibles
 | ||
|             now = datetime.now()
 | ||
|             event = {
 | ||
|                 "type": "Feature",
 | ||
|                 "geometry": {
 | ||
|                     "type": "Point",
 | ||
|                     "coordinates": coordinates
 | ||
|                 },
 | ||
|                 "properties": {
 | ||
|                     "type": "scheduled",
 | ||
|                     "what": "community.osm.event",
 | ||
|                     "what:series": "OpenStreetMap Calendar",
 | ||
|                     "where": location_name,
 | ||
|                     "label": f"Événement OSM Calendar {event_id if 'event_id' in locals() else 'inconnu'}",
 | ||
|                     "description": f"Événement trouvé sur osmcal.org : {event_link}",
 | ||
|                     "start": now.isoformat(),
 | ||
|                     "stop": (now + timedelta(days=1)).isoformat(),
 | ||
|                     "url": event_link,
 | ||
|                     "external_id": external_id,
 | ||
|                     "source": "OSM Calendar (scraped)"
 | ||
|                 }
 | ||
|             }
 | ||
| 
 | ||
|             return event
 | ||
|         else:
 | ||
|             logger.warning(f"Lien non reconnu comme un événement OSM Calendar : {event_link}")
 | ||
|             return None
 | ||
| 
 | ||
|     except Exception as e:
 | ||
|         logger.error(f"Erreur lors de la création d'événement depuis le lien {event_link} : {e}")
 | ||
|         return None
 | ||
| 
 | ||
| if __name__ == "__main__":
 | ||
|     import argparse
 | ||
|     
 | ||
|     # Set up command line argument parsing
 | ||
|     parser = argparse.ArgumentParser(description='OSM Calendar Extractor for the OpenEventDatabase')
 | ||
|     parser.add_argument('--max-events', type=int, default=1, 
 | ||
|                         help='Maximum number of events to insert (default: 1)')
 | ||
|     parser.add_argument('--offset', type=int, default=0, 
 | ||
|                         help='Number of events to skip from the beginning of the RSS feed (default: 0)')
 | ||
|     
 | ||
|     # Parse arguments
 | ||
|     args = parser.parse_args()
 | ||
|     
 | ||
|     # Run the main function with the provided arguments
 | ||
|     main(max_events=args.max_events, offset=args.offset) | 
