#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ fetch_osm_fr_groups.py This script scrapes the OpenStreetMap wiki page for France/OSM-FR to extract information about local working groups. It specifically targets links in the #Pages_des_groupes_locaux section. Usage: python fetch_osm_fr_groups.py [--dry-run] [--force] Options: --dry-run Run the script without saving the results to a file --force Force update even if the cache is still fresh (less than 1 hour old) Output: - osm_fr_groups.json: JSON file with information about OSM-FR local groups - Log messages about the scraping process and results """ import json import argparse import logging import os from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants OUTPUT_FILE = "osm_fr_groups.json" BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR" WIKI_BASE_URL = "https://wiki.openstreetmap.org" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour def is_cache_fresh(): """ Check if the cache file exists and is less than CACHE_DURATION old Returns: bool: True if cache is fresh, False otherwise """ if not os.path.exists(OUTPUT_FILE): return False try: with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) now = datetime.now() return (now - last_updated) < CACHE_DURATION except (IOError, json.JSONDecodeError, ValueError) as e: logger.error(f"Error checking cache freshness: {e}") return False def get_page_content(url): """ Get the HTML content of a page Args: url (str): URL to fetch Returns: str: HTML content of the page or None if request failed """ try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None def extract_working_groups(html_content): """ Extract working groups from the wiki page HTML Args: html_content (str): HTML content of the wiki page Returns: list: List of working group dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') working_groups = [] # Find the working groups section working_groups_section = None for heading in soup.find_all(['h2', 'h3']): if heading.get_text().strip() == 'Groupes de travail' or 'Groupes_de_travail' in heading.get_text(): working_groups_section = heading break if not working_groups_section: logger.warning("Could not find working groups section") # Return an empty list but with a default category return [] # Get the content following the heading until the next heading current = working_groups_section.next_sibling while current and not current.name in ['h2', 'h3']: if current.name == 'ul': # Process list items for li in current.find_all('li', recursive=False): link = li.find('a') if link: name = link.get_text().strip() url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href') # Extract description (text after the link) description = "" next_node = link.next_sibling while next_node: if isinstance(next_node, str): description += next_node.strip() next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None description = description.strip(' :-,') working_groups.append({ "name": name, "url": url, "description": description, "category": "Général", "type": "working_group" }) current = current.next_sibling logger.info(f"Found {len(working_groups)} working groups") return working_groups def extract_local_groups(html_content): """ Extract local groups from the wiki page HTML Args: html_content (str): HTML content of the wiki page Returns: list: List of local group dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') local_groups = [] # Find the local groups section local_groups_section = None for heading in soup.find_all(['h2', 'h3']): if heading.get_text().strip() == 'Groupes locaux' or 'Pages des groupes locaux' in heading.get_text(): local_groups_section = heading break if not local_groups_section: logger.warning("Could not find local groups section") return [] # Get the content following the heading until the next heading current = local_groups_section.next_sibling while current and not current.name in ['h2', 'h3']: if current.name == 'ul': # Process list items for li in current.find_all('li', recursive=False): link = li.find('a') if link: name = link.get_text().strip() url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href') # Extract description (text after the link) description = "" next_node = link.next_sibling while next_node: if isinstance(next_node, str): description += next_node.strip() next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None description = description.strip(' :-,') local_groups.append({ "name": name, "url": url, "description": description, "type": "local_group" }) current = current.next_sibling logger.info(f"Found {len(local_groups)} local groups") return local_groups def extract_umap_url(html_content): """ Extract the uMap URL for OSM-FR local groups Args: html_content (str): HTML content of the wiki page Returns: str: uMap URL or None if not found """ if not html_content: return None soup = BeautifulSoup(html_content, 'html.parser') # Look for links to umap.openstreetmap.fr for link in soup.find_all('a'): href = link.get('href', '') if 'umap.openstreetmap.fr' in href and 'groupes-locaux' in href: return href return None def save_results(local_groups, working_groups, umap_url, dry_run=False): """ Save the results to a JSON file Args: local_groups (list): List of local group dictionaries working_groups (list): List of working group dictionaries umap_url (str): URL to the uMap for local groups dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved results to file") logger.info(f"Local groups: {len(local_groups)}") for group in local_groups: logger.info(f" - {group['name']}: {group['url']}") logger.info(f"Working groups: {len(working_groups)}") for group in working_groups: logger.info(f" - {group['name']}: {group['url']}") if umap_url: logger.info(f"uMap URL: {umap_url}") return True # Prepare the data structure data = { "last_updated": datetime.now().isoformat(), "local_groups": local_groups, "working_groups": working_groups, "umap_url": umap_url } try: with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Successfully saved {len(local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}") return True except IOError as e: logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Scrape OSM-FR local groups from the wiki") parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") args = parser.parse_args() logger.info("Starting fetch_osm_fr_groups.py") # Check if cache is fresh if is_cache_fresh() and not args.force: logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") logger.info(f"Use --force to update anyway") return # Get the wiki page content html_content = get_page_content(BASE_URL) if not html_content: logger.error("Failed to get wiki page content") return # Extract local groups local_groups = extract_local_groups(html_content) if not local_groups: logger.warning("No local groups found") # Extract working groups working_groups = extract_working_groups(html_content) if not working_groups: logger.warning("No working groups found") # Initialize with an empty list to avoid errors in the controller working_groups = [] # Extract uMap URL umap_url = extract_umap_url(html_content) # Save results success = save_results(local_groups, working_groups, umap_url, args.dry_run) if success: logger.info("Script completed successfully") else: logger.error("Script completed with errors") if __name__ == "__main__": main()