up wiki compare

2025-08-22 17:58:04 +02:00 · 2025-08-22 17:58:04 +02:00 · 2f49ef6479
commit 2f49ef6479
parent ce508974c9
23 changed files with 567403 additions and 5132 deletions
--- a/wiki_compare/fetch_osm_fr_groups.py
+++ b/wiki_compare/fetch_osm_fr_groups.py
@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+fetch_osm_fr_groups.py
+
+This script scrapes the OpenStreetMap wiki page for France/OSM-FR to extract
+information about local working groups. It specifically targets links in the
+#Pages_des_groupes_locaux section.
+
+Usage:
+    python fetch_osm_fr_groups.py [--dry-run] [--force]
+
+Options:
+    --dry-run    Run the script without saving the results to a file
+    --force      Force update even if the cache is still fresh (less than 1 hour old)
+
+Output:
+    - osm_fr_groups.json: JSON file with information about OSM-FR local groups
+    - Log messages about the scraping process and results
+"""
+
+import json
+import argparse
+import logging
+import os
+from datetime import datetime, timedelta
+import requests
+from bs4 import BeautifulSoup
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+# Constants
+OUTPUT_FILE = "osm_fr_groups.json"
+BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR"
+WIKI_BASE_URL = "https://wiki.openstreetmap.org"
+CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour
+
+def is_cache_fresh():
+    """
+    Check if the cache file exists and is less than CACHE_DURATION old
+    
+    Returns:
+        bool: True if cache is fresh, False otherwise
+    """
+    if not os.path.exists(OUTPUT_FILE):
+        return False
+    
+    try:
+        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
+            now = datetime.now()
+            return (now - last_updated) < CACHE_DURATION
+    except (IOError, json.JSONDecodeError, ValueError) as e:
+        logger.error(f"Error checking cache freshness: {e}")
+        return False
+
+def get_page_content(url):
+    """
+    Get the HTML content of a page
+    
+    Args:
+        url (str): URL to fetch
+        
+    Returns:
+        str: HTML content of the page or None if request failed
+    """
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
+        return None
+
+def extract_working_groups(html_content):
+    """
+    Extract working groups from the wiki page HTML
+    
+    Args:
+        html_content (str): HTML content of the wiki page
+        
+    Returns:
+        list: List of working group dictionaries
+    """
+    if not html_content:
+        return []
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    working_groups = []
+    
+    # Find the working groups section
+    working_groups_section = None
+    for heading in soup.find_all(['h2', 'h3']):
+        if heading.get_text().strip() == 'Groupes de travail' or 'Groupes_de_travail' in heading.get_text():
+            working_groups_section = heading
+            break
+    
+    if not working_groups_section:
+        logger.warning("Could not find working groups section")
+        # Return an empty list but with a default category
+        return []
+    
+    # Get the content following the heading until the next heading
+    current = working_groups_section.next_sibling
+    while current and not current.name in ['h2', 'h3']:
+        if current.name == 'ul':
+            # Process list items
+            for li in current.find_all('li', recursive=False):
+                link = li.find('a')
+                if link:
+                    name = link.get_text().strip()
+                    url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
+                    
+                    # Extract description (text after the link)
+                    description = ""
+                    next_node = link.next_sibling
+                    while next_node:
+                        if isinstance(next_node, str):
+                            description += next_node.strip()
+                        next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
+                    
+                    description = description.strip(' :-,')
+                    
+                    working_groups.append({
+                        "name": name,
+                        "url": url,
+                        "description": description,
+                        "category": "Général",
+                        "type": "working_group"
+                    })
+        current = current.next_sibling
+    
+    logger.info(f"Found {len(working_groups)} working groups")
+    return working_groups
+
+def extract_local_groups(html_content):
+    """
+    Extract local groups from the wiki page HTML
+    
+    Args:
+        html_content (str): HTML content of the wiki page
+        
+    Returns:
+        list: List of local group dictionaries
+    """
+    if not html_content:
+        return []
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    local_groups = []
+    
+    # Find the local groups section
+    local_groups_section = None
+    for heading in soup.find_all(['h2', 'h3']):
+        if heading.get_text().strip() == 'Groupes locaux' or 'Pages des groupes locaux' in heading.get_text():
+            local_groups_section = heading
+            break
+    
+    if not local_groups_section:
+        logger.warning("Could not find local groups section")
+        return []
+    
+    # Get the content following the heading until the next heading
+    current = local_groups_section.next_sibling
+    while current and not current.name in ['h2', 'h3']:
+        if current.name == 'ul':
+            # Process list items
+            for li in current.find_all('li', recursive=False):
+                link = li.find('a')
+                if link:
+                    name = link.get_text().strip()
+                    url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
+                    
+                    # Extract description (text after the link)
+                    description = ""
+                    next_node = link.next_sibling
+                    while next_node:
+                        if isinstance(next_node, str):
+                            description += next_node.strip()
+                        next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
+                    
+                    description = description.strip(' :-,')
+                    
+                    local_groups.append({
+                        "name": name,
+                        "url": url,
+                        "description": description,
+                        "type": "local_group"
+                    })
+        current = current.next_sibling
+    
+    logger.info(f"Found {len(local_groups)} local groups")
+    return local_groups
+
+def extract_umap_url(html_content):
+    """
+    Extract the uMap URL for OSM-FR local groups
+    
+    Args:
+        html_content (str): HTML content of the wiki page
+        
+    Returns:
+        str: uMap URL or None if not found
+    """
+    if not html_content:
+        return None
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # Look for links to umap.openstreetmap.fr
+    for link in soup.find_all('a'):
+        href = link.get('href', '')
+        if 'umap.openstreetmap.fr' in href and 'groupes-locaux' in href:
+            return href
+    
+    return None
+
+def save_results(local_groups, working_groups, umap_url, dry_run=False):
+    """
+    Save the results to a JSON file
+    
+    Args:
+        local_groups (list): List of local group dictionaries
+        working_groups (list): List of working group dictionaries
+        umap_url (str): URL to the uMap for local groups
+        dry_run (bool): If True, don't actually save to file
+        
+    Returns:
+        bool: True if saving was successful or dry run, False otherwise
+    """
+    if dry_run:
+        logger.info("DRY RUN: Would have saved results to file")
+        logger.info(f"Local groups: {len(local_groups)}")
+        for group in local_groups:
+            logger.info(f"  - {group['name']}: {group['url']}")
+        logger.info(f"Working groups: {len(working_groups)}")
+        for group in working_groups:
+            logger.info(f"  - {group['name']}: {group['url']}")
+        if umap_url:
+            logger.info(f"uMap URL: {umap_url}")
+        return True
+    
+    # Prepare the data structure
+    data = {
+        "last_updated": datetime.now().isoformat(),
+        "local_groups": local_groups,
+        "working_groups": working_groups,
+        "umap_url": umap_url
+    }
+    
+    try:
+        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Successfully saved {len(local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}")
+        return True
+    except IOError as e:
+        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
+        return False
+
+def main():
+    """Main function to execute the script"""
+    parser = argparse.ArgumentParser(description="Scrape OSM-FR local groups from the wiki")
+    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
+    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
+    args = parser.parse_args()
+    
+    logger.info("Starting fetch_osm_fr_groups.py")
+    
+    # Check if cache is fresh
+    if is_cache_fresh() and not args.force:
+        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
+        logger.info(f"Use --force to update anyway")
+        return
+    
+    # Get the wiki page content
+    html_content = get_page_content(BASE_URL)
+    
+    if not html_content:
+        logger.error("Failed to get wiki page content")
+        return
+    
+    # Extract local groups
+    local_groups = extract_local_groups(html_content)
+    
+    if not local_groups:
+        logger.warning("No local groups found")
+    
+    # Extract working groups
+    working_groups = extract_working_groups(html_content)
+    
+    if not working_groups:
+        logger.warning("No working groups found")
+        # Initialize with an empty list to avoid errors in the controller
+        working_groups = []
+    
+    # Extract uMap URL
+    umap_url = extract_umap_url(html_content)
+    
+    # Save results
+    success = save_results(local_groups, working_groups, umap_url, args.dry_run)
+    
+    if success:
+        logger.info("Script completed successfully")
+    else:
+        logger.error("Script completed with errors")
+
+if __name__ == "__main__":
+    main()