osm-labo/wiki_compare/fetch_osm_fr_groups.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
fetch_osm_fr_groups.py

This script fetches information about OSM-FR local groups from two sources:
1. The OpenStreetMap wiki page for France/OSM-FR (specifically the #Pages_des_groupes_locaux section)
2. The Framacalc spreadsheet at https://framacalc.org/osm-groupes-locaux

It then verifies that each group from the Framacalc has a corresponding wiki page.

Usage:
    python fetch_osm_fr_groups.py [--dry-run] [--force]

Options:
    --dry-run    Run the script without saving the results to a file
    --force      Force update even if the cache is still fresh (less than 1 hour old)

Output:
    - osm_fr_groups.json: JSON file with information about OSM-FR local groups
    - Log messages about the scraping process and results
"""

import json
import argparse
import logging
import os
import csv
import io
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
OUTPUT_FILE = "osm_fr_groups.json"
BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
FRAMACALC_URL = "https://framacalc.org/osm-groupes-locaux/export/csv"
WIKI_GROUPS_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR#Groupes_locaux"
CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour

def is_cache_fresh():
    """
    Check if the cache file exists and is less than CACHE_DURATION old

    Returns:
        bool: True if cache is fresh, False otherwise
    """
    if not os.path.exists(OUTPUT_FILE):
        return False

    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
            now = datetime.now()
            return (now - last_updated) < CACHE_DURATION
    except (IOError, json.JSONDecodeError, ValueError) as e:
        logger.error(f"Error checking cache freshness: {e}")
        return False

def get_page_content(url):
    """
    Get the HTML content of a page

    Args:
        url (str): URL to fetch

    Returns:
        str: HTML content of the page or None if request failed
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def extract_working_groups(html_content):
    """
    Extract working groups from the wiki page HTML

    Args:
        html_content (str): HTML content of the wiki page

    Returns:
        list: List of working group dictionaries
    """
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    working_groups = []

    # Find the working groups section
    working_groups_section = None
    for heading in soup.find_all(['h2', 'h3']):
        if heading.get_text().strip() == 'Groupes de travail' or 'Groupes_de_travail' in heading.get_text():
            working_groups_section = heading
            break

    if not working_groups_section:
        logger.warning("Could not find working groups section")
        # Return an empty list but with a default category
        return []

    # Get the content following the heading until the next heading
    current = working_groups_section.next_sibling
    while current and not current.name in ['h2', 'h3']:
        if current.name == 'ul':
            # Process list items
            for li in current.find_all('li', recursive=False):
                link = li.find('a')
                if link:
                    name = link.get_text().strip()
                    url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')

                    # Extract description (text after the link)
                    description = ""
                    next_node = link.next_sibling
                    while next_node:
                        if isinstance(next_node, str):
                            description += next_node.strip()
                        next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None

                    description = description.strip(' :-,')

                    working_groups.append({
                        "name": name,
                        "url": url,
                        "description": description,
                        "category": "Général",
                        "type": "working_group"
                    })
        current = current.next_sibling

    logger.info(f"Found {len(working_groups)} working groups")
    return working_groups

def extract_local_groups_from_wiki(html_content):
    """
    Extract local groups from the wiki page HTML

    Args:
        html_content (str): HTML content of the wiki page

    Returns:
        list: List of local group dictionaries
    """
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    local_groups = []

    # Find the local groups section
    local_groups_section = None
    for heading in soup.find_all(['h2', 'h3']):
        if heading.get_text().strip() == 'Groupes locaux' or 'Pages des groupes locaux' in heading.get_text():
            local_groups_section = heading
            break

    if not local_groups_section:
        logger.warning("Could not find local groups section")
        return []

    # Get the content following the heading until the next heading
    current = local_groups_section.next_sibling
    while current and not current.name in ['h2', 'h3']:
        if current.name == 'ul':
            # Process list items
            for li in current.find_all('li', recursive=False):
                link = li.find('a')
                if link:
                    name = link.get_text().strip()
                    url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')

                    # Extract description (text after the link)
                    description = ""
                    next_node = link.next_sibling
                    while next_node:
                        if isinstance(next_node, str):
                            description += next_node.strip()
                        next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None

                    description = description.strip(' :-,')

                    local_groups.append({
                        "name": name,
                        "url": url,
                        "description": description,
                        "type": "local_group",
                        "source": "wiki"
                    })
        current = current.next_sibling

    logger.info(f"Found {len(local_groups)} local groups from wiki")
    return local_groups

def fetch_framacalc_data():
    """
    Fetch local groups data from Framacalc

    Returns:
        list: List of local group dictionaries from Framacalc
    """
    try:
        response = requests.get(FRAMACALC_URL)
        response.raise_for_status()

        # Parse CSV data
        csv_data = csv.reader(io.StringIO(response.text))
        rows = list(csv_data)

        # Check if we have data
        if len(rows) < 2:
            logger.warning("No data found in Framacalc CSV")
            return []

        # Extract headers (first row)
        headers = rows[0]

        # Find the indices of important columns
        name_idx = -1
        contact_idx = -1
        website_idx = -1

        for i, header in enumerate(headers):
            header_lower = header.lower()
            if 'nom' in header_lower or 'groupe' in header_lower:
                name_idx = i
            elif 'contact' in header_lower or 'email' in header_lower:
                contact_idx = i
            elif 'site' in header_lower or 'web' in header_lower:
                website_idx = i

        if name_idx == -1:
            logger.warning("Could not find name column in Framacalc CSV")
            return []

        # Process data rows
        local_groups = []
        for row in rows[1:]:  # Skip header row
            if len(row) <= name_idx or not row[name_idx].strip():
                continue  # Skip empty rows

            name = row[name_idx].strip()
            contact = row[contact_idx].strip() if contact_idx != -1 and contact_idx < len(row) else ""
            website = row[website_idx].strip() if website_idx != -1 and website_idx < len(row) else ""

            local_groups.append({
                "name": name,
                "contact": contact,
                "website": website,
                "type": "local_group",
                "source": "framacalc",
                "has_wiki_page": False,  # Will be updated later
                "wiki_url": ""  # Will be updated later
            })

        logger.info(f"Found {len(local_groups)} local groups from Framacalc")
        return local_groups

    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching Framacalc data: {e}")
        return []
    except Exception as e:
        logger.error(f"Error processing Framacalc data: {e}")
        return []

def extract_wiki_group_links():
    """
    Extract links to local group wiki pages from the OSM-FR wiki page

    Returns:
        dict: Dictionary mapping group names to wiki URLs
    """
    try:
        # Get the wiki page content
        response = requests.get(WIKI_GROUPS_URL)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        wiki_links = {}

        # Find the "Pages des groupes locaux" section
        pages_section = None
        for heading in soup.find_all(['h2', 'h3', 'h4']):
            if 'Pages des groupes locaux' in heading.get_text():
                pages_section = heading
                break

        if not pages_section:
            logger.warning("Could not find 'Pages des groupes locaux' section")
            return {}

        # Get the content following the heading until the next heading
        current = pages_section.next_sibling
        while current and not current.name in ['h2', 'h3', 'h4']:
            if current.name == 'ul':
                # Process list items
                for li in current.find_all('li', recursive=False):
                    text = li.get_text().strip()
                    link = li.find('a')

                    if link and text:
                        # Extract group name (before the comma)
                        parts = text.split(',', 1)
                        group_name = parts[0].strip()

                        url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
                        wiki_links[group_name] = url

            current = current.next_sibling

        logger.info(f"Found {len(wiki_links)} wiki links for local groups")
        return wiki_links

    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching wiki group links: {e}")
        return {}
    except Exception as e:
        logger.error(f"Error processing wiki group links: {e}")
        return {}

def verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links):
    """
    Verify that each group from Framacalc has a corresponding wiki page

    Args:
        framacalc_groups (list): List of local group dictionaries from Framacalc
        wiki_links (dict): Dictionary mapping group names to wiki URLs

    Returns:
        list: Updated list of local group dictionaries with wiki verification
    """
    for group in framacalc_groups:
        group_name = group['name']

        # Try to find a matching wiki link
        found = False
        for wiki_name, wiki_url in wiki_links.items():
            # Check if the group name is similar to the wiki name
            if group_name.lower() in wiki_name.lower() or wiki_name.lower() in group_name.lower():
                group['has_wiki_page'] = True
                group['wiki_url'] = wiki_url
                found = True
                break

        if not found:
            group['has_wiki_page'] = False
            group['wiki_url'] = ""

    return framacalc_groups

def extract_umap_url(html_content):
    """
    Extract the uMap URL for OSM-FR local groups

    Args:
        html_content (str): HTML content of the wiki page

    Returns:
        str: uMap URL or None if not found
    """
    if not html_content:
        return None

    soup = BeautifulSoup(html_content, 'html.parser')

    # Look for links to umap.openstreetmap.fr
    for link in soup.find_all('a'):
        href = link.get('href', '')
        if 'umap.openstreetmap.fr' in href and 'groupes-locaux' in href:
            return href

    return None

def save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, dry_run=False):
    """
    Save the results to a JSON file

    Args:
        wiki_local_groups (list): List of local group dictionaries from wiki
        framacalc_groups (list): List of local group dictionaries from Framacalc
        working_groups (list): List of working group dictionaries
        umap_url (str): URL to the uMap for local groups
        wiki_links (dict): Dictionary mapping group names to wiki URLs
        dry_run (bool): If True, don't actually save to file

    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved results to file")
        logger.info(f"Wiki local groups: {len(wiki_local_groups)}")
        for group in wiki_local_groups[:5]:  # Show only first 5 for brevity
            logger.info(f"  - {group['name']}: {group['url']}")

        logger.info(f"Framacalc groups: {len(framacalc_groups)}")
        for group in framacalc_groups[:5]:  # Show only first 5 for brevity
            wiki_status = "Has wiki page" if group.get('has_wiki_page') else "No wiki page"
            logger.info(f"  - {group['name']}: {wiki_status}")

        logger.info(f"Working groups: {len(working_groups)}")
        for group in working_groups[:5]:  # Show only first 5 for brevity
            logger.info(f"  - {group['name']}: {group['url']}")

        if umap_url:
            logger.info(f"uMap URL: {umap_url}")

        logger.info(f"Wiki links: {len(wiki_links)}")
        return True

    # Combine all local groups
    all_local_groups = wiki_local_groups + framacalc_groups

    # Prepare the data structure
    data = {
        "last_updated": datetime.now().isoformat(),
        "local_groups": all_local_groups,
        "working_groups": working_groups,
        "umap_url": umap_url,
        "wiki_links": wiki_links
    }

    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(all_local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}")
        return True
    except IOError as e:
        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
        return False

def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Fetch OSM-FR local groups from wiki and Framacalc")
    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
    args = parser.parse_args()

    logger.info("Starting fetch_osm_fr_groups.py")

    # Check if cache is fresh
    if is_cache_fresh() and not args.force:
        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
        logger.info(f"Use --force to update anyway")
        return

    # Get the wiki page content
    html_content = get_page_content(BASE_URL)

    if not html_content:
        logger.error("Failed to get wiki page content")
        return

    # Extract local groups from wiki
    wiki_local_groups = extract_local_groups_from_wiki(html_content)

    if not wiki_local_groups:
        logger.warning("No local groups found in wiki")

    # Extract working groups
    working_groups = extract_working_groups(html_content)

    if not working_groups:
        logger.warning("No working groups found")
        # Initialize with an empty list to avoid errors in the controller
        working_groups = []

    # Extract uMap URL
    umap_url = extract_umap_url(html_content)

    # Fetch local groups from Framacalc
    framacalc_groups = fetch_framacalc_data()

    if not framacalc_groups:
        logger.warning("No local groups found in Framacalc")

    # Extract wiki group links
    wiki_links = extract_wiki_group_links()

    if not wiki_links:
        logger.warning("No wiki links found for local groups")

    # Verify Framacalc groups have wiki pages
    if framacalc_groups and wiki_links:
        framacalc_groups = verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links)

        # Count groups with and without wiki pages
        groups_with_wiki = sum(1 for group in framacalc_groups if group.get('has_wiki_page'))
        groups_without_wiki = sum(1 for group in framacalc_groups if not group.get('has_wiki_page'))

        logger.info(f"Framacalc groups with wiki pages: {groups_with_wiki}")
        logger.info(f"Framacalc groups without wiki pages: {groups_without_wiki}")

    # Save results
    success = save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, args.dry_run)

    if success:
        logger.info("Script completed successfully")
    else:
        logger.error("Script completed with errors")

if __name__ == "__main__":
    main()