#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ fetch_osm_fr_groups.py This script fetches information about OSM-FR local groups from two sources: 1. The OpenStreetMap wiki page for France/OSM-FR (specifically the #Pages_des_groupes_locaux section) 2. The Framacalc spreadsheet at https://framacalc.org/osm-groupes-locaux It then verifies that each group from the Framacalc has a corresponding wiki page. Usage: python fetch_osm_fr_groups.py [--dry-run] [--force] Options: --dry-run Run the script without saving the results to a file --force Force update even if the cache is still fresh (less than 1 hour old) Output: - osm_fr_groups.json: JSON file with information about OSM-FR local groups - Log messages about the scraping process and results """ import json import argparse import logging import os import csv import io from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants OUTPUT_FILE = "osm_fr_groups.json" BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR" WIKI_BASE_URL = "https://wiki.openstreetmap.org" FRAMACALC_URL = "https://framacalc.org/osm-groupes-locaux/export/csv" WIKI_GROUPS_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR#Groupes_locaux" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour def is_cache_fresh(): """ Check if the cache file exists and is less than CACHE_DURATION old Returns: bool: True if cache is fresh, False otherwise """ if not os.path.exists(OUTPUT_FILE): return False try: with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) now = datetime.now() return (now - last_updated) < CACHE_DURATION except (IOError, json.JSONDecodeError, ValueError) as e: logger.error(f"Error checking cache freshness: {e}") return False def get_page_content(url): """ Get the HTML content of a page Args: url (str): URL to fetch Returns: str: HTML content of the page or None if request failed """ try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None def extract_working_groups(html_content): """ Extract working groups from the wiki page HTML Args: html_content (str): HTML content of the wiki page Returns: list: List of working group dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') working_groups = [] # Find the working groups section working_groups_section = None for heading in soup.find_all(['h2', 'h3']): if heading.get_text().strip() == 'Groupes de travail' or 'Groupes_de_travail' in heading.get_text(): working_groups_section = heading break if not working_groups_section: logger.warning("Could not find working groups section") # Return an empty list but with a default category return [] # Get the content following the heading until the next heading current = working_groups_section.next_sibling while current and not current.name in ['h2', 'h3']: if current.name == 'ul': # Process list items for li in current.find_all('li', recursive=False): link = li.find('a') if link: name = link.get_text().strip() url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href') # Extract description (text after the link) description = "" next_node = link.next_sibling while next_node: if isinstance(next_node, str): description += next_node.strip() next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None description = description.strip(' :-,') working_groups.append({ "name": name, "url": url, "description": description, "category": "Général", "type": "working_group" }) current = current.next_sibling logger.info(f"Found {len(working_groups)} working groups") return working_groups def extract_local_groups_from_wiki(html_content): """ Extract local groups from the wiki page HTML Args: html_content (str): HTML content of the wiki page Returns: list: List of local group dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') local_groups = [] # Find the local groups section local_groups_section = None for heading in soup.find_all(['h2', 'h3']): if heading.get_text().strip() == 'Groupes locaux' or 'Pages des groupes locaux' in heading.get_text(): local_groups_section = heading break if not local_groups_section: logger.warning("Could not find local groups section") return [] # Get the content following the heading until the next heading current = local_groups_section.next_sibling while current and not current.name in ['h2', 'h3']: if current.name == 'ul': # Process list items for li in current.find_all('li', recursive=False): link = li.find('a') if link: name = link.get_text().strip() url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href') # Extract description (text after the link) description = "" next_node = link.next_sibling while next_node: if isinstance(next_node, str): description += next_node.strip() next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None description = description.strip(' :-,') local_groups.append({ "name": name, "url": url, "description": description, "type": "local_group", "source": "wiki" }) current = current.next_sibling logger.info(f"Found {len(local_groups)} local groups from wiki") return local_groups def fetch_framacalc_data(): """ Fetch local groups data from Framacalc Returns: list: List of local group dictionaries from Framacalc """ try: response = requests.get(FRAMACALC_URL) response.raise_for_status() # Parse CSV data csv_data = csv.reader(io.StringIO(response.text)) rows = list(csv_data) # Check if we have data if len(rows) < 2: logger.warning("No data found in Framacalc CSV") return [] # Extract headers (first row) headers = rows[0] # Find the indices of important columns name_idx = -1 contact_idx = -1 website_idx = -1 for i, header in enumerate(headers): header_lower = header.lower() if 'nom' in header_lower or 'groupe' in header_lower: name_idx = i elif 'contact' in header_lower or 'email' in header_lower: contact_idx = i elif 'site' in header_lower or 'web' in header_lower: website_idx = i if name_idx == -1: logger.warning("Could not find name column in Framacalc CSV") return [] # Process data rows local_groups = [] for row in rows[1:]: # Skip header row if len(row) <= name_idx or not row[name_idx].strip(): continue # Skip empty rows name = row[name_idx].strip() contact = row[contact_idx].strip() if contact_idx != -1 and contact_idx < len(row) else "" website = row[website_idx].strip() if website_idx != -1 and website_idx < len(row) else "" local_groups.append({ "name": name, "contact": contact, "website": website, "type": "local_group", "source": "framacalc", "has_wiki_page": False, # Will be updated later "wiki_url": "" # Will be updated later }) logger.info(f"Found {len(local_groups)} local groups from Framacalc") return local_groups except requests.exceptions.RequestException as e: logger.error(f"Error fetching Framacalc data: {e}") return [] except Exception as e: logger.error(f"Error processing Framacalc data: {e}") return [] def extract_wiki_group_links(): """ Extract links to local group wiki pages from the OSM-FR wiki page Returns: dict: Dictionary mapping group names to wiki URLs """ try: # Get the wiki page content response = requests.get(WIKI_GROUPS_URL) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') wiki_links = {} # Find the "Pages des groupes locaux" section pages_section = None for heading in soup.find_all(['h2', 'h3', 'h4']): if 'Pages des groupes locaux' in heading.get_text(): pages_section = heading break if not pages_section: logger.warning("Could not find 'Pages des groupes locaux' section") return {} # Get the content following the heading until the next heading current = pages_section.next_sibling while current and not current.name in ['h2', 'h3', 'h4']: if current.name == 'ul': # Process list items for li in current.find_all('li', recursive=False): text = li.get_text().strip() link = li.find('a') if link and text: # Extract group name (before the comma) parts = text.split(',', 1) group_name = parts[0].strip() url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href') wiki_links[group_name] = url current = current.next_sibling logger.info(f"Found {len(wiki_links)} wiki links for local groups") return wiki_links except requests.exceptions.RequestException as e: logger.error(f"Error fetching wiki group links: {e}") return {} except Exception as e: logger.error(f"Error processing wiki group links: {e}") return {} def verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links): """ Verify that each group from Framacalc has a corresponding wiki page Args: framacalc_groups (list): List of local group dictionaries from Framacalc wiki_links (dict): Dictionary mapping group names to wiki URLs Returns: list: Updated list of local group dictionaries with wiki verification """ for group in framacalc_groups: group_name = group['name'] # Try to find a matching wiki link found = False for wiki_name, wiki_url in wiki_links.items(): # Check if the group name is similar to the wiki name if group_name.lower() in wiki_name.lower() or wiki_name.lower() in group_name.lower(): group['has_wiki_page'] = True group['wiki_url'] = wiki_url found = True break if not found: group['has_wiki_page'] = False group['wiki_url'] = "" return framacalc_groups def extract_umap_url(html_content): """ Extract the uMap URL for OSM-FR local groups Args: html_content (str): HTML content of the wiki page Returns: str: uMap URL or None if not found """ if not html_content: return None soup = BeautifulSoup(html_content, 'html.parser') # Look for links to umap.openstreetmap.fr for link in soup.find_all('a'): href = link.get('href', '') if 'umap.openstreetmap.fr' in href and 'groupes-locaux' in href: return href return None def save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, dry_run=False): """ Save the results to a JSON file Args: wiki_local_groups (list): List of local group dictionaries from wiki framacalc_groups (list): List of local group dictionaries from Framacalc working_groups (list): List of working group dictionaries umap_url (str): URL to the uMap for local groups wiki_links (dict): Dictionary mapping group names to wiki URLs dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved results to file") logger.info(f"Wiki local groups: {len(wiki_local_groups)}") for group in wiki_local_groups[:5]: # Show only first 5 for brevity logger.info(f" - {group['name']}: {group['url']}") logger.info(f"Framacalc groups: {len(framacalc_groups)}") for group in framacalc_groups[:5]: # Show only first 5 for brevity wiki_status = "Has wiki page" if group.get('has_wiki_page') else "No wiki page" logger.info(f" - {group['name']}: {wiki_status}") logger.info(f"Working groups: {len(working_groups)}") for group in working_groups[:5]: # Show only first 5 for brevity logger.info(f" - {group['name']}: {group['url']}") if umap_url: logger.info(f"uMap URL: {umap_url}") logger.info(f"Wiki links: {len(wiki_links)}") return True # Combine all local groups all_local_groups = wiki_local_groups + framacalc_groups # Prepare the data structure data = { "last_updated": datetime.now().isoformat(), "local_groups": all_local_groups, "working_groups": working_groups, "umap_url": umap_url, "wiki_links": wiki_links } try: with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Successfully saved {len(all_local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}") return True except IOError as e: logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Fetch OSM-FR local groups from wiki and Framacalc") parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") args = parser.parse_args() logger.info("Starting fetch_osm_fr_groups.py") # Check if cache is fresh if is_cache_fresh() and not args.force: logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") logger.info(f"Use --force to update anyway") return # Get the wiki page content html_content = get_page_content(BASE_URL) if not html_content: logger.error("Failed to get wiki page content") return # Extract local groups from wiki wiki_local_groups = extract_local_groups_from_wiki(html_content) if not wiki_local_groups: logger.warning("No local groups found in wiki") # Extract working groups working_groups = extract_working_groups(html_content) if not working_groups: logger.warning("No working groups found") # Initialize with an empty list to avoid errors in the controller working_groups = [] # Extract uMap URL umap_url = extract_umap_url(html_content) # Fetch local groups from Framacalc framacalc_groups = fetch_framacalc_data() if not framacalc_groups: logger.warning("No local groups found in Framacalc") # Extract wiki group links wiki_links = extract_wiki_group_links() if not wiki_links: logger.warning("No wiki links found for local groups") # Verify Framacalc groups have wiki pages if framacalc_groups and wiki_links: framacalc_groups = verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links) # Count groups with and without wiki pages groups_with_wiki = sum(1 for group in framacalc_groups if group.get('has_wiki_page')) groups_without_wiki = sum(1 for group in framacalc_groups if not group.get('has_wiki_page')) logger.info(f"Framacalc groups with wiki pages: {groups_with_wiki}") logger.info(f"Framacalc groups without wiki pages: {groups_without_wiki}") # Save results success = save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, args.dry_run) if success: logger.info("Script completed successfully") else: logger.error("Script completed with errors") if __name__ == "__main__": main()