osm-commerces/wiki_compare/fetch_osm_fr_groups.py
2025-08-22 18:19:20 +02:00

517 lines
No EOL
18 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
fetch_osm_fr_groups.py
This script fetches information about OSM-FR local groups from two sources:
1. The OpenStreetMap wiki page for France/OSM-FR (specifically the #Pages_des_groupes_locaux section)
2. The Framacalc spreadsheet at https://framacalc.org/osm-groupes-locaux
It then verifies that each group from the Framacalc has a corresponding wiki page.
Usage:
python fetch_osm_fr_groups.py [--dry-run] [--force]
Options:
--dry-run Run the script without saving the results to a file
--force Force update even if the cache is still fresh (less than 1 hour old)
Output:
- osm_fr_groups.json: JSON file with information about OSM-FR local groups
- Log messages about the scraping process and results
"""
import json
import argparse
import logging
import os
import csv
import io
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
OUTPUT_FILE = "osm_fr_groups.json"
BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
FRAMACALC_URL = "https://framacalc.org/osm-groupes-locaux/export/csv"
WIKI_GROUPS_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR#Groupes_locaux"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
def is_cache_fresh():
"""
Check if the cache file exists and is less than CACHE_DURATION old
Returns:
bool: True if cache is fresh, False otherwise
"""
if not os.path.exists(OUTPUT_FILE):
return False
try:
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
now = datetime.now()
return (now - last_updated) < CACHE_DURATION
except (IOError, json.JSONDecodeError, ValueError) as e:
logger.error(f"Error checking cache freshness: {e}")
return False
def get_page_content(url):
"""
Get the HTML content of a page
Args:
url (str): URL to fetch
Returns:
str: HTML content of the page or None if request failed
"""
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def extract_working_groups(html_content):
"""
Extract working groups from the wiki page HTML
Args:
html_content (str): HTML content of the wiki page
Returns:
list: List of working group dictionaries
"""
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
working_groups = []
# Find the working groups section
working_groups_section = None
for heading in soup.find_all(['h2', 'h3']):
if heading.get_text().strip() == 'Groupes de travail' or 'Groupes_de_travail' in heading.get_text():
working_groups_section = heading
break
if not working_groups_section:
logger.warning("Could not find working groups section")
# Return an empty list but with a default category
return []
# Get the content following the heading until the next heading
current = working_groups_section.next_sibling
while current and not current.name in ['h2', 'h3']:
if current.name == 'ul':
# Process list items
for li in current.find_all('li', recursive=False):
link = li.find('a')
if link:
name = link.get_text().strip()
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
# Extract description (text after the link)
description = ""
next_node = link.next_sibling
while next_node:
if isinstance(next_node, str):
description += next_node.strip()
next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
description = description.strip(' :-,')
working_groups.append({
"name": name,
"url": url,
"description": description,
"category": "Général",
"type": "working_group"
})
current = current.next_sibling
logger.info(f"Found {len(working_groups)} working groups")
return working_groups
def extract_local_groups_from_wiki(html_content):
"""
Extract local groups from the wiki page HTML
Args:
html_content (str): HTML content of the wiki page
Returns:
list: List of local group dictionaries
"""
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
local_groups = []
# Find the local groups section
local_groups_section = None
for heading in soup.find_all(['h2', 'h3']):
if heading.get_text().strip() == 'Groupes locaux' or 'Pages des groupes locaux' in heading.get_text():
local_groups_section = heading
break
if not local_groups_section:
logger.warning("Could not find local groups section")
return []
# Get the content following the heading until the next heading
current = local_groups_section.next_sibling
while current and not current.name in ['h2', 'h3']:
if current.name == 'ul':
# Process list items
for li in current.find_all('li', recursive=False):
link = li.find('a')
if link:
name = link.get_text().strip()
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
# Extract description (text after the link)
description = ""
next_node = link.next_sibling
while next_node:
if isinstance(next_node, str):
description += next_node.strip()
next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
description = description.strip(' :-,')
local_groups.append({
"name": name,
"url": url,
"description": description,
"type": "local_group",
"source": "wiki"
})
current = current.next_sibling
logger.info(f"Found {len(local_groups)} local groups from wiki")
return local_groups
def fetch_framacalc_data():
"""
Fetch local groups data from Framacalc
Returns:
list: List of local group dictionaries from Framacalc
"""
try:
response = requests.get(FRAMACALC_URL)
response.raise_for_status()
# Parse CSV data
csv_data = csv.reader(io.StringIO(response.text))
rows = list(csv_data)
# Check if we have data
if len(rows) < 2:
logger.warning("No data found in Framacalc CSV")
return []
# Extract headers (first row)
headers = rows[0]
# Find the indices of important columns
name_idx = -1
contact_idx = -1
website_idx = -1
for i, header in enumerate(headers):
header_lower = header.lower()
if 'nom' in header_lower or 'groupe' in header_lower:
name_idx = i
elif 'contact' in header_lower or 'email' in header_lower:
contact_idx = i
elif 'site' in header_lower or 'web' in header_lower:
website_idx = i
if name_idx == -1:
logger.warning("Could not find name column in Framacalc CSV")
return []
# Process data rows
local_groups = []
for row in rows[1:]: # Skip header row
if len(row) <= name_idx or not row[name_idx].strip():
continue # Skip empty rows
name = row[name_idx].strip()
contact = row[contact_idx].strip() if contact_idx != -1 and contact_idx < len(row) else ""
website = row[website_idx].strip() if website_idx != -1 and website_idx < len(row) else ""
local_groups.append({
"name": name,
"contact": contact,
"website": website,
"type": "local_group",
"source": "framacalc",
"has_wiki_page": False, # Will be updated later
"wiki_url": "" # Will be updated later
})
logger.info(f"Found {len(local_groups)} local groups from Framacalc")
return local_groups
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching Framacalc data: {e}")
return []
except Exception as e:
logger.error(f"Error processing Framacalc data: {e}")
return []
def extract_wiki_group_links():
"""
Extract links to local group wiki pages from the OSM-FR wiki page
Returns:
dict: Dictionary mapping group names to wiki URLs
"""
try:
# Get the wiki page content
response = requests.get(WIKI_GROUPS_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
wiki_links = {}
# Find the "Pages des groupes locaux" section
pages_section = None
for heading in soup.find_all(['h2', 'h3', 'h4']):
if 'Pages des groupes locaux' in heading.get_text():
pages_section = heading
break
if not pages_section:
logger.warning("Could not find 'Pages des groupes locaux' section")
return {}
# Get the content following the heading until the next heading
current = pages_section.next_sibling
while current and not current.name in ['h2', 'h3', 'h4']:
if current.name == 'ul':
# Process list items
for li in current.find_all('li', recursive=False):
text = li.get_text().strip()
link = li.find('a')
if link and text:
# Extract group name (before the comma)
parts = text.split(',', 1)
group_name = parts[0].strip()
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
wiki_links[group_name] = url
current = current.next_sibling
logger.info(f"Found {len(wiki_links)} wiki links for local groups")
return wiki_links
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki group links: {e}")
return {}
except Exception as e:
logger.error(f"Error processing wiki group links: {e}")
return {}
def verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links):
"""
Verify that each group from Framacalc has a corresponding wiki page
Args:
framacalc_groups (list): List of local group dictionaries from Framacalc
wiki_links (dict): Dictionary mapping group names to wiki URLs
Returns:
list: Updated list of local group dictionaries with wiki verification
"""
for group in framacalc_groups:
group_name = group['name']
# Try to find a matching wiki link
found = False
for wiki_name, wiki_url in wiki_links.items():
# Check if the group name is similar to the wiki name
if group_name.lower() in wiki_name.lower() or wiki_name.lower() in group_name.lower():
group['has_wiki_page'] = True
group['wiki_url'] = wiki_url
found = True
break
if not found:
group['has_wiki_page'] = False
group['wiki_url'] = ""
return framacalc_groups
def extract_umap_url(html_content):
"""
Extract the uMap URL for OSM-FR local groups
Args:
html_content (str): HTML content of the wiki page
Returns:
str: uMap URL or None if not found
"""
if not html_content:
return None
soup = BeautifulSoup(html_content, 'html.parser')
# Look for links to umap.openstreetmap.fr
for link in soup.find_all('a'):
href = link.get('href', '')
if 'umap.openstreetmap.fr' in href and 'groupes-locaux' in href:
return href
return None
def save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, dry_run=False):
"""
Save the results to a JSON file
Args:
wiki_local_groups (list): List of local group dictionaries from wiki
framacalc_groups (list): List of local group dictionaries from Framacalc
working_groups (list): List of working group dictionaries
umap_url (str): URL to the uMap for local groups
wiki_links (dict): Dictionary mapping group names to wiki URLs
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved results to file")
logger.info(f"Wiki local groups: {len(wiki_local_groups)}")
for group in wiki_local_groups[:5]: # Show only first 5 for brevity
logger.info(f" - {group['name']}: {group['url']}")
logger.info(f"Framacalc groups: {len(framacalc_groups)}")
for group in framacalc_groups[:5]: # Show only first 5 for brevity
wiki_status = "Has wiki page" if group.get('has_wiki_page') else "No wiki page"
logger.info(f" - {group['name']}: {wiki_status}")
logger.info(f"Working groups: {len(working_groups)}")
for group in working_groups[:5]: # Show only first 5 for brevity
logger.info(f" - {group['name']}: {group['url']}")
if umap_url:
logger.info(f"uMap URL: {umap_url}")
logger.info(f"Wiki links: {len(wiki_links)}")
return True
# Combine all local groups
all_local_groups = wiki_local_groups + framacalc_groups
# Prepare the data structure
data = {
"last_updated": datetime.now().isoformat(),
"local_groups": all_local_groups,
"working_groups": working_groups,
"umap_url": umap_url,
"wiki_links": wiki_links
}
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(all_local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}")
return True
except IOError as e:
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
return False
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Fetch OSM-FR local groups from wiki and Framacalc")
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
args = parser.parse_args()
logger.info("Starting fetch_osm_fr_groups.py")
# Check if cache is fresh
if is_cache_fresh() and not args.force:
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
logger.info(f"Use --force to update anyway")
return
# Get the wiki page content
html_content = get_page_content(BASE_URL)
if not html_content:
logger.error("Failed to get wiki page content")
return
# Extract local groups from wiki
wiki_local_groups = extract_local_groups_from_wiki(html_content)
if not wiki_local_groups:
logger.warning("No local groups found in wiki")
# Extract working groups
working_groups = extract_working_groups(html_content)
if not working_groups:
logger.warning("No working groups found")
# Initialize with an empty list to avoid errors in the controller
working_groups = []
# Extract uMap URL
umap_url = extract_umap_url(html_content)
# Fetch local groups from Framacalc
framacalc_groups = fetch_framacalc_data()
if not framacalc_groups:
logger.warning("No local groups found in Framacalc")
# Extract wiki group links
wiki_links = extract_wiki_group_links()
if not wiki_links:
logger.warning("No wiki links found for local groups")
# Verify Framacalc groups have wiki pages
if framacalc_groups and wiki_links:
framacalc_groups = verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links)
# Count groups with and without wiki pages
groups_with_wiki = sum(1 for group in framacalc_groups if group.get('has_wiki_page'))
groups_without_wiki = sum(1 for group in framacalc_groups if not group.get('has_wiki_page'))
logger.info(f"Framacalc groups with wiki pages: {groups_with_wiki}")
logger.info(f"Framacalc groups without wiki pages: {groups_without_wiki}")
# Save results
success = save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, args.dry_run)
if success:
logger.info("Script completed successfully")
else:
logger.error("Script completed with errors")
if __name__ == "__main__":
main()