up wiki land

This commit is contained in:
Tykayn 2025-08-22 18:19:20 +02:00 committed by tykayn
parent 391a212034
commit e533c273b2
10 changed files with 1116 additions and 182 deletions

View file

@ -4,9 +4,11 @@
"""
fetch_osm_fr_groups.py
This script scrapes the OpenStreetMap wiki page for France/OSM-FR to extract
information about local working groups. It specifically targets links in the
#Pages_des_groupes_locaux section.
This script fetches information about OSM-FR local groups from two sources:
1. The OpenStreetMap wiki page for France/OSM-FR (specifically the #Pages_des_groupes_locaux section)
2. The Framacalc spreadsheet at https://framacalc.org/osm-groupes-locaux
It then verifies that each group from the Framacalc has a corresponding wiki page.
Usage:
python fetch_osm_fr_groups.py [--dry-run] [--force]
@ -24,6 +26,8 @@ import json
import argparse
import logging
import os
import csv
import io
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
@ -40,6 +44,8 @@ logger = logging.getLogger(__name__)
OUTPUT_FILE = "osm_fr_groups.json"
BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
FRAMACALC_URL = "https://framacalc.org/osm-groupes-locaux/export/csv"
WIKI_GROUPS_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR#Groupes_locaux"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
def is_cache_fresh():
@ -141,7 +147,7 @@ def extract_working_groups(html_content):
logger.info(f"Found {len(working_groups)} working groups")
return working_groups
def extract_local_groups(html_content):
def extract_local_groups_from_wiki(html_content):
"""
Extract local groups from the wiki page HTML
@ -193,13 +199,170 @@ def extract_local_groups(html_content):
"name": name,
"url": url,
"description": description,
"type": "local_group"
"type": "local_group",
"source": "wiki"
})
current = current.next_sibling
logger.info(f"Found {len(local_groups)} local groups")
logger.info(f"Found {len(local_groups)} local groups from wiki")
return local_groups
def fetch_framacalc_data():
"""
Fetch local groups data from Framacalc
Returns:
list: List of local group dictionaries from Framacalc
"""
try:
response = requests.get(FRAMACALC_URL)
response.raise_for_status()
# Parse CSV data
csv_data = csv.reader(io.StringIO(response.text))
rows = list(csv_data)
# Check if we have data
if len(rows) < 2:
logger.warning("No data found in Framacalc CSV")
return []
# Extract headers (first row)
headers = rows[0]
# Find the indices of important columns
name_idx = -1
contact_idx = -1
website_idx = -1
for i, header in enumerate(headers):
header_lower = header.lower()
if 'nom' in header_lower or 'groupe' in header_lower:
name_idx = i
elif 'contact' in header_lower or 'email' in header_lower:
contact_idx = i
elif 'site' in header_lower or 'web' in header_lower:
website_idx = i
if name_idx == -1:
logger.warning("Could not find name column in Framacalc CSV")
return []
# Process data rows
local_groups = []
for row in rows[1:]: # Skip header row
if len(row) <= name_idx or not row[name_idx].strip():
continue # Skip empty rows
name = row[name_idx].strip()
contact = row[contact_idx].strip() if contact_idx != -1 and contact_idx < len(row) else ""
website = row[website_idx].strip() if website_idx != -1 and website_idx < len(row) else ""
local_groups.append({
"name": name,
"contact": contact,
"website": website,
"type": "local_group",
"source": "framacalc",
"has_wiki_page": False, # Will be updated later
"wiki_url": "" # Will be updated later
})
logger.info(f"Found {len(local_groups)} local groups from Framacalc")
return local_groups
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching Framacalc data: {e}")
return []
except Exception as e:
logger.error(f"Error processing Framacalc data: {e}")
return []
def extract_wiki_group_links():
"""
Extract links to local group wiki pages from the OSM-FR wiki page
Returns:
dict: Dictionary mapping group names to wiki URLs
"""
try:
# Get the wiki page content
response = requests.get(WIKI_GROUPS_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
wiki_links = {}
# Find the "Pages des groupes locaux" section
pages_section = None
for heading in soup.find_all(['h2', 'h3', 'h4']):
if 'Pages des groupes locaux' in heading.get_text():
pages_section = heading
break
if not pages_section:
logger.warning("Could not find 'Pages des groupes locaux' section")
return {}
# Get the content following the heading until the next heading
current = pages_section.next_sibling
while current and not current.name in ['h2', 'h3', 'h4']:
if current.name == 'ul':
# Process list items
for li in current.find_all('li', recursive=False):
text = li.get_text().strip()
link = li.find('a')
if link and text:
# Extract group name (before the comma)
parts = text.split(',', 1)
group_name = parts[0].strip()
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
wiki_links[group_name] = url
current = current.next_sibling
logger.info(f"Found {len(wiki_links)} wiki links for local groups")
return wiki_links
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki group links: {e}")
return {}
except Exception as e:
logger.error(f"Error processing wiki group links: {e}")
return {}
def verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links):
"""
Verify that each group from Framacalc has a corresponding wiki page
Args:
framacalc_groups (list): List of local group dictionaries from Framacalc
wiki_links (dict): Dictionary mapping group names to wiki URLs
Returns:
list: Updated list of local group dictionaries with wiki verification
"""
for group in framacalc_groups:
group_name = group['name']
# Try to find a matching wiki link
found = False
for wiki_name, wiki_url in wiki_links.items():
# Check if the group name is similar to the wiki name
if group_name.lower() in wiki_name.lower() or wiki_name.lower() in group_name.lower():
group['has_wiki_page'] = True
group['wiki_url'] = wiki_url
found = True
break
if not found:
group['has_wiki_page'] = False
group['wiki_url'] = ""
return framacalc_groups
def extract_umap_url(html_content):
"""
Extract the uMap URL for OSM-FR local groups
@ -223,14 +386,16 @@ def extract_umap_url(html_content):
return None
def save_results(local_groups, working_groups, umap_url, dry_run=False):
def save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, dry_run=False):
"""
Save the results to a JSON file
Args:
local_groups (list): List of local group dictionaries
wiki_local_groups (list): List of local group dictionaries from wiki
framacalc_groups (list): List of local group dictionaries from Framacalc
working_groups (list): List of working group dictionaries
umap_url (str): URL to the uMap for local groups
wiki_links (dict): Dictionary mapping group names to wiki URLs
dry_run (bool): If True, don't actually save to file
Returns:
@ -238,28 +403,41 @@ def save_results(local_groups, working_groups, umap_url, dry_run=False):
"""
if dry_run:
logger.info("DRY RUN: Would have saved results to file")
logger.info(f"Local groups: {len(local_groups)}")
for group in local_groups:
logger.info(f"Wiki local groups: {len(wiki_local_groups)}")
for group in wiki_local_groups[:5]: # Show only first 5 for brevity
logger.info(f" - {group['name']}: {group['url']}")
logger.info(f"Framacalc groups: {len(framacalc_groups)}")
for group in framacalc_groups[:5]: # Show only first 5 for brevity
wiki_status = "Has wiki page" if group.get('has_wiki_page') else "No wiki page"
logger.info(f" - {group['name']}: {wiki_status}")
logger.info(f"Working groups: {len(working_groups)}")
for group in working_groups:
for group in working_groups[:5]: # Show only first 5 for brevity
logger.info(f" - {group['name']}: {group['url']}")
if umap_url:
logger.info(f"uMap URL: {umap_url}")
logger.info(f"Wiki links: {len(wiki_links)}")
return True
# Combine all local groups
all_local_groups = wiki_local_groups + framacalc_groups
# Prepare the data structure
data = {
"last_updated": datetime.now().isoformat(),
"local_groups": local_groups,
"local_groups": all_local_groups,
"working_groups": working_groups,
"umap_url": umap_url
"umap_url": umap_url,
"wiki_links": wiki_links
}
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}")
logger.info(f"Successfully saved {len(all_local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}")
return True
except IOError as e:
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
@ -267,7 +445,7 @@ def save_results(local_groups, working_groups, umap_url, dry_run=False):
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Scrape OSM-FR local groups from the wiki")
parser = argparse.ArgumentParser(description="Fetch OSM-FR local groups from wiki and Framacalc")
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
args = parser.parse_args()
@ -287,11 +465,11 @@ def main():
logger.error("Failed to get wiki page content")
return
# Extract local groups
local_groups = extract_local_groups(html_content)
# Extract local groups from wiki
wiki_local_groups = extract_local_groups_from_wiki(html_content)
if not local_groups:
logger.warning("No local groups found")
if not wiki_local_groups:
logger.warning("No local groups found in wiki")
# Extract working groups
working_groups = extract_working_groups(html_content)
@ -304,8 +482,31 @@ def main():
# Extract uMap URL
umap_url = extract_umap_url(html_content)
# Fetch local groups from Framacalc
framacalc_groups = fetch_framacalc_data()
if not framacalc_groups:
logger.warning("No local groups found in Framacalc")
# Extract wiki group links
wiki_links = extract_wiki_group_links()
if not wiki_links:
logger.warning("No wiki links found for local groups")
# Verify Framacalc groups have wiki pages
if framacalc_groups and wiki_links:
framacalc_groups = verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links)
# Count groups with and without wiki pages
groups_with_wiki = sum(1 for group in framacalc_groups if group.get('has_wiki_page'))
groups_without_wiki = sum(1 for group in framacalc_groups if not group.get('has_wiki_page'))
logger.info(f"Framacalc groups with wiki pages: {groups_with_wiki}")
logger.info(f"Framacalc groups without wiki pages: {groups_without_wiki}")
# Save results
success = save_results(local_groups, working_groups, umap_url, args.dry_run)
success = save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, args.dry_run)
if success:
logger.info("Script completed successfully")