517 lines
No EOL
18 KiB
Python
Executable file
517 lines
No EOL
18 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
fetch_osm_fr_groups.py
|
|
|
|
This script fetches information about OSM-FR local groups from two sources:
|
|
1. The OpenStreetMap wiki page for France/OSM-FR (specifically the #Pages_des_groupes_locaux section)
|
|
2. The Framacalc spreadsheet at https://framacalc.org/osm-groupes-locaux
|
|
|
|
It then verifies that each group from the Framacalc has a corresponding wiki page.
|
|
|
|
Usage:
|
|
python fetch_osm_fr_groups.py [--dry-run] [--force]
|
|
|
|
Options:
|
|
--dry-run Run the script without saving the results to a file
|
|
--force Force update even if the cache is still fresh (less than 1 hour old)
|
|
|
|
Output:
|
|
- osm_fr_groups.json: JSON file with information about OSM-FR local groups
|
|
- Log messages about the scraping process and results
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import csv
|
|
import io
|
|
from datetime import datetime, timedelta
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
OUTPUT_FILE = "osm_fr_groups.json"
|
|
BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR"
|
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
|
FRAMACALC_URL = "https://framacalc.org/osm-groupes-locaux/export/csv"
|
|
WIKI_GROUPS_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR#Groupes_locaux"
|
|
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
|
|
|
def is_cache_fresh():
|
|
"""
|
|
Check if the cache file exists and is less than CACHE_DURATION old
|
|
|
|
Returns:
|
|
bool: True if cache is fresh, False otherwise
|
|
"""
|
|
if not os.path.exists(OUTPUT_FILE):
|
|
return False
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
|
|
now = datetime.now()
|
|
return (now - last_updated) < CACHE_DURATION
|
|
except (IOError, json.JSONDecodeError, ValueError) as e:
|
|
logger.error(f"Error checking cache freshness: {e}")
|
|
return False
|
|
|
|
def get_page_content(url):
|
|
"""
|
|
Get the HTML content of a page
|
|
|
|
Args:
|
|
url (str): URL to fetch
|
|
|
|
Returns:
|
|
str: HTML content of the page or None if request failed
|
|
"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
def extract_working_groups(html_content):
|
|
"""
|
|
Extract working groups from the wiki page HTML
|
|
|
|
Args:
|
|
html_content (str): HTML content of the wiki page
|
|
|
|
Returns:
|
|
list: List of working group dictionaries
|
|
"""
|
|
if not html_content:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
working_groups = []
|
|
|
|
# Find the working groups section
|
|
working_groups_section = None
|
|
for heading in soup.find_all(['h2', 'h3']):
|
|
if heading.get_text().strip() == 'Groupes de travail' or 'Groupes_de_travail' in heading.get_text():
|
|
working_groups_section = heading
|
|
break
|
|
|
|
if not working_groups_section:
|
|
logger.warning("Could not find working groups section")
|
|
# Return an empty list but with a default category
|
|
return []
|
|
|
|
# Get the content following the heading until the next heading
|
|
current = working_groups_section.next_sibling
|
|
while current and not current.name in ['h2', 'h3']:
|
|
if current.name == 'ul':
|
|
# Process list items
|
|
for li in current.find_all('li', recursive=False):
|
|
link = li.find('a')
|
|
if link:
|
|
name = link.get_text().strip()
|
|
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
|
|
|
|
# Extract description (text after the link)
|
|
description = ""
|
|
next_node = link.next_sibling
|
|
while next_node:
|
|
if isinstance(next_node, str):
|
|
description += next_node.strip()
|
|
next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
|
|
|
|
description = description.strip(' :-,')
|
|
|
|
working_groups.append({
|
|
"name": name,
|
|
"url": url,
|
|
"description": description,
|
|
"category": "Général",
|
|
"type": "working_group"
|
|
})
|
|
current = current.next_sibling
|
|
|
|
logger.info(f"Found {len(working_groups)} working groups")
|
|
return working_groups
|
|
|
|
def extract_local_groups_from_wiki(html_content):
|
|
"""
|
|
Extract local groups from the wiki page HTML
|
|
|
|
Args:
|
|
html_content (str): HTML content of the wiki page
|
|
|
|
Returns:
|
|
list: List of local group dictionaries
|
|
"""
|
|
if not html_content:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
local_groups = []
|
|
|
|
# Find the local groups section
|
|
local_groups_section = None
|
|
for heading in soup.find_all(['h2', 'h3']):
|
|
if heading.get_text().strip() == 'Groupes locaux' or 'Pages des groupes locaux' in heading.get_text():
|
|
local_groups_section = heading
|
|
break
|
|
|
|
if not local_groups_section:
|
|
logger.warning("Could not find local groups section")
|
|
return []
|
|
|
|
# Get the content following the heading until the next heading
|
|
current = local_groups_section.next_sibling
|
|
while current and not current.name in ['h2', 'h3']:
|
|
if current.name == 'ul':
|
|
# Process list items
|
|
for li in current.find_all('li', recursive=False):
|
|
link = li.find('a')
|
|
if link:
|
|
name = link.get_text().strip()
|
|
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
|
|
|
|
# Extract description (text after the link)
|
|
description = ""
|
|
next_node = link.next_sibling
|
|
while next_node:
|
|
if isinstance(next_node, str):
|
|
description += next_node.strip()
|
|
next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
|
|
|
|
description = description.strip(' :-,')
|
|
|
|
local_groups.append({
|
|
"name": name,
|
|
"url": url,
|
|
"description": description,
|
|
"type": "local_group",
|
|
"source": "wiki"
|
|
})
|
|
current = current.next_sibling
|
|
|
|
logger.info(f"Found {len(local_groups)} local groups from wiki")
|
|
return local_groups
|
|
|
|
def fetch_framacalc_data():
|
|
"""
|
|
Fetch local groups data from Framacalc
|
|
|
|
Returns:
|
|
list: List of local group dictionaries from Framacalc
|
|
"""
|
|
try:
|
|
response = requests.get(FRAMACALC_URL)
|
|
response.raise_for_status()
|
|
|
|
# Parse CSV data
|
|
csv_data = csv.reader(io.StringIO(response.text))
|
|
rows = list(csv_data)
|
|
|
|
# Check if we have data
|
|
if len(rows) < 2:
|
|
logger.warning("No data found in Framacalc CSV")
|
|
return []
|
|
|
|
# Extract headers (first row)
|
|
headers = rows[0]
|
|
|
|
# Find the indices of important columns
|
|
name_idx = -1
|
|
contact_idx = -1
|
|
website_idx = -1
|
|
|
|
for i, header in enumerate(headers):
|
|
header_lower = header.lower()
|
|
if 'nom' in header_lower or 'groupe' in header_lower:
|
|
name_idx = i
|
|
elif 'contact' in header_lower or 'email' in header_lower:
|
|
contact_idx = i
|
|
elif 'site' in header_lower or 'web' in header_lower:
|
|
website_idx = i
|
|
|
|
if name_idx == -1:
|
|
logger.warning("Could not find name column in Framacalc CSV")
|
|
return []
|
|
|
|
# Process data rows
|
|
local_groups = []
|
|
for row in rows[1:]: # Skip header row
|
|
if len(row) <= name_idx or not row[name_idx].strip():
|
|
continue # Skip empty rows
|
|
|
|
name = row[name_idx].strip()
|
|
contact = row[contact_idx].strip() if contact_idx != -1 and contact_idx < len(row) else ""
|
|
website = row[website_idx].strip() if website_idx != -1 and website_idx < len(row) else ""
|
|
|
|
local_groups.append({
|
|
"name": name,
|
|
"contact": contact,
|
|
"website": website,
|
|
"type": "local_group",
|
|
"source": "framacalc",
|
|
"has_wiki_page": False, # Will be updated later
|
|
"wiki_url": "" # Will be updated later
|
|
})
|
|
|
|
logger.info(f"Found {len(local_groups)} local groups from Framacalc")
|
|
return local_groups
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching Framacalc data: {e}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error processing Framacalc data: {e}")
|
|
return []
|
|
|
|
def extract_wiki_group_links():
|
|
"""
|
|
Extract links to local group wiki pages from the OSM-FR wiki page
|
|
|
|
Returns:
|
|
dict: Dictionary mapping group names to wiki URLs
|
|
"""
|
|
try:
|
|
# Get the wiki page content
|
|
response = requests.get(WIKI_GROUPS_URL)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
wiki_links = {}
|
|
|
|
# Find the "Pages des groupes locaux" section
|
|
pages_section = None
|
|
for heading in soup.find_all(['h2', 'h3', 'h4']):
|
|
if 'Pages des groupes locaux' in heading.get_text():
|
|
pages_section = heading
|
|
break
|
|
|
|
if not pages_section:
|
|
logger.warning("Could not find 'Pages des groupes locaux' section")
|
|
return {}
|
|
|
|
# Get the content following the heading until the next heading
|
|
current = pages_section.next_sibling
|
|
while current and not current.name in ['h2', 'h3', 'h4']:
|
|
if current.name == 'ul':
|
|
# Process list items
|
|
for li in current.find_all('li', recursive=False):
|
|
text = li.get_text().strip()
|
|
link = li.find('a')
|
|
|
|
if link and text:
|
|
# Extract group name (before the comma)
|
|
parts = text.split(',', 1)
|
|
group_name = parts[0].strip()
|
|
|
|
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
|
|
wiki_links[group_name] = url
|
|
|
|
current = current.next_sibling
|
|
|
|
logger.info(f"Found {len(wiki_links)} wiki links for local groups")
|
|
return wiki_links
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching wiki group links: {e}")
|
|
return {}
|
|
except Exception as e:
|
|
logger.error(f"Error processing wiki group links: {e}")
|
|
return {}
|
|
|
|
def verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links):
|
|
"""
|
|
Verify that each group from Framacalc has a corresponding wiki page
|
|
|
|
Args:
|
|
framacalc_groups (list): List of local group dictionaries from Framacalc
|
|
wiki_links (dict): Dictionary mapping group names to wiki URLs
|
|
|
|
Returns:
|
|
list: Updated list of local group dictionaries with wiki verification
|
|
"""
|
|
for group in framacalc_groups:
|
|
group_name = group['name']
|
|
|
|
# Try to find a matching wiki link
|
|
found = False
|
|
for wiki_name, wiki_url in wiki_links.items():
|
|
# Check if the group name is similar to the wiki name
|
|
if group_name.lower() in wiki_name.lower() or wiki_name.lower() in group_name.lower():
|
|
group['has_wiki_page'] = True
|
|
group['wiki_url'] = wiki_url
|
|
found = True
|
|
break
|
|
|
|
if not found:
|
|
group['has_wiki_page'] = False
|
|
group['wiki_url'] = ""
|
|
|
|
return framacalc_groups
|
|
|
|
def extract_umap_url(html_content):
|
|
"""
|
|
Extract the uMap URL for OSM-FR local groups
|
|
|
|
Args:
|
|
html_content (str): HTML content of the wiki page
|
|
|
|
Returns:
|
|
str: uMap URL or None if not found
|
|
"""
|
|
if not html_content:
|
|
return None
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Look for links to umap.openstreetmap.fr
|
|
for link in soup.find_all('a'):
|
|
href = link.get('href', '')
|
|
if 'umap.openstreetmap.fr' in href and 'groupes-locaux' in href:
|
|
return href
|
|
|
|
return None
|
|
|
|
def save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, dry_run=False):
|
|
"""
|
|
Save the results to a JSON file
|
|
|
|
Args:
|
|
wiki_local_groups (list): List of local group dictionaries from wiki
|
|
framacalc_groups (list): List of local group dictionaries from Framacalc
|
|
working_groups (list): List of working group dictionaries
|
|
umap_url (str): URL to the uMap for local groups
|
|
wiki_links (dict): Dictionary mapping group names to wiki URLs
|
|
dry_run (bool): If True, don't actually save to file
|
|
|
|
Returns:
|
|
bool: True if saving was successful or dry run, False otherwise
|
|
"""
|
|
if dry_run:
|
|
logger.info("DRY RUN: Would have saved results to file")
|
|
logger.info(f"Wiki local groups: {len(wiki_local_groups)}")
|
|
for group in wiki_local_groups[:5]: # Show only first 5 for brevity
|
|
logger.info(f" - {group['name']}: {group['url']}")
|
|
|
|
logger.info(f"Framacalc groups: {len(framacalc_groups)}")
|
|
for group in framacalc_groups[:5]: # Show only first 5 for brevity
|
|
wiki_status = "Has wiki page" if group.get('has_wiki_page') else "No wiki page"
|
|
logger.info(f" - {group['name']}: {wiki_status}")
|
|
|
|
logger.info(f"Working groups: {len(working_groups)}")
|
|
for group in working_groups[:5]: # Show only first 5 for brevity
|
|
logger.info(f" - {group['name']}: {group['url']}")
|
|
|
|
if umap_url:
|
|
logger.info(f"uMap URL: {umap_url}")
|
|
|
|
logger.info(f"Wiki links: {len(wiki_links)}")
|
|
return True
|
|
|
|
# Combine all local groups
|
|
all_local_groups = wiki_local_groups + framacalc_groups
|
|
|
|
# Prepare the data structure
|
|
data = {
|
|
"last_updated": datetime.now().isoformat(),
|
|
"local_groups": all_local_groups,
|
|
"working_groups": working_groups,
|
|
"umap_url": umap_url,
|
|
"wiki_links": wiki_links
|
|
}
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Successfully saved {len(all_local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}")
|
|
return True
|
|
except IOError as e:
|
|
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main function to execute the script"""
|
|
parser = argparse.ArgumentParser(description="Fetch OSM-FR local groups from wiki and Framacalc")
|
|
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
|
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("Starting fetch_osm_fr_groups.py")
|
|
|
|
# Check if cache is fresh
|
|
if is_cache_fresh() and not args.force:
|
|
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
|
|
logger.info(f"Use --force to update anyway")
|
|
return
|
|
|
|
# Get the wiki page content
|
|
html_content = get_page_content(BASE_URL)
|
|
|
|
if not html_content:
|
|
logger.error("Failed to get wiki page content")
|
|
return
|
|
|
|
# Extract local groups from wiki
|
|
wiki_local_groups = extract_local_groups_from_wiki(html_content)
|
|
|
|
if not wiki_local_groups:
|
|
logger.warning("No local groups found in wiki")
|
|
|
|
# Extract working groups
|
|
working_groups = extract_working_groups(html_content)
|
|
|
|
if not working_groups:
|
|
logger.warning("No working groups found")
|
|
# Initialize with an empty list to avoid errors in the controller
|
|
working_groups = []
|
|
|
|
# Extract uMap URL
|
|
umap_url = extract_umap_url(html_content)
|
|
|
|
# Fetch local groups from Framacalc
|
|
framacalc_groups = fetch_framacalc_data()
|
|
|
|
if not framacalc_groups:
|
|
logger.warning("No local groups found in Framacalc")
|
|
|
|
# Extract wiki group links
|
|
wiki_links = extract_wiki_group_links()
|
|
|
|
if not wiki_links:
|
|
logger.warning("No wiki links found for local groups")
|
|
|
|
# Verify Framacalc groups have wiki pages
|
|
if framacalc_groups and wiki_links:
|
|
framacalc_groups = verify_framacalc_groups_have_wiki(framacalc_groups, wiki_links)
|
|
|
|
# Count groups with and without wiki pages
|
|
groups_with_wiki = sum(1 for group in framacalc_groups if group.get('has_wiki_page'))
|
|
groups_without_wiki = sum(1 for group in framacalc_groups if not group.get('has_wiki_page'))
|
|
|
|
logger.info(f"Framacalc groups with wiki pages: {groups_with_wiki}")
|
|
logger.info(f"Framacalc groups without wiki pages: {groups_without_wiki}")
|
|
|
|
# Save results
|
|
success = save_results(wiki_local_groups, framacalc_groups, working_groups, umap_url, wiki_links, args.dry_run)
|
|
|
|
if success:
|
|
logger.info("Script completed successfully")
|
|
else:
|
|
logger.error("Script completed with errors")
|
|
|
|
if __name__ == "__main__":
|
|
main() |