up wiki compare
This commit is contained in:
parent
ce508974c9
commit
2f49ef6479
23 changed files with 567403 additions and 5132 deletions
316
wiki_compare/fetch_osm_fr_groups.py
Executable file
316
wiki_compare/fetch_osm_fr_groups.py
Executable file
|
|
@ -0,0 +1,316 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
fetch_osm_fr_groups.py
|
||||
|
||||
This script scrapes the OpenStreetMap wiki page for France/OSM-FR to extract
|
||||
information about local working groups. It specifically targets links in the
|
||||
#Pages_des_groupes_locaux section.
|
||||
|
||||
Usage:
|
||||
python fetch_osm_fr_groups.py [--dry-run] [--force]
|
||||
|
||||
Options:
|
||||
--dry-run Run the script without saving the results to a file
|
||||
--force Force update even if the cache is still fresh (less than 1 hour old)
|
||||
|
||||
Output:
|
||||
- osm_fr_groups.json: JSON file with information about OSM-FR local groups
|
||||
- Log messages about the scraping process and results
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
OUTPUT_FILE = "osm_fr_groups.json"
|
||||
BASE_URL = "https://wiki.openstreetmap.org/wiki/France/OSM-FR"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
||||
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
||||
|
||||
def is_cache_fresh():
|
||||
"""
|
||||
Check if the cache file exists and is less than CACHE_DURATION old
|
||||
|
||||
Returns:
|
||||
bool: True if cache is fresh, False otherwise
|
||||
"""
|
||||
if not os.path.exists(OUTPUT_FILE):
|
||||
return False
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
|
||||
now = datetime.now()
|
||||
return (now - last_updated) < CACHE_DURATION
|
||||
except (IOError, json.JSONDecodeError, ValueError) as e:
|
||||
logger.error(f"Error checking cache freshness: {e}")
|
||||
return False
|
||||
|
||||
def get_page_content(url):
|
||||
"""
|
||||
Get the HTML content of a page
|
||||
|
||||
Args:
|
||||
url (str): URL to fetch
|
||||
|
||||
Returns:
|
||||
str: HTML content of the page or None if request failed
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
def extract_working_groups(html_content):
|
||||
"""
|
||||
Extract working groups from the wiki page HTML
|
||||
|
||||
Args:
|
||||
html_content (str): HTML content of the wiki page
|
||||
|
||||
Returns:
|
||||
list: List of working group dictionaries
|
||||
"""
|
||||
if not html_content:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
working_groups = []
|
||||
|
||||
# Find the working groups section
|
||||
working_groups_section = None
|
||||
for heading in soup.find_all(['h2', 'h3']):
|
||||
if heading.get_text().strip() == 'Groupes de travail' or 'Groupes_de_travail' in heading.get_text():
|
||||
working_groups_section = heading
|
||||
break
|
||||
|
||||
if not working_groups_section:
|
||||
logger.warning("Could not find working groups section")
|
||||
# Return an empty list but with a default category
|
||||
return []
|
||||
|
||||
# Get the content following the heading until the next heading
|
||||
current = working_groups_section.next_sibling
|
||||
while current and not current.name in ['h2', 'h3']:
|
||||
if current.name == 'ul':
|
||||
# Process list items
|
||||
for li in current.find_all('li', recursive=False):
|
||||
link = li.find('a')
|
||||
if link:
|
||||
name = link.get_text().strip()
|
||||
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
|
||||
|
||||
# Extract description (text after the link)
|
||||
description = ""
|
||||
next_node = link.next_sibling
|
||||
while next_node:
|
||||
if isinstance(next_node, str):
|
||||
description += next_node.strip()
|
||||
next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
|
||||
|
||||
description = description.strip(' :-,')
|
||||
|
||||
working_groups.append({
|
||||
"name": name,
|
||||
"url": url,
|
||||
"description": description,
|
||||
"category": "Général",
|
||||
"type": "working_group"
|
||||
})
|
||||
current = current.next_sibling
|
||||
|
||||
logger.info(f"Found {len(working_groups)} working groups")
|
||||
return working_groups
|
||||
|
||||
def extract_local_groups(html_content):
|
||||
"""
|
||||
Extract local groups from the wiki page HTML
|
||||
|
||||
Args:
|
||||
html_content (str): HTML content of the wiki page
|
||||
|
||||
Returns:
|
||||
list: List of local group dictionaries
|
||||
"""
|
||||
if not html_content:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
local_groups = []
|
||||
|
||||
# Find the local groups section
|
||||
local_groups_section = None
|
||||
for heading in soup.find_all(['h2', 'h3']):
|
||||
if heading.get_text().strip() == 'Groupes locaux' or 'Pages des groupes locaux' in heading.get_text():
|
||||
local_groups_section = heading
|
||||
break
|
||||
|
||||
if not local_groups_section:
|
||||
logger.warning("Could not find local groups section")
|
||||
return []
|
||||
|
||||
# Get the content following the heading until the next heading
|
||||
current = local_groups_section.next_sibling
|
||||
while current and not current.name in ['h2', 'h3']:
|
||||
if current.name == 'ul':
|
||||
# Process list items
|
||||
for li in current.find_all('li', recursive=False):
|
||||
link = li.find('a')
|
||||
if link:
|
||||
name = link.get_text().strip()
|
||||
url = WIKI_BASE_URL + link.get('href') if link.get('href').startswith('/') else link.get('href')
|
||||
|
||||
# Extract description (text after the link)
|
||||
description = ""
|
||||
next_node = link.next_sibling
|
||||
while next_node:
|
||||
if isinstance(next_node, str):
|
||||
description += next_node.strip()
|
||||
next_node = next_node.next_sibling if hasattr(next_node, 'next_sibling') else None
|
||||
|
||||
description = description.strip(' :-,')
|
||||
|
||||
local_groups.append({
|
||||
"name": name,
|
||||
"url": url,
|
||||
"description": description,
|
||||
"type": "local_group"
|
||||
})
|
||||
current = current.next_sibling
|
||||
|
||||
logger.info(f"Found {len(local_groups)} local groups")
|
||||
return local_groups
|
||||
|
||||
def extract_umap_url(html_content):
|
||||
"""
|
||||
Extract the uMap URL for OSM-FR local groups
|
||||
|
||||
Args:
|
||||
html_content (str): HTML content of the wiki page
|
||||
|
||||
Returns:
|
||||
str: uMap URL or None if not found
|
||||
"""
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Look for links to umap.openstreetmap.fr
|
||||
for link in soup.find_all('a'):
|
||||
href = link.get('href', '')
|
||||
if 'umap.openstreetmap.fr' in href and 'groupes-locaux' in href:
|
||||
return href
|
||||
|
||||
return None
|
||||
|
||||
def save_results(local_groups, working_groups, umap_url, dry_run=False):
|
||||
"""
|
||||
Save the results to a JSON file
|
||||
|
||||
Args:
|
||||
local_groups (list): List of local group dictionaries
|
||||
working_groups (list): List of working group dictionaries
|
||||
umap_url (str): URL to the uMap for local groups
|
||||
dry_run (bool): If True, don't actually save to file
|
||||
|
||||
Returns:
|
||||
bool: True if saving was successful or dry run, False otherwise
|
||||
"""
|
||||
if dry_run:
|
||||
logger.info("DRY RUN: Would have saved results to file")
|
||||
logger.info(f"Local groups: {len(local_groups)}")
|
||||
for group in local_groups:
|
||||
logger.info(f" - {group['name']}: {group['url']}")
|
||||
logger.info(f"Working groups: {len(working_groups)}")
|
||||
for group in working_groups:
|
||||
logger.info(f" - {group['name']}: {group['url']}")
|
||||
if umap_url:
|
||||
logger.info(f"uMap URL: {umap_url}")
|
||||
return True
|
||||
|
||||
# Prepare the data structure
|
||||
data = {
|
||||
"last_updated": datetime.now().isoformat(),
|
||||
"local_groups": local_groups,
|
||||
"working_groups": working_groups,
|
||||
"umap_url": umap_url
|
||||
}
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Successfully saved {len(local_groups)} local groups and {len(working_groups)} working groups to {OUTPUT_FILE}")
|
||||
return True
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
parser = argparse.ArgumentParser(description="Scrape OSM-FR local groups from the wiki")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
||||
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Starting fetch_osm_fr_groups.py")
|
||||
|
||||
# Check if cache is fresh
|
||||
if is_cache_fresh() and not args.force:
|
||||
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
|
||||
logger.info(f"Use --force to update anyway")
|
||||
return
|
||||
|
||||
# Get the wiki page content
|
||||
html_content = get_page_content(BASE_URL)
|
||||
|
||||
if not html_content:
|
||||
logger.error("Failed to get wiki page content")
|
||||
return
|
||||
|
||||
# Extract local groups
|
||||
local_groups = extract_local_groups(html_content)
|
||||
|
||||
if not local_groups:
|
||||
logger.warning("No local groups found")
|
||||
|
||||
# Extract working groups
|
||||
working_groups = extract_working_groups(html_content)
|
||||
|
||||
if not working_groups:
|
||||
logger.warning("No working groups found")
|
||||
# Initialize with an empty list to avoid errors in the controller
|
||||
working_groups = []
|
||||
|
||||
# Extract uMap URL
|
||||
umap_url = extract_umap_url(html_content)
|
||||
|
||||
# Save results
|
||||
success = save_results(local_groups, working_groups, umap_url, args.dry_run)
|
||||
|
||||
if success:
|
||||
logger.info("Script completed successfully")
|
||||
else:
|
||||
logger.error("Script completed with errors")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue