osm-commerces/wiki_compare/fetch_recent_changes.py
2025-08-22 18:19:20 +02:00

216 lines
No EOL
7.3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
fetch_recent_changes.py
This script fetches recent changes from the OpenStreetMap wiki for the French namespace
and stores the URLs of these pages. It specifically targets the recent changes page:
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2
Usage:
python fetch_recent_changes.py [--dry-run] [--force]
Options:
--dry-run Run the script without saving the results to a file
--force Force update even if the cache is still fresh (less than 1 hour old)
Output:
- recent_changes.json: JSON file with information about recent changes in the French namespace
- Log messages about the scraping process and results
"""
import json
import argparse
import logging
import os
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
OUTPUT_FILE = "recent_changes.json"
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
def is_cache_fresh():
"""
Check if the cache file exists and is less than CACHE_DURATION old
Returns:
bool: True if cache is fresh, False otherwise
"""
if not os.path.exists(OUTPUT_FILE):
return False
try:
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
now = datetime.now()
return (now - last_updated) < CACHE_DURATION
except (IOError, json.JSONDecodeError, ValueError) as e:
logger.error(f"Error checking cache freshness: {e}")
return False
def get_page_content(url):
"""
Get the HTML content of a page
Args:
url (str): URL to fetch
Returns:
str: HTML content of the page or None if request failed
"""
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def extract_recent_changes(html_content):
"""
Extract recent changes from the wiki page HTML
Args:
html_content (str): HTML content of the recent changes page
Returns:
list: List of recent change dictionaries
"""
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
recent_changes = []
# Find the changes list
changes_list = soup.find('ul', class_='special')
if not changes_list:
logger.warning("Could not find recent changes list")
return []
# Process each list item (each change)
for li in changes_list.find_all('li'):
# Extract the page link
page_link = li.find('a', class_='mw-changeslist-title')
if not page_link:
continue
page_name = page_link.get_text().strip()
page_url = WIKI_BASE_URL + page_link.get('href')
# Extract the timestamp
timestamp_span = li.find('span', class_='mw-changeslist-date')
timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown"
# Extract the user
user_link = li.find('a', class_='mw-userlink')
user = user_link.get_text().strip() if user_link else "Unknown"
# Extract the comment
comment_span = li.find('span', class_='comment')
comment = comment_span.get_text().strip() if comment_span else ""
# Extract the change size
change_size_span = li.find('span', class_='mw-changeslist-separator').next_sibling
change_size = change_size_span.get_text().strip() if change_size_span else "0"
recent_changes.append({
"page_name": page_name,
"page_url": page_url,
"timestamp": timestamp,
"user": user,
"comment": comment,
"change_size": change_size
})
logger.info(f"Found {len(recent_changes)} recent changes")
return recent_changes
def save_results(recent_changes, dry_run=False):
"""
Save the results to a JSON file
Args:
recent_changes (list): List of recent change dictionaries
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved results to file")
logger.info(f"Recent changes: {len(recent_changes)}")
for change in recent_changes[:5]: # Show only first 5 for brevity
logger.info(f" - {change['page_name']}: {change['page_url']} ({change['timestamp']})")
if len(recent_changes) > 5:
logger.info(f" ... and {len(recent_changes) - 5} more")
return True
# Prepare the data structure
data = {
"last_updated": datetime.now().isoformat(),
"recent_changes": recent_changes
}
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}")
return True
except IOError as e:
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
return False
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
args = parser.parse_args()
logger.info("Starting fetch_recent_changes.py")
# Check if cache is fresh
if is_cache_fresh() and not args.force:
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
logger.info(f"Use --force to update anyway")
return
# Get the recent changes page content
html_content = get_page_content(RECENT_CHANGES_URL)
if not html_content:
logger.error("Failed to get recent changes page content")
return
# Extract recent changes
recent_changes = extract_recent_changes(html_content)
if not recent_changes:
logger.warning("No recent changes found")
# Save results
success = save_results(recent_changes, args.dry_run)
if success:
logger.info("Script completed successfully")
else:
logger.error("Script completed with errors")
if __name__ == "__main__":
main()