#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ fetch_recent_changes.py This script fetches recent changes from the OpenStreetMap wiki for the French namespace and stores the URLs of these pages. It specifically targets the recent changes page: https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2 Usage: python fetch_recent_changes.py [--dry-run] [--force] Options: --dry-run Run the script without saving the results to a file --force Force update even if the cache is still fresh (less than 1 hour old) Output: - recent_changes.json: JSON file with information about recent changes in the French namespace - Log messages about the scraping process and results """ import json import argparse import logging import os from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants OUTPUT_FILE = "recent_changes.json" RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2" WIKI_BASE_URL = "https://wiki.openstreetmap.org" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour def is_cache_fresh(): """ Check if the cache file exists and is less than CACHE_DURATION old Returns: bool: True if cache is fresh, False otherwise """ if not os.path.exists(OUTPUT_FILE): return False try: with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) now = datetime.now() return (now - last_updated) < CACHE_DURATION except (IOError, json.JSONDecodeError, ValueError) as e: logger.error(f"Error checking cache freshness: {e}") return False def get_page_content(url): """ Get the HTML content of a page Args: url (str): URL to fetch Returns: str: HTML content of the page or None if request failed """ try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None def extract_recent_changes(html_content): """ Extract recent changes from the wiki page HTML Args: html_content (str): HTML content of the recent changes page Returns: list: List of recent change dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') recent_changes = [] # Find the changes list changes_list = soup.find('ul', class_='special') if not changes_list: logger.warning("Could not find recent changes list") return [] # Process each list item (each change) for li in changes_list.find_all('li'): # Extract the page link page_link = li.find('a', class_='mw-changeslist-title') if not page_link: continue page_name = page_link.get_text().strip() page_url = WIKI_BASE_URL + page_link.get('href') # Extract the timestamp timestamp_span = li.find('span', class_='mw-changeslist-date') timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown" # Extract the user user_link = li.find('a', class_='mw-userlink') user = user_link.get_text().strip() if user_link else "Unknown" # Extract the comment comment_span = li.find('span', class_='comment') comment = comment_span.get_text().strip() if comment_span else "" # Extract the change size change_size_span = li.find('span', class_='mw-changeslist-separator').next_sibling change_size = change_size_span.get_text().strip() if change_size_span else "0" recent_changes.append({ "page_name": page_name, "page_url": page_url, "timestamp": timestamp, "user": user, "comment": comment, "change_size": change_size }) logger.info(f"Found {len(recent_changes)} recent changes") return recent_changes def save_results(recent_changes, dry_run=False): """ Save the results to a JSON file Args: recent_changes (list): List of recent change dictionaries dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved results to file") logger.info(f"Recent changes: {len(recent_changes)}") for change in recent_changes[:5]: # Show only first 5 for brevity logger.info(f" - {change['page_name']}: {change['page_url']} ({change['timestamp']})") if len(recent_changes) > 5: logger.info(f" ... and {len(recent_changes) - 5} more") return True # Prepare the data structure data = { "last_updated": datetime.now().isoformat(), "recent_changes": recent_changes } try: with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}") return True except IOError as e: logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace") parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") args = parser.parse_args() logger.info("Starting fetch_recent_changes.py") # Check if cache is fresh if is_cache_fresh() and not args.force: logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") logger.info(f"Use --force to update anyway") return # Get the recent changes page content html_content = get_page_content(RECENT_CHANGES_URL) if not html_content: logger.error("Failed to get recent changes page content") return # Extract recent changes recent_changes = extract_recent_changes(html_content) if not recent_changes: logger.warning("No recent changes found") # Save results success = save_results(recent_changes, args.dry_run) if success: logger.info("Script completed successfully") else: logger.error("Script completed with errors") if __name__ == "__main__": main()