#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ fetch_recent_changes.py This script fetches recent changes from the OpenStreetMap wiki for the French namespace and stores the URLs of these pages. It specifically targets the recent changes page: https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2 Usage: python fetch_recent_changes.py [--dry-run] [--force] Options: --dry-run Run the script without saving the results to a file --force Force update even if the cache is still fresh (less than 1 hour old) Output: - recent_changes.json: JSON file with information about recent changes in the French namespace - Log messages about the scraping process and results """ import json import argparse import logging import os import re from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants OUTPUT_FILE = "recent_changes.json" RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2" WIKI_BASE_URL = "https://wiki.openstreetmap.org" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour def is_cache_fresh(): """ Check if the cache file exists and is less than CACHE_DURATION old Returns: bool: True if cache is fresh, False otherwise """ if not os.path.exists(OUTPUT_FILE): return False try: with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) now = datetime.now() return (now - last_updated) < CACHE_DURATION except (IOError, json.JSONDecodeError, ValueError) as e: logger.error(f"Error checking cache freshness: {e}") return False def get_page_content(url): """ Get the HTML content of a page Args: url (str): URL to fetch Returns: str: HTML content of the page or None if request failed """ try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None def extract_recent_changes(html_content): """ Extract recent changes from the wiki page HTML Args: html_content (str): HTML content of the recent changes page Returns: list: List of recent change dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') recent_changes = [] # Try different selectors for the changes list # First try the old selector changes_list = soup.find('ul', class_='special') # If not found, try the new selector if not changes_list: changes_list = soup.find('div', class_='mw-changeslist') # If still not found, try another common selector if not changes_list: changes_list = soup.find('ul', class_='mw-changeslist') # If still not found, look for any list inside the content area if not changes_list: content_div = soup.find('div', id='mw-content-text') if content_div: changes_list = content_div.find('ul') if not changes_list: logger.warning("Could not find recent changes list") return [] # Process each list item (each change) # Try both li elements and div elements with appropriate classes change_items = changes_list.find_all('li') if not change_items: change_items = changes_list.find_all('div', class_='mw-changeslist-line') for item in change_items: # Extract the page link - try different selectors page_link = item.find('a', class_='mw-changeslist-title') if not page_link: page_link = item.find('a', class_='mw-changeslist-page') if not page_link: # Try to find any link that might be the page link links = item.find_all('a') for link in links: if '/wiki/' in link.get('href', ''): page_link = link break if not page_link: continue page_name = page_link.get_text().strip() page_url = WIKI_BASE_URL + page_link.get('href') # Extract the timestamp - try different selectors timestamp_span = item.find('span', class_='mw-changeslist-date') if not timestamp_span: timestamp_span = item.find('span', class_='mw-changeslist-time') timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown" # Extract the user - try different selectors user_link = item.find('a', class_='mw-userlink') if not user_link: user_link = item.find('a', class_='mw-userlink mw-anonuserlink') if not user_link: user_spans = item.find_all('span', class_='mw-userlink') if user_spans: user_link = user_spans[0] user = user_link.get_text().strip() if user_link else "Unknown" # Extract the comment - try different selectors comment_span = item.find('span', class_='comment') if not comment_span: comment_span = item.find('span', class_='changeslist-comment') comment = comment_span.get_text().strip() if comment_span else "" # Extract the change size - try different approaches change_size = "0" # Try to find spans with specific classes size_spans = item.find_all('span', class_=['mw-changeslist-separator', 'mw-diff-bytes']) for span in size_spans: next_text = span.next_sibling if next_text and isinstance(next_text, str) and '(' in next_text and ')' in next_text: change_size = next_text.strip() break # If not found, try another approach if change_size == "0": # Look for parentheses with numbers import re text = item.get_text() size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text) if size_matches: change_size = size_matches[0] recent_changes.append({ "page_name": page_name, "page_url": page_url, "timestamp": timestamp, "user": user, "comment": comment, "change_size": change_size }) logger.info(f"Found {len(recent_changes)} recent changes") return recent_changes def save_results(recent_changes, dry_run=False): """ Save the results to a JSON file Args: recent_changes (list): List of recent change dictionaries dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved results to file") logger.info(f"Recent changes: {len(recent_changes)}") for change in recent_changes[:5]: # Show only first 5 for brevity logger.info(f" - {change['page_name']}: {change['page_url']} ({change['timestamp']})") if len(recent_changes) > 5: logger.info(f" ... and {len(recent_changes) - 5} more") return True # Prepare the data structure data = { "last_updated": datetime.now().isoformat(), "recent_changes": recent_changes } try: with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}") return True except IOError as e: logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace") parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") args = parser.parse_args() logger.info("Starting fetch_recent_changes.py") # Check if cache is fresh if is_cache_fresh() and not args.force: logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") logger.info(f"Use --force to update anyway") return # Get the recent changes page content html_content = get_page_content(RECENT_CHANGES_URL) if not html_content: logger.error("Failed to get recent changes page content") return # Extract recent changes recent_changes = extract_recent_changes(html_content) if not recent_changes: logger.warning("No recent changes found") # Save results success = save_results(recent_changes, args.dry_run) if success: logger.info("Script completed successfully") else: logger.error("Script completed with errors") if __name__ == "__main__": main()