up wiki land

2025-10-04 17:04:53 +02:00 · 2025-08-22 18:19:20 +02:00 · 2025-08-22 18:19:20 +02:00 · e533c273b2
commit e533c273b2
parent 391a212034
10 changed files with 1116 additions and 182 deletions
--- a/wiki_compare/fetch_recent_changes.py
+++ b/wiki_compare/fetch_recent_changes.py
@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+fetch_recent_changes.py
+
+This script fetches recent changes from the OpenStreetMap wiki for the French namespace
+and stores the URLs of these pages. It specifically targets the recent changes page:
+https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2
+
+Usage:
+    python fetch_recent_changes.py [--dry-run] [--force]
+
+Options:
+    --dry-run    Run the script without saving the results to a file
+    --force      Force update even if the cache is still fresh (less than 1 hour old)
+
+Output:
+    - recent_changes.json: JSON file with information about recent changes in the French namespace
+    - Log messages about the scraping process and results
+"""
+
+import json
+import argparse
+import logging
+import os
+from datetime import datetime, timedelta
+import requests
+from bs4 import BeautifulSoup
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+# Constants
+OUTPUT_FILE = "recent_changes.json"
+RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
+WIKI_BASE_URL = "https://wiki.openstreetmap.org"
+CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour
+
+def is_cache_fresh():
+    """
+    Check if the cache file exists and is less than CACHE_DURATION old
+    
+    Returns:
+        bool: True if cache is fresh, False otherwise
+    """
+    if not os.path.exists(OUTPUT_FILE):
+        return False
+    
+    try:
+        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
+            now = datetime.now()
+            return (now - last_updated) < CACHE_DURATION
+    except (IOError, json.JSONDecodeError, ValueError) as e:
+        logger.error(f"Error checking cache freshness: {e}")
+        return False
+
+def get_page_content(url):
+    """
+    Get the HTML content of a page
+    
+    Args:
+        url (str): URL to fetch
+        
+    Returns:
+        str: HTML content of the page or None if request failed
+    """
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
+        return None
+
+def extract_recent_changes(html_content):
+    """
+    Extract recent changes from the wiki page HTML
+    
+    Args:
+        html_content (str): HTML content of the recent changes page
+        
+    Returns:
+        list: List of recent change dictionaries
+    """
+    if not html_content:
+        return []
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    recent_changes = []
+    
+    # Find the changes list
+    changes_list = soup.find('ul', class_='special')
+    
+    if not changes_list:
+        logger.warning("Could not find recent changes list")
+        return []
+    
+    # Process each list item (each change)
+    for li in changes_list.find_all('li'):
+        # Extract the page link
+        page_link = li.find('a', class_='mw-changeslist-title')
+        if not page_link:
+            continue
+        
+        page_name = page_link.get_text().strip()
+        page_url = WIKI_BASE_URL + page_link.get('href')
+        
+        # Extract the timestamp
+        timestamp_span = li.find('span', class_='mw-changeslist-date')
+        timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown"
+        
+        # Extract the user
+        user_link = li.find('a', class_='mw-userlink')
+        user = user_link.get_text().strip() if user_link else "Unknown"
+        
+        # Extract the comment
+        comment_span = li.find('span', class_='comment')
+        comment = comment_span.get_text().strip() if comment_span else ""
+        
+        # Extract the change size
+        change_size_span = li.find('span', class_='mw-changeslist-separator').next_sibling
+        change_size = change_size_span.get_text().strip() if change_size_span else "0"
+        
+        recent_changes.append({
+            "page_name": page_name,
+            "page_url": page_url,
+            "timestamp": timestamp,
+            "user": user,
+            "comment": comment,
+            "change_size": change_size
+        })
+    
+    logger.info(f"Found {len(recent_changes)} recent changes")
+    return recent_changes
+
+def save_results(recent_changes, dry_run=False):
+    """
+    Save the results to a JSON file
+    
+    Args:
+        recent_changes (list): List of recent change dictionaries
+        dry_run (bool): If True, don't actually save to file
+        
+    Returns:
+        bool: True if saving was successful or dry run, False otherwise
+    """
+    if dry_run:
+        logger.info("DRY RUN: Would have saved results to file")
+        logger.info(f"Recent changes: {len(recent_changes)}")
+        for change in recent_changes[:5]:  # Show only first 5 for brevity
+            logger.info(f"  - {change['page_name']}: {change['page_url']} ({change['timestamp']})")
+        if len(recent_changes) > 5:
+            logger.info(f"  ... and {len(recent_changes) - 5} more")
+        return True
+    
+    # Prepare the data structure
+    data = {
+        "last_updated": datetime.now().isoformat(),
+        "recent_changes": recent_changes
+    }
+    
+    try:
+        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}")
+        return True
+    except IOError as e:
+        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
+        return False
+
+def main():
+    """Main function to execute the script"""
+    parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
+    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
+    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
+    args = parser.parse_args()
+    
+    logger.info("Starting fetch_recent_changes.py")
+    
+    # Check if cache is fresh
+    if is_cache_fresh() and not args.force:
+        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
+        logger.info(f"Use --force to update anyway")
+        return
+    
+    # Get the recent changes page content
+    html_content = get_page_content(RECENT_CHANGES_URL)
+    
+    if not html_content:
+        logger.error("Failed to get recent changes page content")
+        return
+    
+    # Extract recent changes
+    recent_changes = extract_recent_changes(html_content)
+    
+    if not recent_changes:
+        logger.warning("No recent changes found")
+    
+    # Save results
+    success = save_results(recent_changes, args.dry_run)
+    
+    if success:
+        logger.info("Script completed successfully")
+    else:
+        logger.error("Script completed with errors")
+
+if __name__ == "__main__":
+    main()