osm-labo/wiki_compare/fetch_recent_changes.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
fetch_recent_changes.py

This script fetches recent changes from the OpenStreetMap wiki for the French namespace
and stores the URLs of these pages. It specifically targets the recent changes page:
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2

Usage:
    python fetch_recent_changes.py [--dry-run] [--force]

Options:
    --dry-run    Run the script without saving the results to a file
    --force      Force update even if the cache is still fresh (less than 1 hour old)

Output:
    - recent_changes.json: JSON file with information about recent changes in the French namespace
    - Log messages about the scraping process and results
"""

import json
import argparse
import logging
import os
import re
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
OUTPUT_FILE = "recent_changes.json"
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour

def is_cache_fresh():
    """
    Check if the cache file exists and is less than CACHE_DURATION old

    Returns:
        bool: True if cache is fresh, False otherwise
    """
    if not os.path.exists(OUTPUT_FILE):
        return False

    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
            now = datetime.now()
            return (now - last_updated) < CACHE_DURATION
    except (IOError, json.JSONDecodeError, ValueError) as e:
        logger.error(f"Error checking cache freshness: {e}")
        return False

def get_page_content(url):
    """
    Get the HTML content of a page

    Args:
        url (str): URL to fetch

    Returns:
        str: HTML content of the page or None if request failed
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def extract_recent_changes(html_content):
    """
    Extract recent changes from the wiki page HTML

    Args:
        html_content (str): HTML content of the recent changes page

    Returns:
        list: List of recent change dictionaries
    """
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    recent_changes = []

    # Try different selectors for the changes list
    # First try the old selector
    changes_list = soup.find('ul', class_='special')

    # If not found, try the new selector
    if not changes_list:
        changes_list = soup.find('div', class_='mw-changeslist')

    # If still not found, try another common selector
    if not changes_list:
        changes_list = soup.find('ul', class_='mw-changeslist')

    # If still not found, look for any list inside the content area
    if not changes_list:
        content_div = soup.find('div', id='mw-content-text')
        if content_div:
            changes_list = content_div.find('ul')

    if not changes_list:
        logger.warning("Could not find recent changes list")
        return []

    # Process each list item (each change)
    # Try both li elements and div elements with appropriate classes
    change_items = changes_list.find_all('li')
    if not change_items:
        change_items = changes_list.find_all('div', class_='mw-changeslist-line')

    for item in change_items:
        # Extract the page link - try different selectors
        page_link = item.find('a', class_='mw-changeslist-title')
        if not page_link:
            page_link = item.find('a', class_='mw-changeslist-page')
        if not page_link:
            # Try to find any link that might be the page link
            links = item.find_all('a')
            for link in links:
                if '/wiki/' in link.get('href', ''):
                    page_link = link
                    break

        if not page_link:
            continue

        page_name = page_link.get_text().strip()
        page_url = WIKI_BASE_URL + page_link.get('href')

        # Extract the timestamp - try different selectors
        timestamp_span = item.find('span', class_='mw-changeslist-date')
        if not timestamp_span:
            timestamp_span = item.find('span', class_='mw-changeslist-time')
        timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown"

        # Extract the user - try different selectors
        user_link = item.find('a', class_='mw-userlink')
        if not user_link:
            user_link = item.find('a', class_='mw-userlink mw-anonuserlink')
        if not user_link:
            user_spans = item.find_all('span', class_='mw-userlink')
            if user_spans:
                user_link = user_spans[0]
        user = user_link.get_text().strip() if user_link else "Unknown"

        # Extract the comment - try different selectors
        comment_span = item.find('span', class_='comment')
        if not comment_span:
            comment_span = item.find('span', class_='changeslist-comment')
        comment = comment_span.get_text().strip() if comment_span else ""

        # Extract the change size - try different approaches
        change_size = "0"
        # Try to find spans with specific classes
        size_spans = item.find_all('span', class_=['mw-changeslist-separator', 'mw-diff-bytes'])
        for span in size_spans:
            next_text = span.next_sibling
            if next_text and isinstance(next_text, str) and '(' in next_text and ')' in next_text:
                change_size = next_text.strip()
                break

        # If not found, try another approach
        if change_size == "0":
            # Look for parentheses with numbers
            import re
            text = item.get_text()
            size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text)
            if size_matches:
                change_size = size_matches[0]

        recent_changes.append({
            "page_name": page_name,
            "page_url": page_url,
            "timestamp": timestamp,
            "user": user,
            "comment": comment,
            "change_size": change_size
        })

    logger.info(f"Found {len(recent_changes)} recent changes")
    return recent_changes

def save_results(recent_changes, dry_run=False):
    """
    Save the results to a JSON file

    Args:
        recent_changes (list): List of recent change dictionaries
        dry_run (bool): If True, don't actually save to file

    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved results to file")
        logger.info(f"Recent changes: {len(recent_changes)}")
        for change in recent_changes[:5]:  # Show only first 5 for brevity
            logger.info(f"  - {change['page_name']}: {change['page_url']} ({change['timestamp']})")
        if len(recent_changes) > 5:
            logger.info(f"  ... and {len(recent_changes) - 5} more")
        return True

    # Prepare the data structure
    data = {
        "last_updated": datetime.now().isoformat(),
        "recent_changes": recent_changes
    }

    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}")
        return True
    except IOError as e:
        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
        return False

def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
    args = parser.parse_args()

    logger.info("Starting fetch_recent_changes.py")

    # Check if cache is fresh
    if is_cache_fresh() and not args.force:
        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
        logger.info(f"Use --force to update anyway")
        return

    # Get the recent changes page content
    html_content = get_page_content(RECENT_CHANGES_URL)

    if not html_content:
        logger.error("Failed to get recent changes page content")
        return

    # Extract recent changes
    recent_changes = extract_recent_changes(html_content)

    if not recent_changes:
        logger.warning("No recent changes found")

    # Save results
    success = save_results(recent_changes, args.dry_run)

    if success:
        logger.info("Script completed successfully")
    else:
        logger.error("Script completed with errors")

if __name__ == "__main__":
    main()