osm-labo/wiki_compare/fetch_recent_changes.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
fetch_recent_changes.py

This script fetches recent changes from the OpenStreetMap wiki for the French namespace
and stores the URLs of these pages. It specifically targets the recent changes page:
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=10000&days=365&enhanced=1&title=Special:RecentChanges&urlversion=2

Usage:
    python fetch_recent_changes.py [--dry-run] [--force]

Options:
    --dry-run    Run the script without saving the results to a file
    --force      Force update even if the cache is still fresh (less than 1 hour old)

Output:
    - recent_changes.json: JSON file with information about recent changes in the French namespace
    - Log messages about the scraping process and results
"""

import json
import argparse
import logging
import os
import re
import shutil
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
# Use the directory of this script to determine the output file path
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour

def is_cache_fresh():
    """
    Check if the cache file exists and is less than CACHE_DURATION old

    Returns:
        bool: True if cache is fresh, False otherwise
    """
    if not os.path.exists(OUTPUT_FILE):
        return False

    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
            now = datetime.now()
            return (now - last_updated) < CACHE_DURATION
    except (IOError, json.JSONDecodeError, ValueError) as e:
        logger.error(f"Error checking cache freshness: {e}")
        return False

def get_page_content(url):
    """
    Get the HTML content of a page

    Args:
        url (str): URL to fetch

    Returns:
        str: HTML content of the page or None if request failed
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def extract_recent_changes(html_content):
    """
    Extract recent changes from the wiki page HTML

    Args:
        html_content (str): HTML content of the recent changes page

    Returns:
        list: List of recent change dictionaries
    """
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    recent_changes = []

    # Find the main changeslist container
    # According to the issue description, we should look for .mw-changeslist
    changes_list = soup.find('div', class_='mw-changeslist')

    if not changes_list:
        # If still not found, look for the content area
        content_div = soup.find('div', id='mw-content-text')
        if content_div:
            # Try to find the changeslist div
            changes_list = content_div.find('div', class_='mw-changeslist')

    if not changes_list:
        # Log the HTML structure to help debug
        logger.warning("Could not find recent changes list. HTML structure:")
        body = soup.find('body')
        if body:
            content_area = body.find('div', id='content')
            if content_area:
                logger.warning(f"Content area classes: {content_area.get('class', [])}")
                main_content = content_area.find('div', id='mw-content-text')
                if main_content:
                    logger.warning(f"Main content first child: {main_content.find().name if main_content.find() else 'None'}")
        return []

    logger.info(f"Found changes list with tag: {changes_list.name}, classes: {changes_list.get('class', [])}")

    # Process each change item - based on the actual HTML structure
    # According to the debug output, the changes are in tr elements
    change_items = changes_list.find_all('tr')

    # If no tr elements found directly, look for tables with class mw-changeslist-line
    if not change_items:
        tables = changes_list.find_all('table', class_='mw-changeslist-line')
        for table in tables:
            trs = table.find_all('tr')
            change_items.extend(trs)

    logger.info(f"Found {len(change_items)} change items")

    for item in change_items:
        # Extract the page link from the mw-changeslist-title class
        page_link = item.find('a', class_='mw-changeslist-title')

        if not page_link:
            # If not found with the specific class, try to find any link that might be the page link
            inner_td = item.find('td', class_='mw-changeslist-line-inner')
            if inner_td:
                links = inner_td.find_all('a')
                for link in links:
                    href = link.get('href', '')
                    if '/wiki/' in href and 'action=history' not in href and 'diff=' not in href:
                        page_link = link
                        break

        if not page_link:
            # Skip items without a page link (might be headers or other elements)
            continue

        page_name = page_link.get_text().strip()
        page_url = page_link.get('href')
        if not page_url.startswith('http'):
            page_url = WIKI_BASE_URL + page_url

        # Extract the timestamp from the mw-enhanced-rc class
        timestamp_td = item.find('td', class_='mw-enhanced-rc')
        timestamp = timestamp_td.get_text().strip() if timestamp_td else "Unknown"

        # Extract the user from the mw-userlink class
        user_link = item.find('a', class_='mw-userlink')
        user = user_link.get_text().strip() if user_link else "Unknown"

        # Extract the user profile URL
        user_url = ""
        if user_link and user_link.get('href'):
            user_url = user_link.get('href')
            if not user_url.startswith('http'):
                user_url = WIKI_BASE_URL + user_url

        # Extract the diff link
        diff_url = ""
        diff_link = item.find('a', class_='mw-changeslist-diff') or item.find('a', string='diff')
        if diff_link and diff_link.get('href'):
            diff_url = diff_link.get('href')
            if not diff_url.startswith('http'):
                diff_url = WIKI_BASE_URL + diff_url

        # Extract the comment from the comment class
        comment_span = item.find('span', class_='comment')
        comment = comment_span.get_text().strip() if comment_span else ""

        # Extract the change size from the mw-diff-bytes class
        size_span = item.find('span', class_='mw-diff-bytes')
        if size_span:
            change_size = size_span.get_text().strip()
        else:
            # If not found, try to extract from the text
            change_size = "0"
            text = item.get_text()
            size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text)
            if size_matches:
                change_size = size_matches[0]

        # Extract text differences if diff_url is available
        added_text = ""
        removed_text = ""
        if diff_url:
            try:
                # Fetch the diff page
                diff_html = get_page_content(diff_url)
                if diff_html:
                    diff_soup = BeautifulSoup(diff_html, 'html.parser')

                    # Find added text (ins elements)
                    added_elements = diff_soup.find_all('ins', class_='diffchange')
                    if added_elements:
                        added_text = ' '.join([el.get_text().strip() for el in added_elements])

                    # Find removed text (del elements)
                    removed_elements = diff_soup.find_all('del', class_='diffchange')
                    if removed_elements:
                        removed_text = ' '.join([el.get_text().strip() for el in removed_elements])
            except Exception as e:
                logger.error(f"Error fetching diff page {diff_url}: {e}")

        recent_changes.append({
            "page_name": page_name,
            "page_url": page_url,
            "timestamp": timestamp,
            "user": user,
            "user_url": user_url,
            "comment": comment,
            "change_size": change_size,
            "diff_url": diff_url,
            "added_text": added_text,
            "removed_text": removed_text
        })

        logger.debug(f"Extracted change: {page_name} by {user}")

    logger.info(f"Extracted {len(recent_changes)} recent changes")
    return recent_changes

def save_results(recent_changes, dry_run=False):
    """
    Save the results to a JSON file

    Args:
        recent_changes (list): List of recent change dictionaries
        dry_run (bool): If True, don't actually save to file

    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved results to file")
        logger.info(f"Recent changes: {len(recent_changes)}")
        for change in recent_changes[:5]:  # Show only first 5 for brevity
            logger.info(f"  - {change['page_name']}: {change['page_url']} ({change['timestamp']})")
        if len(recent_changes) > 5:
            logger.info(f"  ... and {len(recent_changes) - 5} more")
        return True

    # Log some details about the recent changes
    logger.info(f"Preparing to save {len(recent_changes)} recent changes")
    if recent_changes:
        logger.info(f"First change: {recent_changes[0]['page_name']} by {recent_changes[0]['user']}")

    # Prepare the data structure
    data = {
        "last_updated": datetime.now().isoformat(),
        "recent_changes": recent_changes
    }

    # Get the file's last modified time before saving
    before_mtime = None
    if os.path.exists(OUTPUT_FILE):
        before_mtime = os.path.getmtime(OUTPUT_FILE)
        logger.info(f"File {OUTPUT_FILE} exists, last modified at {datetime.fromtimestamp(before_mtime)}")

    try:
        # Print the JSON data that we're trying to save
        json_data = json.dumps(data, indent=2, ensure_ascii=False)
        logger.info(f"JSON data to save (first 500 chars): {json_data[:500]}...")

        # Save the data to a temporary file first
        temp_file = OUTPUT_FILE + ".tmp"
        logger.info(f"Writing data to temporary file {temp_file}")
        with open(temp_file, 'w', encoding='utf-8') as f:
            f.write(json_data)

        # Check if the temporary file was created and has content
        if os.path.exists(temp_file):
            temp_size = os.path.getsize(temp_file)
            logger.info(f"Temporary file {temp_file} created, size: {temp_size} bytes")

            # Read the content of the temporary file to verify
            with open(temp_file, 'r', encoding='utf-8') as f:
                temp_content = f.read(500)  # Read first 500 chars
                logger.info(f"Temporary file content (first 500 chars): {temp_content}...")

            # Move the temporary file to the final location
            logger.info(f"Moving temporary file to {OUTPUT_FILE}")
            import shutil
            shutil.move(temp_file, OUTPUT_FILE)
        else:
            logger.error(f"Failed to create temporary file {temp_file}")

        # Check if the file was actually updated
        if os.path.exists(OUTPUT_FILE):
            after_mtime = os.path.getmtime(OUTPUT_FILE)
            file_size = os.path.getsize(OUTPUT_FILE)
            logger.info(f"File {OUTPUT_FILE} exists, size: {file_size} bytes, mtime: {datetime.fromtimestamp(after_mtime)}")

            # Read the content of the file to verify
            with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
                file_content = f.read(500)  # Read first 500 chars
                logger.info(f"File content (first 500 chars): {file_content}...")

            if before_mtime and after_mtime <= before_mtime:
                logger.warning(f"File {OUTPUT_FILE} was not updated (mtime did not change)")
        else:
            logger.error(f"File {OUTPUT_FILE} does not exist after saving")

        # Copy the file to the public directory
        public_file = os.path.join(os.path.dirname(os.path.dirname(OUTPUT_FILE)), 'public', os.path.basename(OUTPUT_FILE))
        logger.info(f"Copying {OUTPUT_FILE} to {public_file}")
        shutil.copy2(OUTPUT_FILE, public_file)

        # Check if the public file was created
        if os.path.exists(public_file):
            public_size = os.path.getsize(public_file)
            logger.info(f"Public file {public_file} created, size: {public_size} bytes")
        else:
            logger.error(f"Failed to create public file {public_file}")

        logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}")
        return True
    except IOError as e:
        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
        return False

def load_unavailable_pages():
    """
    Load the list of pages unavailable in French

    Returns:
        tuple: (all_pages, grouped_pages, last_updated)
    """
    if not os.path.exists(UNAVAILABLE_PAGES_FILE):
        logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
        return [], {}, None

    try:
        with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_pages = data.get('all_pages', [])
            grouped_pages = data.get('grouped_pages', {})
            last_updated = data.get('last_updated')
            return all_pages, grouped_pages, last_updated
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error loading unavailable pages file: {e}")
        return [], {}, None

def load_created_pages():
    """
    Load the list of newly created French pages

    Returns:
        tuple: (created_pages, last_updated)
    """
    if not os.path.exists(CREATED_PAGES_FILE):
        logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
        return [], None

    try:
        with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            created_pages = data.get('created_pages', [])
            last_updated = data.get('last_updated')
            return created_pages, last_updated
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error loading created pages file: {e}")
        return [], None

def save_created_pages(created_pages, dry_run=False):
    """
    Save the list of newly created French pages

    Args:
        created_pages (list): List of newly created French pages
        dry_run (bool): If True, don't actually save to file

    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved created pages to file")
        return True

    data = {
        "last_updated": datetime.now().isoformat(),
        "created_pages": created_pages
    }

    try:
        with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")

        # Copy the file to the public directory
        public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
        logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
        shutil.copy2(CREATED_PAGES_FILE, public_file)

        return True
    except IOError as e:
        logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
        return False

def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
    """
    Save the updated list of pages unavailable in French

    Args:
        all_pages (list): List of all unavailable pages
        grouped_pages (dict): Dictionary of pages grouped by language prefix
        dry_run (bool): If True, don't actually save to file

    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved updated unavailable pages to file")
        return True

    data = {
        "last_updated": datetime.now().isoformat(),
        "all_pages": all_pages,
        "grouped_pages": grouped_pages
    }

    try:
        with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")

        # Copy the file to the public directory
        public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
        logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
        shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)

        return True
    except IOError as e:
        logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
        return False

def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
    """
    Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French

    Args:
        recent_changes (list): List of recent change dictionaries
        all_pages (list): List of all unavailable pages
        grouped_pages (dict): Dictionary of pages grouped by language prefix

    Returns:
        tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
    """
    newly_created_pages = []
    updated_all_pages = all_pages.copy()
    updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}

    # Check each recent change
    for change in recent_changes:
        page_name = change['page_name']
        page_url = change['page_url']
        comment = change['comment'].lower()

        # Check if this is a new page creation
        is_new_page = "page created" in comment or "nouvelle page" in comment

        if is_new_page and page_name.startswith("FR:"):
            logger.info(f"Found newly created French page: {page_name}")

            # Check if this page was previously in the list of unavailable pages
            # We need to check if the English version of this page is in the list
            en_page_name = page_name.replace("FR:", "")

            # Find the English page in the list of unavailable pages
            found_en_page = None
            for page in all_pages:
                if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
                    found_en_page = page
                    break

            if found_en_page:
                logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")

                # Remove the English page from the list of unavailable pages
                updated_all_pages.remove(found_en_page)

                # Remove the English page from the grouped pages
                lang_prefix = found_en_page['language_prefix']
                if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
                    updated_grouped_pages[lang_prefix].remove(found_en_page)

                    # If the group is now empty, remove it
                    if not updated_grouped_pages[lang_prefix]:
                        del updated_grouped_pages[lang_prefix]

                # Add the newly created page to the list
                newly_created_pages.append({
                    "title": page_name,
                    "url": page_url,
                    "en_title": found_en_page['title'],
                    "en_url": found_en_page['url'],
                    "created_at": change['timestamp'],
                    "created_by": change['user'],
                    "comment": change['comment']
                })

    return updated_all_pages, updated_grouped_pages, newly_created_pages

def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
    parser.add_argument("--debug", action="store_true", help="Save HTML content to a file for debugging")
    args = parser.parse_args()

    logger.info("Starting fetch_recent_changes.py")

    # Check if cache is fresh
    if is_cache_fresh() and not args.force:
        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
        logger.info(f"Use --force to update anyway")
        return

    # Get the recent changes page content
    html_content = get_page_content(RECENT_CHANGES_URL)

    if not html_content:
        logger.error("Failed to get recent changes page content")
        return

    # Save HTML content to a file for debugging
    if args.debug:
        debug_file = "recent_changes_debug.html"
        try:
            with open(debug_file, 'w', encoding='utf-8') as f:
                f.write(html_content)
            logger.info(f"Saved HTML content to {debug_file} for debugging")
        except IOError as e:
            logger.error(f"Error saving HTML content to {debug_file}: {e}")

    # Parse the HTML to find the structure
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the main content area
    content_div = soup.find('div', id='mw-content-text')
    if content_div:
        logger.info(f"Found content div with id 'mw-content-text'")

        # Look for elements with mw-changeslist class
        changeslist_elements = content_div.find_all(class_='mw-changeslist')
        logger.info(f"Found {len(changeslist_elements)} elements with class 'mw-changeslist'")

        for i, element in enumerate(changeslist_elements):
            logger.info(f"Element {i+1} tag: {element.name}, classes: {element.get('class', [])}")

            # Look for table rows or other elements that might contain changes
            rows = element.find_all('tr')
            divs = element.find_all('div', class_='mw-changeslist-line')
            lis = element.find_all('li')

            logger.info(f"  - Contains {len(rows)} tr elements")
            logger.info(f"  - Contains {len(divs)} div.mw-changeslist-line elements")
            logger.info(f"  - Contains {len(lis)} li elements")

            # Check direct children
            children = list(element.children)
            logger.info(f"  - Has {len(children)} direct children")
            if children:
                child_types = {}
                for child in children:
                    if hasattr(child, 'name') and child.name:
                        child_type = child.name
                        child_types[child_type] = child_types.get(child_type, 0) + 1
                logger.info(f"  - Direct children types: {child_types}")

    # Extract recent changes
    recent_changes = extract_recent_changes(html_content)

    if not recent_changes:
        logger.warning("No recent changes found")

    # Save results
    success = save_results(recent_changes, args.dry_run)

    # Check for newly created French pages
    logger.info("Checking for newly created French pages...")
    all_pages, grouped_pages, last_updated = load_unavailable_pages()
    created_pages, created_last_updated = load_created_pages()

    if all_pages and grouped_pages:
        # Check for newly created pages
        updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)

        # If we found newly created pages, update both files
        if newly_created:
            logger.info(f"Found {len(newly_created)} newly created French pages")

            # Add the newly created pages to the existing list
            created_pages.extend(newly_created)

            # Save the updated files
            save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
            save_created_pages(created_pages, args.dry_run)
        else:
            logger.info("No newly created French pages found")
    else:
        logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")

    if success:
        logger.info("Script completed successfully")
    else:
        logger.error("Script completed with errors")

if __name__ == "__main__":
    main()