qualiwiki/wiki_compare/find_untranslated_french_pages.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
find_untranslated_french_pages.py

This script scrapes the OSM wiki to find French pages that don't have translations
in other languages. It caches the results and only performs the scraping
at most once per hour.

Usage:
    python find_untranslated_french_pages.py [--force] [--dry-run]

Options:
    --force     Force update even if cache is fresh
    --dry-run   Print results without saving to file

Output:
    - untranslated_french_pages.json: JSON file containing information about French pages without translations
"""

import requests
from bs4 import BeautifulSoup
import json
import logging
import argparse
import os
from datetime import datetime, timedelta
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
OUTPUT_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'untranslated_french_pages.json')
CACHE_TIMEOUT = 1  # hours
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
FRENCH_PAGES_URL = "https://wiki.openstreetmap.org/wiki/Special:AllPages?from=&to=&namespace=202&hideredirects=1&prefix=FR:"

def should_update_cache():
    """
    Check if the cache file exists and if it's older than the cache timeout

    Returns:
        bool: True if cache should be updated, False otherwise
    """
    if not os.path.exists(OUTPUT_FILE):
        logger.info("Cache file doesn't exist, creating it")
        return True

    # Check file modification time
    file_mtime = datetime.fromtimestamp(os.path.getmtime(OUTPUT_FILE))
    now = datetime.now()

    # If file is older than cache timeout, update it
    if now - file_mtime > timedelta(hours=CACHE_TIMEOUT):
        logger.info(f"Cache is older than {CACHE_TIMEOUT} hour(s), updating")
        return True

    logger.info(f"Cache is still fresh (less than {CACHE_TIMEOUT} hour(s) old)")
    return False

def fetch_french_pages():
    """
    Fetch all French pages from the OSM wiki

    Returns:
        list: List of dictionaries containing French page information
    """
    logger.info(f"Fetching French pages from {FRENCH_PAGES_URL}")
    french_pages = []
    next_page_url = FRENCH_PAGES_URL

    while next_page_url:
        try:
            response = requests.get(next_page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all links in the mw-allpages-body section
            links_container = soup.select_one('.mw-allpages-body')
            if links_container:
                links = links_container.select('li a')

                for link in links:
                    page_title = link.text.strip()
                    page_url = WIKI_BASE_URL + link.get('href', '')

                    # Extract the key name (remove the FR: prefix)
                    key_match = re.match(r'FR:(.*)', page_title)
                    if key_match:
                        key_name = key_match.group(1)

                        french_pages.append({
                            'title': page_title,
                            'key': key_name,
                            'url': page_url,
                            'has_translation': False  # Will be updated later
                        })

            # Check if there's a next page
            next_link = soup.select_one('a.mw-nextlink')
            next_page_url = WIKI_BASE_URL + next_link.get('href') if next_link else None

        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching French pages: {e}")
            break

    logger.info(f"Found {len(french_pages)} French pages")
    return french_pages

def check_translations(french_pages):
    """
    Check if each French page has translations in other languages

    Args:
        french_pages (list): List of dictionaries containing French page information

    Returns:
        list: Updated list with translation information
    """
    logger.info("Checking for translations of French pages")

    for i, page in enumerate(french_pages):
        if i % 10 == 0:  # Log progress every 10 pages
            logger.info(f"Checking page {i+1}/{len(french_pages)}: {page['title']}")

        try:
            # Construct the English page URL by removing the FR: prefix
            en_url = page['url'].replace('/wiki/FR:', '/wiki/')

            # Check if the English page exists
            response = requests.head(en_url)

            # If the page returns a 200 status code, it exists
            if response.status_code == 200:
                page['has_translation'] = True
                page['en_url'] = en_url
            else:
                page['has_translation'] = False

        except requests.exceptions.RequestException as e:
            logger.error(f"Error checking translation for {page['title']}: {e}")
            # Assume no translation in case of error
            page['has_translation'] = False

    # Filter to only include pages without translations
    untranslated_pages = [page for page in french_pages if not page['has_translation']]
    logger.info(f"Found {len(untranslated_pages)} French pages without translations")

    return untranslated_pages

def save_untranslated_pages(untranslated_pages):
    """
    Save the untranslated pages to a JSON file

    Args:
        untranslated_pages (list): List of dictionaries containing untranslated page information

    Returns:
        str: Path to the output file
    """
    data = {
        'last_updated': datetime.now().isoformat(),
        'untranslated_pages': untranslated_pages
    }

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    logger.info(f"Saved {len(untranslated_pages)} untranslated pages to {OUTPUT_FILE}")
    return OUTPUT_FILE

def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Find French OSM wiki pages without translations")
    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
    parser.add_argument("--dry-run", action="store_true", help="Print results without saving to file")
    args = parser.parse_args()

    logger.info("Starting find_untranslated_french_pages.py")

    # Check if we should update the cache
    if args.force or should_update_cache() or args.dry_run:
        # Fetch all French pages
        french_pages = fetch_french_pages()

        # Check which ones don't have translations
        untranslated_pages = check_translations(french_pages)

        if args.dry_run:
            logger.info(f"Found {len(untranslated_pages)} French pages without translations:")
            for page in untranslated_pages[:10]:  # Show only the first 10 in dry run
                logger.info(f"- {page['title']} ({page['url']})")
            if len(untranslated_pages) > 10:
                logger.info(f"... and {len(untranslated_pages) - 10} more")
        else:
            # Save the results
            output_file = save_untranslated_pages(untranslated_pages)
            logger.info(f"Results saved to {output_file}")
    else:
        logger.info("Using cached untranslated pages data")

    logger.info("Script completed successfully")

if __name__ == "__main__":
    main()