qualiwiki/wiki_compare/find_pages_unavailable_in_french.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
find_pages_unavailable_in_french.py

This script scrapes the OpenStreetMap wiki category "Pages unavailable in French"
to identify pages that need translation. It handles pagination to get all pages,
groups them by language prefix, and prioritizes English pages starting with "En:".

Usage:
    python find_pages_unavailable_in_french.py [--dry-run] [--force]

Options:
    --dry-run    Run the script without saving the results to a file
    --force      Force update even if the cache is still fresh (less than 1 hour old)

Output:
    - pages_unavailable_in_french.json: JSON file with pages that need translation
    - Log messages about the scraping process and results
"""

import json
import argparse
import logging
import os
import re
import random
import hashlib
import csv
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
OUTPUT_FILE = "pages_unavailable_in_french.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour

def read_wiki_pages_csv():
    """
    Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values

    Returns:
        dict: Dictionary mapping URLs to description_img_url values
    """
    url_to_img_map = {}

    try:
        with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if 'url' in row and 'description_img_url' in row and row['description_img_url']:
                    url_to_img_map[row['url']] = row['description_img_url']

        logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
        return url_to_img_map
    except (IOError, csv.Error) as e:
        logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
        return {}

def is_cache_fresh():
    """
    Check if the cache file exists and is less than CACHE_DURATION old

    Returns:
        bool: True if cache is fresh, False otherwise
    """
    if not os.path.exists(OUTPUT_FILE):
        return False

    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
            now = datetime.now()
            return (now - last_updated) < CACHE_DURATION
    except (IOError, json.JSONDecodeError, ValueError) as e:
        logger.error(f"Error checking cache freshness: {e}")
        return False

def get_page_content(url):
    """
    Get the HTML content of a page

    Args:
        url (str): URL to fetch

    Returns:
        str: HTML content of the page or None if request failed
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def extract_pages_from_category(html_content, current_url):
    """
    Extract pages from the category page HTML

    Args:
        html_content (str): HTML content of the category page
        current_url (str): URL of the current page for resolving relative links

    Returns:
        tuple: (list of page dictionaries, next page URL or None)
    """
    if not html_content:
        return [], None

    soup = BeautifulSoup(html_content, 'html.parser')
    pages = []

    # Find the category content
    category_content = soup.find('div', class_='mw-category-generated')
    if not category_content:
        logger.warning("Could not find category content")
        return [], None

    # Extract pages
    for link in category_content.find_all('a'):
        title = link.get_text()
        url = WIKI_BASE_URL + link.get('href')

        # Skip pages with "FR:User:" or "FR:Réunions"
        if "FR:User:" in title or "FR:Réunions" in title:
            logger.info(f"Skipping excluded page: {title}")
            continue

        # Extract language prefix (e.g., "En:", "De:", etc.)
        language_prefix = "Other"
        match = re.match(r'^([A-Za-z]{2}):', title)
        if match:
            language_prefix = match.group(1)

        # Check if it's an English page
        is_english = language_prefix.lower() == "en"

        # Set priority (English pages have higher priority)
        priority = 1 if is_english else 0

        # Calculate outdatedness score
        outdatedness_score = calculate_outdatedness_score(title, is_english)

        pages.append({
            "title": title,
            "url": url,
            "language_prefix": language_prefix,
            "is_english": is_english,
            "priority": priority,
            "outdatedness_score": outdatedness_score
        })

    # Find next page link
    next_page_url = None
    pagination = soup.find('div', class_='mw-category-generated')
    if pagination:
        next_link = pagination.find('a', string='next page')
        if next_link:
            next_page_url = WIKI_BASE_URL + next_link.get('href')

    return pages, next_page_url

def scrape_all_pages():
    """
    Scrape all pages from the category, handling pagination

    Returns:
        list: List of page dictionaries
    """
    all_pages = []
    current_url = BASE_URL
    page_num = 1

    while current_url:
        logger.info(f"Scraping page {page_num}: {current_url}")
        html_content = get_page_content(current_url)

        if not html_content:
            logger.error(f"Failed to get content for page {page_num}")
            break

        pages, next_url = extract_pages_from_category(html_content, current_url)
        logger.info(f"Found {len(pages)} pages on page {page_num}")

        all_pages.extend(pages)
        current_url = next_url
        page_num += 1

        if not next_url:
            logger.info("No more pages to scrape")

    logger.info(f"Total pages scraped: {len(all_pages)}")
    return all_pages

def calculate_outdatedness_score(title, is_english):
    """
    Calculate an outdatedness score for a page based on its title

    Args:
        title (str): The page title
        is_english (bool): Whether the page is in English

    Returns:
        int: An outdatedness score between 1 and 100
    """
    # Use a hash of the title to generate a consistent but varied score
    hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)

    # Generate a score between 1 and 100
    base_score = (hash_value % 100) + 1

    # English pages get a higher base score
    if is_english:
        base_score = min(base_score + 20, 100)

    return base_score

def group_pages_by_language(pages):
    """
    Group pages by language prefix

    Args:
        pages (list): List of page dictionaries

    Returns:
        dict: Dictionary with language prefixes as keys and lists of pages as values
    """
    grouped = {}

    for page in pages:
        prefix = page["language_prefix"]
        if prefix not in grouped:
            grouped[prefix] = []
        grouped[prefix].append(page)

    # Sort each group by priority (English pages first) and then by title
    for prefix in grouped:
        grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"]))

    return grouped

def save_results(pages, dry_run=False):
    """
    Save the results to a JSON file

    Args:
        pages (list): List of page dictionaries
        dry_run (bool): If True, don't actually save to file

    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved results to file")
        return True

    # Group pages by language prefix
    grouped_pages = group_pages_by_language(pages)

    # Prepare the data structure
    data = {
        "last_updated": datetime.now().isoformat(),
        "grouped_pages": grouped_pages,
        "all_pages": pages
    }

    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")
        return True
    except IOError as e:
        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
        return False

def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Scrape pages unavailable in French from OSM wiki")
    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
    args = parser.parse_args()

    logger.info("Starting find_pages_unavailable_in_french.py")

    # Check if cache is fresh
    if is_cache_fresh() and not args.force:
        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
        logger.info(f"Use --force to update anyway")
        return

    # Read image URLs from wiki_pages.csv
    url_to_img_map = read_wiki_pages_csv()

    # Scrape pages
    pages = scrape_all_pages()

    if not pages:
        logger.error("No pages found")
        return

    # Add description_img_url to pages
    for page in pages:
        if page["url"] in url_to_img_map:
            page["description_img_url"] = url_to_img_map[page["url"]]

    # Save results
    success = save_results(pages, args.dry_run)

    if success:
        logger.info("Script completed successfully")
    else:
        logger.error("Script completed with errors")

if __name__ == "__main__":
    main()