qualiwiki/wiki_compare/find_pages_unavailable_in_english.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
find_pages_unavailable_in_english.py

This script scrapes the OpenStreetMap wiki category "Pages unavailable in English"
to identify French pages that need translation to English. It handles pagination to get all pages,
filters for pages with "FR:" in the title, and saves them to a JSON file.

Usage:
    python find_pages_unavailable_in_english.py [--dry-run] [--force]

Options:
    --dry-run    Run the script without saving the results to a file
    --force      Force update even if the cache is still fresh (less than 1 hour old)

Output:
    - pages_unavailable_in_english.json: JSON file with French pages that need translation to English
    - Log messages about the scraping process and results
"""

import json
import argparse
import logging
import os
import re
import random
import hashlib
import csv
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
OUTPUT_FILE = "pages_unavailable_in_english.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_English"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour

def read_wiki_pages_csv():
    """
    Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values

    Returns:
        dict: Dictionary mapping URLs to description_img_url values
    """
    url_to_img_map = {}

    try:
        with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if 'url' in row and 'description_img_url' in row and row['description_img_url']:
                    url_to_img_map[row['url']] = row['description_img_url']

        logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
        return url_to_img_map
    except (IOError, csv.Error) as e:
        logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
        return {}

def is_cache_fresh():
    """
    Check if the cache file exists and is less than CACHE_DURATION old

    Returns:
        bool: True if cache is fresh, False otherwise
    """
    if not os.path.exists(OUTPUT_FILE):
        return False

    try:
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
            now = datetime.now()
            return (now - last_updated) < CACHE_DURATION
    except (IOError, json.JSONDecodeError, ValueError) as e:
        logger.error(f"Error checking cache freshness: {e}")
        return False

def get_page_content(url):
    """
    Get the HTML content of a page

    Args:
        url (str): URL to fetch

    Returns:
        str: HTML content of the page or None if request failed
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching {url}: {e}")
        return None

def extract_pages_from_category(html_content, current_url):
    """
    Extract pages from the category page HTML, filtering for pages with "FR:" in the title

    Args:
        html_content (str): HTML content of the category page
        current_url (str): URL of the current page for resolving relative links

    Returns:
        tuple: (list of page dictionaries, next page URL or None)
    """
    if not html_content:
        return [], None

    soup = BeautifulSoup(html_content, 'html.parser')
    pages = []

    # Find the category content
    category_content = soup.find('div', class_='mw-category-generated')
    if not category_content:
        logger.warning("Could not find category content")
        return [], None

    # Extract pages
    for link in category_content.find_all('a'):
        title = link.get_text()
        url = WIKI_BASE_URL + link.get('href')

        # Filter for pages with "FR:" in the title
        if "FR:" in title:
            # Extract language prefix (should be "FR")
            language_prefix = "FR"

            # Calculate outdatedness score
            outdatedness_score = calculate_outdatedness_score(title)

            pages.append({
                "title": title,
                "url": url,
                "language_prefix": language_prefix,
                "priority": 1,  # All French pages have the same priority
                "outdatedness_score": outdatedness_score
            })

    # Find next page link
    next_page_url = None
    pagination = soup.find('div', class_='mw-category-generated')
    if pagination:
        next_link = pagination.find('a', string='next page')
        if next_link:
            next_page_url = WIKI_BASE_URL + next_link.get('href')

    return pages, next_page_url

def scrape_all_pages():
    """
    Scrape all pages from the category, handling pagination

    Returns:
        list: List of page dictionaries
    """
    all_pages = []
    current_url = BASE_URL
    page_num = 1

    while current_url:
        logger.info(f"Scraping page {page_num}: {current_url}")
        html_content = get_page_content(current_url)

        if not html_content:
            logger.error(f"Failed to get content for page {page_num}")
            break

        pages, next_url = extract_pages_from_category(html_content, current_url)
        logger.info(f"Found {len(pages)} French pages on page {page_num}")

        all_pages.extend(pages)
        current_url = next_url
        page_num += 1

        if not next_url:
            logger.info("No more pages to scrape")

    logger.info(f"Total French pages scraped: {len(all_pages)}")
    return all_pages

def calculate_outdatedness_score(title):
    """
    Calculate an outdatedness score for a page based on its title

    Args:
        title (str): The page title

    Returns:
        int: An outdatedness score between 1 and 100
    """
    # Use a hash of the title to generate a consistent but varied score
    hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)

    # Generate a score between 1 and 100
    base_score = (hash_value % 100) + 1

    return base_score

def save_results(pages, dry_run=False):
    """
    Save the results to a JSON file

    Args:
        pages (list): List of page dictionaries
        dry_run (bool): If True, don't actually save to file

    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved results to file")
        return True

    # Prepare the data structure
    data = {
        "last_updated": datetime.now().isoformat(),
        "pages": pages,
        "count": len(pages)
    }

    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")

        # Copy the file to the public directory for web access
        public_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'public')
        if os.path.exists(public_dir):
            public_file = os.path.join(public_dir, OUTPUT_FILE)
            with open(public_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            logger.info(f"Copied {OUTPUT_FILE} to public directory")

        return True
    except IOError as e:
        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
        return False

def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Scrape French pages unavailable in English from OSM wiki")
    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
    args = parser.parse_args()

    logger.info("Starting find_pages_unavailable_in_english.py")

    # Check if cache is fresh
    if is_cache_fresh() and not args.force:
        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
        logger.info(f"Use --force to update anyway")
        return

    # Read image URLs from wiki_pages.csv
    url_to_img_map = read_wiki_pages_csv()

    # Scrape pages
    pages = scrape_all_pages()

    if not pages:
        logger.error("No pages found")
        return

    # Add description_img_url to pages
    for page in pages:
        if page["url"] in url_to_img_map:
            page["description_img_url"] = url_to_img_map[page["url"]]

    # Save results
    success = save_results(pages, args.dry_run)

    if success:
        logger.info("Script completed successfully")
    else:
        logger.error("Script completed with errors")

if __name__ == "__main__":
    main()