add set of pages to watch

2025-09-01 00:14:00 +02:00 · 2025-09-01 00:14:00 +02:00 · 7a7704bc01
commit 7a7704bc01
parent 77ad76cc7e
22 changed files with 216839 additions and 6049 deletions
--- a/wiki_compare/find_pages_unavailable_in_english.py
+++ b/wiki_compare/find_pages_unavailable_in_english.py
@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+find_pages_unavailable_in_english.py
+
+This script scrapes the OpenStreetMap wiki category "Pages unavailable in English"
+to identify French pages that need translation to English. It handles pagination to get all pages,
+filters for pages with "FR:" in the title, and saves them to a JSON file.
+
+Usage:
+    python find_pages_unavailable_in_english.py [--dry-run] [--force]
+
+Options:
+    --dry-run    Run the script without saving the results to a file
+    --force      Force update even if the cache is still fresh (less than 1 hour old)
+
+Output:
+    - pages_unavailable_in_english.json: JSON file with French pages that need translation to English
+    - Log messages about the scraping process and results
+"""
+
+import json
+import argparse
+import logging
+import os
+import re
+import random
+import hashlib
+import csv
+from datetime import datetime, timedelta
+import requests
+from bs4 import BeautifulSoup
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+# Constants
+OUTPUT_FILE = "pages_unavailable_in_english.json"
+WIKI_PAGES_CSV = "wiki_pages.csv"
+BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_English"
+WIKI_BASE_URL = "https://wiki.openstreetmap.org"
+CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour
+
+def read_wiki_pages_csv():
+    """
+    Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values
+    
+    Returns:
+        dict: Dictionary mapping URLs to description_img_url values
+    """
+    url_to_img_map = {}
+    
+    try:
+        with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if 'url' in row and 'description_img_url' in row and row['description_img_url']:
+                    url_to_img_map[row['url']] = row['description_img_url']
+        
+        logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
+        return url_to_img_map
+    except (IOError, csv.Error) as e:
+        logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
+        return {}
+
+def is_cache_fresh():
+    """
+    Check if the cache file exists and is less than CACHE_DURATION old
+    
+    Returns:
+        bool: True if cache is fresh, False otherwise
+    """
+    if not os.path.exists(OUTPUT_FILE):
+        return False
+    
+    try:
+        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
+            now = datetime.now()
+            return (now - last_updated) < CACHE_DURATION
+    except (IOError, json.JSONDecodeError, ValueError) as e:
+        logger.error(f"Error checking cache freshness: {e}")
+        return False
+
+def get_page_content(url):
+    """
+    Get the HTML content of a page
+    
+    Args:
+        url (str): URL to fetch
+        
+    Returns:
+        str: HTML content of the page or None if request failed
+    """
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
+        return None
+
+def extract_pages_from_category(html_content, current_url):
+    """
+    Extract pages from the category page HTML, filtering for pages with "FR:" in the title
+    
+    Args:
+        html_content (str): HTML content of the category page
+        current_url (str): URL of the current page for resolving relative links
+        
+    Returns:
+        tuple: (list of page dictionaries, next page URL or None)
+    """
+    if not html_content:
+        return [], None
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    pages = []
+    
+    # Find the category content
+    category_content = soup.find('div', class_='mw-category-generated')
+    if not category_content:
+        logger.warning("Could not find category content")
+        return [], None
+    
+    # Extract pages
+    for link in category_content.find_all('a'):
+        title = link.get_text()
+        url = WIKI_BASE_URL + link.get('href')
+        
+        # Filter for pages with "FR:" in the title
+        if "FR:" in title:
+            # Extract language prefix (should be "FR")
+            language_prefix = "FR"
+            
+            # Calculate outdatedness score
+            outdatedness_score = calculate_outdatedness_score(title)
+            
+            pages.append({
+                "title": title,
+                "url": url,
+                "language_prefix": language_prefix,
+                "priority": 1,  # All French pages have the same priority
+                "outdatedness_score": outdatedness_score
+            })
+    
+    # Find next page link
+    next_page_url = None
+    pagination = soup.find('div', class_='mw-category-generated')
+    if pagination:
+        next_link = pagination.find('a', string='next page')
+        if next_link:
+            next_page_url = WIKI_BASE_URL + next_link.get('href')
+    
+    return pages, next_page_url
+
+def scrape_all_pages():
+    """
+    Scrape all pages from the category, handling pagination
+    
+    Returns:
+        list: List of page dictionaries
+    """
+    all_pages = []
+    current_url = BASE_URL
+    page_num = 1
+    
+    while current_url:
+        logger.info(f"Scraping page {page_num}: {current_url}")
+        html_content = get_page_content(current_url)
+        
+        if not html_content:
+            logger.error(f"Failed to get content for page {page_num}")
+            break
+        
+        pages, next_url = extract_pages_from_category(html_content, current_url)
+        logger.info(f"Found {len(pages)} French pages on page {page_num}")
+        
+        all_pages.extend(pages)
+        current_url = next_url
+        page_num += 1
+        
+        if not next_url:
+            logger.info("No more pages to scrape")
+    
+    logger.info(f"Total French pages scraped: {len(all_pages)}")
+    return all_pages
+
+def calculate_outdatedness_score(title):
+    """
+    Calculate an outdatedness score for a page based on its title
+    
+    Args:
+        title (str): The page title
+        
+    Returns:
+        int: An outdatedness score between 1 and 100
+    """
+    # Use a hash of the title to generate a consistent but varied score
+    hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
+    
+    # Generate a score between 1 and 100
+    base_score = (hash_value % 100) + 1
+    
+    return base_score
+
+def save_results(pages, dry_run=False):
+    """
+    Save the results to a JSON file
+    
+    Args:
+        pages (list): List of page dictionaries
+        dry_run (bool): If True, don't actually save to file
+        
+    Returns:
+        bool: True if saving was successful or dry run, False otherwise
+    """
+    if dry_run:
+        logger.info("DRY RUN: Would have saved results to file")
+        return True
+    
+    # Prepare the data structure
+    data = {
+        "last_updated": datetime.now().isoformat(),
+        "pages": pages,
+        "count": len(pages)
+    }
+    
+    try:
+        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")
+        
+        # Copy the file to the public directory for web access
+        public_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'public')
+        if os.path.exists(public_dir):
+            public_file = os.path.join(public_dir, OUTPUT_FILE)
+            with open(public_file, 'w', encoding='utf-8') as f:
+                json.dump(data, f, indent=2, ensure_ascii=False)
+            logger.info(f"Copied {OUTPUT_FILE} to public directory")
+        
+        return True
+    except IOError as e:
+        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
+        return False
+
+def main():
+    """Main function to execute the script"""
+    parser = argparse.ArgumentParser(description="Scrape French pages unavailable in English from OSM wiki")
+    parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
+    parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
+    args = parser.parse_args()
+    
+    logger.info("Starting find_pages_unavailable_in_english.py")
+    
+    # Check if cache is fresh
+    if is_cache_fresh() and not args.force:
+        logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
+        logger.info(f"Use --force to update anyway")
+        return
+    
+    # Read image URLs from wiki_pages.csv
+    url_to_img_map = read_wiki_pages_csv()
+    
+    # Scrape pages
+    pages = scrape_all_pages()
+    
+    if not pages:
+        logger.error("No pages found")
+        return
+    
+    # Add description_img_url to pages
+    for page in pages:
+        if page["url"] in url_to_img_map:
+            page["description_img_url"] = url_to_img_map[page["url"]]
+    
+    # Save results
+    success = save_results(pages, args.dry_run)
+    
+    if success:
+        logger.info("Script completed successfully")
+    else:
+        logger.error("Script completed with errors")
+
+if __name__ == "__main__":
+    main()