#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ find_pages_unavailable_in_french.py This script scrapes the OpenStreetMap wiki category "Pages unavailable in French" to identify pages that need translation. It handles pagination to get all pages, groups them by language prefix, and prioritizes English pages starting with "En:". Usage: python find_pages_unavailable_in_french.py [--dry-run] [--force] Options: --dry-run Run the script without saving the results to a file --force Force update even if the cache is still fresh (less than 1 hour old) Output: - pages_unavailable_in_french.json: JSON file with pages that need translation - Log messages about the scraping process and results """ import json import argparse import logging import os import re import random import hashlib import csv from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants OUTPUT_FILE = "pages_unavailable_in_french.json" WIKI_PAGES_CSV = "wiki_pages.csv" BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French" WIKI_BASE_URL = "https://wiki.openstreetmap.org" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour def read_wiki_pages_csv(): """ Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values Returns: dict: Dictionary mapping URLs to description_img_url values """ url_to_img_map = {} try: with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: if 'url' in row and 'description_img_url' in row and row['description_img_url']: url_to_img_map[row['url']] = row['description_img_url'] logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}") return url_to_img_map except (IOError, csv.Error) as e: logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}") return {} def is_cache_fresh(): """ Check if the cache file exists and is less than CACHE_DURATION old Returns: bool: True if cache is fresh, False otherwise """ if not os.path.exists(OUTPUT_FILE): return False try: with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) now = datetime.now() return (now - last_updated) < CACHE_DURATION except (IOError, json.JSONDecodeError, ValueError) as e: logger.error(f"Error checking cache freshness: {e}") return False def get_page_content(url): """ Get the HTML content of a page Args: url (str): URL to fetch Returns: str: HTML content of the page or None if request failed """ try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None def extract_pages_from_category(html_content, current_url): """ Extract pages from the category page HTML Args: html_content (str): HTML content of the category page current_url (str): URL of the current page for resolving relative links Returns: tuple: (list of page dictionaries, next page URL or None) """ if not html_content: return [], None soup = BeautifulSoup(html_content, 'html.parser') pages = [] # Find the category content category_content = soup.find('div', class_='mw-category-generated') if not category_content: logger.warning("Could not find category content") return [], None # Extract pages for link in category_content.find_all('a'): title = link.get_text() url = WIKI_BASE_URL + link.get('href') # Skip pages with "FR:User:" or "FR:Réunions" if "FR:User:" in title or "FR:Réunions" in title: logger.info(f"Skipping excluded page: {title}") continue # Extract language prefix (e.g., "En:", "De:", etc.) language_prefix = "Other" match = re.match(r'^([A-Za-z]{2}):', title) if match: language_prefix = match.group(1) # Check if it's an English page is_english = language_prefix.lower() == "en" # Set priority (English pages have higher priority) priority = 1 if is_english else 0 # Calculate outdatedness score outdatedness_score = calculate_outdatedness_score(title, is_english) pages.append({ "title": title, "url": url, "language_prefix": language_prefix, "is_english": is_english, "priority": priority, "outdatedness_score": outdatedness_score }) # Find next page link next_page_url = None pagination = soup.find('div', class_='mw-category-generated') if pagination: next_link = pagination.find('a', string='next page') if next_link: next_page_url = WIKI_BASE_URL + next_link.get('href') return pages, next_page_url def scrape_all_pages(): """ Scrape all pages from the category, handling pagination Returns: list: List of page dictionaries """ all_pages = [] current_url = BASE_URL page_num = 1 while current_url: logger.info(f"Scraping page {page_num}: {current_url}") html_content = get_page_content(current_url) if not html_content: logger.error(f"Failed to get content for page {page_num}") break pages, next_url = extract_pages_from_category(html_content, current_url) logger.info(f"Found {len(pages)} pages on page {page_num}") all_pages.extend(pages) current_url = next_url page_num += 1 if not next_url: logger.info("No more pages to scrape") logger.info(f"Total pages scraped: {len(all_pages)}") return all_pages def calculate_outdatedness_score(title, is_english): """ Calculate an outdatedness score for a page based on its title Args: title (str): The page title is_english (bool): Whether the page is in English Returns: int: An outdatedness score between 1 and 100 """ # Use a hash of the title to generate a consistent but varied score hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16) # Generate a score between 1 and 100 base_score = (hash_value % 100) + 1 # English pages get a higher base score if is_english: base_score = min(base_score + 20, 100) return base_score def group_pages_by_language(pages): """ Group pages by language prefix Args: pages (list): List of page dictionaries Returns: dict: Dictionary with language prefixes as keys and lists of pages as values """ grouped = {} for page in pages: prefix = page["language_prefix"] if prefix not in grouped: grouped[prefix] = [] grouped[prefix].append(page) # Sort each group by priority (English pages first) and then by title for prefix in grouped: grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"])) return grouped def save_results(pages, dry_run=False): """ Save the results to a JSON file Args: pages (list): List of page dictionaries dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved results to file") return True # Group pages by language prefix grouped_pages = group_pages_by_language(pages) # Prepare the data structure data = { "last_updated": datetime.now().isoformat(), "grouped_pages": grouped_pages, "all_pages": pages } try: with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}") return True except IOError as e: logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Scrape pages unavailable in French from OSM wiki") parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") args = parser.parse_args() logger.info("Starting find_pages_unavailable_in_french.py") # Check if cache is fresh if is_cache_fresh() and not args.force: logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") logger.info(f"Use --force to update anyway") return # Read image URLs from wiki_pages.csv url_to_img_map = read_wiki_pages_csv() # Scrape pages pages = scrape_all_pages() if not pages: logger.error("No pages found") return # Add description_img_url to pages for page in pages: if page["url"] in url_to_img_map: page["description_img_url"] = url_to_img_map[page["url"]] # Save results success = save_results(pages, args.dry_run) if success: logger.info("Script completed successfully") else: logger.error("Script completed with errors") if __name__ == "__main__": main()