#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ find_untranslated_french_pages.py This script scrapes the OSM wiki to find French pages that don't have translations in other languages. It caches the results and only performs the scraping at most once per hour. Usage: python find_untranslated_french_pages.py [--force] [--dry-run] Options: --force Force update even if cache is fresh --dry-run Print results without saving to file Output: - untranslated_french_pages.json: JSON file containing information about French pages without translations """ import requests from bs4 import BeautifulSoup import json import logging import argparse import os from datetime import datetime, timedelta import re # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants OUTPUT_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'untranslated_french_pages.json') CACHE_TIMEOUT = 1 # hours WIKI_BASE_URL = "https://wiki.openstreetmap.org" FRENCH_PAGES_URL = "https://wiki.openstreetmap.org/wiki/Special:AllPages?from=&to=&namespace=202&hideredirects=1&prefix=FR:" def should_update_cache(): """ Check if the cache file exists and if it's older than the cache timeout Returns: bool: True if cache should be updated, False otherwise """ if not os.path.exists(OUTPUT_FILE): logger.info("Cache file doesn't exist, creating it") return True # Check file modification time file_mtime = datetime.fromtimestamp(os.path.getmtime(OUTPUT_FILE)) now = datetime.now() # If file is older than cache timeout, update it if now - file_mtime > timedelta(hours=CACHE_TIMEOUT): logger.info(f"Cache is older than {CACHE_TIMEOUT} hour(s), updating") return True logger.info(f"Cache is still fresh (less than {CACHE_TIMEOUT} hour(s) old)") return False def fetch_french_pages(): """ Fetch all French pages from the OSM wiki Returns: list: List of dictionaries containing French page information """ logger.info(f"Fetching French pages from {FRENCH_PAGES_URL}") french_pages = [] next_page_url = FRENCH_PAGES_URL while next_page_url: try: response = requests.get(next_page_url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Find all links in the mw-allpages-body section links_container = soup.select_one('.mw-allpages-body') if links_container: links = links_container.select('li a') for link in links: page_title = link.text.strip() page_url = WIKI_BASE_URL + link.get('href', '') # Extract the key name (remove the FR: prefix) key_match = re.match(r'FR:(.*)', page_title) if key_match: key_name = key_match.group(1) french_pages.append({ 'title': page_title, 'key': key_name, 'url': page_url, 'has_translation': False # Will be updated later }) # Check if there's a next page next_link = soup.select_one('a.mw-nextlink') next_page_url = WIKI_BASE_URL + next_link.get('href') if next_link else None except requests.exceptions.RequestException as e: logger.error(f"Error fetching French pages: {e}") break logger.info(f"Found {len(french_pages)} French pages") return french_pages def check_translations(french_pages): """ Check if each French page has translations in other languages Args: french_pages (list): List of dictionaries containing French page information Returns: list: Updated list with translation information """ logger.info("Checking for translations of French pages") for i, page in enumerate(french_pages): if i % 10 == 0: # Log progress every 10 pages logger.info(f"Checking page {i+1}/{len(french_pages)}: {page['title']}") try: # Construct the English page URL by removing the FR: prefix en_url = page['url'].replace('/wiki/FR:', '/wiki/') # Check if the English page exists response = requests.head(en_url) # If the page returns a 200 status code, it exists if response.status_code == 200: page['has_translation'] = True page['en_url'] = en_url else: page['has_translation'] = False except requests.exceptions.RequestException as e: logger.error(f"Error checking translation for {page['title']}: {e}") # Assume no translation in case of error page['has_translation'] = False # Filter to only include pages without translations untranslated_pages = [page for page in french_pages if not page['has_translation']] logger.info(f"Found {len(untranslated_pages)} French pages without translations") return untranslated_pages def save_untranslated_pages(untranslated_pages): """ Save the untranslated pages to a JSON file Args: untranslated_pages (list): List of dictionaries containing untranslated page information Returns: str: Path to the output file """ data = { 'last_updated': datetime.now().isoformat(), 'untranslated_pages': untranslated_pages } with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) logger.info(f"Saved {len(untranslated_pages)} untranslated pages to {OUTPUT_FILE}") return OUTPUT_FILE def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Find French OSM wiki pages without translations") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") parser.add_argument("--dry-run", action="store_true", help="Print results without saving to file") args = parser.parse_args() logger.info("Starting find_untranslated_french_pages.py") # Check if we should update the cache if args.force or should_update_cache() or args.dry_run: # Fetch all French pages french_pages = fetch_french_pages() # Check which ones don't have translations untranslated_pages = check_translations(french_pages) if args.dry_run: logger.info(f"Found {len(untranslated_pages)} French pages without translations:") for page in untranslated_pages[:10]: # Show only the first 10 in dry run logger.info(f"- {page['title']} ({page['url']})") if len(untranslated_pages) > 10: logger.info(f"... and {len(untranslated_pages) - 10} more") else: # Save the results output_file = save_untranslated_pages(untranslated_pages) logger.info(f"Results saved to {output_file}") else: logger.info("Using cached untranslated pages data") logger.info("Script completed successfully") if __name__ == "__main__": main()