osm-labo/wiki_compare/find_untranslated_french_pages.py
2025-08-22 17:58:04 +02:00

212 lines
No EOL
7.5 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
find_untranslated_french_pages.py
This script scrapes the OSM wiki to find French pages that don't have translations
in other languages. It caches the results and only performs the scraping
at most once per hour.
Usage:
python find_untranslated_french_pages.py [--force] [--dry-run]
Options:
--force Force update even if cache is fresh
--dry-run Print results without saving to file
Output:
- untranslated_french_pages.json: JSON file containing information about French pages without translations
"""
import requests
from bs4 import BeautifulSoup
import json
import logging
import argparse
import os
from datetime import datetime, timedelta
import re
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
OUTPUT_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'untranslated_french_pages.json')
CACHE_TIMEOUT = 1 # hours
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
FRENCH_PAGES_URL = "https://wiki.openstreetmap.org/wiki/Special:AllPages?from=&to=&namespace=202&hideredirects=1&prefix=FR:"
def should_update_cache():
"""
Check if the cache file exists and if it's older than the cache timeout
Returns:
bool: True if cache should be updated, False otherwise
"""
if not os.path.exists(OUTPUT_FILE):
logger.info("Cache file doesn't exist, creating it")
return True
# Check file modification time
file_mtime = datetime.fromtimestamp(os.path.getmtime(OUTPUT_FILE))
now = datetime.now()
# If file is older than cache timeout, update it
if now - file_mtime > timedelta(hours=CACHE_TIMEOUT):
logger.info(f"Cache is older than {CACHE_TIMEOUT} hour(s), updating")
return True
logger.info(f"Cache is still fresh (less than {CACHE_TIMEOUT} hour(s) old)")
return False
def fetch_french_pages():
"""
Fetch all French pages from the OSM wiki
Returns:
list: List of dictionaries containing French page information
"""
logger.info(f"Fetching French pages from {FRENCH_PAGES_URL}")
french_pages = []
next_page_url = FRENCH_PAGES_URL
while next_page_url:
try:
response = requests.get(next_page_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links in the mw-allpages-body section
links_container = soup.select_one('.mw-allpages-body')
if links_container:
links = links_container.select('li a')
for link in links:
page_title = link.text.strip()
page_url = WIKI_BASE_URL + link.get('href', '')
# Extract the key name (remove the FR: prefix)
key_match = re.match(r'FR:(.*)', page_title)
if key_match:
key_name = key_match.group(1)
french_pages.append({
'title': page_title,
'key': key_name,
'url': page_url,
'has_translation': False # Will be updated later
})
# Check if there's a next page
next_link = soup.select_one('a.mw-nextlink')
next_page_url = WIKI_BASE_URL + next_link.get('href') if next_link else None
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching French pages: {e}")
break
logger.info(f"Found {len(french_pages)} French pages")
return french_pages
def check_translations(french_pages):
"""
Check if each French page has translations in other languages
Args:
french_pages (list): List of dictionaries containing French page information
Returns:
list: Updated list with translation information
"""
logger.info("Checking for translations of French pages")
for i, page in enumerate(french_pages):
if i % 10 == 0: # Log progress every 10 pages
logger.info(f"Checking page {i+1}/{len(french_pages)}: {page['title']}")
try:
# Construct the English page URL by removing the FR: prefix
en_url = page['url'].replace('/wiki/FR:', '/wiki/')
# Check if the English page exists
response = requests.head(en_url)
# If the page returns a 200 status code, it exists
if response.status_code == 200:
page['has_translation'] = True
page['en_url'] = en_url
else:
page['has_translation'] = False
except requests.exceptions.RequestException as e:
logger.error(f"Error checking translation for {page['title']}: {e}")
# Assume no translation in case of error
page['has_translation'] = False
# Filter to only include pages without translations
untranslated_pages = [page for page in french_pages if not page['has_translation']]
logger.info(f"Found {len(untranslated_pages)} French pages without translations")
return untranslated_pages
def save_untranslated_pages(untranslated_pages):
"""
Save the untranslated pages to a JSON file
Args:
untranslated_pages (list): List of dictionaries containing untranslated page information
Returns:
str: Path to the output file
"""
data = {
'last_updated': datetime.now().isoformat(),
'untranslated_pages': untranslated_pages
}
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"Saved {len(untranslated_pages)} untranslated pages to {OUTPUT_FILE}")
return OUTPUT_FILE
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Find French OSM wiki pages without translations")
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
parser.add_argument("--dry-run", action="store_true", help="Print results without saving to file")
args = parser.parse_args()
logger.info("Starting find_untranslated_french_pages.py")
# Check if we should update the cache
if args.force or should_update_cache() or args.dry_run:
# Fetch all French pages
french_pages = fetch_french_pages()
# Check which ones don't have translations
untranslated_pages = check_translations(french_pages)
if args.dry_run:
logger.info(f"Found {len(untranslated_pages)} French pages without translations:")
for page in untranslated_pages[:10]: # Show only the first 10 in dry run
logger.info(f"- {page['title']} ({page['url']})")
if len(untranslated_pages) > 10:
logger.info(f"... and {len(untranslated_pages) - 10} more")
else:
# Save the results
output_file = save_untranslated_pages(untranslated_pages)
logger.info(f"Results saved to {output_file}")
else:
logger.info("Using cached untranslated pages data")
logger.info("Script completed successfully")
if __name__ == "__main__":
main()