212 lines
No EOL
7.5 KiB
Python
Executable file
212 lines
No EOL
7.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
find_untranslated_french_pages.py
|
|
|
|
This script scrapes the OSM wiki to find French pages that don't have translations
|
|
in other languages. It caches the results and only performs the scraping
|
|
at most once per hour.
|
|
|
|
Usage:
|
|
python find_untranslated_french_pages.py [--force] [--dry-run]
|
|
|
|
Options:
|
|
--force Force update even if cache is fresh
|
|
--dry-run Print results without saving to file
|
|
|
|
Output:
|
|
- untranslated_french_pages.json: JSON file containing information about French pages without translations
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import logging
|
|
import argparse
|
|
import os
|
|
from datetime import datetime, timedelta
|
|
import re
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
OUTPUT_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'untranslated_french_pages.json')
|
|
CACHE_TIMEOUT = 1 # hours
|
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
|
FRENCH_PAGES_URL = "https://wiki.openstreetmap.org/wiki/Special:AllPages?from=&to=&namespace=202&hideredirects=1&prefix=FR:"
|
|
|
|
def should_update_cache():
|
|
"""
|
|
Check if the cache file exists and if it's older than the cache timeout
|
|
|
|
Returns:
|
|
bool: True if cache should be updated, False otherwise
|
|
"""
|
|
if not os.path.exists(OUTPUT_FILE):
|
|
logger.info("Cache file doesn't exist, creating it")
|
|
return True
|
|
|
|
# Check file modification time
|
|
file_mtime = datetime.fromtimestamp(os.path.getmtime(OUTPUT_FILE))
|
|
now = datetime.now()
|
|
|
|
# If file is older than cache timeout, update it
|
|
if now - file_mtime > timedelta(hours=CACHE_TIMEOUT):
|
|
logger.info(f"Cache is older than {CACHE_TIMEOUT} hour(s), updating")
|
|
return True
|
|
|
|
logger.info(f"Cache is still fresh (less than {CACHE_TIMEOUT} hour(s) old)")
|
|
return False
|
|
|
|
def fetch_french_pages():
|
|
"""
|
|
Fetch all French pages from the OSM wiki
|
|
|
|
Returns:
|
|
list: List of dictionaries containing French page information
|
|
"""
|
|
logger.info(f"Fetching French pages from {FRENCH_PAGES_URL}")
|
|
french_pages = []
|
|
next_page_url = FRENCH_PAGES_URL
|
|
|
|
while next_page_url:
|
|
try:
|
|
response = requests.get(next_page_url)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find all links in the mw-allpages-body section
|
|
links_container = soup.select_one('.mw-allpages-body')
|
|
if links_container:
|
|
links = links_container.select('li a')
|
|
|
|
for link in links:
|
|
page_title = link.text.strip()
|
|
page_url = WIKI_BASE_URL + link.get('href', '')
|
|
|
|
# Extract the key name (remove the FR: prefix)
|
|
key_match = re.match(r'FR:(.*)', page_title)
|
|
if key_match:
|
|
key_name = key_match.group(1)
|
|
|
|
french_pages.append({
|
|
'title': page_title,
|
|
'key': key_name,
|
|
'url': page_url,
|
|
'has_translation': False # Will be updated later
|
|
})
|
|
|
|
# Check if there's a next page
|
|
next_link = soup.select_one('a.mw-nextlink')
|
|
next_page_url = WIKI_BASE_URL + next_link.get('href') if next_link else None
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching French pages: {e}")
|
|
break
|
|
|
|
logger.info(f"Found {len(french_pages)} French pages")
|
|
return french_pages
|
|
|
|
def check_translations(french_pages):
|
|
"""
|
|
Check if each French page has translations in other languages
|
|
|
|
Args:
|
|
french_pages (list): List of dictionaries containing French page information
|
|
|
|
Returns:
|
|
list: Updated list with translation information
|
|
"""
|
|
logger.info("Checking for translations of French pages")
|
|
|
|
for i, page in enumerate(french_pages):
|
|
if i % 10 == 0: # Log progress every 10 pages
|
|
logger.info(f"Checking page {i+1}/{len(french_pages)}: {page['title']}")
|
|
|
|
try:
|
|
# Construct the English page URL by removing the FR: prefix
|
|
en_url = page['url'].replace('/wiki/FR:', '/wiki/')
|
|
|
|
# Check if the English page exists
|
|
response = requests.head(en_url)
|
|
|
|
# If the page returns a 200 status code, it exists
|
|
if response.status_code == 200:
|
|
page['has_translation'] = True
|
|
page['en_url'] = en_url
|
|
else:
|
|
page['has_translation'] = False
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error checking translation for {page['title']}: {e}")
|
|
# Assume no translation in case of error
|
|
page['has_translation'] = False
|
|
|
|
# Filter to only include pages without translations
|
|
untranslated_pages = [page for page in french_pages if not page['has_translation']]
|
|
logger.info(f"Found {len(untranslated_pages)} French pages without translations")
|
|
|
|
return untranslated_pages
|
|
|
|
def save_untranslated_pages(untranslated_pages):
|
|
"""
|
|
Save the untranslated pages to a JSON file
|
|
|
|
Args:
|
|
untranslated_pages (list): List of dictionaries containing untranslated page information
|
|
|
|
Returns:
|
|
str: Path to the output file
|
|
"""
|
|
data = {
|
|
'last_updated': datetime.now().isoformat(),
|
|
'untranslated_pages': untranslated_pages
|
|
}
|
|
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"Saved {len(untranslated_pages)} untranslated pages to {OUTPUT_FILE}")
|
|
return OUTPUT_FILE
|
|
|
|
def main():
|
|
"""Main function to execute the script"""
|
|
parser = argparse.ArgumentParser(description="Find French OSM wiki pages without translations")
|
|
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
|
parser.add_argument("--dry-run", action="store_true", help="Print results without saving to file")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("Starting find_untranslated_french_pages.py")
|
|
|
|
# Check if we should update the cache
|
|
if args.force or should_update_cache() or args.dry_run:
|
|
# Fetch all French pages
|
|
french_pages = fetch_french_pages()
|
|
|
|
# Check which ones don't have translations
|
|
untranslated_pages = check_translations(french_pages)
|
|
|
|
if args.dry_run:
|
|
logger.info(f"Found {len(untranslated_pages)} French pages without translations:")
|
|
for page in untranslated_pages[:10]: # Show only the first 10 in dry run
|
|
logger.info(f"- {page['title']} ({page['url']})")
|
|
if len(untranslated_pages) > 10:
|
|
logger.info(f"... and {len(untranslated_pages) - 10} more")
|
|
else:
|
|
# Save the results
|
|
output_file = save_untranslated_pages(untranslated_pages)
|
|
logger.info(f"Results saved to {output_file}")
|
|
else:
|
|
logger.info("Using cached untranslated pages data")
|
|
|
|
logger.info("Script completed successfully")
|
|
|
|
if __name__ == "__main__":
|
|
main() |