329 lines
No EOL
10 KiB
Python
Executable file
329 lines
No EOL
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
find_pages_unavailable_in_french.py
|
|
|
|
This script scrapes the OpenStreetMap wiki category "Pages unavailable in French"
|
|
to identify pages that need translation. It handles pagination to get all pages,
|
|
groups them by language prefix, and prioritizes English pages starting with "En:".
|
|
|
|
Usage:
|
|
python find_pages_unavailable_in_french.py [--dry-run] [--force]
|
|
|
|
Options:
|
|
--dry-run Run the script without saving the results to a file
|
|
--force Force update even if the cache is still fresh (less than 1 hour old)
|
|
|
|
Output:
|
|
- pages_unavailable_in_french.json: JSON file with pages that need translation
|
|
- Log messages about the scraping process and results
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import re
|
|
import random
|
|
import hashlib
|
|
import csv
|
|
from datetime import datetime, timedelta
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
OUTPUT_FILE = "pages_unavailable_in_french.json"
|
|
WIKI_PAGES_CSV = "wiki_pages.csv"
|
|
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French"
|
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
|
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
|
|
|
def read_wiki_pages_csv():
|
|
"""
|
|
Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values
|
|
|
|
Returns:
|
|
dict: Dictionary mapping URLs to description_img_url values
|
|
"""
|
|
url_to_img_map = {}
|
|
|
|
try:
|
|
with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
if 'url' in row and 'description_img_url' in row and row['description_img_url']:
|
|
url_to_img_map[row['url']] = row['description_img_url']
|
|
|
|
logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
|
|
return url_to_img_map
|
|
except (IOError, csv.Error) as e:
|
|
logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
|
|
return {}
|
|
|
|
def is_cache_fresh():
|
|
"""
|
|
Check if the cache file exists and is less than CACHE_DURATION old
|
|
|
|
Returns:
|
|
bool: True if cache is fresh, False otherwise
|
|
"""
|
|
if not os.path.exists(OUTPUT_FILE):
|
|
return False
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
|
|
now = datetime.now()
|
|
return (now - last_updated) < CACHE_DURATION
|
|
except (IOError, json.JSONDecodeError, ValueError) as e:
|
|
logger.error(f"Error checking cache freshness: {e}")
|
|
return False
|
|
|
|
def get_page_content(url):
|
|
"""
|
|
Get the HTML content of a page
|
|
|
|
Args:
|
|
url (str): URL to fetch
|
|
|
|
Returns:
|
|
str: HTML content of the page or None if request failed
|
|
"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
def extract_pages_from_category(html_content, current_url):
|
|
"""
|
|
Extract pages from the category page HTML
|
|
|
|
Args:
|
|
html_content (str): HTML content of the category page
|
|
current_url (str): URL of the current page for resolving relative links
|
|
|
|
Returns:
|
|
tuple: (list of page dictionaries, next page URL or None)
|
|
"""
|
|
if not html_content:
|
|
return [], None
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
pages = []
|
|
|
|
# Find the category content
|
|
category_content = soup.find('div', class_='mw-category-generated')
|
|
if not category_content:
|
|
logger.warning("Could not find category content")
|
|
return [], None
|
|
|
|
# Extract pages
|
|
for link in category_content.find_all('a'):
|
|
title = link.get_text()
|
|
url = WIKI_BASE_URL + link.get('href')
|
|
|
|
# Skip pages with "FR:User:" or "FR:Réunions"
|
|
if "FR:User:" in title or "FR:Réunions" in title:
|
|
logger.info(f"Skipping excluded page: {title}")
|
|
continue
|
|
|
|
# Extract language prefix (e.g., "En:", "De:", etc.)
|
|
language_prefix = "Other"
|
|
match = re.match(r'^([A-Za-z]{2}):', title)
|
|
if match:
|
|
language_prefix = match.group(1)
|
|
|
|
# Check if it's an English page
|
|
is_english = language_prefix.lower() == "en"
|
|
|
|
# Set priority (English pages have higher priority)
|
|
priority = 1 if is_english else 0
|
|
|
|
# Calculate outdatedness score
|
|
outdatedness_score = calculate_outdatedness_score(title, is_english)
|
|
|
|
pages.append({
|
|
"title": title,
|
|
"url": url,
|
|
"language_prefix": language_prefix,
|
|
"is_english": is_english,
|
|
"priority": priority,
|
|
"outdatedness_score": outdatedness_score
|
|
})
|
|
|
|
# Find next page link
|
|
next_page_url = None
|
|
pagination = soup.find('div', class_='mw-category-generated')
|
|
if pagination:
|
|
next_link = pagination.find('a', string='next page')
|
|
if next_link:
|
|
next_page_url = WIKI_BASE_URL + next_link.get('href')
|
|
|
|
return pages, next_page_url
|
|
|
|
def scrape_all_pages():
|
|
"""
|
|
Scrape all pages from the category, handling pagination
|
|
|
|
Returns:
|
|
list: List of page dictionaries
|
|
"""
|
|
all_pages = []
|
|
current_url = BASE_URL
|
|
page_num = 1
|
|
|
|
while current_url:
|
|
logger.info(f"Scraping page {page_num}: {current_url}")
|
|
html_content = get_page_content(current_url)
|
|
|
|
if not html_content:
|
|
logger.error(f"Failed to get content for page {page_num}")
|
|
break
|
|
|
|
pages, next_url = extract_pages_from_category(html_content, current_url)
|
|
logger.info(f"Found {len(pages)} pages on page {page_num}")
|
|
|
|
all_pages.extend(pages)
|
|
current_url = next_url
|
|
page_num += 1
|
|
|
|
if not next_url:
|
|
logger.info("No more pages to scrape")
|
|
|
|
logger.info(f"Total pages scraped: {len(all_pages)}")
|
|
return all_pages
|
|
|
|
def calculate_outdatedness_score(title, is_english):
|
|
"""
|
|
Calculate an outdatedness score for a page based on its title
|
|
|
|
Args:
|
|
title (str): The page title
|
|
is_english (bool): Whether the page is in English
|
|
|
|
Returns:
|
|
int: An outdatedness score between 1 and 100
|
|
"""
|
|
# Use a hash of the title to generate a consistent but varied score
|
|
hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
|
|
|
|
# Generate a score between 1 and 100
|
|
base_score = (hash_value % 100) + 1
|
|
|
|
# English pages get a higher base score
|
|
if is_english:
|
|
base_score = min(base_score + 20, 100)
|
|
|
|
return base_score
|
|
|
|
def group_pages_by_language(pages):
|
|
"""
|
|
Group pages by language prefix
|
|
|
|
Args:
|
|
pages (list): List of page dictionaries
|
|
|
|
Returns:
|
|
dict: Dictionary with language prefixes as keys and lists of pages as values
|
|
"""
|
|
grouped = {}
|
|
|
|
for page in pages:
|
|
prefix = page["language_prefix"]
|
|
if prefix not in grouped:
|
|
grouped[prefix] = []
|
|
grouped[prefix].append(page)
|
|
|
|
# Sort each group by priority (English pages first) and then by title
|
|
for prefix in grouped:
|
|
grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"]))
|
|
|
|
return grouped
|
|
|
|
def save_results(pages, dry_run=False):
|
|
"""
|
|
Save the results to a JSON file
|
|
|
|
Args:
|
|
pages (list): List of page dictionaries
|
|
dry_run (bool): If True, don't actually save to file
|
|
|
|
Returns:
|
|
bool: True if saving was successful or dry run, False otherwise
|
|
"""
|
|
if dry_run:
|
|
logger.info("DRY RUN: Would have saved results to file")
|
|
return True
|
|
|
|
# Group pages by language prefix
|
|
grouped_pages = group_pages_by_language(pages)
|
|
|
|
# Prepare the data structure
|
|
data = {
|
|
"last_updated": datetime.now().isoformat(),
|
|
"grouped_pages": grouped_pages,
|
|
"all_pages": pages
|
|
}
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")
|
|
return True
|
|
except IOError as e:
|
|
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main function to execute the script"""
|
|
parser = argparse.ArgumentParser(description="Scrape pages unavailable in French from OSM wiki")
|
|
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
|
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("Starting find_pages_unavailable_in_french.py")
|
|
|
|
# Check if cache is fresh
|
|
if is_cache_fresh() and not args.force:
|
|
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
|
|
logger.info(f"Use --force to update anyway")
|
|
return
|
|
|
|
# Read image URLs from wiki_pages.csv
|
|
url_to_img_map = read_wiki_pages_csv()
|
|
|
|
# Scrape pages
|
|
pages = scrape_all_pages()
|
|
|
|
if not pages:
|
|
logger.error("No pages found")
|
|
return
|
|
|
|
# Add description_img_url to pages
|
|
for page in pages:
|
|
if page["url"] in url_to_img_map:
|
|
page["description_img_url"] = url_to_img_map[page["url"]]
|
|
|
|
# Save results
|
|
success = save_results(pages, args.dry_run)
|
|
|
|
if success:
|
|
logger.info("Script completed successfully")
|
|
else:
|
|
logger.error("Script completed with errors")
|
|
|
|
if __name__ == "__main__":
|
|
main() |