293 lines
No EOL
9.2 KiB
Python
293 lines
No EOL
9.2 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
find_pages_unavailable_in_english.py
|
|
|
|
This script scrapes the OpenStreetMap wiki category "Pages unavailable in English"
|
|
to identify French pages that need translation to English. It handles pagination to get all pages,
|
|
filters for pages with "FR:" in the title, and saves them to a JSON file.
|
|
|
|
Usage:
|
|
python find_pages_unavailable_in_english.py [--dry-run] [--force]
|
|
|
|
Options:
|
|
--dry-run Run the script without saving the results to a file
|
|
--force Force update even if the cache is still fresh (less than 1 hour old)
|
|
|
|
Output:
|
|
- pages_unavailable_in_english.json: JSON file with French pages that need translation to English
|
|
- Log messages about the scraping process and results
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import re
|
|
import random
|
|
import hashlib
|
|
import csv
|
|
from datetime import datetime, timedelta
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
OUTPUT_FILE = "pages_unavailable_in_english.json"
|
|
WIKI_PAGES_CSV = "wiki_pages.csv"
|
|
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_English"
|
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
|
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
|
|
|
def read_wiki_pages_csv():
|
|
"""
|
|
Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values
|
|
|
|
Returns:
|
|
dict: Dictionary mapping URLs to description_img_url values
|
|
"""
|
|
url_to_img_map = {}
|
|
|
|
try:
|
|
with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
if 'url' in row and 'description_img_url' in row and row['description_img_url']:
|
|
url_to_img_map[row['url']] = row['description_img_url']
|
|
|
|
logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
|
|
return url_to_img_map
|
|
except (IOError, csv.Error) as e:
|
|
logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
|
|
return {}
|
|
|
|
def is_cache_fresh():
|
|
"""
|
|
Check if the cache file exists and is less than CACHE_DURATION old
|
|
|
|
Returns:
|
|
bool: True if cache is fresh, False otherwise
|
|
"""
|
|
if not os.path.exists(OUTPUT_FILE):
|
|
return False
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
|
|
now = datetime.now()
|
|
return (now - last_updated) < CACHE_DURATION
|
|
except (IOError, json.JSONDecodeError, ValueError) as e:
|
|
logger.error(f"Error checking cache freshness: {e}")
|
|
return False
|
|
|
|
def get_page_content(url):
|
|
"""
|
|
Get the HTML content of a page
|
|
|
|
Args:
|
|
url (str): URL to fetch
|
|
|
|
Returns:
|
|
str: HTML content of the page or None if request failed
|
|
"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
def extract_pages_from_category(html_content, current_url):
|
|
"""
|
|
Extract pages from the category page HTML, filtering for pages with "FR:" in the title
|
|
|
|
Args:
|
|
html_content (str): HTML content of the category page
|
|
current_url (str): URL of the current page for resolving relative links
|
|
|
|
Returns:
|
|
tuple: (list of page dictionaries, next page URL or None)
|
|
"""
|
|
if not html_content:
|
|
return [], None
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
pages = []
|
|
|
|
# Find the category content
|
|
category_content = soup.find('div', class_='mw-category-generated')
|
|
if not category_content:
|
|
logger.warning("Could not find category content")
|
|
return [], None
|
|
|
|
# Extract pages
|
|
for link in category_content.find_all('a'):
|
|
title = link.get_text()
|
|
url = WIKI_BASE_URL + link.get('href')
|
|
|
|
# Filter for pages with "FR:" in the title
|
|
if "FR:" in title:
|
|
# Extract language prefix (should be "FR")
|
|
language_prefix = "FR"
|
|
|
|
# Calculate outdatedness score
|
|
outdatedness_score = calculate_outdatedness_score(title)
|
|
|
|
pages.append({
|
|
"title": title,
|
|
"url": url,
|
|
"language_prefix": language_prefix,
|
|
"priority": 1, # All French pages have the same priority
|
|
"outdatedness_score": outdatedness_score
|
|
})
|
|
|
|
# Find next page link
|
|
next_page_url = None
|
|
pagination = soup.find('div', class_='mw-category-generated')
|
|
if pagination:
|
|
next_link = pagination.find('a', string='next page')
|
|
if next_link:
|
|
next_page_url = WIKI_BASE_URL + next_link.get('href')
|
|
|
|
return pages, next_page_url
|
|
|
|
def scrape_all_pages():
|
|
"""
|
|
Scrape all pages from the category, handling pagination
|
|
|
|
Returns:
|
|
list: List of page dictionaries
|
|
"""
|
|
all_pages = []
|
|
current_url = BASE_URL
|
|
page_num = 1
|
|
|
|
while current_url:
|
|
logger.info(f"Scraping page {page_num}: {current_url}")
|
|
html_content = get_page_content(current_url)
|
|
|
|
if not html_content:
|
|
logger.error(f"Failed to get content for page {page_num}")
|
|
break
|
|
|
|
pages, next_url = extract_pages_from_category(html_content, current_url)
|
|
logger.info(f"Found {len(pages)} French pages on page {page_num}")
|
|
|
|
all_pages.extend(pages)
|
|
current_url = next_url
|
|
page_num += 1
|
|
|
|
if not next_url:
|
|
logger.info("No more pages to scrape")
|
|
|
|
logger.info(f"Total French pages scraped: {len(all_pages)}")
|
|
return all_pages
|
|
|
|
def calculate_outdatedness_score(title):
|
|
"""
|
|
Calculate an outdatedness score for a page based on its title
|
|
|
|
Args:
|
|
title (str): The page title
|
|
|
|
Returns:
|
|
int: An outdatedness score between 1 and 100
|
|
"""
|
|
# Use a hash of the title to generate a consistent but varied score
|
|
hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
|
|
|
|
# Generate a score between 1 and 100
|
|
base_score = (hash_value % 100) + 1
|
|
|
|
return base_score
|
|
|
|
def save_results(pages, dry_run=False):
|
|
"""
|
|
Save the results to a JSON file
|
|
|
|
Args:
|
|
pages (list): List of page dictionaries
|
|
dry_run (bool): If True, don't actually save to file
|
|
|
|
Returns:
|
|
bool: True if saving was successful or dry run, False otherwise
|
|
"""
|
|
if dry_run:
|
|
logger.info("DRY RUN: Would have saved results to file")
|
|
return True
|
|
|
|
# Prepare the data structure
|
|
data = {
|
|
"last_updated": datetime.now().isoformat(),
|
|
"pages": pages,
|
|
"count": len(pages)
|
|
}
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")
|
|
|
|
# Copy the file to the public directory for web access
|
|
public_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'public')
|
|
if os.path.exists(public_dir):
|
|
public_file = os.path.join(public_dir, OUTPUT_FILE)
|
|
with open(public_file, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Copied {OUTPUT_FILE} to public directory")
|
|
|
|
return True
|
|
except IOError as e:
|
|
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main function to execute the script"""
|
|
parser = argparse.ArgumentParser(description="Scrape French pages unavailable in English from OSM wiki")
|
|
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
|
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("Starting find_pages_unavailable_in_english.py")
|
|
|
|
# Check if cache is fresh
|
|
if is_cache_fresh() and not args.force:
|
|
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
|
|
logger.info(f"Use --force to update anyway")
|
|
return
|
|
|
|
# Read image URLs from wiki_pages.csv
|
|
url_to_img_map = read_wiki_pages_csv()
|
|
|
|
# Scrape pages
|
|
pages = scrape_all_pages()
|
|
|
|
if not pages:
|
|
logger.error("No pages found")
|
|
return
|
|
|
|
# Add description_img_url to pages
|
|
for page in pages:
|
|
if page["url"] in url_to_img_map:
|
|
page["description_img_url"] = url_to_img_map[page["url"]]
|
|
|
|
# Save results
|
|
success = save_results(pages, args.dry_run)
|
|
|
|
if success:
|
|
logger.info("Script completed successfully")
|
|
else:
|
|
logger.error("Script completed with errors")
|
|
|
|
if __name__ == "__main__":
|
|
main() |