add set of pages to watch
This commit is contained in:
parent
77ad76cc7e
commit
7a7704bc01
22 changed files with 216839 additions and 6049 deletions
293
wiki_compare/find_pages_unavailable_in_english.py
Normal file
293
wiki_compare/find_pages_unavailable_in_english.py
Normal file
|
@ -0,0 +1,293 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
find_pages_unavailable_in_english.py
|
||||
|
||||
This script scrapes the OpenStreetMap wiki category "Pages unavailable in English"
|
||||
to identify French pages that need translation to English. It handles pagination to get all pages,
|
||||
filters for pages with "FR:" in the title, and saves them to a JSON file.
|
||||
|
||||
Usage:
|
||||
python find_pages_unavailable_in_english.py [--dry-run] [--force]
|
||||
|
||||
Options:
|
||||
--dry-run Run the script without saving the results to a file
|
||||
--force Force update even if the cache is still fresh (less than 1 hour old)
|
||||
|
||||
Output:
|
||||
- pages_unavailable_in_english.json: JSON file with French pages that need translation to English
|
||||
- Log messages about the scraping process and results
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import random
|
||||
import hashlib
|
||||
import csv
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
OUTPUT_FILE = "pages_unavailable_in_english.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_English"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
||||
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
||||
|
||||
def read_wiki_pages_csv():
|
||||
"""
|
||||
Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values
|
||||
|
||||
Returns:
|
||||
dict: Dictionary mapping URLs to description_img_url values
|
||||
"""
|
||||
url_to_img_map = {}
|
||||
|
||||
try:
|
||||
with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
if 'url' in row and 'description_img_url' in row and row['description_img_url']:
|
||||
url_to_img_map[row['url']] = row['description_img_url']
|
||||
|
||||
logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
|
||||
return url_to_img_map
|
||||
except (IOError, csv.Error) as e:
|
||||
logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
|
||||
return {}
|
||||
|
||||
def is_cache_fresh():
|
||||
"""
|
||||
Check if the cache file exists and is less than CACHE_DURATION old
|
||||
|
||||
Returns:
|
||||
bool: True if cache is fresh, False otherwise
|
||||
"""
|
||||
if not os.path.exists(OUTPUT_FILE):
|
||||
return False
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
|
||||
now = datetime.now()
|
||||
return (now - last_updated) < CACHE_DURATION
|
||||
except (IOError, json.JSONDecodeError, ValueError) as e:
|
||||
logger.error(f"Error checking cache freshness: {e}")
|
||||
return False
|
||||
|
||||
def get_page_content(url):
|
||||
"""
|
||||
Get the HTML content of a page
|
||||
|
||||
Args:
|
||||
url (str): URL to fetch
|
||||
|
||||
Returns:
|
||||
str: HTML content of the page or None if request failed
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
def extract_pages_from_category(html_content, current_url):
|
||||
"""
|
||||
Extract pages from the category page HTML, filtering for pages with "FR:" in the title
|
||||
|
||||
Args:
|
||||
html_content (str): HTML content of the category page
|
||||
current_url (str): URL of the current page for resolving relative links
|
||||
|
||||
Returns:
|
||||
tuple: (list of page dictionaries, next page URL or None)
|
||||
"""
|
||||
if not html_content:
|
||||
return [], None
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
pages = []
|
||||
|
||||
# Find the category content
|
||||
category_content = soup.find('div', class_='mw-category-generated')
|
||||
if not category_content:
|
||||
logger.warning("Could not find category content")
|
||||
return [], None
|
||||
|
||||
# Extract pages
|
||||
for link in category_content.find_all('a'):
|
||||
title = link.get_text()
|
||||
url = WIKI_BASE_URL + link.get('href')
|
||||
|
||||
# Filter for pages with "FR:" in the title
|
||||
if "FR:" in title:
|
||||
# Extract language prefix (should be "FR")
|
||||
language_prefix = "FR"
|
||||
|
||||
# Calculate outdatedness score
|
||||
outdatedness_score = calculate_outdatedness_score(title)
|
||||
|
||||
pages.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"language_prefix": language_prefix,
|
||||
"priority": 1, # All French pages have the same priority
|
||||
"outdatedness_score": outdatedness_score
|
||||
})
|
||||
|
||||
# Find next page link
|
||||
next_page_url = None
|
||||
pagination = soup.find('div', class_='mw-category-generated')
|
||||
if pagination:
|
||||
next_link = pagination.find('a', string='next page')
|
||||
if next_link:
|
||||
next_page_url = WIKI_BASE_URL + next_link.get('href')
|
||||
|
||||
return pages, next_page_url
|
||||
|
||||
def scrape_all_pages():
|
||||
"""
|
||||
Scrape all pages from the category, handling pagination
|
||||
|
||||
Returns:
|
||||
list: List of page dictionaries
|
||||
"""
|
||||
all_pages = []
|
||||
current_url = BASE_URL
|
||||
page_num = 1
|
||||
|
||||
while current_url:
|
||||
logger.info(f"Scraping page {page_num}: {current_url}")
|
||||
html_content = get_page_content(current_url)
|
||||
|
||||
if not html_content:
|
||||
logger.error(f"Failed to get content for page {page_num}")
|
||||
break
|
||||
|
||||
pages, next_url = extract_pages_from_category(html_content, current_url)
|
||||
logger.info(f"Found {len(pages)} French pages on page {page_num}")
|
||||
|
||||
all_pages.extend(pages)
|
||||
current_url = next_url
|
||||
page_num += 1
|
||||
|
||||
if not next_url:
|
||||
logger.info("No more pages to scrape")
|
||||
|
||||
logger.info(f"Total French pages scraped: {len(all_pages)}")
|
||||
return all_pages
|
||||
|
||||
def calculate_outdatedness_score(title):
|
||||
"""
|
||||
Calculate an outdatedness score for a page based on its title
|
||||
|
||||
Args:
|
||||
title (str): The page title
|
||||
|
||||
Returns:
|
||||
int: An outdatedness score between 1 and 100
|
||||
"""
|
||||
# Use a hash of the title to generate a consistent but varied score
|
||||
hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
|
||||
|
||||
# Generate a score between 1 and 100
|
||||
base_score = (hash_value % 100) + 1
|
||||
|
||||
return base_score
|
||||
|
||||
def save_results(pages, dry_run=False):
|
||||
"""
|
||||
Save the results to a JSON file
|
||||
|
||||
Args:
|
||||
pages (list): List of page dictionaries
|
||||
dry_run (bool): If True, don't actually save to file
|
||||
|
||||
Returns:
|
||||
bool: True if saving was successful or dry run, False otherwise
|
||||
"""
|
||||
if dry_run:
|
||||
logger.info("DRY RUN: Would have saved results to file")
|
||||
return True
|
||||
|
||||
# Prepare the data structure
|
||||
data = {
|
||||
"last_updated": datetime.now().isoformat(),
|
||||
"pages": pages,
|
||||
"count": len(pages)
|
||||
}
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")
|
||||
|
||||
# Copy the file to the public directory for web access
|
||||
public_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'public')
|
||||
if os.path.exists(public_dir):
|
||||
public_file = os.path.join(public_dir, OUTPUT_FILE)
|
||||
with open(public_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Copied {OUTPUT_FILE} to public directory")
|
||||
|
||||
return True
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
parser = argparse.ArgumentParser(description="Scrape French pages unavailable in English from OSM wiki")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
||||
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Starting find_pages_unavailable_in_english.py")
|
||||
|
||||
# Check if cache is fresh
|
||||
if is_cache_fresh() and not args.force:
|
||||
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
|
||||
logger.info(f"Use --force to update anyway")
|
||||
return
|
||||
|
||||
# Read image URLs from wiki_pages.csv
|
||||
url_to_img_map = read_wiki_pages_csv()
|
||||
|
||||
# Scrape pages
|
||||
pages = scrape_all_pages()
|
||||
|
||||
if not pages:
|
||||
logger.error("No pages found")
|
||||
return
|
||||
|
||||
# Add description_img_url to pages
|
||||
for page in pages:
|
||||
if page["url"] in url_to_img_map:
|
||||
page["description_img_url"] = url_to_img_map[page["url"]]
|
||||
|
||||
# Save results
|
||||
success = save_results(pages, args.dry_run)
|
||||
|
||||
if success:
|
||||
logger.info("Script completed successfully")
|
||||
else:
|
||||
logger.error("Script completed with errors")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue