mirror of
https://forge.chapril.org/tykayn/osm-commerces
synced 2025-10-04 17:04:53 +02:00
up wiki compare
This commit is contained in:
parent
ce508974c9
commit
2f49ef6479
23 changed files with 567403 additions and 5132 deletions
263
wiki_compare/find_pages_unavailable_in_french.py
Executable file
263
wiki_compare/find_pages_unavailable_in_french.py
Executable file
|
@ -0,0 +1,263 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
find_pages_unavailable_in_french.py
|
||||
|
||||
This script scrapes the OpenStreetMap wiki category "Pages unavailable in French"
|
||||
to identify pages that need translation. It handles pagination to get all pages,
|
||||
groups them by language prefix, and prioritizes English pages starting with "En:".
|
||||
|
||||
Usage:
|
||||
python find_pages_unavailable_in_french.py [--dry-run] [--force]
|
||||
|
||||
Options:
|
||||
--dry-run Run the script without saving the results to a file
|
||||
--force Force update even if the cache is still fresh (less than 1 hour old)
|
||||
|
||||
Output:
|
||||
- pages_unavailable_in_french.json: JSON file with pages that need translation
|
||||
- Log messages about the scraping process and results
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
OUTPUT_FILE = "pages_unavailable_in_french.json"
|
||||
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
||||
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
||||
|
||||
def is_cache_fresh():
|
||||
"""
|
||||
Check if the cache file exists and is less than CACHE_DURATION old
|
||||
|
||||
Returns:
|
||||
bool: True if cache is fresh, False otherwise
|
||||
"""
|
||||
if not os.path.exists(OUTPUT_FILE):
|
||||
return False
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
|
||||
now = datetime.now()
|
||||
return (now - last_updated) < CACHE_DURATION
|
||||
except (IOError, json.JSONDecodeError, ValueError) as e:
|
||||
logger.error(f"Error checking cache freshness: {e}")
|
||||
return False
|
||||
|
||||
def get_page_content(url):
|
||||
"""
|
||||
Get the HTML content of a page
|
||||
|
||||
Args:
|
||||
url (str): URL to fetch
|
||||
|
||||
Returns:
|
||||
str: HTML content of the page or None if request failed
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
def extract_pages_from_category(html_content, current_url):
|
||||
"""
|
||||
Extract pages from the category page HTML
|
||||
|
||||
Args:
|
||||
html_content (str): HTML content of the category page
|
||||
current_url (str): URL of the current page for resolving relative links
|
||||
|
||||
Returns:
|
||||
tuple: (list of page dictionaries, next page URL or None)
|
||||
"""
|
||||
if not html_content:
|
||||
return [], None
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
pages = []
|
||||
|
||||
# Find the category content
|
||||
category_content = soup.find('div', class_='mw-category-generated')
|
||||
if not category_content:
|
||||
logger.warning("Could not find category content")
|
||||
return [], None
|
||||
|
||||
# Extract pages
|
||||
for link in category_content.find_all('a'):
|
||||
title = link.get_text()
|
||||
url = WIKI_BASE_URL + link.get('href')
|
||||
|
||||
# Extract language prefix (e.g., "En:", "De:", etc.)
|
||||
language_prefix = "Other"
|
||||
match = re.match(r'^([A-Za-z]{2}):', title)
|
||||
if match:
|
||||
language_prefix = match.group(1)
|
||||
|
||||
# Check if it's an English page
|
||||
is_english = language_prefix.lower() == "en"
|
||||
|
||||
# Set priority (English pages have higher priority)
|
||||
priority = 1 if is_english else 0
|
||||
|
||||
pages.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"language_prefix": language_prefix,
|
||||
"is_english": is_english,
|
||||
"priority": priority
|
||||
})
|
||||
|
||||
# Find next page link
|
||||
next_page_url = None
|
||||
pagination = soup.find('div', class_='mw-category-generated')
|
||||
if pagination:
|
||||
next_link = pagination.find('a', string='next page')
|
||||
if next_link:
|
||||
next_page_url = WIKI_BASE_URL + next_link.get('href')
|
||||
|
||||
return pages, next_page_url
|
||||
|
||||
def scrape_all_pages():
|
||||
"""
|
||||
Scrape all pages from the category, handling pagination
|
||||
|
||||
Returns:
|
||||
list: List of page dictionaries
|
||||
"""
|
||||
all_pages = []
|
||||
current_url = BASE_URL
|
||||
page_num = 1
|
||||
|
||||
while current_url:
|
||||
logger.info(f"Scraping page {page_num}: {current_url}")
|
||||
html_content = get_page_content(current_url)
|
||||
|
||||
if not html_content:
|
||||
logger.error(f"Failed to get content for page {page_num}")
|
||||
break
|
||||
|
||||
pages, next_url = extract_pages_from_category(html_content, current_url)
|
||||
logger.info(f"Found {len(pages)} pages on page {page_num}")
|
||||
|
||||
all_pages.extend(pages)
|
||||
current_url = next_url
|
||||
page_num += 1
|
||||
|
||||
if not next_url:
|
||||
logger.info("No more pages to scrape")
|
||||
|
||||
logger.info(f"Total pages scraped: {len(all_pages)}")
|
||||
return all_pages
|
||||
|
||||
def group_pages_by_language(pages):
|
||||
"""
|
||||
Group pages by language prefix
|
||||
|
||||
Args:
|
||||
pages (list): List of page dictionaries
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with language prefixes as keys and lists of pages as values
|
||||
"""
|
||||
grouped = {}
|
||||
|
||||
for page in pages:
|
||||
prefix = page["language_prefix"]
|
||||
if prefix not in grouped:
|
||||
grouped[prefix] = []
|
||||
grouped[prefix].append(page)
|
||||
|
||||
# Sort each group by priority (English pages first)
|
||||
for prefix in grouped:
|
||||
grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"]))
|
||||
|
||||
return grouped
|
||||
|
||||
def save_results(pages, dry_run=False):
|
||||
"""
|
||||
Save the results to a JSON file
|
||||
|
||||
Args:
|
||||
pages (list): List of page dictionaries
|
||||
dry_run (bool): If True, don't actually save to file
|
||||
|
||||
Returns:
|
||||
bool: True if saving was successful or dry run, False otherwise
|
||||
"""
|
||||
if dry_run:
|
||||
logger.info("DRY RUN: Would have saved results to file")
|
||||
return True
|
||||
|
||||
# Group pages by language prefix
|
||||
grouped_pages = group_pages_by_language(pages)
|
||||
|
||||
# Prepare the data structure
|
||||
data = {
|
||||
"last_updated": datetime.now().isoformat(),
|
||||
"grouped_pages": grouped_pages,
|
||||
"all_pages": pages
|
||||
}
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}")
|
||||
return True
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
parser = argparse.ArgumentParser(description="Scrape pages unavailable in French from OSM wiki")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
||||
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Starting find_pages_unavailable_in_french.py")
|
||||
|
||||
# Check if cache is fresh
|
||||
if is_cache_fresh() and not args.force:
|
||||
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
|
||||
logger.info(f"Use --force to update anyway")
|
||||
return
|
||||
|
||||
# Scrape pages
|
||||
pages = scrape_all_pages()
|
||||
|
||||
if not pages:
|
||||
logger.error("No pages found")
|
||||
return
|
||||
|
||||
# Save results
|
||||
success = save_results(pages, args.dry_run)
|
||||
|
||||
if success:
|
||||
logger.info("Script completed successfully")
|
||||
else:
|
||||
logger.error("Script completed with errors")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue