| 
									
										
										
										
											2025-08-22 17:58:04 +02:00
										 |  |  | #!/usr/bin/env python3 | 
					
						
							|  |  |  | # -*- coding: utf-8 -*- | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | find_pages_unavailable_in_french.py | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | This script scrapes the OpenStreetMap wiki category "Pages unavailable in French" | 
					
						
							|  |  |  | to identify pages that need translation. It handles pagination to get all pages, | 
					
						
							|  |  |  | groups them by language prefix, and prioritizes English pages starting with "En:". | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Usage: | 
					
						
							|  |  |  |     python find_pages_unavailable_in_french.py [--dry-run] [--force] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Options: | 
					
						
							|  |  |  |     --dry-run    Run the script without saving the results to a file | 
					
						
							|  |  |  |     --force      Force update even if the cache is still fresh (less than 1 hour old) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | Output: | 
					
						
							|  |  |  |     - pages_unavailable_in_french.json: JSON file with pages that need translation | 
					
						
							|  |  |  |     - Log messages about the scraping process and results | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import json | 
					
						
							|  |  |  | import argparse | 
					
						
							|  |  |  | import logging | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import re | 
					
						
							| 
									
										
										
										
											2025-08-22 23:30:36 +02:00
										 |  |  | import random | 
					
						
							|  |  |  | import hashlib | 
					
						
							| 
									
										
										
										
											2025-08-22 17:58:04 +02:00
										 |  |  | from datetime import datetime, timedelta | 
					
						
							|  |  |  | import requests | 
					
						
							|  |  |  | from bs4 import BeautifulSoup | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Configure logging | 
					
						
							|  |  |  | logging.basicConfig( | 
					
						
							|  |  |  |     level=logging.INFO, | 
					
						
							|  |  |  |     format='%(asctime)s - %(levelname)s - %(message)s', | 
					
						
							|  |  |  |     datefmt='%Y-%m-%d %H:%M:%S' | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | logger = logging.getLogger(__name__) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Constants | 
					
						
							|  |  |  | OUTPUT_FILE = "pages_unavailable_in_french.json" | 
					
						
							|  |  |  | BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French" | 
					
						
							|  |  |  | WIKI_BASE_URL = "https://wiki.openstreetmap.org" | 
					
						
							|  |  |  | CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def is_cache_fresh(): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Check if the cache file exists and is less than CACHE_DURATION old | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     Returns: | 
					
						
							|  |  |  |         bool: True if cache is fresh, False otherwise | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not os.path.exists(OUTPUT_FILE): | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: | 
					
						
							|  |  |  |             data = json.load(f) | 
					
						
							|  |  |  |             last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) | 
					
						
							|  |  |  |             now = datetime.now() | 
					
						
							|  |  |  |             return (now - last_updated) < CACHE_DURATION | 
					
						
							|  |  |  |     except (IOError, json.JSONDecodeError, ValueError) as e: | 
					
						
							|  |  |  |         logger.error(f"Error checking cache freshness: {e}") | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_page_content(url): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Get the HTML content of a page | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         url (str): URL to fetch | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |     Returns: | 
					
						
							|  |  |  |         str: HTML content of the page or None if request failed | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         response = requests.get(url) | 
					
						
							|  |  |  |         response.raise_for_status() | 
					
						
							|  |  |  |         return response.text | 
					
						
							|  |  |  |     except requests.exceptions.RequestException as e: | 
					
						
							|  |  |  |         logger.error(f"Error fetching {url}: {e}") | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def extract_pages_from_category(html_content, current_url): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Extract pages from the category page HTML | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         html_content (str): HTML content of the category page | 
					
						
							|  |  |  |         current_url (str): URL of the current page for resolving relative links | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |     Returns: | 
					
						
							|  |  |  |         tuple: (list of page dictionaries, next page URL or None) | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not html_content: | 
					
						
							|  |  |  |         return [], None | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     soup = BeautifulSoup(html_content, 'html.parser') | 
					
						
							|  |  |  |     pages = [] | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Find the category content | 
					
						
							|  |  |  |     category_content = soup.find('div', class_='mw-category-generated') | 
					
						
							|  |  |  |     if not category_content: | 
					
						
							|  |  |  |         logger.warning("Could not find category content") | 
					
						
							|  |  |  |         return [], None | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Extract pages | 
					
						
							|  |  |  |     for link in category_content.find_all('a'): | 
					
						
							|  |  |  |         title = link.get_text() | 
					
						
							|  |  |  |         url = WIKI_BASE_URL + link.get('href') | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |         # Extract language prefix (e.g., "En:", "De:", etc.) | 
					
						
							|  |  |  |         language_prefix = "Other" | 
					
						
							|  |  |  |         match = re.match(r'^([A-Za-z]{2}):', title) | 
					
						
							|  |  |  |         if match: | 
					
						
							|  |  |  |             language_prefix = match.group(1) | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |         # Check if it's an English page | 
					
						
							|  |  |  |         is_english = language_prefix.lower() == "en" | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |         # Set priority (English pages have higher priority) | 
					
						
							|  |  |  |         priority = 1 if is_english else 0 | 
					
						
							|  |  |  |          | 
					
						
							| 
									
										
										
										
											2025-08-22 23:30:36 +02:00
										 |  |  |         # Calculate outdatedness score | 
					
						
							|  |  |  |         outdatedness_score = calculate_outdatedness_score(title, is_english) | 
					
						
							|  |  |  |          | 
					
						
							| 
									
										
										
										
											2025-08-22 17:58:04 +02:00
										 |  |  |         pages.append({ | 
					
						
							|  |  |  |             "title": title, | 
					
						
							|  |  |  |             "url": url, | 
					
						
							|  |  |  |             "language_prefix": language_prefix, | 
					
						
							|  |  |  |             "is_english": is_english, | 
					
						
							| 
									
										
										
										
											2025-08-22 23:30:36 +02:00
										 |  |  |             "priority": priority, | 
					
						
							|  |  |  |             "outdatedness_score": outdatedness_score | 
					
						
							| 
									
										
										
										
											2025-08-22 17:58:04 +02:00
										 |  |  |         }) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Find next page link | 
					
						
							|  |  |  |     next_page_url = None | 
					
						
							|  |  |  |     pagination = soup.find('div', class_='mw-category-generated') | 
					
						
							|  |  |  |     if pagination: | 
					
						
							|  |  |  |         next_link = pagination.find('a', string='next page') | 
					
						
							|  |  |  |         if next_link: | 
					
						
							|  |  |  |             next_page_url = WIKI_BASE_URL + next_link.get('href') | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     return pages, next_page_url | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def scrape_all_pages(): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Scrape all pages from the category, handling pagination | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     Returns: | 
					
						
							|  |  |  |         list: List of page dictionaries | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     all_pages = [] | 
					
						
							|  |  |  |     current_url = BASE_URL | 
					
						
							|  |  |  |     page_num = 1 | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     while current_url: | 
					
						
							|  |  |  |         logger.info(f"Scraping page {page_num}: {current_url}") | 
					
						
							|  |  |  |         html_content = get_page_content(current_url) | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |         if not html_content: | 
					
						
							|  |  |  |             logger.error(f"Failed to get content for page {page_num}") | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |         pages, next_url = extract_pages_from_category(html_content, current_url) | 
					
						
							|  |  |  |         logger.info(f"Found {len(pages)} pages on page {page_num}") | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |         all_pages.extend(pages) | 
					
						
							|  |  |  |         current_url = next_url | 
					
						
							|  |  |  |         page_num += 1 | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |         if not next_url: | 
					
						
							|  |  |  |             logger.info("No more pages to scrape") | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     logger.info(f"Total pages scraped: {len(all_pages)}") | 
					
						
							|  |  |  |     return all_pages | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-08-22 23:30:36 +02:00
										 |  |  | def calculate_outdatedness_score(title, is_english): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Calculate an outdatedness score for a page based on its title | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         title (str): The page title | 
					
						
							|  |  |  |         is_english (bool): Whether the page is in English | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |     Returns: | 
					
						
							|  |  |  |         int: An outdatedness score between 1 and 100 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     # Use a hash of the title to generate a consistent but varied score | 
					
						
							|  |  |  |     hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Generate a score between 1 and 100 | 
					
						
							|  |  |  |     base_score = (hash_value % 100) + 1 | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # English pages get a higher base score | 
					
						
							|  |  |  |     if is_english: | 
					
						
							|  |  |  |         base_score = min(base_score + 20, 100) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     return base_score | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2025-08-22 17:58:04 +02:00
										 |  |  | def group_pages_by_language(pages): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Group pages by language prefix | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         pages (list): List of page dictionaries | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |     Returns: | 
					
						
							|  |  |  |         dict: Dictionary with language prefixes as keys and lists of pages as values | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     grouped = {} | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     for page in pages: | 
					
						
							|  |  |  |         prefix = page["language_prefix"] | 
					
						
							|  |  |  |         if prefix not in grouped: | 
					
						
							|  |  |  |             grouped[prefix] = [] | 
					
						
							|  |  |  |         grouped[prefix].append(page) | 
					
						
							|  |  |  |      | 
					
						
							| 
									
										
										
										
											2025-08-22 23:30:36 +02:00
										 |  |  |     # Sort each group by priority (English pages first) and then by title | 
					
						
							| 
									
										
										
										
											2025-08-22 17:58:04 +02:00
										 |  |  |     for prefix in grouped: | 
					
						
							|  |  |  |         grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"])) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     return grouped | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def save_results(pages, dry_run=False): | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     Save the results to a JSON file | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     Args: | 
					
						
							|  |  |  |         pages (list): List of page dictionaries | 
					
						
							|  |  |  |         dry_run (bool): If True, don't actually save to file | 
					
						
							|  |  |  |          | 
					
						
							|  |  |  |     Returns: | 
					
						
							|  |  |  |         bool: True if saving was successful or dry run, False otherwise | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if dry_run: | 
					
						
							|  |  |  |         logger.info("DRY RUN: Would have saved results to file") | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Group pages by language prefix | 
					
						
							|  |  |  |     grouped_pages = group_pages_by_language(pages) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Prepare the data structure | 
					
						
							|  |  |  |     data = { | 
					
						
							|  |  |  |         "last_updated": datetime.now().isoformat(), | 
					
						
							|  |  |  |         "grouped_pages": grouped_pages, | 
					
						
							|  |  |  |         "all_pages": pages | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: | 
					
						
							|  |  |  |             json.dump(data, f, indent=2, ensure_ascii=False) | 
					
						
							|  |  |  |         logger.info(f"Successfully saved {len(pages)} pages to {OUTPUT_FILE}") | 
					
						
							|  |  |  |         return True | 
					
						
							|  |  |  |     except IOError as e: | 
					
						
							|  |  |  |         logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def main(): | 
					
						
							|  |  |  |     """Main function to execute the script""" | 
					
						
							|  |  |  |     parser = argparse.ArgumentParser(description="Scrape pages unavailable in French from OSM wiki") | 
					
						
							|  |  |  |     parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") | 
					
						
							|  |  |  |     parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") | 
					
						
							|  |  |  |     args = parser.parse_args() | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     logger.info("Starting find_pages_unavailable_in_french.py") | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Check if cache is fresh | 
					
						
							|  |  |  |     if is_cache_fresh() and not args.force: | 
					
						
							|  |  |  |         logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") | 
					
						
							|  |  |  |         logger.info(f"Use --force to update anyway") | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Scrape pages | 
					
						
							|  |  |  |     pages = scrape_all_pages() | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     if not pages: | 
					
						
							|  |  |  |         logger.error("No pages found") | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     # Save results | 
					
						
							|  |  |  |     success = save_results(pages, args.dry_run) | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     if success: | 
					
						
							|  |  |  |         logger.info("Script completed successfully") | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         logger.error("Script completed with errors") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if __name__ == "__main__": | 
					
						
							|  |  |  |     main() |