1868 lines
		
	
	
		
			No EOL
		
	
	
		
			73 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			1868 lines
		
	
	
		
			No EOL
		
	
	
		
			73 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/env python3
 | |
| # -*- coding: utf-8 -*-
 | |
| 
 | |
| """
 | |
| wiki_compare.py
 | |
| 
 | |
| This script fetches the most used OpenStreetMap keys from TagInfo,
 | |
| compares their English and French wiki pages, and identifies which pages
 | |
| need updating based on modification dates and content analysis.
 | |
| 
 | |
| The script also compares a specific list of wiki pages defined in the
 | |
| SPECIFIC_PAGES constant. This list can include regular page titles,
 | |
| full URLs, or pages with FR: prefix.
 | |
| 
 | |
| Usage:
 | |
|     python wiki_compare.py
 | |
| 
 | |
| Output:
 | |
|     - top_keys.json: JSON file containing the most used OSM keys
 | |
|     - wiki_pages.csv: CSV file with information about each wiki page
 | |
|     - outdated_pages.json: JSON file containing pages that need updating
 | |
|     - staleness_histogram.png: Histogram of staleness scores
 | |
|     - A console output listing the wiki pages that need updating
 | |
| """
 | |
| 
 | |
| import json
 | |
| import csv
 | |
| import requests
 | |
| import re
 | |
| import os
 | |
| import subprocess
 | |
| import tempfile
 | |
| import hashlib
 | |
| import argparse
 | |
| from datetime import datetime
 | |
| from bs4 import BeautifulSoup
 | |
| import logging
 | |
| import matplotlib.pyplot as plt
 | |
| import numpy as np
 | |
| from pathlib import Path
 | |
| 
 | |
| # Try to import nltk, but make it optional
 | |
| try:
 | |
|     import nltk
 | |
|     NLTK_AVAILABLE = True
 | |
| except ImportError:
 | |
|     NLTK_AVAILABLE = False
 | |
| 
 | |
| # Configure logging
 | |
| logging.basicConfig(
 | |
|     level=logging.INFO,
 | |
|     format='%(asctime)s - %(levelname)s - %(message)s',
 | |
|     datefmt='%Y-%m-%d %H:%M:%S'
 | |
| )
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| # Constants
 | |
| TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
 | |
| TAGINFO_FRANCE_API_URL = "https://taginfo.geofabrik.de/europe:france/api/4/keys/without_wiki_page"
 | |
| WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
 | |
| WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
 | |
| WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
 | |
| WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
 | |
| WIKI_DEADEND_PAGES_URL = "https://wiki.openstreetmap.org/w/index.php?title=Special:DeadendPages&limit=500&offset=1000"
 | |
| TOP_KEYS_FILE = "top_keys.json"
 | |
| KEYS_WITHOUT_WIKI_FILE = "keys_without_wiki.json"
 | |
| WIKI_PAGES_CSV = "wiki_pages.csv"
 | |
| OUTDATED_PAGES_FILE = "outdated_pages.json"
 | |
| DEADEND_PAGES_FILE = "deadend_pages.json"
 | |
| STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 | |
| # Number of wiki pages to examine
 | |
| NUM_WIKI_PAGES = 2
 | |
| # HTML cache folder
 | |
| HTML_CACHE_DIR = "html_cache"
 | |
| 
 | |
| # Initialize NLTK for sentence tokenization if available
 | |
| if NLTK_AVAILABLE:
 | |
|     try:
 | |
|         nltk.data.find('tokenizers/punkt')
 | |
|     except LookupError:
 | |
|         nltk.download('punkt')
 | |
| 
 | |
|     # Also download punkt_tab resource which is needed for sent_tokenize
 | |
|     try:
 | |
|         nltk.data.find('tokenizers/punkt_tab')
 | |
|     except LookupError:
 | |
|         nltk.download('punkt_tab')
 | |
| 
 | |
| # Create HTML cache directory if it doesn't exist
 | |
| Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
 | |
| 
 | |
| # List of specific pages to compare (in addition to top keys)
 | |
| # This list can include:
 | |
| # 1. Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
 | |
| # 2. Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
 | |
| # 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
 | |
| SPECIFIC_PAGES = [
 | |
|     "Anatomie_des_étiquettes_osm",
 | |
|     "Tag:leisure=children_club",
 | |
|     "Tag:harassment_prevention=ask_angela",
 | |
|     "Key:harassment_prevention",
 | |
|     "Proposal process",
 | |
|     "Outil de Manipulation et d'Organisation",
 | |
|     "Automated_Edits_code_of_conduct",
 | |
|     "Key:cuisine",
 | |
|     "Libre_Charge_Map",
 | |
|     "OSM_Mon_Commerce",
 | |
|     "Complète_Tes_Commerces",
 | |
|     "Tag:amenity=charging_station",
 | |
|     "Organised_Editing/Activities/MapYourGrid_Initiative",
 | |
|     "Key:highway",
 | |
|     "Quality_assurance",
 | |
|     "Verifiability",
 | |
|     "Good_practice",
 | |
|     "Mapping_parties",
 | |
|     "State_of_the_Map",
 | |
|     "Diversity",
 | |
|     "Mapping_private_information",
 | |
|     "Any_tags_you_like",
 | |
|     "Organised_Editing/Best_Practices",
 | |
|     "Map_features",
 | |
|     "Wiki"
 | |
| ]
 | |
| 
 | |
| def fetch_desynchronized_pages():
 | |
|     """
 | |
|     Fetch pages from the FR:Traductions_désynchronisées category
 | |
| 
 | |
|     Returns:
 | |
|         list: List of page URLs from the category
 | |
|     """
 | |
|     logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
 | |
| 
 | |
|     try:
 | |
|         response = requests.get(WIKI_CATEGORY_URL)
 | |
|         response.raise_for_status()
 | |
| 
 | |
|         soup = BeautifulSoup(response.text, 'html.parser')
 | |
| 
 | |
|         # Find all links to French pages in the category
 | |
|         page_links = []
 | |
|         for link in soup.select('a[href^="/wiki/FR:"]'):
 | |
|             href = link.get('href', '')
 | |
|             # Skip if it's a category link or a language link
 | |
|             if '/Category:' in href or 'action=edit' in href:
 | |
|                 continue
 | |
| 
 | |
|             # Get the full URL
 | |
|             full_url = 'https://wiki.openstreetmap.org' + href
 | |
|             page_links.append(full_url)
 | |
| 
 | |
|         logger.info(f"Found {len(page_links)} pages in the category")
 | |
|         return page_links
 | |
| 
 | |
|     except requests.exceptions.RequestException as e:
 | |
|         logger.error(f"Error fetching category page: {e}")
 | |
|         return []
 | |
| 
 | |
| def suggest_categories(page_title, page_url):
 | |
|     """
 | |
|     Suggest categories for an uncategorized page based on its title and content
 | |
|     
 | |
|     Args:
 | |
|         page_title (str): Title of the page
 | |
|         page_url (str): URL of the page
 | |
|         
 | |
|     Returns:
 | |
|         list: List of suggested categories
 | |
|     """
 | |
|     logger.info(f"Suggesting categories for page: {page_title}")
 | |
|     
 | |
|     suggested_categories = []
 | |
|     
 | |
|     # Common categories for French OSM wiki pages
 | |
|     common_categories = [
 | |
|         "Documentation OSM en français",
 | |
|         "Cartographie",
 | |
|         "Contributeurs",
 | |
|         "Développeurs",
 | |
|         "Éléments cartographiés",
 | |
|         "Imports",
 | |
|         "Logiciels",
 | |
|         "Projets",
 | |
|         "Rencontres",
 | |
|         "Utilisateurs"
 | |
|     ]
 | |
|     
 | |
|     # Add geography-related categories for pages about France
 | |
|     if "France" in page_title:
 | |
|         suggested_categories.append("France")
 | |
|         
 | |
|         # Check for specific regions or departments
 | |
|         regions = [
 | |
|             "Auvergne-Rhône-Alpes", "Bourgogne-Franche-Comté", "Bretagne", 
 | |
|             "Centre-Val de Loire", "Corse", "Grand Est", "Hauts-de-France", 
 | |
|             "Île-de-France", "Normandie", "Nouvelle-Aquitaine", 
 | |
|             "Occitanie", "Pays de la Loire", "Provence-Alpes-Côte d'Azur"
 | |
|         ]
 | |
|         
 | |
|         for region in regions:
 | |
|             if region in page_title:
 | |
|                 suggested_categories.append(region)
 | |
|     
 | |
|     # Try to fetch the page content to make better suggestions
 | |
|     try:
 | |
|         response = requests.get(page_url)
 | |
|         response.raise_for_status()
 | |
|         
 | |
|         soup = BeautifulSoup(response.text, 'html.parser')
 | |
|         
 | |
|         # Get the main content
 | |
|         content = soup.select_one('#mw-content-text')
 | |
|         if content:
 | |
|             text = content.get_text(separator=' ', strip=True).lower()
 | |
|             
 | |
|             # Check for keywords related to common categories
 | |
|             if any(keyword in text for keyword in ["carte", "cartographie", "mapper"]):
 | |
|                 suggested_categories.append("Cartographie")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["contribuer", "contributeur", "éditer"]):
 | |
|                 suggested_categories.append("Contributeurs")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["développeur", "programmer", "code", "api"]):
 | |
|                 suggested_categories.append("Développeurs")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["tag", "clé", "valeur", "élément", "nœud", "way", "relation"]):
 | |
|                 suggested_categories.append("Éléments cartographiés")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["import", "données", "dataset"]):
 | |
|                 suggested_categories.append("Imports")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["logiciel", "application", "outil"]):
 | |
|                 suggested_categories.append("Logiciels")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["projet", "initiative"]):
 | |
|                 suggested_categories.append("Projets")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["rencontre", "réunion", "événement", "conférence"]):
 | |
|                 suggested_categories.append("Rencontres")
 | |
|                 
 | |
|             if any(keyword in text for keyword in ["utiliser", "utilisateur", "usage"]):
 | |
|                 suggested_categories.append("Utilisateurs")
 | |
|     
 | |
|     except requests.exceptions.RequestException as e:
 | |
|         logger.warning(f"Error fetching page content for category suggestions: {e}")
 | |
|         # If we can't fetch the content, suggest common categories based on title only
 | |
|         if "projet" in page_title.lower():
 | |
|             suggested_categories.append("Projets")
 | |
|         elif "logiciel" in page_title.lower() or "application" in page_title.lower():
 | |
|             suggested_categories.append("Logiciels")
 | |
|         elif "rencontre" in page_title.lower() or "réunion" in page_title.lower():
 | |
|             suggested_categories.append("Rencontres")
 | |
|             
 | |
|         # Always suggest the general French documentation category
 | |
|         suggested_categories.append("Documentation OSM en français")
 | |
|     
 | |
|     # Remove duplicates while preserving order
 | |
|     seen = set()
 | |
|     unique_categories = []
 | |
|     for cat in suggested_categories:
 | |
|         if cat not in seen:
 | |
|             seen.add(cat)
 | |
|             unique_categories.append(cat)
 | |
|     
 | |
|     logger.info(f"Suggested {len(unique_categories)} categories for {page_title}: {', '.join(unique_categories)}")
 | |
|     return unique_categories
 | |
| 
 | |
| def fetch_deadend_pages():
 | |
|     """
 | |
|     Fetch pages starting with "France" from the DeadendPages list
 | |
|     
 | |
|     Returns:
 | |
|         list: List of dictionaries containing page information
 | |
|     """
 | |
|     logger.info(f"Fetching pages from DeadendPages list: {WIKI_DEADEND_PAGES_URL}")
 | |
|     
 | |
|     try:
 | |
|         response = requests.get(WIKI_DEADEND_PAGES_URL)
 | |
|         response.raise_for_status()
 | |
|         
 | |
|         soup = BeautifulSoup(response.text, 'html.parser')
 | |
|         
 | |
|         # Find all links in the DeadendPages list
 | |
|         page_links = []
 | |
|         for link in soup.select('.mw-spcontent li a'):
 | |
|             href = link.get('href', '')
 | |
|             title = link.get_text(strip=True)
 | |
|             
 | |
|             # Skip if it's not a wiki page or if it's a special page
 | |
|             if not href.startswith('/wiki/') or 'Special:' in href:
 | |
|                 continue
 | |
|                 
 | |
|             # Filter pages that start with "France"
 | |
|             if title.startswith('France'):
 | |
|                 # Get the full URL
 | |
|                 full_url = 'https://wiki.openstreetmap.org' + href
 | |
|                 
 | |
|                 # Suggest categories for this page
 | |
|                 suggested_categories = suggest_categories(title, full_url)
 | |
|                 
 | |
|                 page_links.append({
 | |
|                     'title': title,
 | |
|                     'url': full_url,
 | |
|                     'suggested_categories': suggested_categories
 | |
|                 })
 | |
|         
 | |
|         logger.info(f"Found {len(page_links)} pages starting with 'France' in the DeadendPages list")
 | |
|         return page_links
 | |
|         
 | |
|     except requests.exceptions.RequestException as e:
 | |
|         logger.error(f"Error fetching DeadendPages list: {e}")
 | |
|         return []
 | |
| 
 | |
| def fetch_top_keys(limit=NUM_WIKI_PAGES):
 | |
|     """
 | |
|     Fetch the most used OSM keys from TagInfo API
 | |
|     
 | |
|     Args:
 | |
|         limit (int): Number of keys to fetch
 | |
|         
 | |
|     Returns:
 | |
|         list: List of dictionaries containing key information
 | |
|     """
 | |
|     logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")
 | |
|     
 | |
|     params = {
 | |
|         'page': 1,
 | |
|         'rp': limit,
 | |
|         'sortname': 'count_all',
 | |
|         'sortorder': 'desc'
 | |
|     }
 | |
|     
 | |
|     try:
 | |
|         response = requests.get(TAGINFO_API_URL, params=params)
 | |
|         response.raise_for_status()
 | |
|         data = response.json()
 | |
|         
 | |
|         # Extract just the key names and counts
 | |
|         top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
 | |
|         
 | |
|         logger.info(f"Successfully fetched {len(top_keys)} keys")
 | |
|         return top_keys
 | |
|     
 | |
|     except requests.exceptions.RequestException as e:
 | |
|         logger.error(f"Error fetching data from TagInfo API: {e}")
 | |
|         return []
 | |
| 
 | |
| def fetch_keys_without_wiki_page(limit=36):
 | |
|     """
 | |
|     Fetch keys used in France that are missing a wiki page from TagInfo API
 | |
|     
 | |
|     Args:
 | |
|         limit (int): Number of keys to fetch
 | |
|         
 | |
|     Returns:
 | |
|         list: List of dictionaries containing key information
 | |
|     """
 | |
|     logger.info(f"Fetching top {limit} OSM keys without wiki pages used in France...")
 | |
|     
 | |
|     params = {
 | |
|         'page': 1,
 | |
|         'rp': limit,
 | |
|         'english': 0,
 | |
|         'sortname': 'count_all',
 | |
|         'sortorder': 'desc'
 | |
|     }
 | |
|     
 | |
|     try:
 | |
|         response = requests.get(TAGINFO_FRANCE_API_URL, params=params)
 | |
|         response.raise_for_status()
 | |
|         data = response.json()
 | |
|         
 | |
|         # Extract just the key names and counts
 | |
|         keys_without_wiki = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
 | |
|         
 | |
|         logger.info(f"Successfully fetched {len(keys_without_wiki)} keys without wiki pages")
 | |
|         return keys_without_wiki
 | |
|     
 | |
|     except requests.exceptions.RequestException as e:
 | |
|         logger.error(f"Error fetching data from TagInfo France API: {e}")
 | |
|         return []
 | |
| 
 | |
| def load_json_data(filename):
 | |
|     """
 | |
|     Load data from a JSON file
 | |
| 
 | |
|     Args:
 | |
|         filename (str): Name of the file
 | |
| 
 | |
|     Returns:
 | |
|         dict: Data loaded from the file or empty dict if file doesn't exist
 | |
|     """
 | |
|     try:
 | |
|         if os.path.exists(filename):
 | |
|             with open(filename, 'r', encoding='utf-8') as f:
 | |
|                 data = json.load(f)
 | |
|             logger.info(f"Data loaded from {filename}")
 | |
|             return data
 | |
|         else:
 | |
|             logger.info(f"File {filename} doesn't exist, returning empty dict")
 | |
|             return {}
 | |
|     except (IOError, json.JSONDecodeError) as e:
 | |
|         logger.error(f"Error loading data from {filename}: {e}")
 | |
|         return {}
 | |
| 
 | |
| def save_to_json(data, filename):
 | |
|     """
 | |
|     Save data to a JSON file
 | |
|     
 | |
|     Args:
 | |
|         data: Data to save
 | |
|         filename (str): Name of the file
 | |
|     """
 | |
|     try:
 | |
|         # Convert data to JSON string
 | |
|         json_str = json.dumps(data, indent=2, ensure_ascii=False)
 | |
| 
 | |
|         # Print the JSON string for debugging
 | |
|         logger.info(f"JSON string to be written to {filename}:")
 | |
|         
 | |
|         # Check if data is a dictionary before trying to access keys
 | |
|         if isinstance(data, dict):
 | |
|             logger.info(f"JSON keys at top level: {list(data.keys())}")
 | |
|             if 'translations' in data:
 | |
|                 logger.info(f"JSON keys in translations: {list(data['translations'].keys())}")
 | |
|                 if 'type' in data['translations']:
 | |
|                     logger.info(f"'type' key exists in translations")
 | |
|                 if 'type_key' in data['translations']:
 | |
|                     logger.info(f"'type_key' key exists in translations")
 | |
|         elif isinstance(data, list):
 | |
|             logger.info(f"Data is a list with {len(data)} items")
 | |
| 
 | |
|         # Write the JSON string to the file
 | |
|         with open(filename, 'w', encoding='utf-8') as f:
 | |
|             f.write(json_str)
 | |
| 
 | |
|         logger.info(f"Data saved to {filename}")
 | |
|     except IOError as e:
 | |
|         logger.error(f"Error saving data to {filename}: {e}")
 | |
| 
 | |
| def calculate_global_metrics(data):
 | |
|     """
 | |
|     Calculate global metrics for all pages in the data
 | |
|     
 | |
|     Args:
 | |
|         data: Data containing regular_pages and specific_pages
 | |
|         
 | |
|     Returns:
 | |
|         dict: Dictionary with global metrics
 | |
|     """
 | |
|     # Combine regular and specific pages for global metrics
 | |
|     all_pages = data.get('regular_pages', []) + data.get('specific_pages', [])
 | |
|     
 | |
|     # Initialize metrics
 | |
|     metrics = {
 | |
|         'total_pages': len(all_pages),
 | |
|         'avg_sections': 0,
 | |
|         'avg_words': 0,
 | |
|         'avg_links': 0,
 | |
|         'avg_images': 0,
 | |
|         'avg_categories': 0,
 | |
|         'avg_staleness': 0,
 | |
|         'pages_with_en_fr': 0,
 | |
|         'pages_missing_fr': 0,
 | |
|         'staleness_distribution': {
 | |
|             '0-20': 0,
 | |
|             '21-40': 0,
 | |
|             '41-60': 0,
 | |
|             '61-80': 0,
 | |
|             '81-100': 0,
 | |
|             '100+': 0
 | |
|         }
 | |
|     }
 | |
|     
 | |
|     # Skip if no pages
 | |
|     if not all_pages:
 | |
|         return metrics
 | |
|     
 | |
|     # Calculate totals
 | |
|     total_sections = 0
 | |
|     total_words = 0
 | |
|     total_links = 0
 | |
|     total_images = 0
 | |
|     total_categories = 0
 | |
|     total_staleness = 0
 | |
|     
 | |
|     for page in all_pages:
 | |
|         # Count pages with/without French version
 | |
|         if page.get('fr_page'):
 | |
|             metrics['pages_with_en_fr'] += 1
 | |
|         else:
 | |
|             metrics['pages_missing_fr'] += 1
 | |
|         
 | |
|         # Add to staleness distribution
 | |
|         staleness = page.get('staleness_score', 0)
 | |
|         total_staleness += staleness
 | |
|         
 | |
|         if staleness <= 20:
 | |
|             metrics['staleness_distribution']['0-20'] += 1
 | |
|         elif staleness <= 40:
 | |
|             metrics['staleness_distribution']['21-40'] += 1
 | |
|         elif staleness <= 60:
 | |
|             metrics['staleness_distribution']['41-60'] += 1
 | |
|         elif staleness <= 80:
 | |
|             metrics['staleness_distribution']['61-80'] += 1
 | |
|         elif staleness <= 100:
 | |
|             metrics['staleness_distribution']['81-100'] += 1
 | |
|         else:
 | |
|             metrics['staleness_distribution']['100+'] += 1
 | |
|         
 | |
|         # Add to totals
 | |
|         total_sections += page.get('section_diff', 0) if 'section_diff' in page else 0
 | |
|         total_words += page.get('word_diff', 0) if 'word_diff' in page else 0
 | |
|         total_links += page.get('link_diff', 0) if 'link_diff' in page else 0
 | |
|         total_images += page.get('media_diff', 0) if 'media_diff' in page else 0
 | |
|         
 | |
|         # Count categories if available
 | |
|         if page.get('category_comparison'):
 | |
|             cat_count = len(page['category_comparison'].get('en_only', []))
 | |
|             total_categories += cat_count
 | |
|     
 | |
|     # Calculate averages
 | |
|     metrics['avg_sections'] = round(total_sections / len(all_pages), 2)
 | |
|     metrics['avg_words'] = round(total_words / len(all_pages), 2)
 | |
|     metrics['avg_links'] = round(total_links / len(all_pages), 2)
 | |
|     metrics['avg_images'] = round(total_images / len(all_pages), 2)
 | |
|     metrics['avg_categories'] = round(total_categories / len(all_pages), 2)
 | |
|     metrics['avg_staleness'] = round(total_staleness / len(all_pages), 2)
 | |
|     
 | |
|     return metrics
 | |
| 
 | |
| def save_with_history(data, filename):
 | |
|     """
 | |
|     Save data to a JSON file while preserving history
 | |
| 
 | |
|     This function loads existing data from the file (if it exists),
 | |
|     adds the new data to the history, and saves the updated data back to the file.
 | |
|     It also calculates global metrics for the current data.
 | |
| 
 | |
|     Args:
 | |
|         data: New data to save
 | |
|         filename (str): Name of the file
 | |
|     """
 | |
|     try:
 | |
|         # Load existing data
 | |
|         existing_data = load_json_data(filename)
 | |
| 
 | |
|         # Create a timestamp for the current data
 | |
|         current_timestamp = datetime.now().isoformat()
 | |
| 
 | |
|         # Initialize history if it doesn't exist
 | |
|         if 'history' not in existing_data:
 | |
|             existing_data['history'] = {}
 | |
|             
 | |
|         # Calculate global metrics for the current data
 | |
|         global_metrics = calculate_global_metrics(data)
 | |
| 
 | |
|         # Add current regular_pages, specific_pages, and global metrics to history
 | |
|         history_entry = {
 | |
|             'regular_pages': data.get('regular_pages', []),
 | |
|             'specific_pages': data.get('specific_pages', []),
 | |
|             'global_metrics': global_metrics
 | |
|         }
 | |
| 
 | |
|         # Add the entry to history with timestamp as key
 | |
|         existing_data['history'][current_timestamp] = history_entry
 | |
| 
 | |
|         # Update the current data
 | |
|         existing_data['regular_pages'] = data.get('regular_pages', [])
 | |
|         existing_data['specific_pages'] = data.get('specific_pages', [])
 | |
|         existing_data['global_metrics'] = global_metrics
 | |
|         existing_data['last_updated'] = current_timestamp
 | |
| 
 | |
|         # Save the updated data
 | |
|         with open(filename, 'w', encoding='utf-8') as f:
 | |
|             json.dump(existing_data, f, indent=2, ensure_ascii=False)
 | |
| 
 | |
|         logger.info(f"Data with history saved to {filename}")
 | |
|         
 | |
|         # Also save a separate ranking history file
 | |
|         save_ranking_history(existing_data, "page_rankings.json")
 | |
|         
 | |
|     except (IOError, json.JSONDecodeError) as e:
 | |
|         logger.error(f"Error saving data with history to {filename}: {e}")
 | |
|         # Fallback to regular save if there's an error
 | |
|         save_to_json(data, filename)
 | |
|         
 | |
| def save_ranking_history(data, filename):
 | |
|     """
 | |
|     Save ranking history to a separate JSON file
 | |
|     
 | |
|     This function extracts ranking data from the history and saves it in a format
 | |
|     optimized for displaying ranking evolution over time.
 | |
|     
 | |
|     Args:
 | |
|         data: Data containing history entries
 | |
|         filename (str): Name of the file to save rankings
 | |
|     """
 | |
|     try:
 | |
|         # Initialize ranking data structure
 | |
|         ranking_data = {
 | |
|             'timestamps': [],
 | |
|             'pages': {},
 | |
|             'global_metrics': {}
 | |
|         }
 | |
|         
 | |
|         # Extract history entries
 | |
|         history = data.get('history', {})
 | |
|         
 | |
|         # Sort timestamps chronologically
 | |
|         sorted_timestamps = sorted(history.keys())
 | |
|         ranking_data['timestamps'] = sorted_timestamps
 | |
|         
 | |
|         # Process each page to track its metrics over time
 | |
|         all_page_keys = set()
 | |
|         
 | |
|         # First, collect all unique page keys across all history entries
 | |
|         for timestamp in sorted_timestamps:
 | |
|             entry = history[timestamp]
 | |
|             
 | |
|             # Add global metrics for this timestamp
 | |
|             if 'global_metrics' in entry:
 | |
|                 ranking_data['global_metrics'][timestamp] = entry['global_metrics']
 | |
|             
 | |
|             # Collect page keys from regular pages
 | |
|             for page in entry.get('regular_pages', []):
 | |
|                 all_page_keys.add(page['key'])
 | |
|                 
 | |
|             # Collect page keys from specific pages
 | |
|             for page in entry.get('specific_pages', []):
 | |
|                 all_page_keys.add(page['key'])
 | |
|         
 | |
|         # Initialize data structure for each page
 | |
|         for page_key in all_page_keys:
 | |
|             ranking_data['pages'][page_key] = {
 | |
|                 'title': page_key,
 | |
|                 'metrics': {}
 | |
|             }
 | |
|         
 | |
|         # Fill in metrics for each page at each timestamp
 | |
|         for timestamp in sorted_timestamps:
 | |
|             entry = history[timestamp]
 | |
|             
 | |
|             # Process regular pages
 | |
|             for page in entry.get('regular_pages', []):
 | |
|                 page_key = page['key']
 | |
|                 
 | |
|                 # Extract metrics we want to track
 | |
|                 metrics = {
 | |
|                     'staleness_score': page.get('staleness_score', 0),
 | |
|                     'word_diff': page.get('word_diff', 0),
 | |
|                     'section_diff': page.get('section_diff', 0),
 | |
|                     'link_diff': page.get('link_diff', 0),
 | |
|                     'media_diff': page.get('media_diff', 0)
 | |
|                 }
 | |
|                 
 | |
|                 # Store metrics for this timestamp
 | |
|                 ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
 | |
|                 
 | |
|                 # Store page title if available
 | |
|                 if 'en_page' in page and page['en_page']:
 | |
|                     ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
 | |
|             
 | |
|             # Process specific pages
 | |
|             for page in entry.get('specific_pages', []):
 | |
|                 page_key = page['key']
 | |
|                 
 | |
|                 # Extract metrics we want to track
 | |
|                 metrics = {
 | |
|                     'staleness_score': page.get('staleness_score', 0),
 | |
|                     'word_diff': page.get('word_diff', 0),
 | |
|                     'section_diff': page.get('section_diff', 0),
 | |
|                     'link_diff': page.get('link_diff', 0),
 | |
|                     'media_diff': page.get('media_diff', 0)
 | |
|                 }
 | |
|                 
 | |
|                 # Store metrics for this timestamp
 | |
|                 ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
 | |
|                 
 | |
|                 # Store page title if available
 | |
|                 if 'en_page' in page and page['en_page']:
 | |
|                     ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
 | |
|         
 | |
|         # Save the ranking data
 | |
|         with open(filename, 'w', encoding='utf-8') as f:
 | |
|             json.dump(ranking_data, f, indent=2, ensure_ascii=False)
 | |
|             
 | |
|         logger.info(f"Ranking history saved to {filename}")
 | |
|         
 | |
|     except (IOError, json.JSONDecodeError) as e:
 | |
|         logger.error(f"Error saving ranking history to {filename}: {e}")
 | |
| 
 | |
| def check_grammar_with_grammalecte(text):
 | |
|     """
 | |
|     Check grammar in French text using grammalecte-cli
 | |
|     
 | |
|     Args:
 | |
|         text (str): French text to check
 | |
|         
 | |
|     Returns:
 | |
|         list: List of grammar suggestions
 | |
|     """
 | |
|     if not text or len(text.strip()) == 0:
 | |
|         logger.warning("Empty text provided for grammar checking")
 | |
|         return []
 | |
|     
 | |
|     # Check if grammalecte-cli is available
 | |
|     try:
 | |
|         subprocess.run(['which', 'grammalecte-cli'], capture_output=True, check=True)
 | |
|     except subprocess.CalledProcessError:
 | |
|         logger.warning("grammalecte-cli not found, skipping grammar check")
 | |
|         return []
 | |
|     
 | |
|     logger.info("Checking grammar with grammalecte-cli...")
 | |
|     
 | |
|     try:
 | |
|         # Create a temporary file with the text
 | |
|         with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
 | |
|             temp_file.write(text)
 | |
|             temp_file_path = temp_file.name
 | |
|         
 | |
|         # Run grammalecte-cli on the temporary file
 | |
|         cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
 | |
|         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
 | |
|         
 | |
|         # Parse the JSON output
 | |
|         grammar_data = json.loads(result.stdout)
 | |
|         
 | |
|         # Extract grammar errors from all paragraphs
 | |
|         grammar_suggestions = []
 | |
|         for paragraph in grammar_data.get('data', []):
 | |
|             paragraph_index = paragraph.get('iParagraph', 0)
 | |
|             
 | |
|             # Process grammar errors
 | |
|             for error in paragraph.get('lGrammarErrors', []):
 | |
|                 suggestion = {
 | |
|                     'paragraph': paragraph_index,
 | |
|                     'start': error.get('nStart', 0),
 | |
|                     'end': error.get('nEnd', 0),
 | |
|                     'type': error.get('sType', ''),
 | |
|                     'message': error.get('sMessage', ''),
 | |
|                     'suggestions': error.get('aSuggestions', []),
 | |
|                     'text': error.get('sUnderlined', ''),
 | |
|                     'before': error.get('sBefore', ''),
 | |
|                     'after': error.get('sAfter', '')
 | |
|                 }
 | |
|                 grammar_suggestions.append(suggestion)
 | |
|             
 | |
|             # Process spelling errors
 | |
|             for error in paragraph.get('lSpellingErrors', []):
 | |
|                 suggestion = {
 | |
|                     'paragraph': paragraph_index,
 | |
|                     'start': error.get('nStart', 0),
 | |
|                     'end': error.get('nEnd', 0),
 | |
|                     'type': 'spelling',
 | |
|                     'message': 'Erreur d\'orthographe',
 | |
|                     'suggestions': error.get('aSuggestions', []),
 | |
|                     'text': error.get('sUnderlined', ''),
 | |
|                     'before': error.get('sBefore', ''),
 | |
|                     'after': error.get('sAfter', '')
 | |
|                 }
 | |
|                 grammar_suggestions.append(suggestion)
 | |
|         
 | |
|         # Clean up the temporary file
 | |
|         os.unlink(temp_file_path)
 | |
|         
 | |
|         logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
 | |
|         return grammar_suggestions
 | |
|     
 | |
|     except subprocess.CalledProcessError as e:
 | |
|         logger.error(f"Error running grammalecte-cli: {e}")
 | |
|         logger.error(f"stdout: {e.stdout}")
 | |
|         logger.error(f"stderr: {e.stderr}")
 | |
|         return []
 | |
|     
 | |
|     except json.JSONDecodeError as e:
 | |
|         logger.error(f"Error parsing grammalecte-cli output: {e}")
 | |
|         return []
 | |
|     
 | |
|     except Exception as e:
 | |
|         logger.error(f"Unexpected error during grammar checking: {e}")
 | |
|         return []
 | |
| 
 | |
| def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
 | |
|     """
 | |
|     Fetch wiki page for a given key or specific page
 | |
|     
 | |
|     This function handles different types of wiki pages:
 | |
|     1. Regular OSM key pages (e.g., "building", "highway")
 | |
|     2. Specific wiki pages that can be in various formats:
 | |
|        - Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
 | |
|        - Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
 | |
|        - Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
 | |
|     
 | |
|     Args:
 | |
|         key (str): OSM key or specific page title/URL
 | |
|         language (str): Language code ('en' or 'fr')
 | |
|         is_specific_page (bool): Whether this is a specific page rather than a key
 | |
|         check_grammar (bool): Whether to check grammar for French pages
 | |
| 
 | |
|     Returns:
 | |
|         dict: Dictionary with page information or None if page doesn't exist
 | |
|     """
 | |
|     # Skip pages with "FR:User:" or "FR:Réunions"
 | |
|     if "FR:User:" in key or "FR:Réunions" in key:
 | |
|         logger.info(f"Skipping excluded page: {key}")
 | |
|         return None
 | |
|     # Handle different URL formats
 | |
|     if is_specific_page:
 | |
|         # Case 1: Full URL
 | |
|         if key.startswith('http'):
 | |
|             url = key
 | |
|             # Extract the page title from the URL
 | |
|             page_title = key.split('/')[-1]
 | |
|             # Determine language from URL
 | |
|             if 'FR:' in key or '/FR:' in key:
 | |
|                 language = 'fr'
 | |
|             else:
 | |
|                 language = 'en'
 | |
|         # Case 2: Page with FR: prefix
 | |
|         elif key.startswith('FR:'):
 | |
|             url = f"{WIKI_BASE_URL}{key}"
 | |
|             page_title = key[3:]  # Remove FR: prefix for title
 | |
|             language = 'fr'
 | |
|         # Case 3: Regular page title
 | |
|         else:
 | |
|             if language == 'fr':
 | |
|                 url = f"{WIKI_BASE_URL}FR:{key}"
 | |
|             else:
 | |
|                 url = f"{WIKI_BASE_URL}{key}"
 | |
|             page_title = key
 | |
|     else:
 | |
|         # Regular key page
 | |
|         base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
 | |
|         url = f"{base_url}{key}"
 | |
|         page_title = key
 | |
|     
 | |
|     # Create a unique cache filename based on the URL
 | |
|     cache_key = hashlib.md5(url.encode()).hexdigest()
 | |
|     cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
 | |
| 
 | |
|     html_content = None
 | |
| 
 | |
|     # Try to load from cache first
 | |
|     if cache_file.exists():
 | |
|         logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
 | |
|         try:
 | |
|             with open(cache_file, 'r', encoding='utf-8') as f:
 | |
|                 html_content = f.read()
 | |
|         except Exception as e:
 | |
|             logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
 | |
|             html_content = None
 | |
| 
 | |
|     # If not in cache or cache read failed, fetch from web
 | |
|     if html_content is None:
 | |
|         logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
 | |
|         try:
 | |
|             response = requests.get(url)
 | |
| 
 | |
|             # Check if page exists
 | |
|             if response.status_code == 404:
 | |
|                 logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
 | |
|                 return None
 | |
| 
 | |
|             response.raise_for_status()
 | |
|             html_content = response.text
 | |
| 
 | |
|             # Save to cache
 | |
|             try:
 | |
|                 with open(cache_file, 'w', encoding='utf-8') as f:
 | |
|                     f.write(html_content)
 | |
|                 logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
 | |
|             except Exception as e:
 | |
|                 logger.warning(f"Error saving to cache: {e}")
 | |
|         except requests.exceptions.RequestException as e:
 | |
|             logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
 | |
|             return None
 | |
| 
 | |
|     soup = BeautifulSoup(html_content, 'html.parser')
 | |
| 
 | |
|     # Get last modification date
 | |
|     last_modified = None
 | |
|     footer_info = soup.select_one('#footer-info-lastmod')
 | |
|     if footer_info:
 | |
|         date_text = footer_info.text
 | |
|         # Extract date using regex
 | |
|         date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
 | |
|         if date_match:
 | |
|             date_str = date_match.group(1)
 | |
|             try:
 | |
|                 # Parse date (format may vary based on wiki language)
 | |
|                 last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
 | |
|             except ValueError:
 | |
|                 logger.warning(f"Could not parse date: {date_str}")
 | |
| 
 | |
|     # Extract sections (h2, h3, h4)
 | |
|     section_elements = soup.select('h2, h3, h4')
 | |
|     sections = len(section_elements)
 | |
| 
 | |
|     # Extract section titles
 | |
|     section_titles = []
 | |
|     for section_elem in section_elements:
 | |
|         # Skip sections that are part of the table of contents, navigation, or DescriptionBox
 | |
|         if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
 | |
|             continue
 | |
| 
 | |
|         # Skip sections that are inside a table with class DescriptionBox
 | |
|         if section_elem.find_parent('table', class_='DescriptionBox'):
 | |
|             continue
 | |
| 
 | |
|         # Get the text of the section title, removing any edit links
 | |
|         for edit_link in section_elem.select('.mw-editsection'):
 | |
|             edit_link.extract()
 | |
| 
 | |
|         section_title = section_elem.get_text(strip=True)
 | |
|         section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
 | |
| 
 | |
|         section_titles.append({
 | |
|             'title': section_title,
 | |
|             'level': section_level
 | |
|         })
 | |
|         
 | |
|     # Count words and sentences in the content
 | |
|     content = soup.select_one('#mw-content-text')
 | |
|     clean_text = ""
 | |
|     if content:
 | |
|         # Remove script and style elements
 | |
|         for script in content.select('script, style'):
 | |
|             script.extract()
 | |
| 
 | |
|         # Remove .languages elements
 | |
|         for languages_elem in content.select('.languages'):
 | |
|             languages_elem.extract()
 | |
| 
 | |
|         # Get text and count words
 | |
|         clean_text = content.get_text(separator=' ', strip=True)
 | |
|         word_count = len(clean_text.split())
 | |
| 
 | |
|         # Count sentences using NLTK if available, otherwise use a simple approximation
 | |
|         if NLTK_AVAILABLE and check_grammar:
 | |
|             sentences = nltk.sent_tokenize(clean_text)
 | |
|             sentence_count = len(sentences)
 | |
|         else:
 | |
|             # Simple approximation: count periods, exclamation marks, and question marks
 | |
|             sentence_count = len(re.findall(r'[.!?]+', clean_text))
 | |
| 
 | |
|         # Check grammar for French pages
 | |
|         grammar_suggestions = []
 | |
|         if language == 'fr' and check_grammar:
 | |
|             logger.info(f"Checking grammar for French page: {key}")
 | |
|             grammar_suggestions = check_grammar_with_grammalecte(clean_text)
 | |
|         elif language == 'fr' and not check_grammar:
 | |
|             logger.info(f"Grammar checking disabled for French page: {key}")
 | |
| 
 | |
|         # Extract links
 | |
|         links = content.select('a')
 | |
|         link_count = len(links)
 | |
| 
 | |
|         # Get link details (text and href)
 | |
|         link_details = []
 | |
|         for link in links:
 | |
|             href = link.get('href', '')
 | |
|             # Skip edit section links and other non-content links
 | |
|             if 'action=edit' in href or 'redlink=1' in href or not href:
 | |
|                 continue
 | |
| 
 | |
|             # Make relative URLs absolute
 | |
|             if href.startswith('/'):
 | |
|                 href = 'https://wiki.openstreetmap.org' + href
 | |
| 
 | |
|             link_text = link.get_text(strip=True)
 | |
|             if link_text:  # Only include links with text
 | |
|                 link_details.append({
 | |
|                     'text': link_text,
 | |
|                     'href': href
 | |
|                 })
 | |
| 
 | |
|         # Extract media (images)
 | |
|         media_elements = content.select('img')
 | |
|         media_count = len(media_elements)
 | |
| 
 | |
|         # Get media details (src and alt text)
 | |
|         media_details = []
 | |
| 
 | |
|         # Extract description image specifically
 | |
|         # Try multiple selectors to find the description image
 | |
|         description_img = None
 | |
| 
 | |
|         # Debug: Log the key we're processing
 | |
|         logger.info(f"Looking for description image for key '{key}' in {language}")
 | |
| 
 | |
|         # Function to filter out OSM logo and small icons
 | |
|         def is_relevant_image(img):
 | |
|             src = img.get('src', '')
 | |
|             # Skip OSM logo
 | |
|             if 'osm_logo' in src:
 | |
|                 return False
 | |
|             # Skip small icons (usually less than 30px)
 | |
|             width = img.get('width')
 | |
|             if width and int(width) < 30:
 | |
|                 return False
 | |
|             height = img.get('height')
 | |
|             if height and int(height) < 30:
 | |
|                 return False
 | |
|             return True
 | |
| 
 | |
|         # Special case for highway key - directly target the image we want
 | |
|         if key == 'highway':
 | |
|             # Try to find the specific image in figure elements
 | |
|             highway_img_elements = content.select('figure.mw-halign-center img')
 | |
|             logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
 | |
| 
 | |
|             # Filter for relevant images
 | |
|             relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images for highway")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")
 | |
| 
 | |
|         # If not found with highway-specific selector, try the td.d_image selector
 | |
|         if not description_img:
 | |
|             description_img_elements = content.select('td.d_image img')
 | |
|             logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")
 | |
| 
 | |
|             # Filter for relevant images
 | |
|             relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")
 | |
| 
 | |
|         # If still not found, try the specific selector for .description img.mw-file-element
 | |
|         if not description_img:
 | |
|             description_img_elements = content.select('.description img.mw-file-element')
 | |
|             logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
 | |
| 
 | |
|             # Filter for relevant images
 | |
|             relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images in .description")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
 | |
| 
 | |
|         # If still not found, try images in figures within the description box
 | |
|         if not description_img:
 | |
|             description_img_elements = content.select('.description figure img')
 | |
|             logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")
 | |
| 
 | |
|             # Filter for relevant images
 | |
|             relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")
 | |
| 
 | |
|         # If still not found, try any image in the description box
 | |
|         if not description_img:
 | |
|             description_img_elements = content.select('.description img')
 | |
|             logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")
 | |
| 
 | |
|             # Filter for relevant images
 | |
|             relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images in .description general")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")
 | |
| 
 | |
|         # If still not found, try images in the DescriptionBox table
 | |
|         if not description_img:
 | |
|             description_img_elements = content.select('table.DescriptionBox img')
 | |
|             logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
 | |
| 
 | |
|             # Filter for relevant images
 | |
|             relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
 | |
| 
 | |
|         # If still not found, try images in figure elements anywhere in the content
 | |
|         if not description_img:
 | |
|             description_img_elements = content.select('figure img')
 | |
|             logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")
 | |
| 
 | |
|             # Filter for relevant images
 | |
|             relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")
 | |
| 
 | |
|         # If we still don't have an image, use any image that's not the OSM logo
 | |
|         if not description_img:
 | |
|             all_images = content.select('img')
 | |
|             relevant_images = [img for img in all_images if is_relevant_image(img)]
 | |
|             logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")
 | |
| 
 | |
|             if relevant_images:
 | |
|                 description_img = relevant_images[0]
 | |
|                 logger.info(f"  Using fallback image: {description_img.get('src', '')}")
 | |
| 
 | |
|         # Process the found image
 | |
|         description_img_url = None
 | |
|         if description_img:
 | |
|             src = description_img.get('src', '')
 | |
|             if src:
 | |
|                 # Make relative URLs absolute
 | |
|                 if src.startswith('//'):
 | |
|                     src = 'https:' + src
 | |
|                 elif src.startswith('/'):
 | |
|                     src = 'https://wiki.openstreetmap.org' + src
 | |
| 
 | |
|                 description_img_url = src
 | |
| 
 | |
|             # Process all images
 | |
|             for img in media_elements:
 | |
|                 src = img.get('src', '')
 | |
|                 if src:
 | |
|                     # Make relative URLs absolute
 | |
|                     if src.startswith('//'):
 | |
|                         src = 'https:' + src
 | |
|                     elif src.startswith('/'):
 | |
|                         src = 'https://wiki.openstreetmap.org' + src
 | |
|                     
 | |
|                     alt_text = img.get('alt', '')
 | |
|                     media_details.append({
 | |
|                         'src': src,
 | |
|                         'alt': alt_text
 | |
|                     })
 | |
|             
 | |
|             # Extract categories
 | |
|             categories = []
 | |
|             category_links = soup.select('#mw-normal-catlinks li a')
 | |
|             for cat_link in category_links:
 | |
|                 categories.append(cat_link.get_text(strip=True))
 | |
|         else:
 | |
|             word_count = 0
 | |
|             link_count = 0
 | |
|             link_details = []
 | |
|             media_count = 0
 | |
|             media_details = []
 | |
|             categories = []
 | |
|             grammar_suggestions = []
 | |
|         
 | |
|         return {
 | |
|             'key': key,
 | |
|             'page_title': page_title,
 | |
|             'language': language,
 | |
|             'url': url,
 | |
|             'last_modified': last_modified,
 | |
|             'sections': sections,
 | |
|             'section_titles': section_titles,
 | |
|             'word_count': word_count,
 | |
|             'sentence_count': sentence_count,
 | |
|             'link_count': link_count,
 | |
|             'link_details': link_details,
 | |
|             'media_count': media_count,
 | |
|             'media_details': media_details,
 | |
|             'categories': categories,
 | |
|             'description_img_url': description_img_url,
 | |
|             'is_specific_page': is_specific_page,
 | |
|             'grammar_suggestions': grammar_suggestions,
 | |
|             'html_content': html_content
 | |
|         }
 | |
| 
 | |
| def generate_staleness_histogram(wiki_pages):
 | |
|     """
 | |
|     Generate a histogram of staleness scores by 10% ranges
 | |
|     
 | |
|     Args:
 | |
|         wiki_pages (list): List of dictionaries containing page information with staleness scores
 | |
|         
 | |
|     Returns:
 | |
|         None: Saves the histogram to a file
 | |
|     """
 | |
|     logger.info("Generating histogram of staleness scores by 10% ranges...")
 | |
|     
 | |
|     # Extract staleness scores
 | |
|     staleness_scores = []
 | |
|     for page in wiki_pages:
 | |
|         if page and 'staleness_score' in page:
 | |
|             staleness_scores.append(page['staleness_score'])
 | |
|     
 | |
|     if not staleness_scores:
 | |
|         logger.warning("No staleness scores found. Cannot generate histogram.")
 | |
|         return
 | |
|     
 | |
|     # Determine the maximum score for binning
 | |
|     max_score = max(staleness_scores)
 | |
|     # Round up to the nearest 10 to ensure all scores are included
 | |
|     max_bin_edge = np.ceil(max_score / 10) * 10
 | |
|     
 | |
|     # Create bins for 10% ranges
 | |
|     bins = np.arange(0, max_bin_edge + 10, 10)
 | |
|     
 | |
|     # Count scores in each bin
 | |
|     hist, bin_edges = np.histogram(staleness_scores, bins=bins)
 | |
|     
 | |
|     # Create histogram
 | |
|     plt.figure(figsize=(12, 6))
 | |
|     
 | |
|     # Create bar chart
 | |
|     plt.bar(range(len(hist)), hist, align='center')
 | |
|     
 | |
|     # Set x-axis labels for each bin
 | |
|     bin_labels = [f"{int(bin_edges[i])}-{int(bin_edges[i+1])}%" for i in range(len(bin_edges)-1)]
 | |
|     plt.xticks(range(len(hist)), bin_labels, rotation=45)
 | |
|     
 | |
|     # Set labels and title
 | |
|     plt.xlabel('Tranches de score de décrépitude (en %)')
 | |
|     plt.ylabel('Nombre de pages')
 | |
|     plt.title('Répartition du score de décrépitude par tranches de 10%')
 | |
|     
 | |
|     # Add grid for better readability
 | |
|     plt.grid(axis='y', linestyle='--', alpha=0.7)
 | |
|     
 | |
|     # Adjust layout
 | |
|     plt.tight_layout()
 | |
|     
 | |
|     # Save figure
 | |
|     plt.savefig(STALENESS_HISTOGRAM_FILE)
 | |
|     logger.info(f"Histogram saved to {STALENESS_HISTOGRAM_FILE}")
 | |
|     
 | |
|     # Close the figure to free memory
 | |
|     plt.close()
 | |
| 
 | |
| def analyze_wiki_pages(pages):
 | |
|     """
 | |
|     Analyze wiki pages to determine which ones need updating
 | |
|     
 | |
|     Args:
 | |
|         pages (list): List of dictionaries containing page information
 | |
|         
 | |
|     Returns:
 | |
|         list: List of pages that need updating, sorted by priority
 | |
|     """
 | |
|     logger.info("Analyzing wiki pages to identify those needing updates...")
 | |
|     
 | |
|     # Group pages by key
 | |
|     pages_by_key = {}
 | |
|     for page in pages:
 | |
|         if page is None:
 | |
|             continue
 | |
|         
 | |
|         key = page['key']
 | |
|         if key not in pages_by_key:
 | |
|             pages_by_key[key] = {}
 | |
|         
 | |
|         pages_by_key[key][page['language']] = page
 | |
|     
 | |
|     # Analyze each key's pages
 | |
|     needs_update = []
 | |
|     
 | |
|     for key, lang_pages in pages_by_key.items():
 | |
|         # Skip if either language is missing
 | |
|         if 'en' not in lang_pages or 'fr' not in lang_pages:
 | |
|             if 'en' in lang_pages:
 | |
|                 # French page is missing
 | |
|                 # For missing French pages, calculate a high staleness score
 | |
|                 # Use word count as the main factor (50% weight)
 | |
|                 missing_staleness_score = (
 | |
|                     30 * 0.2 +  # Assume 30 days outdated (20%)
 | |
|                     lang_pages['en']['word_count'] / 100 * 0.5 +  # Word count (50%)
 | |
|                     lang_pages['en']['sections'] * 0.15 +  # Sections (15%)
 | |
|                     lang_pages['en']['link_count'] / 10 * 0.15  # Links (15%)
 | |
|                 )
 | |
|                 
 | |
|                 # Round to 2 decimal places and ensure it's high
 | |
|                 missing_staleness_score = max(100, round(missing_staleness_score, 2))
 | |
|                 
 | |
|                 # Get media count or default to 0
 | |
|                 media_count = lang_pages['en'].get('media_count', 0)
 | |
|                 
 | |
|                 needs_update.append({
 | |
|                     'key': key,
 | |
|                     'reason': 'French page missing',
 | |
|                     'en_page': lang_pages['en'],
 | |
|                     'fr_page': None,
 | |
|                     'date_diff': 0,
 | |
|                     'word_diff': lang_pages['en']['word_count'],
 | |
|                     'section_diff': lang_pages['en']['sections'],
 | |
|                     'link_diff': lang_pages['en']['link_count'],
 | |
|                     'media_diff': media_count,
 | |
|                     'staleness_score': missing_staleness_score,
 | |
|                     'priority': missing_staleness_score,  # Use staleness score as priority
 | |
|                     'section_comparison': None,  # No comparison possible
 | |
|                     'link_comparison': None,     # No comparison possible
 | |
|                     'media_comparison': None,    # No comparison possible
 | |
|                     'category_comparison': None  # No comparison possible
 | |
|                 })
 | |
|             continue
 | |
|         
 | |
|         en_page = lang_pages['en']
 | |
|         fr_page = lang_pages['fr']
 | |
|         
 | |
|         # Skip if dates are missing
 | |
|         if not en_page['last_modified'] or not fr_page['last_modified']:
 | |
|             continue
 | |
|         
 | |
|         # Calculate date difference in days
 | |
|         en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
 | |
|         fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
 | |
|         date_diff = (en_date - fr_date).days
 | |
|         
 | |
|         # Calculate content differences
 | |
|         word_diff = en_page['word_count'] - fr_page['word_count']
 | |
|         section_diff = en_page['sections'] - fr_page['sections']
 | |
|         link_diff = en_page['link_count'] - fr_page['link_count']
 | |
|         media_diff = en_page.get('media_count', 0) - fr_page.get('media_count', 0)
 | |
|         
 | |
|         # Calculate staleness score (higher means more outdated/stale)
 | |
|         # Weight factors adjusted to emphasize word count differences
 | |
|         staleness_score = (
 | |
|             abs(date_diff) * 0.2 +  # Date difference (20%)
 | |
|             abs(word_diff) / 100 * 0.5 +  # Word count difference (normalized) (50%)
 | |
|             abs(section_diff) * 0.15 +  # Section difference (15%)
 | |
|             abs(link_diff) / 10 * 0.15  # Link count difference (normalized) (15%)
 | |
|         )
 | |
|         
 | |
|         # Round to 2 decimal places for display
 | |
|         staleness_score = round(staleness_score, 2)
 | |
|         
 | |
|         # Compare sections between English and French pages
 | |
|         section_comparison = {
 | |
|             'en_only': [],
 | |
|             'fr_only': [],
 | |
|             'common': []
 | |
|         }
 | |
|         
 | |
|         # Group sections by their level for hierarchical comparison
 | |
|         en_sections_by_level = {}
 | |
|         fr_sections_by_level = {}
 | |
|         
 | |
|         # Organize English sections by level
 | |
|         for section in en_page.get('section_titles', []):
 | |
|             level = section['level']
 | |
|             if level not in en_sections_by_level:
 | |
|                 en_sections_by_level[level] = []
 | |
|             en_sections_by_level[level].append(section)
 | |
|             
 | |
|         # Organize French sections by level
 | |
|         for section in fr_page.get('section_titles', []):
 | |
|             level = section['level']
 | |
|             if level not in fr_sections_by_level:
 | |
|                 fr_sections_by_level[level] = []
 | |
|             fr_sections_by_level[level].append(section)
 | |
|         
 | |
|         # Process each level to find matching sections
 | |
|         all_levels = set(list(en_sections_by_level.keys()) + list(fr_sections_by_level.keys()))
 | |
|         
 | |
|         for level in all_levels:
 | |
|             en_level_sections = en_sections_by_level.get(level, [])
 | |
|             fr_level_sections = fr_sections_by_level.get(level, [])
 | |
|             
 | |
|             # Create dictionaries for easier lookup, using lowercase titles
 | |
|             en_dict = {section['title'].lower(): section for section in en_level_sections}
 | |
|             fr_dict = {section['title'].lower(): section for section in fr_level_sections}
 | |
|             
 | |
|             # Find sections at this level only in English
 | |
|             for title, section in en_dict.items():
 | |
|                 if title not in fr_dict:
 | |
|                     section_comparison['en_only'].append(section)
 | |
|             
 | |
|             # Find sections at this level only in French
 | |
|             for title, section in fr_dict.items():
 | |
|                 if title not in en_dict:
 | |
|                     section_comparison['fr_only'].append(section)
 | |
|             
 | |
|             # Find common sections at this level
 | |
|             for title in en_dict.keys():
 | |
|                 if title in fr_dict:
 | |
|                     section_comparison['common'].append({
 | |
|                         'en': en_dict[title],
 | |
|                         'fr': fr_dict[title]
 | |
|                     })
 | |
|         
 | |
|         # Compare links between English and French pages
 | |
|         link_comparison = {
 | |
|             'en_only': [],
 | |
|             'fr_only': [],
 | |
|             'common': []
 | |
|         }
 | |
|         
 | |
|         # Extract link texts for comparison (case insensitive)
 | |
|         en_links = {link['text'].lower(): link for link in en_page.get('link_details', [])}
 | |
|         fr_links = {link['text'].lower(): link for link in fr_page.get('link_details', [])}
 | |
|         
 | |
|         # Find links only in English
 | |
|         for text, link in en_links.items():
 | |
|             if text not in fr_links:
 | |
|                 link_comparison['en_only'].append(link)
 | |
|         
 | |
|         # Find links only in French
 | |
|         for text, link in fr_links.items():
 | |
|             if text not in en_links:
 | |
|                 link_comparison['fr_only'].append(link)
 | |
|         
 | |
|         # Find common links
 | |
|         for text in en_links.keys():
 | |
|             if text in fr_links:
 | |
|                 link_comparison['common'].append({
 | |
|                     'en': en_links[text],
 | |
|                     'fr': fr_links[text]
 | |
|                 })
 | |
|         
 | |
|         # Compare media between English and French pages
 | |
|         media_comparison = {
 | |
|             'en_only': [],
 | |
|             'fr_only': [],
 | |
|             'common': []
 | |
|         }
 | |
|         
 | |
|         # Extract media alt texts for comparison (case insensitive)
 | |
|         en_media = {media['alt'].lower(): media for media in en_page.get('media_details', []) if media['alt']}
 | |
|         fr_media = {media['alt'].lower(): media for media in fr_page.get('media_details', []) if media['alt']}
 | |
|         
 | |
|         # Find media only in English
 | |
|         for alt, media in en_media.items():
 | |
|             if alt not in fr_media:
 | |
|                 media_comparison['en_only'].append(media)
 | |
|         
 | |
|         # Find media only in French
 | |
|         for alt, media in fr_media.items():
 | |
|             if alt not in en_media:
 | |
|                 media_comparison['fr_only'].append(media)
 | |
|         
 | |
|         # Find common media
 | |
|         for alt in en_media.keys():
 | |
|             if alt in fr_media:
 | |
|                 media_comparison['common'].append({
 | |
|                     'en': en_media[alt],
 | |
|                     'fr': fr_media[alt]
 | |
|                 })
 | |
|         
 | |
|         # Add media without alt text to their respective language-only lists
 | |
|         for media in en_page.get('media_details', []):
 | |
|             if not media['alt'] or media['alt'].lower() not in en_media:
 | |
|                 media_comparison['en_only'].append(media)
 | |
|         
 | |
|         for media in fr_page.get('media_details', []):
 | |
|             if not media['alt'] or media['alt'].lower() not in fr_media:
 | |
|                 media_comparison['fr_only'].append(media)
 | |
|         
 | |
|         # Compare categories between English and French pages
 | |
|         category_comparison = {
 | |
|             'en_only': [],
 | |
|             'fr_only': [],
 | |
|             'common': []
 | |
|         }
 | |
|         
 | |
|         # Extract categories for comparison (case insensitive)
 | |
|         en_categories = [cat.lower() for cat in en_page.get('categories', [])]
 | |
|         fr_categories = [cat.lower() for cat in fr_page.get('categories', [])]
 | |
|         
 | |
|         # Find categories only in English
 | |
|         for cat in en_page.get('categories', []):
 | |
|             if cat.lower() not in fr_categories:
 | |
|                 category_comparison['en_only'].append(cat)
 | |
|         
 | |
|         # Find categories only in French
 | |
|         for cat in fr_page.get('categories', []):
 | |
|             if cat.lower() not in en_categories:
 | |
|                 category_comparison['fr_only'].append(cat)
 | |
|         
 | |
|         # Find common categories
 | |
|         for cat in en_page.get('categories', []):
 | |
|             if cat.lower() in fr_categories:
 | |
|                 category_comparison['common'].append(cat)
 | |
|         
 | |
|         if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
 | |
|             reason = []
 | |
|             if date_diff > 30:
 | |
|                 reason.append(f"La version Française est datée de {date_diff} jours")
 | |
|             if word_diff > 200:
 | |
|                 reason.append(f"La version Anglaise a {word_diff} mots de plus")
 | |
|             if section_diff > 2:
 | |
|                 reason.append(f"La version Anglaise a {section_diff} sections de plus")
 | |
|             if link_diff > 20:
 | |
|                 reason.append(f"La version Anglaise a {link_diff} liens de plus")
 | |
|             if media_diff > 5:
 | |
|                 reason.append(f"La version Anglaise a {media_diff} images de plus")
 | |
|             if fr_page['word_count'] < en_page['word_count'] * 0.7:
 | |
|                 reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
 | |
|             
 | |
|             needs_update.append({
 | |
|                 'key': key,
 | |
|                 'reason': ', '.join(reason),
 | |
|                 'en_page': en_page,
 | |
|                 'fr_page': fr_page,
 | |
|                 'date_diff': date_diff,
 | |
|                 'word_diff': word_diff,
 | |
|                 'section_diff': section_diff,
 | |
|                 'link_diff': link_diff,
 | |
|                 'media_diff': media_diff,
 | |
|                 'staleness_score': staleness_score,
 | |
|                 'priority': staleness_score,  # Use staleness score as priority
 | |
|                 'section_comparison': section_comparison,
 | |
|                 'link_comparison': link_comparison,
 | |
|                 'media_comparison': media_comparison,
 | |
|                 'category_comparison': category_comparison
 | |
|             })
 | |
|     
 | |
|     # Sort by priority (descending)
 | |
|     needs_update.sort(key=lambda x: x['priority'], reverse=True)
 | |
|     
 | |
|     return needs_update
 | |
| 
 | |
| def main():
 | |
|     """
 | |
|     Main function to execute the script
 | |
|     
 | |
|     This function:
 | |
|     1. Fetches the top OSM keys from TagInfo API
 | |
|     2. Fetches keys used in France that are missing a wiki page from TagInfo API
 | |
|     3. Fetches and processes wiki pages for these keys
 | |
|     4. Processes specific wiki pages listed in SPECIFIC_PAGES
 | |
|     5. Processes pages from the FR:Traductions_désynchronisées category
 | |
|     6. Processes pages starting with "France" from the DeadendPages list
 | |
|     7. Calculates staleness scores for all pages
 | |
|     8. Generates a histogram of staleness scores
 | |
|     9. Saves the results to CSV and JSON files
 | |
|     10. Prints a list of pages that need updating
 | |
|     """
 | |
|     # Parse command-line arguments
 | |
|     parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
 | |
|     parser.add_argument('--no-grammar-check', action='store_true',
 | |
|                         help='Disable grammar checking for French pages', default=False)
 | |
|     args = parser.parse_args()
 | |
| 
 | |
|     # Whether to check grammar for French pages
 | |
|     check_grammar = not args.no_grammar_check
 | |
| 
 | |
|     logger.info("Starting wiki_compare.py")
 | |
|     logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
 | |
| 
 | |
|     # Create output directory if it doesn't exist
 | |
|     os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
 | |
|     
 | |
|     # Fetch top keys
 | |
|     top_keys = fetch_top_keys(NUM_WIKI_PAGES)
 | |
|     
 | |
|     if not top_keys:
 | |
|         logger.error("Failed to fetch top keys. Exiting.")
 | |
|         return
 | |
|     
 | |
|     # Save top keys to JSON
 | |
|     save_to_json(top_keys, TOP_KEYS_FILE)
 | |
|     
 | |
|     # Fetch keys without wiki pages used in France
 | |
|     keys_without_wiki = fetch_keys_without_wiki_page()
 | |
|     
 | |
|     if keys_without_wiki:
 | |
|         # Save keys without wiki pages to JSON
 | |
|         save_to_json(keys_without_wiki, KEYS_WITHOUT_WIKI_FILE)
 | |
|         logger.info(f"Saved {len(keys_without_wiki)} keys without wiki pages to {KEYS_WITHOUT_WIKI_FILE}")
 | |
|     else:
 | |
|         logger.warning("No keys without wiki pages were fetched.")
 | |
|         
 | |
|     # Fetch pages starting with "France" from the DeadendPages list
 | |
|     deadend_pages = fetch_deadend_pages()
 | |
|     
 | |
|     if deadend_pages:
 | |
|         # Load existing deadend pages data to compare with history
 | |
|         existing_data = load_json_data(DEADEND_PAGES_FILE)
 | |
|         
 | |
|         # Initialize history if it doesn't exist
 | |
|         if 'history' not in existing_data:
 | |
|             existing_data['history'] = {}
 | |
|             
 | |
|         # Get the most recent history entry
 | |
|         sorted_timestamps = sorted(existing_data.get('history', {}).keys())
 | |
|         previous_pages = []
 | |
|         if sorted_timestamps:
 | |
|             latest_timestamp = sorted_timestamps[-1]
 | |
|             previous_pages = existing_data['history'][latest_timestamp].get('pages', [])
 | |
|         
 | |
|         # Find pages that were in the previous list but are no longer in the current list
 | |
|         previous_urls = [page['url'] for page in previous_pages]
 | |
|         current_urls = [page['url'] for page in deadend_pages]
 | |
|         
 | |
|         categorized_pages = []
 | |
|         for url in previous_urls:
 | |
|             if url not in current_urls:
 | |
|                 # Find the page in previous_pages
 | |
|                 for page in previous_pages:
 | |
|                     if page['url'] == url:
 | |
|                         # This page is no longer in the DeadendPages list, which means it has been categorized
 | |
|                         categorized_pages.append(page)
 | |
|                         break
 | |
|         
 | |
|         # Create a timestamp for the current data
 | |
|         current_timestamp = datetime.now().isoformat()
 | |
|         
 | |
|         # Create the history entry
 | |
|         history_entry = {
 | |
|             'pages': deadend_pages,
 | |
|             'categorized_pages': categorized_pages
 | |
|         }
 | |
|         
 | |
|         # Add the entry to history with timestamp as key
 | |
|         existing_data['history'][current_timestamp] = history_entry
 | |
|         
 | |
|         # Update the current data
 | |
|         existing_data['pages'] = deadend_pages
 | |
|         existing_data['categorized_pages'] = categorized_pages
 | |
|         existing_data['last_updated'] = current_timestamp
 | |
|         
 | |
|         # Save the updated data
 | |
|         save_to_json(existing_data, DEADEND_PAGES_FILE)
 | |
|         logger.info(f"Saved {len(deadend_pages)} deadend pages to {DEADEND_PAGES_FILE}")
 | |
|         logger.info(f"Found {len(categorized_pages)} pages that have been categorized since the last run")
 | |
|     else:
 | |
|         logger.warning("No deadend pages were fetched.")
 | |
|     
 | |
|     # Fetch wiki pages for each key
 | |
|     wiki_pages = []
 | |
|     
 | |
|     # Process top keys
 | |
|     logger.info("Processing top keys...")
 | |
|     for key_info in top_keys:
 | |
|         key = key_info['key']
 | |
|         
 | |
|         # Fetch English page
 | |
|         en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
 | |
|         if en_page:
 | |
|             wiki_pages.append(en_page)
 | |
|         
 | |
|         # Fetch French page
 | |
|         fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
 | |
|         if fr_page:
 | |
|             wiki_pages.append(fr_page)
 | |
|     
 | |
|     # Process specific pages from the SPECIFIC_PAGES list
 | |
|     # These are additional pages to compare beyond the top keys from TagInfo
 | |
|     logger.info("Processing specific pages...")
 | |
|     for page in SPECIFIC_PAGES:
 | |
|         # For specific pages, we need to handle different formats
 | |
|         
 | |
|         # Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
 | |
|         if page.startswith('http'):
 | |
|             # For full URLs, we directly fetch the page
 | |
|             page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
 | |
|             if page_info:
 | |
|                 wiki_pages.append(page_info)
 | |
|                 
 | |
|                 # If it's a French page, try to find the English equivalent
 | |
|                 if page_info['language'] == 'fr':
 | |
|                     # Try to get the English version by removing FR: prefix
 | |
|                     en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
 | |
|                     en_url = f"{WIKI_BASE_URL}{en_title}"
 | |
|                     logger.info(f"Trying to find English equivalent for {page}: {en_url}")
 | |
|                     en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
 | |
|                     if en_page:
 | |
|                         wiki_pages.append(en_page)
 | |
|                 # If it's an English page, try to find the French equivalent
 | |
|                 else:
 | |
|                     # Try to get the French version by adding FR: prefix
 | |
|                     fr_title = f"FR:{page_info['page_title']}"
 | |
|                     fr_url = f"{WIKI_BASE_URL}{fr_title}"
 | |
|                     logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
 | |
|                     fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
 | |
|                     if fr_page:
 | |
|                         wiki_pages.append(fr_page)
 | |
|         
 | |
|         # Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
 | |
|         elif page.startswith('FR:'):
 | |
|             # Fetch the French page
 | |
|             fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
 | |
|             if fr_page:
 | |
|                 wiki_pages.append(fr_page)
 | |
|                 
 | |
|                 # Try to get the English version by removing FR: prefix
 | |
|                 en_title = page[3:]  # Remove FR: prefix
 | |
|                 en_url = f"{WIKI_BASE_URL}{en_title}"
 | |
|                 logger.info(f"Trying to find English equivalent for {page}: {en_url}")
 | |
|                 en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
 | |
|                 if en_page:
 | |
|                     wiki_pages.append(en_page)
 | |
|         
 | |
|         # Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
 | |
|         else:
 | |
|             # Fetch the English page
 | |
|             en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
 | |
|             if en_page:
 | |
|                 wiki_pages.append(en_page)
 | |
|             
 | |
|             # Fetch the French page (by adding FR: prefix)
 | |
|             fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
 | |
|             if fr_page:
 | |
|                 wiki_pages.append(fr_page)
 | |
| 
 | |
|     # Process pages from the FR:Traductions_désynchronisées category
 | |
|     logger.info("Processing pages from FR:Traductions_désynchronisées category...")
 | |
|     desynchronized_pages = fetch_desynchronized_pages()
 | |
|     for page_url in desynchronized_pages:
 | |
|         # Fetch the French page
 | |
|         fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
 | |
|         if fr_page:
 | |
|             wiki_pages.append(fr_page)
 | |
| 
 | |
|             # Try to find the English equivalent
 | |
|             if fr_page['page_title'].startswith('FR:'):
 | |
|                 en_title = fr_page['page_title'][3:]  # Remove FR: prefix
 | |
|             else:
 | |
|                 en_title = fr_page['page_title']
 | |
| 
 | |
|             en_url = f"{WIKI_BASE_URL}{en_title}"
 | |
|             logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
 | |
|             en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
 | |
|             if en_page:
 | |
|                 wiki_pages.append(en_page)
 | |
| 
 | |
|     # Process wiki pages to add staleness score
 | |
|     processed_wiki_pages = []
 | |
|     pages_by_key = {}
 | |
|     
 | |
|     # Group pages by key
 | |
|     for page in wiki_pages:
 | |
|         if page is None:
 | |
|             continue
 | |
|         
 | |
|         key = page['key']
 | |
|         if key not in pages_by_key:
 | |
|             pages_by_key[key] = {}
 | |
|         
 | |
|         pages_by_key[key][page['language']] = page
 | |
|     
 | |
|     # Calculate staleness score for each pair of pages
 | |
|     for key, lang_pages in pages_by_key.items():
 | |
|         # Add English page with staleness score
 | |
|         if 'en' in lang_pages:
 | |
|             en_page = lang_pages['en'].copy()
 | |
|             
 | |
|             # If French page exists, calculate staleness score
 | |
|             if 'fr' in lang_pages:
 | |
|                 fr_page = lang_pages['fr']
 | |
|                 
 | |
|                 # Skip if dates are missing
 | |
|                 if en_page['last_modified'] and fr_page['last_modified']:
 | |
|                     # Calculate date difference in days
 | |
|                     en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
 | |
|                     fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
 | |
|                     date_diff = (en_date - fr_date).days
 | |
|                     
 | |
|                     # Calculate content differences
 | |
|                     word_diff = en_page['word_count'] - fr_page['word_count']
 | |
|                     section_diff = en_page['sections'] - fr_page['sections']
 | |
|                     link_diff = en_page['link_count'] - fr_page['link_count']
 | |
|                     
 | |
|                     # Calculate staleness score
 | |
|                     staleness_score = (
 | |
|                         abs(date_diff) * 0.2 +
 | |
|                         abs(word_diff) / 100 * 0.5 +
 | |
|                         abs(section_diff) * 0.15 +
 | |
|                         abs(link_diff) / 10 * 0.15
 | |
|                     )
 | |
|                     
 | |
|                     # Round to 2 decimal places
 | |
|                     staleness_score = round(staleness_score, 2)
 | |
|                     
 | |
|                     en_page['staleness_score'] = staleness_score
 | |
|                     fr_page['staleness_score'] = staleness_score
 | |
|                 else:
 | |
|                     en_page['staleness_score'] = 0
 | |
|                     fr_page['staleness_score'] = 0
 | |
|                 
 | |
|                 processed_wiki_pages.append(en_page)
 | |
|                 processed_wiki_pages.append(fr_page)
 | |
|             else:
 | |
|                 # French page is missing, calculate a high staleness score
 | |
|                 missing_staleness_score = (
 | |
|                     30 * 0.2 +
 | |
|                     en_page['word_count'] / 100 * 0.5 +
 | |
|                     en_page['sections'] * 0.15 +
 | |
|                     en_page['link_count'] / 10 * 0.15
 | |
|                 )
 | |
|                 
 | |
|                 # Round to 2 decimal places and ensure it's high
 | |
|                 missing_staleness_score = max(100, round(missing_staleness_score, 2))
 | |
|                 
 | |
|                 en_page['staleness_score'] = missing_staleness_score
 | |
|                 processed_wiki_pages.append(en_page)
 | |
|         
 | |
|         # Add French page without English counterpart (rare case)
 | |
|         elif 'fr' in lang_pages:
 | |
|             fr_page = lang_pages['fr'].copy()
 | |
|             fr_page['staleness_score'] = 0
 | |
|             processed_wiki_pages.append(fr_page)
 | |
|     
 | |
|     # Generate histogram of staleness scores
 | |
|     generate_staleness_histogram(processed_wiki_pages)
 | |
|     
 | |
|     # Save processed wiki pages to CSV
 | |
|     try:
 | |
|         with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
 | |
|             # Basic fields for CSV (detailed content will be in JSON only)
 | |
|             fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score', 'description_img_url']
 | |
|             writer = csv.DictWriter(f, fieldnames=fieldnames)
 | |
|             
 | |
|             writer.writeheader()
 | |
|             for page in processed_wiki_pages:
 | |
|                 if page:  # Skip None values
 | |
|                     # Create a copy with only the CSV fields
 | |
|                     csv_page = {field: page.get(field, '') for field in fieldnames if field in page}
 | |
|                     writer.writerow(csv_page)
 | |
|         
 | |
|         logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
 | |
|     
 | |
|     except IOError as e:
 | |
|         logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
 | |
|         return
 | |
|     
 | |
|     # Analyze pages to find those needing updates
 | |
|     pages_to_update = analyze_wiki_pages(wiki_pages)
 | |
|     
 | |
|     # Separate regular pages and specific pages
 | |
|     regular_pages = []
 | |
|     specific_pages = []
 | |
|     
 | |
|     for page in pages_to_update:
 | |
|         # Check if either English or French page is marked as specific
 | |
|         is_specific = False
 | |
|         if page['en_page'] and page['en_page'].get('is_specific_page', False):
 | |
|             is_specific = True
 | |
|         elif page['fr_page'] and page['fr_page'].get('is_specific_page', False):
 | |
|             is_specific = True
 | |
|             
 | |
|         if is_specific:
 | |
|             specific_pages.append(page)
 | |
|         else:
 | |
|             regular_pages.append(page)
 | |
|     
 | |
|     # Create a structured output with separate sections
 | |
|     output_data = {
 | |
|         "regular_pages": regular_pages,
 | |
|         "specific_pages": specific_pages,
 | |
|         "last_updated": datetime.now().isoformat()
 | |
|     }
 | |
|     
 | |
|     # Save pages that need updating to JSON with history
 | |
|     save_with_history(output_data, OUTDATED_PAGES_FILE)
 | |
|     
 | |
|     # Print the top pages needing updates
 | |
|     print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
 | |
|     
 | |
|     for i, page in enumerate(pages_to_update[:NUM_WIKI_PAGES], 1):
 | |
|         key = page['key']
 | |
|         reason = page['reason']
 | |
|         en_url = page['en_page']['url'] if page['en_page'] else "N/A"
 | |
|         fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"
 | |
|         
 | |
|         print(f"{i}. Key: {key}")
 | |
|         print(f"   Reason: {reason}")
 | |
|         print(f"   English: {en_url}")
 | |
|         print(f"   French: {fr_url}")
 | |
|         print()
 | |
|     
 | |
|     logger.info("Script completed successfully")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main() | 
