qualiwiki/wiki_compare/wiki_compare.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
wiki_compare.py

This script fetches the most used OpenStreetMap keys from TagInfo,
compares their English and French wiki pages, and identifies which pages
need updating based on modification dates and content analysis.

The script also compares a specific list of wiki pages defined in the
SPECIFIC_PAGES constant. This list can include regular page titles,
full URLs, or pages with FR: prefix.

Usage:
    python wiki_compare.py

Output:
    - top_keys.json: JSON file containing the most used OSM keys
    - wiki_pages.csv: CSV file with information about each wiki page
    - outdated_pages.json: JSON file containing pages that need updating
    - staleness_histogram.png: Histogram of staleness scores
    - A console output listing the wiki pages that need updating
"""

import json
import csv
import requests
import re
import os
import subprocess
import tempfile
import hashlib
import argparse
from datetime import datetime
from bs4 import BeautifulSoup
import logging
import matplotlib.pyplot as plt
import numpy as np
import nltk
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 2
# HTML cache folder
HTML_CACHE_DIR = "html_cache"

# Initialize NLTK for sentence tokenization
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Also download punkt_tab resource which is needed for sent_tokenize
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)

# List of specific pages to compare (in addition to top keys)
# This list can include:
# 1. Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
# 2. Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
SPECIFIC_PAGES = [
    "Anatomie_des_étiquettes_osm",
    "Tag:leisure=children_club",
    "Tag:harassment_prevention=ask_angela",
    "Key:harassment_prevention",
    "Proposal process",
    "Automated_Edits_code_of_conduct",
    "Key:cuisine",
    "Libre_Charge_Map",
    "OSM_Mon_Commerce",
    "Complète_Tes_Commerces",
    "Tag:amenity=charging_station",
    "Organised_Editing/Activities/MapYourGrid_Initiative",
    "Key:highway",
    "Quality_assurance",
    "Verifiability",
    "Good_practice",
    "Mapping_parties",
    "State_of_the_Map",
    "Diversity",
    "Mapping_private_information",
    "Any_tags_you_like",
    "Organised_Editing/Best_Practices",
    "Map_features"
]

def fetch_desynchronized_pages():
    """
    Fetch pages from the FR:Traductions_désynchronisées category

    Returns:
        list: List of page URLs from the category
    """
    logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")

    try:
        response = requests.get(WIKI_CATEGORY_URL)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links to French pages in the category
        page_links = []
        for link in soup.select('a[href^="/wiki/FR:"]'):
            href = link.get('href', '')
            # Skip if it's a category link or a language link
            if '/Category:' in href or 'action=edit' in href:
                continue

            # Get the full URL
            full_url = 'https://wiki.openstreetmap.org' + href
            page_links.append(full_url)

        logger.info(f"Found {len(page_links)} pages in the category")
        return page_links

    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching category page: {e}")
        return []

def fetch_top_keys(limit=NUM_WIKI_PAGES):
    """
    Fetch the most used OSM keys from TagInfo API

    Args:
        limit (int): Number of keys to fetch

    Returns:
        list: List of dictionaries containing key information
    """
    logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")

    params = {
        'page': 1,
        'rp': limit,
        'sortname': 'count_all',
        'sortorder': 'desc'
    }

    try:
        response = requests.get(TAGINFO_API_URL, params=params)
        response.raise_for_status()
        data = response.json()

        # Extract just the key names and counts
        top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]

        logger.info(f"Successfully fetched {len(top_keys)} keys")
        return top_keys

    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching data from TagInfo API: {e}")
        return []

def load_json_data(filename):
    """
    Load data from a JSON file

    Args:
        filename (str): Name of the file

    Returns:
        dict: Data loaded from the file or empty dict if file doesn't exist
    """
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                data = json.load(f)
            logger.info(f"Data loaded from {filename}")
            return data
        else:
            logger.info(f"File {filename} doesn't exist, returning empty dict")
            return {}
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error loading data from {filename}: {e}")
        return {}

def save_to_json(data, filename):
    """
    Save data to a JSON file

    Args:
        data: Data to save
        filename (str): Name of the file
    """
    try:
        # Convert data to JSON string
        json_str = json.dumps(data, indent=2, ensure_ascii=False)

        # Print the JSON string for debugging
        logger.info(f"JSON string to be written to {filename}:")

        # Check if data is a dictionary before trying to access keys
        if isinstance(data, dict):
            logger.info(f"JSON keys at top level: {list(data.keys())}")
            if 'translations' in data:
                logger.info(f"JSON keys in translations: {list(data['translations'].keys())}")
                if 'type' in data['translations']:
                    logger.info(f"'type' key exists in translations")
                if 'type_key' in data['translations']:
                    logger.info(f"'type_key' key exists in translations")
        elif isinstance(data, list):
            logger.info(f"Data is a list with {len(data)} items")

        # Write the JSON string to the file
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(json_str)

        logger.info(f"Data saved to {filename}")
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")

def save_with_history(data, filename):
    """
    Save data to a JSON file while preserving history

    This function loads existing data from the file (if it exists),
    adds the new data to the history, and saves the updated data back to the file.

    Args:
        data: New data to save
        filename (str): Name of the file
    """
    try:
        # Load existing data
        existing_data = load_json_data(filename)

        # Create a timestamp for the current data
        current_timestamp = datetime.now().isoformat()

        # Initialize history if it doesn't exist
        if 'history' not in existing_data:
            existing_data['history'] = {}

        # Add current regular_pages and specific_pages to history
        history_entry = {
            'regular_pages': data.get('regular_pages', []),
            'specific_pages': data.get('specific_pages', [])
        }

        # Add the entry to history with timestamp as key
        existing_data['history'][current_timestamp] = history_entry

        # Update the current data
        existing_data['regular_pages'] = data.get('regular_pages', [])
        existing_data['specific_pages'] = data.get('specific_pages', [])
        existing_data['last_updated'] = current_timestamp

        # Save the updated data
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)

        logger.info(f"Data with history saved to {filename}")
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error saving data with history to {filename}: {e}")
        # Fallback to regular save if there's an error
        save_to_json(data, filename)

def check_grammar_with_grammalecte(text):
    """
    Check grammar in French text using grammalecte-cli

    Args:
        text (str): French text to check

    Returns:
        list: List of grammar suggestions
    """
    if not text or len(text.strip()) == 0:
        logger.warning("Empty text provided for grammar checking")
        return []

    logger.info("Checking grammar with grammalecte-cli...")

    try:
        # Create a temporary file with the text
        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
            temp_file.write(text)
            temp_file_path = temp_file.name

        # Run grammalecte-cli on the temporary file
        cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)

        # Parse the JSON output
        grammar_data = json.loads(result.stdout)

        # Extract grammar errors from all paragraphs
        grammar_suggestions = []
        for paragraph in grammar_data.get('data', []):
            paragraph_index = paragraph.get('iParagraph', 0)

            # Process grammar errors
            for error in paragraph.get('lGrammarErrors', []):
                suggestion = {
                    'paragraph': paragraph_index,
                    'start': error.get('nStart', 0),
                    'end': error.get('nEnd', 0),
                    'type': error.get('sType', ''),
                    'message': error.get('sMessage', ''),
                    'suggestions': error.get('aSuggestions', []),
                    'text': error.get('sUnderlined', ''),
                    'before': error.get('sBefore', ''),
                    'after': error.get('sAfter', '')
                }
                grammar_suggestions.append(suggestion)

            # Process spelling errors
            for error in paragraph.get('lSpellingErrors', []):
                suggestion = {
                    'paragraph': paragraph_index,
                    'start': error.get('nStart', 0),
                    'end': error.get('nEnd', 0),
                    'type': 'spelling',
                    'message': 'Erreur d\'orthographe',
                    'suggestions': error.get('aSuggestions', []),
                    'text': error.get('sUnderlined', ''),
                    'before': error.get('sBefore', ''),
                    'after': error.get('sAfter', '')
                }
                grammar_suggestions.append(suggestion)

        # Clean up the temporary file
        os.unlink(temp_file_path)

        logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
        return grammar_suggestions

    except subprocess.CalledProcessError as e:
        logger.error(f"Error running grammalecte-cli: {e}")
        logger.error(f"stdout: {e.stdout}")
        logger.error(f"stderr: {e.stderr}")
        return []

    except json.JSONDecodeError as e:
        logger.error(f"Error parsing grammalecte-cli output: {e}")
        return []

    except Exception as e:
        logger.error(f"Unexpected error during grammar checking: {e}")
        return []

def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
    """
    Fetch wiki page for a given key or specific page

    This function handles different types of wiki pages:
    1. Regular OSM key pages (e.g., "building", "highway")
    2. Specific wiki pages that can be in various formats:
       - Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
       - Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
       - Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")

    Args:
        key (str): OSM key or specific page title/URL
        language (str): Language code ('en' or 'fr')
        is_specific_page (bool): Whether this is a specific page rather than a key
        check_grammar (bool): Whether to check grammar for French pages

    Returns:
        dict: Dictionary with page information or None if page doesn't exist
    """
    # Skip pages with "FR:User:" or "FR:Réunions"
    if "FR:User:" in key or "FR:Réunions" in key:
        logger.info(f"Skipping excluded page: {key}")
        return None
    # Handle different URL formats
    if is_specific_page:
        # Case 1: Full URL
        if key.startswith('http'):
            url = key
            # Extract the page title from the URL
            page_title = key.split('/')[-1]
            # Determine language from URL
            if 'FR:' in key or '/FR:' in key:
                language = 'fr'
            else:
                language = 'en'
        # Case 2: Page with FR: prefix
        elif key.startswith('FR:'):
            url = f"{WIKI_BASE_URL}{key}"
            page_title = key[3:]  # Remove FR: prefix for title
            language = 'fr'
        # Case 3: Regular page title
        else:
            if language == 'fr':
                url = f"{WIKI_BASE_URL}FR:{key}"
            else:
                url = f"{WIKI_BASE_URL}{key}"
            page_title = key
    else:
        # Regular key page
        base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
        url = f"{base_url}{key}"
        page_title = key

    # Create a unique cache filename based on the URL
    cache_key = hashlib.md5(url.encode()).hexdigest()
    cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"

    html_content = None

    # Try to load from cache first
    if cache_file.exists():
        logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
        try:
            with open(cache_file, 'r', encoding='utf-8') as f:
                html_content = f.read()
        except Exception as e:
            logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
            html_content = None

    # If not in cache or cache read failed, fetch from web
    if html_content is None:
        logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
        try:
            response = requests.get(url)

            # Check if page exists
            if response.status_code == 404:
                logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
                return None

            response.raise_for_status()
            html_content = response.text

            # Save to cache
            try:
                with open(cache_file, 'w', encoding='utf-8') as f:
                    f.write(html_content)
                logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
            except Exception as e:
                logger.warning(f"Error saving to cache: {e}")
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
            return None

    soup = BeautifulSoup(html_content, 'html.parser')

    # Get last modification date
    last_modified = None
    footer_info = soup.select_one('#footer-info-lastmod')
    if footer_info:
        date_text = footer_info.text
        # Extract date using regex
        date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
        if date_match:
            date_str = date_match.group(1)
            try:
                # Parse date (format may vary based on wiki language)
                last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
            except ValueError:
                logger.warning(f"Could not parse date: {date_str}")

    # Extract sections (h2, h3, h4)
    section_elements = soup.select('h2, h3, h4')
    sections = len(section_elements)

    # Extract section titles
    section_titles = []
    for section_elem in section_elements:
        # Skip sections that are part of the table of contents, navigation, or DescriptionBox
        if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
            continue

        # Skip sections that are inside a table with class DescriptionBox
        if section_elem.find_parent('table', class_='DescriptionBox'):
            continue

        # Get the text of the section title, removing any edit links
        for edit_link in section_elem.select('.mw-editsection'):
            edit_link.extract()

        section_title = section_elem.get_text(strip=True)
        section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4

        section_titles.append({
            'title': section_title,
            'level': section_level
        })

    # Count words and sentences in the content
    content = soup.select_one('#mw-content-text')
    clean_text = ""
    if content:
        # Remove script and style elements
        for script in content.select('script, style'):
            script.extract()

        # Remove .languages elements
        for languages_elem in content.select('.languages'):
            languages_elem.extract()

        # Get text and count words
        clean_text = content.get_text(separator=' ', strip=True)
        word_count = len(clean_text.split())

        # Count sentences using NLTK
        sentences = nltk.sent_tokenize(clean_text)
        sentence_count = len(sentences)

        # Check grammar for French pages
        grammar_suggestions = []
        if language == 'fr' and check_grammar:
            logger.info(f"Checking grammar for French page: {key}")
            grammar_suggestions = check_grammar_with_grammalecte(clean_text)
        elif language == 'fr' and not check_grammar:
            logger.info(f"Grammar checking disabled for French page: {key}")

        # Extract links
        links = content.select('a')
        link_count = len(links)

        # Get link details (text and href)
        link_details = []
        for link in links:
            href = link.get('href', '')
            # Skip edit section links and other non-content links
            if 'action=edit' in href or 'redlink=1' in href or not href:
                continue

            # Make relative URLs absolute
            if href.startswith('/'):
                href = 'https://wiki.openstreetmap.org' + href

            link_text = link.get_text(strip=True)
            if link_text:  # Only include links with text
                link_details.append({
                    'text': link_text,
                    'href': href
                })

        # Extract media (images)
        media_elements = content.select('img')
        media_count = len(media_elements)

        # Get media details (src and alt text)
        media_details = []

        # Extract description image specifically
        # Try multiple selectors to find the description image
        description_img = None

        # Debug: Log the key we're processing
        logger.info(f"Looking for description image for key '{key}' in {language}")

        # Function to filter out OSM logo and small icons
        def is_relevant_image(img):
            src = img.get('src', '')
            # Skip OSM logo
            if 'osm_logo' in src:
                return False
            # Skip small icons (usually less than 30px)
            width = img.get('width')
            if width and int(width) < 30:
                return False
            height = img.get('height')
            if height and int(height) < 30:
                return False
            return True

        # Special case for highway key - directly target the image we want
        if key == 'highway':
            # Try to find the specific image in figure elements
            highway_img_elements = content.select('figure.mw-halign-center img')
            logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")

            # Filter for relevant images
            relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images for highway")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")

        # If not found with highway-specific selector, try the td.d_image selector
        if not description_img:
            description_img_elements = content.select('td.d_image img')
            logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")

            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")

        # If still not found, try the specific selector for .description img.mw-file-element
        if not description_img:
            description_img_elements = content.select('.description img.mw-file-element')
            logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")

            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in .description")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")

        # If still not found, try images in figures within the description box
        if not description_img:
            description_img_elements = content.select('.description figure img')
            logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")

            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")

        # If still not found, try any image in the description box
        if not description_img:
            description_img_elements = content.select('.description img')
            logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")

            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in .description general")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")

        # If still not found, try images in the DescriptionBox table
        if not description_img:
            description_img_elements = content.select('table.DescriptionBox img')
            logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")

            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")

        # If still not found, try images in figure elements anywhere in the content
        if not description_img:
            description_img_elements = content.select('figure img')
            logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")

            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")

        # If we still don't have an image, use any image that's not the OSM logo
        if not description_img:
            all_images = content.select('img')
            relevant_images = [img for img in all_images if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")

            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using fallback image: {description_img.get('src', '')}")

        # Process the found image
        description_img_url = None
        if description_img:
            src = description_img.get('src', '')
            if src:
                # Make relative URLs absolute
                if src.startswith('//'):
                    src = 'https:' + src
                elif src.startswith('/'):
                    src = 'https://wiki.openstreetmap.org' + src

                description_img_url = src

            # Process all images
            for img in media_elements:
                src = img.get('src', '')
                if src:
                    # Make relative URLs absolute
                    if src.startswith('//'):
                        src = 'https:' + src
                    elif src.startswith('/'):
                        src = 'https://wiki.openstreetmap.org' + src

                    alt_text = img.get('alt', '')
                    media_details.append({
                        'src': src,
                        'alt': alt_text
                    })

            # Extract categories
            categories = []
            category_links = soup.select('#mw-normal-catlinks li a')
            for cat_link in category_links:
                categories.append(cat_link.get_text(strip=True))
        else:
            word_count = 0
            link_count = 0
            link_details = []
            media_count = 0
            media_details = []
            categories = []
            grammar_suggestions = []

        return {
            'key': key,
            'page_title': page_title,
            'language': language,
            'url': url,
            'last_modified': last_modified,
            'sections': sections,
            'section_titles': section_titles,
            'word_count': word_count,
            'sentence_count': sentence_count,
            'link_count': link_count,
            'link_details': link_details,
            'media_count': media_count,
            'media_details': media_details,
            'categories': categories,
            'description_img_url': description_img_url,
            'is_specific_page': is_specific_page,
            'grammar_suggestions': grammar_suggestions,
            'html_content': html_content
        }

def generate_staleness_histogram(wiki_pages):
    """
    Generate a histogram of staleness scores by 10% ranges

    Args:
        wiki_pages (list): List of dictionaries containing page information with staleness scores

    Returns:
        None: Saves the histogram to a file
    """
    logger.info("Generating histogram of staleness scores by 10% ranges...")

    # Extract staleness scores
    staleness_scores = []
    for page in wiki_pages:
        if page and 'staleness_score' in page:
            staleness_scores.append(page['staleness_score'])

    if not staleness_scores:
        logger.warning("No staleness scores found. Cannot generate histogram.")
        return

    # Determine the maximum score for binning
    max_score = max(staleness_scores)
    # Round up to the nearest 10 to ensure all scores are included
    max_bin_edge = np.ceil(max_score / 10) * 10

    # Create bins for 10% ranges
    bins = np.arange(0, max_bin_edge + 10, 10)

    # Count scores in each bin
    hist, bin_edges = np.histogram(staleness_scores, bins=bins)

    # Create histogram
    plt.figure(figsize=(12, 6))

    # Create bar chart
    plt.bar(range(len(hist)), hist, align='center')

    # Set x-axis labels for each bin
    bin_labels = [f"{int(bin_edges[i])}-{int(bin_edges[i+1])}%" for i in range(len(bin_edges)-1)]
    plt.xticks(range(len(hist)), bin_labels, rotation=45)

    # Set labels and title
    plt.xlabel('Tranches de score de décrépitude (en %)')
    plt.ylabel('Nombre de pages')
    plt.title('Répartition du score de décrépitude par tranches de 10%')

    # Add grid for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Adjust layout
    plt.tight_layout()

    # Save figure
    plt.savefig(STALENESS_HISTOGRAM_FILE)
    logger.info(f"Histogram saved to {STALENESS_HISTOGRAM_FILE}")

    # Close the figure to free memory
    plt.close()

def analyze_wiki_pages(pages):
    """
    Analyze wiki pages to determine which ones need updating

    Args:
        pages (list): List of dictionaries containing page information

    Returns:
        list: List of pages that need updating, sorted by priority
    """
    logger.info("Analyzing wiki pages to identify those needing updates...")

    # Group pages by key
    pages_by_key = {}
    for page in pages:
        if page is None:
            continue

        key = page['key']
        if key not in pages_by_key:
            pages_by_key[key] = {}

        pages_by_key[key][page['language']] = page

    # Analyze each key's pages
    needs_update = []

    for key, lang_pages in pages_by_key.items():
        # Skip if either language is missing
        if 'en' not in lang_pages or 'fr' not in lang_pages:
            if 'en' in lang_pages:
                # French page is missing
                # For missing French pages, calculate a high staleness score
                # Use word count as the main factor (50% weight)
                missing_staleness_score = (
                    30 * 0.2 +  # Assume 30 days outdated (20%)
                    lang_pages['en']['word_count'] / 100 * 0.5 +  # Word count (50%)
                    lang_pages['en']['sections'] * 0.15 +  # Sections (15%)
                    lang_pages['en']['link_count'] / 10 * 0.15  # Links (15%)
                )

                # Round to 2 decimal places and ensure it's high
                missing_staleness_score = max(100, round(missing_staleness_score, 2))

                # Get media count or default to 0
                media_count = lang_pages['en'].get('media_count', 0)

                needs_update.append({
                    'key': key,
                    'reason': 'French page missing',
                    'en_page': lang_pages['en'],
                    'fr_page': None,
                    'date_diff': 0,
                    'word_diff': lang_pages['en']['word_count'],
                    'section_diff': lang_pages['en']['sections'],
                    'link_diff': lang_pages['en']['link_count'],
                    'media_diff': media_count,
                    'staleness_score': missing_staleness_score,
                    'priority': missing_staleness_score,  # Use staleness score as priority
                    'section_comparison': None,  # No comparison possible
                    'link_comparison': None,     # No comparison possible
                    'media_comparison': None,    # No comparison possible
                    'category_comparison': None  # No comparison possible
                })
            continue

        en_page = lang_pages['en']
        fr_page = lang_pages['fr']

        # Skip if dates are missing
        if not en_page['last_modified'] or not fr_page['last_modified']:
            continue

        # Calculate date difference in days
        en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
        fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
        date_diff = (en_date - fr_date).days

        # Calculate content differences
        word_diff = en_page['word_count'] - fr_page['word_count']
        section_diff = en_page['sections'] - fr_page['sections']
        link_diff = en_page['link_count'] - fr_page['link_count']
        media_diff = en_page.get('media_count', 0) - fr_page.get('media_count', 0)

        # Calculate staleness score (higher means more outdated/stale)
        # Weight factors adjusted to emphasize word count differences
        staleness_score = (
            abs(date_diff) * 0.2 +  # Date difference (20%)
            abs(word_diff) / 100 * 0.5 +  # Word count difference (normalized) (50%)
            abs(section_diff) * 0.15 +  # Section difference (15%)
            abs(link_diff) / 10 * 0.15  # Link count difference (normalized) (15%)
        )

        # Round to 2 decimal places for display
        staleness_score = round(staleness_score, 2)

        # Compare sections between English and French pages
        section_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Group sections by their level for hierarchical comparison
        en_sections_by_level = {}
        fr_sections_by_level = {}

        # Organize English sections by level
        for section in en_page.get('section_titles', []):
            level = section['level']
            if level not in en_sections_by_level:
                en_sections_by_level[level] = []
            en_sections_by_level[level].append(section)

        # Organize French sections by level
        for section in fr_page.get('section_titles', []):
            level = section['level']
            if level not in fr_sections_by_level:
                fr_sections_by_level[level] = []
            fr_sections_by_level[level].append(section)

        # Process each level to find matching sections
        all_levels = set(list(en_sections_by_level.keys()) + list(fr_sections_by_level.keys()))

        for level in all_levels:
            en_level_sections = en_sections_by_level.get(level, [])
            fr_level_sections = fr_sections_by_level.get(level, [])

            # Create dictionaries for easier lookup, using lowercase titles
            en_dict = {section['title'].lower(): section for section in en_level_sections}
            fr_dict = {section['title'].lower(): section for section in fr_level_sections}

            # Find sections at this level only in English
            for title, section in en_dict.items():
                if title not in fr_dict:
                    section_comparison['en_only'].append(section)

            # Find sections at this level only in French
            for title, section in fr_dict.items():
                if title not in en_dict:
                    section_comparison['fr_only'].append(section)

            # Find common sections at this level
            for title in en_dict.keys():
                if title in fr_dict:
                    section_comparison['common'].append({
                        'en': en_dict[title],
                        'fr': fr_dict[title]
                    })

        # Compare links between English and French pages
        link_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Extract link texts for comparison (case insensitive)
        en_links = {link['text'].lower(): link for link in en_page.get('link_details', [])}
        fr_links = {link['text'].lower(): link for link in fr_page.get('link_details', [])}

        # Find links only in English
        for text, link in en_links.items():
            if text not in fr_links:
                link_comparison['en_only'].append(link)

        # Find links only in French
        for text, link in fr_links.items():
            if text not in en_links:
                link_comparison['fr_only'].append(link)

        # Find common links
        for text in en_links.keys():
            if text in fr_links:
                link_comparison['common'].append({
                    'en': en_links[text],
                    'fr': fr_links[text]
                })

        # Compare media between English and French pages
        media_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Extract media alt texts for comparison (case insensitive)
        en_media = {media['alt'].lower(): media for media in en_page.get('media_details', []) if media['alt']}
        fr_media = {media['alt'].lower(): media for media in fr_page.get('media_details', []) if media['alt']}

        # Find media only in English
        for alt, media in en_media.items():
            if alt not in fr_media:
                media_comparison['en_only'].append(media)

        # Find media only in French
        for alt, media in fr_media.items():
            if alt not in en_media:
                media_comparison['fr_only'].append(media)

        # Find common media
        for alt in en_media.keys():
            if alt in fr_media:
                media_comparison['common'].append({
                    'en': en_media[alt],
                    'fr': fr_media[alt]
                })

        # Add media without alt text to their respective language-only lists
        for media in en_page.get('media_details', []):
            if not media['alt'] or media['alt'].lower() not in en_media:
                media_comparison['en_only'].append(media)

        for media in fr_page.get('media_details', []):
            if not media['alt'] or media['alt'].lower() not in fr_media:
                media_comparison['fr_only'].append(media)

        # Compare categories between English and French pages
        category_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Extract categories for comparison (case insensitive)
        en_categories = [cat.lower() for cat in en_page.get('categories', [])]
        fr_categories = [cat.lower() for cat in fr_page.get('categories', [])]

        # Find categories only in English
        for cat in en_page.get('categories', []):
            if cat.lower() not in fr_categories:
                category_comparison['en_only'].append(cat)

        # Find categories only in French
        for cat in fr_page.get('categories', []):
            if cat.lower() not in en_categories:
                category_comparison['fr_only'].append(cat)

        # Find common categories
        for cat in en_page.get('categories', []):
            if cat.lower() in fr_categories:
                category_comparison['common'].append(cat)

        if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
            reason = []
            if date_diff > 30:
                reason.append(f"La version Française est datée de {date_diff} jours")
            if word_diff > 200:
                reason.append(f"La version Anglaise a {word_diff} mots de plus")
            if section_diff > 2:
                reason.append(f"La version Anglaise a {section_diff} sections de plus")
            if link_diff > 20:
                reason.append(f"La version Anglaise a {link_diff} liens de plus")
            if media_diff > 5:
                reason.append(f"La version Anglaise a {media_diff} images de plus")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
                reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")

            needs_update.append({
                'key': key,
                'reason': ', '.join(reason),
                'en_page': en_page,
                'fr_page': fr_page,
                'date_diff': date_diff,
                'word_diff': word_diff,
                'section_diff': section_diff,
                'link_diff': link_diff,
                'media_diff': media_diff,
                'staleness_score': staleness_score,
                'priority': staleness_score,  # Use staleness score as priority
                'section_comparison': section_comparison,
                'link_comparison': link_comparison,
                'media_comparison': media_comparison,
                'category_comparison': category_comparison
            })

    # Sort by priority (descending)
    needs_update.sort(key=lambda x: x['priority'], reverse=True)

    return needs_update

def main():
    """
    Main function to execute the script

    This function:
    1. Fetches the top OSM keys from TagInfo API
    2. Fetches and processes wiki pages for these keys
    3. Processes specific wiki pages listed in SPECIFIC_PAGES
    4. Processes pages from the FR:Traductions_désynchronisées category
    5. Calculates staleness scores for all pages
    6. Generates a histogram of staleness scores
    7. Saves the results to CSV and JSON files
    8. Prints a list of pages that need updating
    """
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
    parser.add_argument('--no-grammar-check', action='store_true',
                        help='Disable grammar checking for French pages')
    args = parser.parse_args()

    # Whether to check grammar for French pages
    check_grammar = not args.no_grammar_check

    logger.info("Starting wiki_compare.py")
    logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)

    # Fetch top keys
    top_keys = fetch_top_keys(NUM_WIKI_PAGES)

    if not top_keys:
        logger.error("Failed to fetch top keys. Exiting.")
        return

    # Save top keys to JSON
    save_to_json(top_keys, TOP_KEYS_FILE)

    # Fetch wiki pages for each key
    wiki_pages = []

    # Process top keys
    logger.info("Processing top keys...")
    for key_info in top_keys:
        key = key_info['key']

        # Fetch English page
        en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
        if en_page:
            wiki_pages.append(en_page)

        # Fetch French page
        fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
        if fr_page:
            wiki_pages.append(fr_page)

    # Process specific pages from the SPECIFIC_PAGES list
    # These are additional pages to compare beyond the top keys from TagInfo
    logger.info("Processing specific pages...")
    for page in SPECIFIC_PAGES:
        # For specific pages, we need to handle different formats

        # Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
        if page.startswith('http'):
            # For full URLs, we directly fetch the page
            page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
            if page_info:
                wiki_pages.append(page_info)

                # If it's a French page, try to find the English equivalent
                if page_info['language'] == 'fr':
                    # Try to get the English version by removing FR: prefix
                    en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
                    en_url = f"{WIKI_BASE_URL}{en_title}"
                    logger.info(f"Trying to find English equivalent for {page}: {en_url}")
                    en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
                    if en_page:
                        wiki_pages.append(en_page)
                # If it's an English page, try to find the French equivalent
                else:
                    # Try to get the French version by adding FR: prefix
                    fr_title = f"FR:{page_info['page_title']}"
                    fr_url = f"{WIKI_BASE_URL}{fr_title}"
                    logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
                    fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
                    if fr_page:
                        wiki_pages.append(fr_page)

        # Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
        elif page.startswith('FR:'):
            # Fetch the French page
            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
            if fr_page:
                wiki_pages.append(fr_page)

                # Try to get the English version by removing FR: prefix
                en_title = page[3:]  # Remove FR: prefix
                en_url = f"{WIKI_BASE_URL}{en_title}"
                logger.info(f"Trying to find English equivalent for {page}: {en_url}")
                en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
                if en_page:
                    wiki_pages.append(en_page)

        # Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
        else:
            # Fetch the English page
            en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
            if en_page:
                wiki_pages.append(en_page)

            # Fetch the French page (by adding FR: prefix)
            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
            if fr_page:
                wiki_pages.append(fr_page)

    # Process pages from the FR:Traductions_désynchronisées category
    logger.info("Processing pages from FR:Traductions_désynchronisées category...")
    desynchronized_pages = fetch_desynchronized_pages()
    for page_url in desynchronized_pages:
        # Fetch the French page
        fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
        if fr_page:
            wiki_pages.append(fr_page)

            # Try to find the English equivalent
            if fr_page['page_title'].startswith('FR:'):
                en_title = fr_page['page_title'][3:]  # Remove FR: prefix
            else:
                en_title = fr_page['page_title']

            en_url = f"{WIKI_BASE_URL}{en_title}"
            logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
            en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
            if en_page:
                wiki_pages.append(en_page)

    # Process wiki pages to add staleness score
    processed_wiki_pages = []
    pages_by_key = {}

    # Group pages by key
    for page in wiki_pages:
        if page is None:
            continue

        key = page['key']
        if key not in pages_by_key:
            pages_by_key[key] = {}

        pages_by_key[key][page['language']] = page

    # Calculate staleness score for each pair of pages
    for key, lang_pages in pages_by_key.items():
        # Add English page with staleness score
        if 'en' in lang_pages:
            en_page = lang_pages['en'].copy()

            # If French page exists, calculate staleness score
            if 'fr' in lang_pages:
                fr_page = lang_pages['fr']

                # Skip if dates are missing
                if en_page['last_modified'] and fr_page['last_modified']:
                    # Calculate date difference in days
                    en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
                    fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
                    date_diff = (en_date - fr_date).days

                    # Calculate content differences
                    word_diff = en_page['word_count'] - fr_page['word_count']
                    section_diff = en_page['sections'] - fr_page['sections']
                    link_diff = en_page['link_count'] - fr_page['link_count']

                    # Calculate staleness score
                    staleness_score = (
                        abs(date_diff) * 0.2 +
                        abs(word_diff) / 100 * 0.5 +
                        abs(section_diff) * 0.15 +
                        abs(link_diff) / 10 * 0.15
                    )

                    # Round to 2 decimal places
                    staleness_score = round(staleness_score, 2)

                    en_page['staleness_score'] = staleness_score
                    fr_page['staleness_score'] = staleness_score
                else:
                    en_page['staleness_score'] = 0
                    fr_page['staleness_score'] = 0

                processed_wiki_pages.append(en_page)
                processed_wiki_pages.append(fr_page)
            else:
                # French page is missing, calculate a high staleness score
                missing_staleness_score = (
                    30 * 0.2 +
                    en_page['word_count'] / 100 * 0.5 +
                    en_page['sections'] * 0.15 +
                    en_page['link_count'] / 10 * 0.15
                )

                # Round to 2 decimal places and ensure it's high
                missing_staleness_score = max(100, round(missing_staleness_score, 2))

                en_page['staleness_score'] = missing_staleness_score
                processed_wiki_pages.append(en_page)

        # Add French page without English counterpart (rare case)
        elif 'fr' in lang_pages:
            fr_page = lang_pages['fr'].copy()
            fr_page['staleness_score'] = 0
            processed_wiki_pages.append(fr_page)

    # Generate histogram of staleness scores
    generate_staleness_histogram(processed_wiki_pages)

    # Save processed wiki pages to CSV
    try:
        with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
            # Basic fields for CSV (detailed content will be in JSON only)
            fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score', 'description_img_url']
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            writer.writeheader()
            for page in processed_wiki_pages:
                if page:  # Skip None values
                    # Create a copy with only the CSV fields
                    csv_page = {field: page.get(field, '') for field in fieldnames if field in page}
                    writer.writerow(csv_page)

        logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")

    except IOError as e:
        logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
        return

    # Analyze pages to find those needing updates
    pages_to_update = analyze_wiki_pages(wiki_pages)

    # Separate regular pages and specific pages
    regular_pages = []
    specific_pages = []

    for page in pages_to_update:
        # Check if either English or French page is marked as specific
        is_specific = False
        if page['en_page'] and page['en_page'].get('is_specific_page', False):
            is_specific = True
        elif page['fr_page'] and page['fr_page'].get('is_specific_page', False):
            is_specific = True

        if is_specific:
            specific_pages.append(page)
        else:
            regular_pages.append(page)

    # Create a structured output with separate sections
    output_data = {
        "regular_pages": regular_pages,
        "specific_pages": specific_pages,
        "last_updated": datetime.now().isoformat()
    }

    # Save pages that need updating to JSON with history
    save_with_history(output_data, OUTDATED_PAGES_FILE)

    # Print the top pages needing updates
    print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")

    for i, page in enumerate(pages_to_update[:NUM_WIKI_PAGES], 1):
        key = page['key']
        reason = page['reason']
        en_url = page['en_page']['url'] if page['en_page'] else "N/A"
        fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"

        print(f"{i}. Key: {key}")
        print(f"   Reason: {reason}")
        print(f"   English: {en_url}")
        print(f"   French: {fr_url}")
        print()

    logger.info("Script completed successfully")

if __name__ == "__main__":
    main()