osm-labo/wiki_compare/wiki_compare.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
wiki_compare.py

This script fetches the most used OpenStreetMap keys from TagInfo,
compares their English and French wiki pages, and identifies which pages
need updating based on modification dates and content analysis.

Usage:
    python wiki_compare.py

Output:
    - top_keys.json: JSON file containing the most used OSM keys
    - wiki_pages.csv: CSV file with information about each wiki page
    - outdated_pages.json: JSON file containing pages that need updating
    - A console output listing the wiki pages that need updating
"""

import json
import csv
import requests
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
import logging
import matplotlib.pyplot as plt
import numpy as np

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 100

def fetch_top_keys(limit=NUM_WIKI_PAGES):
    """
    Fetch the most used OSM keys from TagInfo API

    Args:
        limit (int): Number of keys to fetch

    Returns:
        list: List of dictionaries containing key information
    """
    logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")

    params = {
        'page': 1,
        'rp': limit,
        'sortname': 'count_all',
        'sortorder': 'desc'
    }

    try:
        response = requests.get(TAGINFO_API_URL, params=params)
        response.raise_for_status()
        data = response.json()

        # Extract just the key names and counts
        top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]

        logger.info(f"Successfully fetched {len(top_keys)} keys")
        return top_keys

    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching data from TagInfo API: {e}")
        return []

def save_to_json(data, filename):
    """
    Save data to a JSON file

    Args:
        data: Data to save
        filename (str): Name of the file
    """
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Data saved to {filename}")
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")

def fetch_wiki_page(key, language='en'):
    """
    Fetch wiki page for a given key

    Args:
        key (str): OSM key
        language (str): Language code ('en' or 'fr')

    Returns:
        dict: Dictionary with page information or None if page doesn't exist
    """
    base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
    url = f"{base_url}{key}"

    logger.info(f"Fetching {language} wiki page for key '{key}': {url}")

    try:
        response = requests.get(url)

        # Check if page exists
        if response.status_code == 404:
            logger.warning(f"Wiki page for key '{key}' in {language} does not exist")
            return None

        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Get last modification date
        last_modified = None
        footer_info = soup.select_one('#footer-info-lastmod')
        if footer_info:
            date_text = footer_info.text
            # Extract date using regex
            date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
            if date_match:
                date_str = date_match.group(1)
                try:
                    # Parse date (format may vary based on wiki language)
                    last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
                except ValueError:
                    logger.warning(f"Could not parse date: {date_str}")

        # Extract sections (h2, h3, h4)
        section_elements = soup.select('h2, h3, h4')
        sections = len(section_elements)

        # Extract section titles
        section_titles = []
        for section_elem in section_elements:
            # Skip sections that are part of the table of contents, navigation, or DescriptionBox
            if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
                continue

            # Skip sections that are inside a table with class DescriptionBox
            if section_elem.find_parent('table', class_='DescriptionBox'):
                continue

            # Get the text of the section title, removing any edit links
            for edit_link in section_elem.select('.mw-editsection'):
                edit_link.extract()

            section_title = section_elem.get_text(strip=True)
            section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4

            section_titles.append({
                'title': section_title,
                'level': section_level
            })

        # Count words in the content
        content = soup.select_one('#mw-content-text')
        if content:
            # Remove script and style elements
            for script in content.select('script, style'):
                script.extract()

            # Remove .languages elements
            for languages_elem in content.select('.languages'):
                languages_elem.extract()

            # Get text and count words
            text = content.get_text(separator=' ', strip=True)
            word_count = len(text.split())

            # Extract links
            links = content.select('a')
            link_count = len(links)

            # Get link details (text and href)
            link_details = []
            for link in links:
                href = link.get('href', '')
                # Skip edit section links and other non-content links
                if 'action=edit' in href or 'redlink=1' in href or not href:
                    continue

                # Make relative URLs absolute
                if href.startswith('/'):
                    href = 'https://wiki.openstreetmap.org' + href

                link_text = link.get_text(strip=True)
                if link_text:  # Only include links with text
                    link_details.append({
                        'text': link_text,
                        'href': href
                    })

            # Extract media (images)
            media_elements = content.select('img')
            media_count = len(media_elements)

            # Get media details (src and alt text)
            media_details = []
            for img in media_elements:
                src = img.get('src', '')
                if src:
                    # Make relative URLs absolute
                    if src.startswith('//'):
                        src = 'https:' + src
                    elif src.startswith('/'):
                        src = 'https://wiki.openstreetmap.org' + src

                    alt_text = img.get('alt', '')
                    media_details.append({
                        'src': src,
                        'alt': alt_text
                    })

            # Extract categories
            categories = []
            category_links = soup.select('#mw-normal-catlinks li a')
            for cat_link in category_links:
                categories.append(cat_link.get_text(strip=True))
        else:
            word_count = 0
            link_count = 0
            link_details = []
            media_count = 0
            media_details = []
            categories = []

        return {
            'key': key,
            'language': language,
            'url': url,
            'last_modified': last_modified,
            'sections': sections,
            'section_titles': section_titles,
            'word_count': word_count,
            'link_count': link_count,
            'link_details': link_details,
            'media_count': media_count,
            'media_details': media_details,
            'categories': categories
        }

    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
        return None

def generate_staleness_histogram(wiki_pages):
    """
    Generate a histogram of staleness scores by 10% ranges

    Args:
        wiki_pages (list): List of dictionaries containing page information with staleness scores

    Returns:
        None: Saves the histogram to a file
    """
    logger.info("Generating histogram of staleness scores by 10% ranges...")

    # Extract staleness scores
    staleness_scores = []
    for page in wiki_pages:
        if page and 'staleness_score' in page:
            staleness_scores.append(page['staleness_score'])

    if not staleness_scores:
        logger.warning("No staleness scores found. Cannot generate histogram.")
        return

    # Determine the maximum score for binning
    max_score = max(staleness_scores)
    # Round up to the nearest 10 to ensure all scores are included
    max_bin_edge = np.ceil(max_score / 10) * 10

    # Create bins for 10% ranges
    bins = np.arange(0, max_bin_edge + 10, 10)

    # Count scores in each bin
    hist, bin_edges = np.histogram(staleness_scores, bins=bins)

    # Create histogram
    plt.figure(figsize=(12, 6))

    # Create bar chart
    plt.bar(range(len(hist)), hist, align='center')

    # Set x-axis labels for each bin
    bin_labels = [f"{int(bin_edges[i])}-{int(bin_edges[i+1])}%" for i in range(len(bin_edges)-1)]
    plt.xticks(range(len(hist)), bin_labels, rotation=45)

    # Set labels and title
    plt.xlabel('Tranches de score de décrépitude (en %)')
    plt.ylabel('Nombre de pages')
    plt.title('Répartition du score de décrépitude par tranches de 10%')

    # Add grid for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Adjust layout
    plt.tight_layout()

    # Save figure
    plt.savefig(STALENESS_HISTOGRAM_FILE)
    logger.info(f"Histogram saved to {STALENESS_HISTOGRAM_FILE}")

    # Close the figure to free memory
    plt.close()

def analyze_wiki_pages(pages):
    """
    Analyze wiki pages to determine which ones need updating

    Args:
        pages (list): List of dictionaries containing page information

    Returns:
        list: List of pages that need updating, sorted by priority
    """
    logger.info("Analyzing wiki pages to identify those needing updates...")

    # Group pages by key
    pages_by_key = {}
    for page in pages:
        if page is None:
            continue

        key = page['key']
        if key not in pages_by_key:
            pages_by_key[key] = {}

        pages_by_key[key][page['language']] = page

    # Analyze each key's pages
    needs_update = []

    for key, lang_pages in pages_by_key.items():
        # Skip if either language is missing
        if 'en' not in lang_pages or 'fr' not in lang_pages:
            if 'en' in lang_pages:
                # French page is missing
                # For missing French pages, calculate a high staleness score
                # Use word count as the main factor (50% weight)
                missing_staleness_score = (
                    30 * 0.2 +  # Assume 30 days outdated (20%)
                    lang_pages['en']['word_count'] / 100 * 0.5 +  # Word count (50%)
                    lang_pages['en']['sections'] * 0.15 +  # Sections (15%)
                    lang_pages['en']['link_count'] / 10 * 0.15  # Links (15%)
                )

                # Round to 2 decimal places and ensure it's high
                missing_staleness_score = max(100, round(missing_staleness_score, 2))

                # Get media count or default to 0
                media_count = lang_pages['en'].get('media_count', 0)

                needs_update.append({
                    'key': key,
                    'reason': 'French page missing',
                    'en_page': lang_pages['en'],
                    'fr_page': None,
                    'date_diff': 0,
                    'word_diff': lang_pages['en']['word_count'],
                    'section_diff': lang_pages['en']['sections'],
                    'link_diff': lang_pages['en']['link_count'],
                    'media_diff': media_count,
                    'staleness_score': missing_staleness_score,
                    'priority': missing_staleness_score,  # Use staleness score as priority
                    'section_comparison': None,  # No comparison possible
                    'link_comparison': None,     # No comparison possible
                    'media_comparison': None,    # No comparison possible
                    'category_comparison': None  # No comparison possible
                })
            continue

        en_page = lang_pages['en']
        fr_page = lang_pages['fr']

        # Skip if dates are missing
        if not en_page['last_modified'] or not fr_page['last_modified']:
            continue

        # Calculate date difference in days
        en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
        fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
        date_diff = (en_date - fr_date).days

        # Calculate content differences
        word_diff = en_page['word_count'] - fr_page['word_count']
        section_diff = en_page['sections'] - fr_page['sections']
        link_diff = en_page['link_count'] - fr_page['link_count']
        media_diff = en_page.get('media_count', 0) - fr_page.get('media_count', 0)

        # Calculate staleness score (higher means more outdated/stale)
        # Weight factors adjusted to emphasize word count differences
        staleness_score = (
            abs(date_diff) * 0.2 +  # Date difference (20%)
            abs(word_diff) / 100 * 0.5 +  # Word count difference (normalized) (50%)
            abs(section_diff) * 0.15 +  # Section difference (15%)
            abs(link_diff) / 10 * 0.15  # Link count difference (normalized) (15%)
        )

        # Round to 2 decimal places for display
        staleness_score = round(staleness_score, 2)

        # Compare sections between English and French pages
        section_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Extract section titles for comparison
        en_sections = {section['title'].lower(): section for section in en_page.get('section_titles', [])}
        fr_sections = {section['title'].lower(): section for section in fr_page.get('section_titles', [])}

        # Find sections only in English
        for title, section in en_sections.items():
            if title not in fr_sections:
                section_comparison['en_only'].append(section)

        # Find sections only in French
        for title, section in fr_sections.items():
            if title not in en_sections:
                section_comparison['fr_only'].append(section)

        # Find common sections
        for title in en_sections.keys():
            if title in fr_sections:
                section_comparison['common'].append({
                    'en': en_sections[title],
                    'fr': fr_sections[title]
                })

        # Compare links between English and French pages
        link_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Extract link texts for comparison (case insensitive)
        en_links = {link['text'].lower(): link for link in en_page.get('link_details', [])}
        fr_links = {link['text'].lower(): link for link in fr_page.get('link_details', [])}

        # Find links only in English
        for text, link in en_links.items():
            if text not in fr_links:
                link_comparison['en_only'].append(link)

        # Find links only in French
        for text, link in fr_links.items():
            if text not in en_links:
                link_comparison['fr_only'].append(link)

        # Find common links
        for text in en_links.keys():
            if text in fr_links:
                link_comparison['common'].append({
                    'en': en_links[text],
                    'fr': fr_links[text]
                })

        # Compare media between English and French pages
        media_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Extract media alt texts for comparison (case insensitive)
        en_media = {media['alt'].lower(): media for media in en_page.get('media_details', []) if media['alt']}
        fr_media = {media['alt'].lower(): media for media in fr_page.get('media_details', []) if media['alt']}

        # Find media only in English
        for alt, media in en_media.items():
            if alt not in fr_media:
                media_comparison['en_only'].append(media)

        # Find media only in French
        for alt, media in fr_media.items():
            if alt not in en_media:
                media_comparison['fr_only'].append(media)

        # Find common media
        for alt in en_media.keys():
            if alt in fr_media:
                media_comparison['common'].append({
                    'en': en_media[alt],
                    'fr': fr_media[alt]
                })

        # Add media without alt text to their respective language-only lists
        for media in en_page.get('media_details', []):
            if not media['alt'] or media['alt'].lower() not in en_media:
                media_comparison['en_only'].append(media)

        for media in fr_page.get('media_details', []):
            if not media['alt'] or media['alt'].lower() not in fr_media:
                media_comparison['fr_only'].append(media)

        # Compare categories between English and French pages
        category_comparison = {
            'en_only': [],
            'fr_only': [],
            'common': []
        }

        # Extract categories for comparison (case insensitive)
        en_categories = [cat.lower() for cat in en_page.get('categories', [])]
        fr_categories = [cat.lower() for cat in fr_page.get('categories', [])]

        # Find categories only in English
        for cat in en_page.get('categories', []):
            if cat.lower() not in fr_categories:
                category_comparison['en_only'].append(cat)

        # Find categories only in French
        for cat in fr_page.get('categories', []):
            if cat.lower() not in en_categories:
                category_comparison['fr_only'].append(cat)

        # Find common categories
        for cat in en_page.get('categories', []):
            if cat.lower() in fr_categories:
                category_comparison['common'].append(cat)

        if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
            reason = []
            if date_diff > 30:
                reason.append(f"La version Française est datée de {date_diff} jours")
            if word_diff > 200:
                reason.append(f"La version Anglaise a {word_diff} plus de mots")
            if section_diff > 2:
                reason.append(f"La version Anglaise a {section_diff} plus de sections")
            if link_diff > 20:
                reason.append(f"La version Anglaise a {link_diff} plus de liens")
            if media_diff > 5:
                reason.append(f"La version Anglaise a {media_diff} plus d'images")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
                reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")

            needs_update.append({
                'key': key,
                'reason': ', '.join(reason),
                'en_page': en_page,
                'fr_page': fr_page,
                'date_diff': date_diff,
                'word_diff': word_diff,
                'section_diff': section_diff,
                'link_diff': link_diff,
                'media_diff': media_diff,
                'staleness_score': staleness_score,
                'priority': staleness_score,  # Use staleness score as priority
                'section_comparison': section_comparison,
                'link_comparison': link_comparison,
                'media_comparison': media_comparison,
                'category_comparison': category_comparison
            })

    # Sort by priority (descending)
    needs_update.sort(key=lambda x: x['priority'], reverse=True)

    return needs_update

def main():
    """Main function to execute the script"""
    logger.info("Starting wiki_compare.py")

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)

    # Fetch top keys
    top_keys = fetch_top_keys(NUM_WIKI_PAGES)

    if not top_keys:
        logger.error("Failed to fetch top keys. Exiting.")
        return

    # Save top keys to JSON
    save_to_json(top_keys, TOP_KEYS_FILE)

    # Fetch wiki pages for each key
    wiki_pages = []

    for key_info in top_keys:
        key = key_info['key']

        # Fetch English page
        en_page = fetch_wiki_page(key, 'en')
        if en_page:
            wiki_pages.append(en_page)

        # Fetch French page
        fr_page = fetch_wiki_page(key, 'fr')
        if fr_page:
            wiki_pages.append(fr_page)

    # Process wiki pages to add staleness score
    processed_wiki_pages = []
    pages_by_key = {}

    # Group pages by key
    for page in wiki_pages:
        if page is None:
            continue

        key = page['key']
        if key not in pages_by_key:
            pages_by_key[key] = {}

        pages_by_key[key][page['language']] = page

    # Calculate staleness score for each pair of pages
    for key, lang_pages in pages_by_key.items():
        # Add English page with staleness score
        if 'en' in lang_pages:
            en_page = lang_pages['en'].copy()

            # If French page exists, calculate staleness score
            if 'fr' in lang_pages:
                fr_page = lang_pages['fr']

                # Skip if dates are missing
                if en_page['last_modified'] and fr_page['last_modified']:
                    # Calculate date difference in days
                    en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
                    fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
                    date_diff = (en_date - fr_date).days

                    # Calculate content differences
                    word_diff = en_page['word_count'] - fr_page['word_count']
                    section_diff = en_page['sections'] - fr_page['sections']
                    link_diff = en_page['link_count'] - fr_page['link_count']

                    # Calculate staleness score
                    staleness_score = (
                        abs(date_diff) * 0.2 +
                        abs(word_diff) / 100 * 0.5 +
                        abs(section_diff) * 0.15 +
                        abs(link_diff) / 10 * 0.15
                    )

                    # Round to 2 decimal places
                    staleness_score = round(staleness_score, 2)

                    en_page['staleness_score'] = staleness_score
                    fr_page['staleness_score'] = staleness_score
                else:
                    en_page['staleness_score'] = 0
                    fr_page['staleness_score'] = 0

                processed_wiki_pages.append(en_page)
                processed_wiki_pages.append(fr_page)
            else:
                # French page is missing, calculate a high staleness score
                missing_staleness_score = (
                    30 * 0.2 +
                    en_page['word_count'] / 100 * 0.5 +
                    en_page['sections'] * 0.15 +
                    en_page['link_count'] / 10 * 0.15
                )

                # Round to 2 decimal places and ensure it's high
                missing_staleness_score = max(100, round(missing_staleness_score, 2))

                en_page['staleness_score'] = missing_staleness_score
                processed_wiki_pages.append(en_page)

        # Add French page without English counterpart (rare case)
        elif 'fr' in lang_pages:
            fr_page = lang_pages['fr'].copy()
            fr_page['staleness_score'] = 0
            processed_wiki_pages.append(fr_page)

    # Generate histogram of staleness scores
    generate_staleness_histogram(processed_wiki_pages)

    # Save processed wiki pages to CSV
    try:
        with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
            # Basic fields for CSV (detailed content will be in JSON only)
            fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score']
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            writer.writeheader()
            for page in processed_wiki_pages:
                if page:  # Skip None values
                    # Create a copy with only the CSV fields
                    csv_page = {field: page.get(field, '') for field in fieldnames if field in page}
                    writer.writerow(csv_page)

        logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")

    except IOError as e:
        logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
        return

    # Analyze pages to find those needing updates
    pages_to_update = analyze_wiki_pages(wiki_pages)

    # Save pages that need updating to JSON
    save_to_json(pages_to_update, OUTDATED_PAGES_FILE)

    # Print the top pages needing updates
    print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")

    for i, page in enumerate(pages_to_update[:NUM_WIKI_PAGES], 1):
        key = page['key']
        reason = page['reason']
        en_url = page['en_page']['url'] if page['en_page'] else "N/A"
        fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"

        print(f"{i}. Key: {key}")
        print(f"   Reason: {reason}")
        print(f"   English: {en_url}")
        print(f"   French: {fr_url}")
        print()

    logger.info("Script completed successfully")

if __name__ == "__main__":
    main()