add wiki fraicheur comparée anglais français

2025-08-21 16:07:49 +02:00 · 2025-08-21 16:07:49 +02:00 · 83d1972589
commit 83d1972589
parent 0aaddb44c5
12 changed files with 1332 additions and 0 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+wiki_compare.py
+
+This script fetches the 10 most used OpenStreetMap keys from TagInfo,
+compares their English and French wiki pages, and identifies which pages
+need updating based on modification dates and content analysis.
+
+Usage:
+    python wiki_compare.py
+
+Output:
+    - top_keys.json: JSON file containing the 10 most used OSM keys
+    - wiki_pages.csv: CSV file with information about each wiki page
+    - outdated_pages.json: JSON file containing pages that need updating
+    - A console output listing the 10 wiki pages that need updating
+"""
+
+import json
+import csv
+import requests
+import re
+import os
+from datetime import datetime
+from bs4 import BeautifulSoup
+import logging
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+# Constants
+TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
+WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
+WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
+TOP_KEYS_FILE = "top_keys.json"
+WIKI_PAGES_CSV = "wiki_pages.csv"
+OUTDATED_PAGES_FILE = "outdated_pages.json"
+
+def fetch_top_keys(limit=50):
+    """
+    Fetch the most used OSM keys from TagInfo API
+    
+    Args:
+        limit (int): Number of keys to fetch
+        
+    Returns:
+        list: List of dictionaries containing key information
+    """
+    logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")
+    
+    params = {
+        'page': 1,
+        'rp': limit,
+        'sortname': 'count_all',
+        'sortorder': 'desc'
+    }
+    
+    try:
+        response = requests.get(TAGINFO_API_URL, params=params)
+        response.raise_for_status()
+        data = response.json()
+        
+        # Extract just the key names and counts
+        top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
+        
+        logger.info(f"Successfully fetched {len(top_keys)} keys")
+        return top_keys
+    
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching data from TagInfo API: {e}")
+        return []
+
+def save_to_json(data, filename):
+    """
+    Save data to a JSON file
+    
+    Args:
+        data: Data to save
+        filename (str): Name of the file
+    """
+    try:
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Data saved to {filename}")
+    except IOError as e:
+        logger.error(f"Error saving data to {filename}: {e}")
+
+def fetch_wiki_page(key, language='en'):
+    """
+    Fetch wiki page for a given key
+    
+    Args:
+        key (str): OSM key
+        language (str): Language code ('en' or 'fr')
+        
+    Returns:
+        dict: Dictionary with page information or None if page doesn't exist
+    """
+    base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
+    url = f"{base_url}{key}"
+    
+    logger.info(f"Fetching {language} wiki page for key '{key}': {url}")
+    
+    try:
+        response = requests.get(url)
+        
+        # Check if page exists
+        if response.status_code == 404:
+            logger.warning(f"Wiki page for key '{key}' in {language} does not exist")
+            return None
+        
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        # Get last modification date
+        last_modified = None
+        footer_info = soup.select_one('#footer-info-lastmod')
+        if footer_info:
+            date_text = footer_info.text
+            # Extract date using regex
+            date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
+            if date_match:
+                date_str = date_match.group(1)
+                try:
+                    # Parse date (format may vary based on wiki language)
+                    last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
+                except ValueError:
+                    logger.warning(f"Could not parse date: {date_str}")
+        
+        # Count sections (h2, h3, h4)
+        sections = len(soup.select('h2, h3, h4'))
+        
+        # Count words in the content
+        content = soup.select_one('#mw-content-text')
+        if content:
+            # Remove script and style elements
+            for script in content.select('script, style'):
+                script.extract()
+            
+            # Get text and count words
+            text = content.get_text(separator=' ', strip=True)
+            word_count = len(text.split())
+            
+            # Count links
+            links = content.select('a')
+            link_count = len(links)
+        else:
+            word_count = 0
+            link_count = 0
+        
+        return {
+            'key': key,
+            'language': language,
+            'url': url,
+            'last_modified': last_modified,
+            'sections': sections,
+            'word_count': word_count,
+            'link_count': link_count
+        }
+    
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
+        return None
+
+def analyze_wiki_pages(pages):
+    """
+    Analyze wiki pages to determine which ones need updating
+    
+    Args:
+        pages (list): List of dictionaries containing page information
+        
+    Returns:
+        list: List of pages that need updating, sorted by priority
+    """
+    logger.info("Analyzing wiki pages to identify those needing updates...")
+    
+    # Group pages by key
+    pages_by_key = {}
+    for page in pages:
+        if page is None:
+            continue
+        
+        key = page['key']
+        if key not in pages_by_key:
+            pages_by_key[key] = {}
+        
+        pages_by_key[key][page['language']] = page
+    
+    # Analyze each key's pages
+    needs_update = []
+    
+    for key, lang_pages in pages_by_key.items():
+        # Skip if either language is missing
+        if 'en' not in lang_pages or 'fr' not in lang_pages:
+            if 'en' in lang_pages:
+                # French page is missing
+                needs_update.append({
+                    'key': key,
+                    'reason': 'French page missing',
+                    'en_page': lang_pages['en'],
+                    'fr_page': None,
+                    'date_diff': 0,
+                    'word_diff': lang_pages['en']['word_count'],
+                    'section_diff': lang_pages['en']['sections'],
+                    'link_diff': lang_pages['en']['link_count'],
+                    'priority': 100  # High priority for missing pages
+                })
+            continue
+        
+        en_page = lang_pages['en']
+        fr_page = lang_pages['fr']
+        
+        # Skip if dates are missing
+        if not en_page['last_modified'] or not fr_page['last_modified']:
+            continue
+        
+        # Calculate date difference in days
+        en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
+        fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
+        date_diff = (en_date - fr_date).days
+        
+        # Calculate content differences
+        word_diff = en_page['word_count'] - fr_page['word_count']
+        section_diff = en_page['sections'] - fr_page['sections']
+        link_diff = en_page['link_count'] - fr_page['link_count']
+        
+        # Calculate priority score (higher means needs more urgent update)
+        # Weight factors can be adjusted
+        priority = (
+            abs(date_diff) * 0.4 +  # Date difference
+            abs(word_diff) / 100 * 0.25 +  # Word count difference (normalized)
+            abs(section_diff) * 0.2 +  # Section difference
+            abs(link_diff) / 10 * 0.15  # Link count difference (normalized)
+        )
+        
+        if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
+            reason = []
+            if date_diff > 30:
+                reason.append(f"French page outdated by {date_diff} days")
+            if word_diff > 200:
+                reason.append(f"English page has {word_diff} more words")
+            if section_diff > 2:
+                reason.append(f"English page has {section_diff} more sections")
+            if link_diff > 20:
+                reason.append(f"English page has {link_diff} more links")
+            if fr_page['word_count'] < en_page['word_count'] * 0.7:
+                reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content")
+            
+            needs_update.append({
+                'key': key,
+                'reason': ', '.join(reason),
+                'en_page': en_page,
+                'fr_page': fr_page,
+                'date_diff': date_diff,
+                'word_diff': word_diff,
+                'section_diff': section_diff,
+                'link_diff': link_diff,
+                'priority': priority
+            })
+    
+    # Sort by priority (descending)
+    needs_update.sort(key=lambda x: x['priority'], reverse=True)
+    
+    return needs_update
+
+def main():
+    """Main function to execute the script"""
+    logger.info("Starting wiki_compare.py")
+    
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
+    
+    # Fetch top keys
+    top_keys = fetch_top_keys(10)
+    
+    if not top_keys:
+        logger.error("Failed to fetch top keys. Exiting.")
+        return
+    
+    # Save top keys to JSON
+    save_to_json(top_keys, TOP_KEYS_FILE)
+    
+    # Fetch wiki pages for each key
+    wiki_pages = []
+    
+    for key_info in top_keys:
+        key = key_info['key']
+        
+        # Fetch English page
+        en_page = fetch_wiki_page(key, 'en')
+        if en_page:
+            wiki_pages.append(en_page)
+        
+        # Fetch French page
+        fr_page = fetch_wiki_page(key, 'fr')
+        if fr_page:
+            wiki_pages.append(fr_page)
+    
+    # Save wiki pages to CSV
+    try:
+        with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
+            fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count']
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            
+            writer.writeheader()
+            for page in wiki_pages:
+                if page:  # Skip None values
+                    writer.writerow(page)
+        
+        logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
+    
+    except IOError as e:
+        logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
+        return
+    
+    # Analyze pages to find those needing updates
+    pages_to_update = analyze_wiki_pages(wiki_pages)
+    
+    # Save pages that need updating to JSON
+    save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
+    
+    # Print the top 10 pages needing updates
+    print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====")
+    
+    for i, page in enumerate(pages_to_update[:10], 1):
+        key = page['key']
+        reason = page['reason']
+        en_url = page['en_page']['url'] if page['en_page'] else "N/A"
+        fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"
+        
+        print(f"{i}. Key: {key}")
+        print(f"   Reason: {reason}")
+        print(f"   English: {en_url}")
+        print(f"   French: {fr_url}")
+        print()
+    
+    logger.info("Script completed successfully")
+
+if __name__ == "__main__":
+    main()