add wiki compare

2025-08-21 16:50:17 +02:00 · 2025-08-21 16:50:17 +02:00 · 38fbc451f5
commit 38fbc451f5
parent 692e609a46
9 changed files with 81151 additions and 126 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -4,7 +4,7 @@
 """
 wiki_compare.py

-This script fetches the 10 most used OpenStreetMap keys from TagInfo,
+This script fetches the most used OpenStreetMap keys from TagInfo,
 compares their English and French wiki pages, and identifies which pages
 need updating based on modification dates and content analysis.

@ -12,10 +12,10 @@ Usage:
    python wiki_compare.py

 Output:
-    - top_keys.json: JSON file containing the 10 most used OSM keys
+    - top_keys.json: JSON file containing the most used OSM keys
    - wiki_pages.csv: CSV file with information about each wiki page
    - outdated_pages.json: JSON file containing pages that need updating
-    - A console output listing the 10 wiki pages that need updating
+    - A console output listing the wiki pages that need updating
 """

 import json
@ -42,8 +42,10 @@ WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
 TOP_KEYS_FILE = "top_keys.json"
 WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
+# Number of wiki pages to examine
+NUM_WIKI_PAGES = 20

-def fetch_top_keys(limit=50):
+def fetch_top_keys(limit=NUM_WIKI_PAGES):
    """
    Fetch the most used OSM keys from TagInfo API
    
@ -135,8 +137,28 @@ def fetch_wiki_page(key, language='en'):
                except ValueError:
                    logger.warning(f"Could not parse date: {date_str}")
        
-        # Count sections (h2, h3, h4)
-        sections = len(soup.select('h2, h3, h4'))
+        # Extract sections (h2, h3, h4)
+        section_elements = soup.select('h2, h3, h4')
+        sections = len(section_elements)
+        
+        # Extract section titles
+        section_titles = []
+        for section_elem in section_elements:
+            # Skip sections that are part of the table of contents or navigation
+            if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
+                continue
+                
+            # Get the text of the section title, removing any edit links
+            for edit_link in section_elem.select('.mw-editsection'):
+                edit_link.extract()
+            
+            section_title = section_elem.get_text(strip=True)
+            section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
+            
+            section_titles.append({
+                'title': section_title,
+                'level': section_level
+            })
        
        # Count words in the content
        content = soup.select_one('#mw-content-text')
@ -149,12 +171,55 @@ def fetch_wiki_page(key, language='en'):
            text = content.get_text(separator=' ', strip=True)
            word_count = len(text.split())
            
-            # Count links
+            # Extract links
            links = content.select('a')
            link_count = len(links)
+            
+            # Get link details (text and href)
+            link_details = []
+            for link in links:
+                href = link.get('href', '')
+                # Skip edit section links and other non-content links
+                if 'action=edit' in href or 'redlink=1' in href or not href:
+                    continue
+                
+                # Make relative URLs absolute
+                if href.startswith('/'):
+                    href = 'https://wiki.openstreetmap.org' + href
+                
+                link_text = link.get_text(strip=True)
+                if link_text:  # Only include links with text
+                    link_details.append({
+                        'text': link_text,
+                        'href': href
+                    })
+            
+            # Extract media (images)
+            media_elements = content.select('img')
+            media_count = len(media_elements)
+            
+            # Get media details (src and alt text)
+            media_details = []
+            for img in media_elements:
+                src = img.get('src', '')
+                if src:
+                    # Make relative URLs absolute
+                    if src.startswith('//'):
+                        src = 'https:' + src
+                    elif src.startswith('/'):
+                        src = 'https://wiki.openstreetmap.org' + src
+                    
+                    alt_text = img.get('alt', '')
+                    media_details.append({
+                        'src': src,
+                        'alt': alt_text
+                    })
        else:
            word_count = 0
            link_count = 0
+            link_details = []
+            media_count = 0
+            media_details = []
        
        return {
            'key': key,
@ -162,8 +227,12 @@ def fetch_wiki_page(key, language='en'):
            'url': url,
            'last_modified': last_modified,
            'sections': sections,
+            'section_titles': section_titles,
            'word_count': word_count,
-            'link_count': link_count
+            'link_count': link_count,
+            'link_details': link_details,
+            'media_count': media_count,
+            'media_details': media_details
        }
    
    except requests.exceptions.RequestException as e:
@ -202,6 +271,21 @@ def analyze_wiki_pages(pages):
        if 'en' not in lang_pages or 'fr' not in lang_pages:
            if 'en' in lang_pages:
                # French page is missing
+                # For missing French pages, calculate a high staleness score
+                # Use word count as the main factor (50% weight)
+                missing_staleness_score = (
+                    30 * 0.2 +  # Assume 30 days outdated (20%)
+                    lang_pages['en']['word_count'] / 100 * 0.5 +  # Word count (50%)
+                    lang_pages['en']['sections'] * 0.15 +  # Sections (15%)
+                    lang_pages['en']['link_count'] / 10 * 0.15  # Links (15%)
+                )
+                
+                # Round to 2 decimal places and ensure it's high
+                missing_staleness_score = max(100, round(missing_staleness_score, 2))
+                
+                # Get media count or default to 0
+                media_count = lang_pages['en'].get('media_count', 0)
+                
                needs_update.append({
                    'key': key,
                    'reason': 'French page missing',
@ -211,7 +295,12 @@ def analyze_wiki_pages(pages):
                    'word_diff': lang_pages['en']['word_count'],
                    'section_diff': lang_pages['en']['sections'],
                    'link_diff': lang_pages['en']['link_count'],
-                    'priority': 100  # High priority for missing pages
+                    'media_diff': media_count,
+                    'staleness_score': missing_staleness_score,
+                    'priority': missing_staleness_score,  # Use staleness score as priority
+                    'section_comparison': None,  # No comparison possible
+                    'link_comparison': None,     # No comparison possible
+                    'media_comparison': None     # No comparison possible
                })
            continue
        
@ -231,28 +320,130 @@ def analyze_wiki_pages(pages):
        word_diff = en_page['word_count'] - fr_page['word_count']
        section_diff = en_page['sections'] - fr_page['sections']
        link_diff = en_page['link_count'] - fr_page['link_count']
+        media_diff = en_page.get('media_count', 0) - fr_page.get('media_count', 0)
        
-        # Calculate priority score (higher means needs more urgent update)
-        # Weight factors can be adjusted
-        priority = (
-            abs(date_diff) * 0.4 +  # Date difference
-            abs(word_diff) / 100 * 0.25 +  # Word count difference (normalized)
-            abs(section_diff) * 0.2 +  # Section difference
-            abs(link_diff) / 10 * 0.15  # Link count difference (normalized)
+        # Calculate staleness score (higher means more outdated/stale)
+        # Weight factors adjusted to emphasize word count differences
+        staleness_score = (
+            abs(date_diff) * 0.2 +  # Date difference (20%)
+            abs(word_diff) / 100 * 0.5 +  # Word count difference (normalized) (50%)
+            abs(section_diff) * 0.15 +  # Section difference (15%)
+            abs(link_diff) / 10 * 0.15  # Link count difference (normalized) (15%)
        )
        
+        # Round to 2 decimal places for display
+        staleness_score = round(staleness_score, 2)
+        
+        # Compare sections between English and French pages
+        section_comparison = {
+            'en_only': [],
+            'fr_only': [],
+            'common': []
+        }
+        
+        # Extract section titles for comparison
+        en_sections = {section['title'].lower(): section for section in en_page.get('section_titles', [])}
+        fr_sections = {section['title'].lower(): section for section in fr_page.get('section_titles', [])}
+        
+        # Find sections only in English
+        for title, section in en_sections.items():
+            if title not in fr_sections:
+                section_comparison['en_only'].append(section)
+        
+        # Find sections only in French
+        for title, section in fr_sections.items():
+            if title not in en_sections:
+                section_comparison['fr_only'].append(section)
+        
+        # Find common sections
+        for title in en_sections.keys():
+            if title in fr_sections:
+                section_comparison['common'].append({
+                    'en': en_sections[title],
+                    'fr': fr_sections[title]
+                })
+        
+        # Compare links between English and French pages
+        link_comparison = {
+            'en_only': [],
+            'fr_only': [],
+            'common': []
+        }
+        
+        # Extract link texts for comparison (case insensitive)
+        en_links = {link['text'].lower(): link for link in en_page.get('link_details', [])}
+        fr_links = {link['text'].lower(): link for link in fr_page.get('link_details', [])}
+        
+        # Find links only in English
+        for text, link in en_links.items():
+            if text not in fr_links:
+                link_comparison['en_only'].append(link)
+        
+        # Find links only in French
+        for text, link in fr_links.items():
+            if text not in en_links:
+                link_comparison['fr_only'].append(link)
+        
+        # Find common links
+        for text in en_links.keys():
+            if text in fr_links:
+                link_comparison['common'].append({
+                    'en': en_links[text],
+                    'fr': fr_links[text]
+                })
+        
+        # Compare media between English and French pages
+        media_comparison = {
+            'en_only': [],
+            'fr_only': [],
+            'common': []
+        }
+        
+        # Extract media alt texts for comparison (case insensitive)
+        en_media = {media['alt'].lower(): media for media in en_page.get('media_details', []) if media['alt']}
+        fr_media = {media['alt'].lower(): media for media in fr_page.get('media_details', []) if media['alt']}
+        
+        # Find media only in English
+        for alt, media in en_media.items():
+            if alt not in fr_media:
+                media_comparison['en_only'].append(media)
+        
+        # Find media only in French
+        for alt, media in fr_media.items():
+            if alt not in en_media:
+                media_comparison['fr_only'].append(media)
+        
+        # Find common media
+        for alt in en_media.keys():
+            if alt in fr_media:
+                media_comparison['common'].append({
+                    'en': en_media[alt],
+                    'fr': fr_media[alt]
+                })
+        
+        # Add media without alt text to their respective language-only lists
+        for media in en_page.get('media_details', []):
+            if not media['alt'] or media['alt'].lower() not in en_media:
+                media_comparison['en_only'].append(media)
+        
+        for media in fr_page.get('media_details', []):
+            if not media['alt'] or media['alt'].lower() not in fr_media:
+                media_comparison['fr_only'].append(media)
+        
        if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
            reason = []
            if date_diff > 30:
-                reason.append(f"French page outdated by {date_diff} days")
+                reason.append(f"La version Française est datée de {date_diff} jours")
            if word_diff > 200:
-                reason.append(f"English page has {word_diff} more words")
+                reason.append(f"La version Anglaise a {word_diff} plus de mots")
            if section_diff > 2:
-                reason.append(f"English page has {section_diff} more sections")
+                reason.append(f"La version Anglaise a {section_diff} plus de sections")
            if link_diff > 20:
-                reason.append(f"English page has {link_diff} more links")
+                reason.append(f"La version Anglaise a {link_diff} plus de liens")
+            if media_diff > 5:
+                reason.append(f"La version Anglaise a {media_diff} plus d'images")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
-                reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content")
+                reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
            
            needs_update.append({
                'key': key,
@ -263,7 +454,12 @@ def analyze_wiki_pages(pages):
                'word_diff': word_diff,
                'section_diff': section_diff,
                'link_diff': link_diff,
-                'priority': priority
+                'media_diff': media_diff,
+                'staleness_score': staleness_score,
+                'priority': staleness_score,  # Use staleness score as priority
+                'section_comparison': section_comparison,
+                'link_comparison': link_comparison,
+                'media_comparison': media_comparison
            })
    
    # Sort by priority (descending)
@ -279,7 +475,7 @@ def main():
    os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
    
    # Fetch top keys
-    top_keys = fetch_top_keys(10)
+    top_keys = fetch_top_keys(NUM_WIKI_PAGES)
    
    if not top_keys:
        logger.error("Failed to fetch top keys. Exiting.")
@ -304,16 +500,96 @@ def main():
        if fr_page:
            wiki_pages.append(fr_page)
    
-    # Save wiki pages to CSV
+    # Process wiki pages to add staleness score
+    processed_wiki_pages = []
+    pages_by_key = {}
+    
+    # Group pages by key
+    for page in wiki_pages:
+        if page is None:
+            continue
+        
+        key = page['key']
+        if key not in pages_by_key:
+            pages_by_key[key] = {}
+        
+        pages_by_key[key][page['language']] = page
+    
+    # Calculate staleness score for each pair of pages
+    for key, lang_pages in pages_by_key.items():
+        # Add English page with staleness score
+        if 'en' in lang_pages:
+            en_page = lang_pages['en'].copy()
+            
+            # If French page exists, calculate staleness score
+            if 'fr' in lang_pages:
+                fr_page = lang_pages['fr']
+                
+                # Skip if dates are missing
+                if en_page['last_modified'] and fr_page['last_modified']:
+                    # Calculate date difference in days
+                    en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
+                    fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
+                    date_diff = (en_date - fr_date).days
+                    
+                    # Calculate content differences
+                    word_diff = en_page['word_count'] - fr_page['word_count']
+                    section_diff = en_page['sections'] - fr_page['sections']
+                    link_diff = en_page['link_count'] - fr_page['link_count']
+                    
+                    # Calculate staleness score
+                    staleness_score = (
+                        abs(date_diff) * 0.2 +
+                        abs(word_diff) / 100 * 0.5 +
+                        abs(section_diff) * 0.15 +
+                        abs(link_diff) / 10 * 0.15
+                    )
+                    
+                    # Round to 2 decimal places
+                    staleness_score = round(staleness_score, 2)
+                    
+                    en_page['staleness_score'] = staleness_score
+                    fr_page['staleness_score'] = staleness_score
+                else:
+                    en_page['staleness_score'] = 0
+                    fr_page['staleness_score'] = 0
+                
+                processed_wiki_pages.append(en_page)
+                processed_wiki_pages.append(fr_page)
+            else:
+                # French page is missing, calculate a high staleness score
+                missing_staleness_score = (
+                    30 * 0.2 +
+                    en_page['word_count'] / 100 * 0.5 +
+                    en_page['sections'] * 0.15 +
+                    en_page['link_count'] / 10 * 0.15
+                )
+                
+                # Round to 2 decimal places and ensure it's high
+                missing_staleness_score = max(100, round(missing_staleness_score, 2))
+                
+                en_page['staleness_score'] = missing_staleness_score
+                processed_wiki_pages.append(en_page)
+        
+        # Add French page without English counterpart (rare case)
+        elif 'fr' in lang_pages:
+            fr_page = lang_pages['fr'].copy()
+            fr_page['staleness_score'] = 0
+            processed_wiki_pages.append(fr_page)
+    
+    # Save processed wiki pages to CSV
    try:
        with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
-            fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count']
+            # Basic fields for CSV (detailed content will be in JSON only)
+            fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            
            writer.writeheader()
-            for page in wiki_pages:
+            for page in processed_wiki_pages:
                if page:  # Skip None values
-                    writer.writerow(page)
+                    # Create a copy with only the CSV fields
+                    csv_page = {field: page.get(field, '') for field in fieldnames if field in page}
+                    writer.writerow(csv_page)
        
        logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
    
@ -327,10 +603,10 @@ def main():
    # Save pages that need updating to JSON
    save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
    
-    # Print the top 10 pages needing updates
-    print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====")
+    # Print the top pages needing updates
+    print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
    
-    for i, page in enumerate(pages_to_update[:10], 1):
+    for i, page in enumerate(pages_to_update[:NUM_WIKI_PAGES], 1):
        key = page['key']
        reason = page['reason']
        en_url = page['en_page']['url'] if page['en_page'] else "N/A"