up wording comparaison

2025-09-03 16:04:16 +02:00 · 2025-09-03 16:04:16 +02:00 · 09e16d9075
commit 09e16d9075
parent 1140c87932
6 changed files with 443 additions and 239 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -30,11 +30,14 @@ import re
 import os
 import subprocess
 import tempfile
+import hashlib
 from datetime import datetime
 from bs4 import BeautifulSoup
 import logging
 import matplotlib.pyplot as plt
 import numpy as np
+import nltk
+from pathlib import Path

 # Configure logging
 logging.basicConfig(
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
 NUM_WIKI_PAGES = 50
+# HTML cache folder
+HTML_CACHE_DIR = "html_cache"
+
+# Initialize NLTK for sentence tokenization
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+    
+# Create HTML cache directory if it doesn't exist
+Path(HTML_CACHE_DIR).mkdir(exist_ok=True)

 # List of specific pages to compare (in addition to top keys)
 # This list can include:
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        url = f"{base_url}{key}"
        page_title = key
    
-    logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
+    # Create a unique cache filename based on the URL
+    cache_key = hashlib.md5(url.encode()).hexdigest()
+    cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
    
-    try:
-        response = requests.get(url)
-        
-        # Check if page exists
-        if response.status_code == 404:
-            logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
+    html_content = None
+    
+    # Try to load from cache first
+    if cache_file.exists():
+        logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+        except Exception as e:
+            logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
+            html_content = None
+    
+    # If not in cache or cache read failed, fetch from web
+    if html_content is None:
+        logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
+        try:
+            response = requests.get(url)
+            
+            # Check if page exists
+            if response.status_code == 404:
+                logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
+                return None
+            
+            response.raise_for_status()
+            html_content = response.text
+            
+            # Save to cache
+            try:
+                with open(cache_file, 'w', encoding='utf-8') as f:
+                    f.write(html_content)
+                logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
+            except Exception as e:
+                logger.warning(f"Error saving to cache: {e}")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
            return None
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # Get last modification date
+    last_modified = None
+    footer_info = soup.select_one('#footer-info-lastmod')
+    if footer_info:
+        date_text = footer_info.text
+        # Extract date using regex
+        date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
+        if date_match:
+            date_str = date_match.group(1)
+            try:
+                # Parse date (format may vary based on wiki language)
+                last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
+            except ValueError:
+                logger.warning(f"Could not parse date: {date_str}")
+    
+    # Extract sections (h2, h3, h4)
+    section_elements = soup.select('h2, h3, h4')
+    sections = len(section_elements)
+    
+    # Extract section titles
+    section_titles = []
+    for section_elem in section_elements:
+        # Skip sections that are part of the table of contents, navigation, or DescriptionBox
+        if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
+            continue
+            
+        # Skip sections that are inside a table with class DescriptionBox
+        if section_elem.find_parent('table', class_='DescriptionBox'):
+            continue
+            
+        # Get the text of the section title, removing any edit links
+        for edit_link in section_elem.select('.mw-editsection'):
+            edit_link.extract()
        
-        response.raise_for_status()
+        section_title = section_elem.get_text(strip=True)
+        section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
        
-        soup = BeautifulSoup(response.text, 'html.parser')
+        section_titles.append({
+            'title': section_title,
+            'level': section_level
+        })
        
-        # Get last modification date
-        last_modified = None
-        footer_info = soup.select_one('#footer-info-lastmod')
-        if footer_info:
-            date_text = footer_info.text
-            # Extract date using regex
-            date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
-            if date_match:
-                date_str = date_match.group(1)
-                try:
-                    # Parse date (format may vary based on wiki language)
-                    last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
-                except ValueError:
-                    logger.warning(f"Could not parse date: {date_str}")
+    # Count words and sentences in the content
+    content = soup.select_one('#mw-content-text')
+    clean_text = ""
+    if content:
+        # Remove script and style elements
+        for script in content.select('script, style'):
+            script.extract()
        
-        # Extract sections (h2, h3, h4)
-        section_elements = soup.select('h2, h3, h4')
-        sections = len(section_elements)
+        # Remove .languages elements
+        for languages_elem in content.select('.languages'):
+            languages_elem.extract()
        
-        # Extract section titles
-        section_titles = []
-        for section_elem in section_elements:
-            # Skip sections that are part of the table of contents, navigation, or DescriptionBox
-            if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
+        # Get text and count words
+        clean_text = content.get_text(separator=' ', strip=True)
+        word_count = len(clean_text.split())
+        
+        # Count sentences using NLTK
+        sentences = nltk.sent_tokenize(clean_text)
+        sentence_count = len(sentences)
+        
+        # Check grammar for French pages
+        grammar_suggestions = []
+        if language == 'fr':
+            logger.info(f"Checking grammar for French page: {key}")
+            grammar_suggestions = check_grammar_with_grammalecte(clean_text)
+        
+        # Extract links
+        links = content.select('a')
+        link_count = len(links)
+        
+        # Get link details (text and href)
+        link_details = []
+        for link in links:
+            href = link.get('href', '')
+            # Skip edit section links and other non-content links
+            if 'action=edit' in href or 'redlink=1' in href or not href:
                continue
-                
-            # Skip sections that are inside a table with class DescriptionBox
-            if section_elem.find_parent('table', class_='DescriptionBox'):
-                continue
-                
-            # Get the text of the section title, removing any edit links
-            for edit_link in section_elem.select('.mw-editsection'):
-                edit_link.extract()
            
-            section_title = section_elem.get_text(strip=True)
-            section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
+            # Make relative URLs absolute
+            if href.startswith('/'):
+                href = 'https://wiki.openstreetmap.org' + href
            
-            section_titles.append({
-                'title': section_title,
-                'level': section_level
-            })
+            link_text = link.get_text(strip=True)
+            if link_text:  # Only include links with text
+                link_details.append({
+                    'text': link_text,
+                    'href': href
+                })
        
-        # Count words in the content
-        content = soup.select_one('#mw-content-text')
-        clean_text = ""
-        if content:
-            # Remove script and style elements
-            for script in content.select('script, style'):
-                script.extract()
-            
-            # Remove .languages elements
-            for languages_elem in content.select('.languages'):
-                languages_elem.extract()
-            
-            # Get text and count words
-            clean_text = content.get_text(separator=' ', strip=True)
-            word_count = len(clean_text.split())
-            
-            # Check grammar for French pages
-            grammar_suggestions = []
-            if language == 'fr':
-                logger.info(f"Checking grammar for French page: {key}")
-                grammar_suggestions = check_grammar_with_grammalecte(clean_text)
-            
-            # Extract links
-            links = content.select('a')
-            link_count = len(links)
-            
-            # Get link details (text and href)
-            link_details = []
-            for link in links:
-                href = link.get('href', '')
-                # Skip edit section links and other non-content links
-                if 'action=edit' in href or 'redlink=1' in href or not href:
-                    continue
-                
+        # Extract media (images)
+        media_elements = content.select('img')
+        media_count = len(media_elements)
+        
+        # Get media details (src and alt text)
+        media_details = []
+
+        # Extract description image specifically
+        # Try multiple selectors to find the description image
+        description_img = None
+
+        # Debug: Log the key we're processing
+        logger.info(f"Looking for description image for key '{key}' in {language}")
+
+        # Function to filter out OSM logo and small icons
+        def is_relevant_image(img):
+            src = img.get('src', '')
+            # Skip OSM logo
+            if 'osm_logo' in src:
+                return False
+            # Skip small icons (usually less than 30px)
+            width = img.get('width')
+            if width and int(width) < 30:
+                return False
+            height = img.get('height')
+            if height and int(height) < 30:
+                return False
+            return True
+
+        # Special case for highway key - directly target the image we want
+        if key == 'highway':
+            # Try to find the specific image in figure elements
+            highway_img_elements = content.select('figure.mw-halign-center img')
+            logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images for highway")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")
+
+        # If not found with highway-specific selector, try the td.d_image selector
+        if not description_img:
+            description_img_elements = content.select('td.d_image img')
+            logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")
+
+        # If still not found, try the specific selector for .description img.mw-file-element
+        if not description_img:
+            description_img_elements = content.select('.description img.mw-file-element')
+            logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in .description")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
+
+        # If still not found, try images in figures within the description box
+        if not description_img:
+            description_img_elements = content.select('.description figure img')
+            logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")
+
+        # If still not found, try any image in the description box
+        if not description_img:
+            description_img_elements = content.select('.description img')
+            logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in .description general")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")
+
+        # If still not found, try images in the DescriptionBox table
+        if not description_img:
+            description_img_elements = content.select('table.DescriptionBox img')
+            logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
+
+        # If still not found, try images in figure elements anywhere in the content
+        if not description_img:
+            description_img_elements = content.select('figure img')
+            logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")
+
+        # If we still don't have an image, use any image that's not the OSM logo
+        if not description_img:
+            all_images = content.select('img')
+            relevant_images = [img for img in all_images if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using fallback image: {description_img.get('src', '')}")
+
+        # Process the found image
+        description_img_url = None
+        if description_img:
+            src = description_img.get('src', '')
+            if src:
                # Make relative URLs absolute
-                if href.startswith('/'):
-                    href = 'https://wiki.openstreetmap.org' + href
-                
-                link_text = link.get_text(strip=True)
-                if link_text:  # Only include links with text
-                    link_details.append({
-                        'text': link_text,
-                        'href': href
-                    })
-            
-            # Extract media (images)
-            media_elements = content.select('img')
-            media_count = len(media_elements)
-            
-            # Get media details (src and alt text)
-            media_details = []
+                if src.startswith('//'):
+                    src = 'https:' + src
+                elif src.startswith('/'):
+                    src = 'https://wiki.openstreetmap.org' + src

-            # Extract description image specifically
-            # Try multiple selectors to find the description image
-            description_img = None
-
-            # Debug: Log the key we're processing
-            logger.info(f"Looking for description image for key '{key}' in {language}")
-
-            # Function to filter out OSM logo and small icons
-            def is_relevant_image(img):
-                src = img.get('src', '')
-                # Skip OSM logo
-                if 'osm_logo' in src:
-                    return False
-                # Skip small icons (usually less than 30px)
-                width = img.get('width')
-                if width and int(width) < 30:
-                    return False
-                height = img.get('height')
-                if height and int(height) < 30:
-                    return False
-                return True
-
-            # Special case for highway key - directly target the image we want
-            if key == 'highway':
-                # Try to find the specific image in figure elements
-                highway_img_elements = content.select('figure.mw-halign-center img')
-                logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images for highway")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")
-
-            # If not found with highway-specific selector, try the td.d_image selector
-            if not description_img:
-                description_img_elements = content.select('td.d_image img')
-                logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")
-
-            # If still not found, try the specific selector for .description img.mw-file-element
-            if not description_img:
-                description_img_elements = content.select('.description img.mw-file-element')
-                logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in .description")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
-
-            # If still not found, try images in figures within the description box
-            if not description_img:
-                description_img_elements = content.select('.description figure img')
-                logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")
-
-            # If still not found, try any image in the description box
-            if not description_img:
-                description_img_elements = content.select('.description img')
-                logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in .description general")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")
-
-            # If still not found, try images in the DescriptionBox table
-            if not description_img:
-                description_img_elements = content.select('table.DescriptionBox img')
-                logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
-
-            # If still not found, try images in figure elements anywhere in the content
-            if not description_img:
-                description_img_elements = content.select('figure img')
-                logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")
-
-            # If we still don't have an image, use any image that's not the OSM logo
-            if not description_img:
-                all_images = content.select('img')
-                relevant_images = [img for img in all_images if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using fallback image: {description_img.get('src', '')}")
-
-            # Process the found image
-            description_img_url = None
-            if description_img:
-                src = description_img.get('src', '')
-                if src:
-                    # Make relative URLs absolute
-                    if src.startswith('//'):
-                        src = 'https:' + src
-                    elif src.startswith('/'):
-                        src = 'https://wiki.openstreetmap.org' + src
-
-                    description_img_url = src
+                description_img_url = src

            # Process all images
            for img in media_elements:
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'sections': sections,
            'section_titles': section_titles,
            'word_count': word_count,
+            'sentence_count': sentence_count,
            'link_count': link_count,
            'link_details': link_details,
            'media_count': media_count,
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'categories': categories,
            'description_img_url': description_img_url,
            'is_specific_page': is_specific_page,
-            'grammar_suggestions': grammar_suggestions
+            'grammar_suggestions': grammar_suggestions,
+            'html_content': html_content
        }
    
    except requests.exceptions.RequestException as e:
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
            if date_diff > 30:
                reason.append(f"La version Française est datée de {date_diff} jours")
            if word_diff > 200:
-                reason.append(f"La version Anglaise a {word_diff} plus de mots")
+                reason.append(f"La version Anglaise a {word_diff} mots de plus")
            if section_diff > 2:
-                reason.append(f"La version Anglaise a {section_diff} plus de sections")
+                reason.append(f"La version Anglaise a {section_diff} sections de plus")
            if link_diff > 20:
-                reason.append(f"La version Anglaise a {link_diff} plus de liens")
+                reason.append(f"La version Anglaise a {link_diff} liens de plus")
            if media_diff > 5:
-                reason.append(f"La version Anglaise a {media_diff} plus d'images")
+                reason.append(f"La version Anglaise a {media_diff} images de plus")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
                reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")