up wording comparaison

2025-09-03 16:04:16 +02:00 · 2025-09-03 16:04:16 +02:00 · 09e16d9075
commit 09e16d9075
parent 1140c87932
6 changed files with 443 additions and 239 deletions
--- a/.gitignore
+++ b/.gitignore
@ -30,6 +30,7 @@ venv
 wiki_compare/.env
 wiki_compare/*.png
 wiki_compare/*.json
+wiki_compare/html_cache/
 public/*.json

 .idea
--- a/.idea/php.xml
+++ b/.idea/php.xml
@ -141,7 +141,7 @@
      <path value="$PROJECT_DIR$/vendor/symfony/mime" />
    </include_path>
  </component>
-  <component name="PhpProjectSharedConfiguration" php_language_level="8.3">
+  <component name="PhpProjectSharedConfiguration" php_language_level="8.2">
    <option name="suggestChangeDefaultLanguageLevel" value="false" />
  </component>
  <component name="PhpStanOptionsConfiguration">
--- a/src/Controller/WikiController.php
+++ b/src/Controller/WikiController.php
@ -720,10 +720,67 @@ class WikiController extends AbstractController
        $englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
        $frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
        
+        // Fetch the HTML content of the English page using wiki_compare.py
+        $scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
+        $englishHtml = null;
+        $frenchHtml = null;
+        
+        if (file_exists($scriptPath)) {
+            // Create a temporary Python script to fetch the page content
+            $tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
+            $pythonCode = <<<EOT
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import sys
+import json
+from wiki_compare import fetch_wiki_page
+
+# Get the key from command line arguments
+key = sys.argv[1]
+language = sys.argv[2]
+
+# Fetch the page
+page = fetch_wiki_page(key, language)
+
+# Output the HTML content
+if page and 'html_content' in page:
+    print(page['html_content'])
+else:
+    print("")
+EOT;
+            
+            file_put_contents($tempScriptPath, $pythonCode);
+            chmod($tempScriptPath, 0755);
+            
+            // Fetch English page
+            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
+            $englishHtml = shell_exec($command);
+            
+            // Extract only the content part from the HTML (remove headers, footers, etc.)
+            if ($englishHtml) {
+                $englishHtml = $this->extractMainContent($englishHtml);
+            }
+            
+            // Fetch French page (might not exist, but we'll try)
+            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
+            $frenchHtml = shell_exec($command);
+            
+            // Extract only the content part from the HTML (remove headers, footers, etc.)
+            if ($frenchHtml) {
+                $frenchHtml = $this->extractMainContent($frenchHtml);
+            }
+            
+            // Clean up the temporary script
+            unlink($tempScriptPath);
+        }
+
        return $this->render('admin/wiki_create_french.html.twig', [
            'key' => $key,
            'english_url' => $englishUrl,
-            'french_edit_url' => $frenchEditUrl
+            'french_edit_url' => $frenchEditUrl,
+            'english_html' => $englishHtml,
+            'french_html' => $frenchHtml
        ]);
    }

@ -1436,4 +1493,57 @@ class WikiController extends AbstractController
            'fr_links' => $frLinks
        ]);
    }
+    
+    /**
+     * Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
+     * 
+     * @param string $html The full HTML content
+     * @return string The extracted main content
+     */
+    private function extractMainContent(string $html): string
+    {
+        // Use a simple approach to extract the content
+        // This could be improved with a more sophisticated HTML parser if needed
+        
+        // Create a DOMDocument to parse the HTML
+        $dom = new \DOMDocument();
+        
+        // Suppress warnings about malformed HTML
+        libxml_use_internal_errors(true);
+        $dom->loadHTML($html);
+        libxml_clear_errors();
+        
+        // Try to find the main content element
+        $contentElement = null;
+        
+        // First, try to find the element with id "mw-content-text"
+        $contentElement = $dom->getElementById('mw-content-text');
+        
+        // If not found, try to find the element with class "mw-content-ltr"
+        if (!$contentElement) {
+            $xpath = new \DOMXPath($dom);
+            $elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
+            if ($elements->length > 0) {
+                $contentElement = $elements->item(0);
+            }
+        }
+        
+        // If still not found, return the original HTML
+        if (!$contentElement) {
+            return $html;
+        }
+        
+        // Get the HTML of the content element
+        $contentHtml = $dom->saveHTML($contentElement);
+        
+        // Clean up the content HTML
+        // Remove script and style elements
+        $contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
+        $contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
+        
+        // Remove edit section links
+        $contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
+        
+        return $contentHtml;
+    }
 }
--- a/templates/admin/wiki.html.twig
+++ b/templates/admin/wiki.html.twig
@ -48,7 +48,9 @@

                                    </td>
                                    <td>
+                                        <a href="https://wiki.openstreetmap.org/{{ key }}">
                                        <strong>{{ key }}</strong>
+                                        </a>
                                    </td>

                                    {% set diff = page_differences[key] %}
@ -212,7 +214,6 @@
                                            {% endif %}
                                            <div>
                                                <strong>{{ page.key }}</strong>
-                                                <span class="badge bg-primary">Spécifique</span>
                                            </div>
                                        </div>
                                    </td>
--- a/templates/admin/wiki_create_french.html.twig
+++ b/templates/admin/wiki_create_french.html.twig
@ -81,6 +81,49 @@
            </ul>
        </div>
        
+        {% if english_html or french_html %}
+        <div class="card mb-4">
+            <div class="card-header">
+                <h2>Contenu HTML des pages</h2>
+                <p class="mb-0">Vous pouvez consulter le contenu HTML des pages ci-dessous pour faciliter la traduction.</p>
+            </div>
+            <div class="card-body">
+                <div class="row">
+                    <div class="col-md-6">
+                        <div class="card h-100">
+                            <div class="card-header bg-primary text-white">
+                                <i class="bi bi-flag-fill"></i> Contenu HTML de la version anglaise
+                            </div>
+                            <div class="card-body">
+                                <div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
+                                    {{ english_html|raw }}
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <div class="col-md-6">
+                        <div class="card h-100">
+                            <div class="card-header bg-info text-white">
+                                <i class="bi bi-translate"></i> Contenu HTML de la version française
+                            </div>
+                            <div class="card-body">
+                                <div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
+                                    {% if french_html %}
+                                        {{ french_html|raw }}
+                                    {% else %}
+                                        <div class="alert alert-warning">
+                                            <i class="bi bi-exclamation-triangle"></i> La page française n'existe pas encore.
+                                        </div>
+                                    {% endif %}
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        {% endif %}
+        
        <div class="row">
            <div class="col-md-6">
                <div class="iframe-header">
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -30,11 +30,14 @@ import re
 import os
 import subprocess
 import tempfile
+import hashlib
 from datetime import datetime
 from bs4 import BeautifulSoup
 import logging
 import matplotlib.pyplot as plt
 import numpy as np
+import nltk
+from pathlib import Path

 # Configure logging
 logging.basicConfig(
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
 NUM_WIKI_PAGES = 50
+# HTML cache folder
+HTML_CACHE_DIR = "html_cache"
+
+# Initialize NLTK for sentence tokenization
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+    
+# Create HTML cache directory if it doesn't exist
+Path(HTML_CACHE_DIR).mkdir(exist_ok=True)

 # List of specific pages to compare (in addition to top keys)
 # This list can include:
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        url = f"{base_url}{key}"
        page_title = key
    
-    logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
+    # Create a unique cache filename based on the URL
+    cache_key = hashlib.md5(url.encode()).hexdigest()
+    cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
    
-    try:
-        response = requests.get(url)
+    html_content = None
    
-        # Check if page exists
-        if response.status_code == 404:
-            logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
+    # Try to load from cache first
+    if cache_file.exists():
+        logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+        except Exception as e:
+            logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
+            html_content = None
+    
+    # If not in cache or cache read failed, fetch from web
+    if html_content is None:
+        logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
+        try:
+            response = requests.get(url)
+            
+            # Check if page exists
+            if response.status_code == 404:
+                logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
+                return None
+            
+            response.raise_for_status()
+            html_content = response.text
+            
+            # Save to cache
+            try:
+                with open(cache_file, 'w', encoding='utf-8') as f:
+                    f.write(html_content)
+                logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
+            except Exception as e:
+                logger.warning(f"Error saving to cache: {e}")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
            return None
    
-        response.raise_for_status()
+    soup = BeautifulSoup(html_content, 'html.parser')
    
-        soup = BeautifulSoup(response.text, 'html.parser')
+    # Get last modification date
+    last_modified = None
+    footer_info = soup.select_one('#footer-info-lastmod')
+    if footer_info:
+        date_text = footer_info.text
+        # Extract date using regex
+        date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
+        if date_match:
+            date_str = date_match.group(1)
+            try:
+                # Parse date (format may vary based on wiki language)
+                last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
+            except ValueError:
+                logger.warning(f"Could not parse date: {date_str}")
    
-        # Get last modification date
-        last_modified = None
-        footer_info = soup.select_one('#footer-info-lastmod')
-        if footer_info:
-            date_text = footer_info.text
-            # Extract date using regex
-            date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
-            if date_match:
-                date_str = date_match.group(1)
-                try:
-                    # Parse date (format may vary based on wiki language)
-                    last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
-                except ValueError:
-                    logger.warning(f"Could not parse date: {date_str}")
+    # Extract sections (h2, h3, h4)
+    section_elements = soup.select('h2, h3, h4')
+    sections = len(section_elements)
    
-        # Extract sections (h2, h3, h4)
-        section_elements = soup.select('h2, h3, h4')
-        sections = len(section_elements)
+    # Extract section titles
+    section_titles = []
+    for section_elem in section_elements:
+        # Skip sections that are part of the table of contents, navigation, or DescriptionBox
+        if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
+            continue
            
-        # Extract section titles
-        section_titles = []
-        for section_elem in section_elements:
-            # Skip sections that are part of the table of contents, navigation, or DescriptionBox
-            if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
+        # Skip sections that are inside a table with class DescriptionBox
+        if section_elem.find_parent('table', class_='DescriptionBox'):
+            continue
+            
+        # Get the text of the section title, removing any edit links
+        for edit_link in section_elem.select('.mw-editsection'):
+            edit_link.extract()
+        
+        section_title = section_elem.get_text(strip=True)
+        section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
+        
+        section_titles.append({
+            'title': section_title,
+            'level': section_level
+        })
+        
+    # Count words and sentences in the content
+    content = soup.select_one('#mw-content-text')
+    clean_text = ""
+    if content:
+        # Remove script and style elements
+        for script in content.select('script, style'):
+            script.extract()
+        
+        # Remove .languages elements
+        for languages_elem in content.select('.languages'):
+            languages_elem.extract()
+        
+        # Get text and count words
+        clean_text = content.get_text(separator=' ', strip=True)
+        word_count = len(clean_text.split())
+        
+        # Count sentences using NLTK
+        sentences = nltk.sent_tokenize(clean_text)
+        sentence_count = len(sentences)
+        
+        # Check grammar for French pages
+        grammar_suggestions = []
+        if language == 'fr':
+            logger.info(f"Checking grammar for French page: {key}")
+            grammar_suggestions = check_grammar_with_grammalecte(clean_text)
+        
+        # Extract links
+        links = content.select('a')
+        link_count = len(links)
+        
+        # Get link details (text and href)
+        link_details = []
+        for link in links:
+            href = link.get('href', '')
+            # Skip edit section links and other non-content links
+            if 'action=edit' in href or 'redlink=1' in href or not href:
                continue
            
-            # Skip sections that are inside a table with class DescriptionBox
-            if section_elem.find_parent('table', class_='DescriptionBox'):
-                continue
+            # Make relative URLs absolute
+            if href.startswith('/'):
+                href = 'https://wiki.openstreetmap.org' + href
            
-            # Get the text of the section title, removing any edit links
-            for edit_link in section_elem.select('.mw-editsection'):
-                edit_link.extract()
+            link_text = link.get_text(strip=True)
+            if link_text:  # Only include links with text
+                link_details.append({
+                    'text': link_text,
+                    'href': href
+                })
        
-            section_title = section_elem.get_text(strip=True)
-            section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
+        # Extract media (images)
+        media_elements = content.select('img')
+        media_count = len(media_elements)
        
-            section_titles.append({
-                'title': section_title,
-                'level': section_level
-            })
+        # Get media details (src and alt text)
+        media_details = []

-        # Count words in the content
-        content = soup.select_one('#mw-content-text')
-        clean_text = ""
-        if content:
-            # Remove script and style elements
-            for script in content.select('script, style'):
-                script.extract()
+        # Extract description image specifically
+        # Try multiple selectors to find the description image
+        description_img = None

-            # Remove .languages elements
-            for languages_elem in content.select('.languages'):
-                languages_elem.extract()
+        # Debug: Log the key we're processing
+        logger.info(f"Looking for description image for key '{key}' in {language}")

-            # Get text and count words
-            clean_text = content.get_text(separator=' ', strip=True)
-            word_count = len(clean_text.split())
+        # Function to filter out OSM logo and small icons
+        def is_relevant_image(img):
+            src = img.get('src', '')
+            # Skip OSM logo
+            if 'osm_logo' in src:
+                return False
+            # Skip small icons (usually less than 30px)
+            width = img.get('width')
+            if width and int(width) < 30:
+                return False
+            height = img.get('height')
+            if height and int(height) < 30:
+                return False
+            return True

-            # Check grammar for French pages
-            grammar_suggestions = []
-            if language == 'fr':
-                logger.info(f"Checking grammar for French page: {key}")
-                grammar_suggestions = check_grammar_with_grammalecte(clean_text)
+        # Special case for highway key - directly target the image we want
+        if key == 'highway':
+            # Try to find the specific image in figure elements
+            highway_img_elements = content.select('figure.mw-halign-center img')
+            logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")

-            # Extract links
-            links = content.select('a')
-            link_count = len(links)
+            # Filter for relevant images
+            relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images for highway")

-            # Get link details (text and href)
-            link_details = []
-            for link in links:
-                href = link.get('href', '')
-                # Skip edit section links and other non-content links
-                if 'action=edit' in href or 'redlink=1' in href or not href:
-                    continue
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")

+        # If not found with highway-specific selector, try the td.d_image selector
+        if not description_img:
+            description_img_elements = content.select('td.d_image img')
+            logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")
+
+        # If still not found, try the specific selector for .description img.mw-file-element
+        if not description_img:
+            description_img_elements = content.select('.description img.mw-file-element')
+            logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in .description")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
+
+        # If still not found, try images in figures within the description box
+        if not description_img:
+            description_img_elements = content.select('.description figure img')
+            logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")
+
+        # If still not found, try any image in the description box
+        if not description_img:
+            description_img_elements = content.select('.description img')
+            logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in .description general")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")
+
+        # If still not found, try images in the DescriptionBox table
+        if not description_img:
+            description_img_elements = content.select('table.DescriptionBox img')
+            logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
+
+        # If still not found, try images in figure elements anywhere in the content
+        if not description_img:
+            description_img_elements = content.select('figure img')
+            logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")
+
+            # Filter for relevant images
+            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")
+
+        # If we still don't have an image, use any image that's not the OSM logo
+        if not description_img:
+            all_images = content.select('img')
+            relevant_images = [img for img in all_images if is_relevant_image(img)]
+            logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")
+
+            if relevant_images:
+                description_img = relevant_images[0]
+                logger.info(f"  Using fallback image: {description_img.get('src', '')}")
+
+        # Process the found image
+        description_img_url = None
+        if description_img:
+            src = description_img.get('src', '')
+            if src:
                # Make relative URLs absolute
-                if href.startswith('/'):
-                    href = 'https://wiki.openstreetmap.org' + href
+                if src.startswith('//'):
+                    src = 'https:' + src
+                elif src.startswith('/'):
+                    src = 'https://wiki.openstreetmap.org' + src

-                link_text = link.get_text(strip=True)
-                if link_text:  # Only include links with text
-                    link_details.append({
-                        'text': link_text,
-                        'href': href
-                    })
-            
-            # Extract media (images)
-            media_elements = content.select('img')
-            media_count = len(media_elements)
-            
-            # Get media details (src and alt text)
-            media_details = []
-
-            # Extract description image specifically
-            # Try multiple selectors to find the description image
-            description_img = None
-
-            # Debug: Log the key we're processing
-            logger.info(f"Looking for description image for key '{key}' in {language}")
-
-            # Function to filter out OSM logo and small icons
-            def is_relevant_image(img):
-                src = img.get('src', '')
-                # Skip OSM logo
-                if 'osm_logo' in src:
-                    return False
-                # Skip small icons (usually less than 30px)
-                width = img.get('width')
-                if width and int(width) < 30:
-                    return False
-                height = img.get('height')
-                if height and int(height) < 30:
-                    return False
-                return True
-
-            # Special case for highway key - directly target the image we want
-            if key == 'highway':
-                # Try to find the specific image in figure elements
-                highway_img_elements = content.select('figure.mw-halign-center img')
-                logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images for highway")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")
-
-            # If not found with highway-specific selector, try the td.d_image selector
-            if not description_img:
-                description_img_elements = content.select('td.d_image img')
-                logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")
-
-            # If still not found, try the specific selector for .description img.mw-file-element
-            if not description_img:
-                description_img_elements = content.select('.description img.mw-file-element')
-                logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in .description")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
-
-            # If still not found, try images in figures within the description box
-            if not description_img:
-                description_img_elements = content.select('.description figure img')
-                logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")
-
-            # If still not found, try any image in the description box
-            if not description_img:
-                description_img_elements = content.select('.description img')
-                logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in .description general")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")
-
-            # If still not found, try images in the DescriptionBox table
-            if not description_img:
-                description_img_elements = content.select('table.DescriptionBox img')
-                logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
-
-            # If still not found, try images in figure elements anywhere in the content
-            if not description_img:
-                description_img_elements = content.select('figure img')
-                logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")
-
-                # Filter for relevant images
-                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")
-
-            # If we still don't have an image, use any image that's not the OSM logo
-            if not description_img:
-                all_images = content.select('img')
-                relevant_images = [img for img in all_images if is_relevant_image(img)]
-                logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")
-
-                if relevant_images:
-                    description_img = relevant_images[0]
-                    logger.info(f"  Using fallback image: {description_img.get('src', '')}")
-
-            # Process the found image
-            description_img_url = None
-            if description_img:
-                src = description_img.get('src', '')
-                if src:
-                    # Make relative URLs absolute
-                    if src.startswith('//'):
-                        src = 'https:' + src
-                    elif src.startswith('/'):
-                        src = 'https://wiki.openstreetmap.org' + src
-
-                    description_img_url = src
+                description_img_url = src

            # Process all images
            for img in media_elements:
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'sections': sections,
            'section_titles': section_titles,
            'word_count': word_count,
+            'sentence_count': sentence_count,
            'link_count': link_count,
            'link_details': link_details,
            'media_count': media_count,
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'categories': categories,
            'description_img_url': description_img_url,
            'is_specific_page': is_specific_page,
-            'grammar_suggestions': grammar_suggestions
+            'grammar_suggestions': grammar_suggestions,
+            'html_content': html_content
        }
    
    except requests.exceptions.RequestException as e:
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
            if date_diff > 30:
                reason.append(f"La version Française est datée de {date_diff} jours")
            if word_diff > 200:
-                reason.append(f"La version Anglaise a {word_diff} plus de mots")
+                reason.append(f"La version Anglaise a {word_diff} mots de plus")
            if section_diff > 2:
-                reason.append(f"La version Anglaise a {section_diff} plus de sections")
+                reason.append(f"La version Anglaise a {section_diff} sections de plus")
            if link_diff > 20:
-                reason.append(f"La version Anglaise a {link_diff} plus de liens")
+                reason.append(f"La version Anglaise a {link_diff} liens de plus")
            if media_diff > 5:
-                reason.append(f"La version Anglaise a {media_diff} plus d'images")
+                reason.append(f"La version Anglaise a {media_diff} images de plus")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
                reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")