up wording comparaison

2025-09-03 16:04:16 +02:00 · 2025-09-03 16:04:16 +02:00 · 09e16d9075
commit 09e16d9075
parent 1140c87932
6 changed files with 443 additions and 239 deletions
--- a/.gitignore
+++ b/.gitignore
@ -30,6 +30,7 @@ venv
 wiki_compare/.env
 wiki_compare/*.png
 wiki_compare/*.json
 wiki_compare/html_cache/
 public/*.json
 .idea
--- a/.idea/php.xml
+++ b/.idea/php.xml
@ -141,7 +141,7 @@
      <path value="$PROJECT_DIR$/vendor/symfony/mime" />
    </include_path>
  </component>
-  <component name="PhpProjectSharedConfiguration" php_language_level="8.3">
+  <component name="PhpProjectSharedConfiguration" php_language_level="8.2">
    <option name="suggestChangeDefaultLanguageLevel" value="false" />
  </component>
  <component name="PhpStanOptionsConfiguration">
--- a/src/Controller/WikiController.php
+++ b/src/Controller/WikiController.php
@ -719,11 +719,68 @@ class WikiController extends AbstractController
        // Construct the URLs for the English page and the French page creation form
        $englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
        $frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
        // Fetch the HTML content of the English page using wiki_compare.py
        $scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
        $englishHtml = null;
        $frenchHtml = null;
        if (file_exists($scriptPath)) {
            // Create a temporary Python script to fetch the page content
            $tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
            $pythonCode = <<<EOT
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import sys
 import json
 from wiki_compare import fetch_wiki_page
 # Get the key from command line arguments
 key = sys.argv[1]
 language = sys.argv[2]
 # Fetch the page
 page = fetch_wiki_page(key, language)
 # Output the HTML content
 if page and 'html_content' in page:
    print(page['html_content'])
 else:
    print("")
 EOT;
            file_put_contents($tempScriptPath, $pythonCode);
            chmod($tempScriptPath, 0755);
            // Fetch English page
            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
            $englishHtml = shell_exec($command);
            // Extract only the content part from the HTML (remove headers, footers, etc.)
            if ($englishHtml) {
                $englishHtml = $this->extractMainContent($englishHtml);
            }
            // Fetch French page (might not exist, but we'll try)
            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
            $frenchHtml = shell_exec($command);
            // Extract only the content part from the HTML (remove headers, footers, etc.)
            if ($frenchHtml) {
                $frenchHtml = $this->extractMainContent($frenchHtml);
            }
            // Clean up the temporary script
            unlink($tempScriptPath);
        }
        return $this->render('admin/wiki_create_french.html.twig', [
            'key' => $key,
            'english_url' => $englishUrl,
-            'french_edit_url' => $frenchEditUrl
+            'french_edit_url' => $frenchEditUrl,
            'english_html' => $englishHtml,
            'french_html' => $frenchHtml
        ]);
    }
@ -1436,4 +1493,57 @@ class WikiController extends AbstractController
            'fr_links' => $frLinks
        ]);
    }
    /**
     * Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
     * 
     * @param string $html The full HTML content
     * @return string The extracted main content
     */
    private function extractMainContent(string $html): string
    {
        // Use a simple approach to extract the content
        // This could be improved with a more sophisticated HTML parser if needed
        // Create a DOMDocument to parse the HTML
        $dom = new \DOMDocument();
        // Suppress warnings about malformed HTML
        libxml_use_internal_errors(true);
        $dom->loadHTML($html);
        libxml_clear_errors();
        // Try to find the main content element
        $contentElement = null;
        // First, try to find the element with id "mw-content-text"
        $contentElement = $dom->getElementById('mw-content-text');
        // If not found, try to find the element with class "mw-content-ltr"
        if (!$contentElement) {
            $xpath = new \DOMXPath($dom);
            $elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
            if ($elements->length > 0) {
                $contentElement = $elements->item(0);
            }
        }
        // If still not found, return the original HTML
        if (!$contentElement) {
            return $html;
        }
        // Get the HTML of the content element
        $contentHtml = $dom->saveHTML($contentElement);
        // Clean up the content HTML
        // Remove script and style elements
        $contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
        $contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
        // Remove edit section links
        $contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
        return $contentHtml;
    }
 }
--- a/templates/admin/wiki.html.twig
+++ b/templates/admin/wiki.html.twig
@ -48,7 +48,9 @@
                                    </td>
                                    <td>
                                        <a href="https://wiki.openstreetmap.org/{{ key }}">
                                        <strong>{{ key }}</strong>
                                        </a>
                                    </td>
                                    {% set diff = page_differences[key] %}
@ -212,7 +214,6 @@
                                            {% endif %}
                                            <div>
                                                <strong>{{ page.key }}</strong>
                                                <span class="badge bg-primary">Spécifique</span>
                                            </div>
                                        </div>
                                    </td>
--- a/templates/admin/wiki_create_french.html.twig
+++ b/templates/admin/wiki_create_french.html.twig
@ -81,6 +81,49 @@
            </ul>
        </div>
        {% if english_html or french_html %}
        <div class="card mb-4">
            <div class="card-header">
                <h2>Contenu HTML des pages</h2>
                <p class="mb-0">Vous pouvez consulter le contenu HTML des pages ci-dessous pour faciliter la traduction.</p>
            </div>
            <div class="card-body">
                <div class="row">
                    <div class="col-md-6">
                        <div class="card h-100">
                            <div class="card-header bg-primary text-white">
                                <i class="bi bi-flag-fill"></i> Contenu HTML de la version anglaise
                            </div>
                            <div class="card-body">
                                <div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
                                    {{ english_html|raw }}
                                </div>
                            </div>
                        </div>
                    </div>
                    <div class="col-md-6">
                        <div class="card h-100">
                            <div class="card-header bg-info text-white">
                                <i class="bi bi-translate"></i> Contenu HTML de la version française
                            </div>
                            <div class="card-body">
                                <div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
                                    {% if french_html %}
                                        {{ french_html|raw }}
                                    {% else %}
                                        <div class="alert alert-warning">
                                            <i class="bi bi-exclamation-triangle"></i> La page française n'existe pas encore.
                                        </div>
                                    {% endif %}
                                </div>
                            </div>
                        </div>
                    </div>
                </div>
            </div>
        </div>
        {% endif %}
        <div class="row">
            <div class="col-md-6">
                <div class="iframe-header">
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -30,11 +30,14 @@ import re
 import os
 import subprocess
 import tempfile
 import hashlib
 from datetime import datetime
 from bs4 import BeautifulSoup
 import logging
 import matplotlib.pyplot as plt
 import numpy as np
 import nltk
 from pathlib import Path
 # Configure logging
 logging.basicConfig(
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
 NUM_WIKI_PAGES = 50
 # HTML cache folder
 HTML_CACHE_DIR = "html_cache"
 # Initialize NLTK for sentence tokenization
 try:
    nltk.data.find('tokenizers/punkt')
 except LookupError:
    nltk.download('punkt')
 # Create HTML cache directory if it doesn't exist
 Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
 # List of specific pages to compare (in addition to top keys)
 # This list can include:
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        url = f"{base_url}{key}"
        page_title = key
-    logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
+    # Create a unique cache filename based on the URL
    cache_key = hashlib.md5(url.encode()).hexdigest()
    cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
-    try:
+    html_content = None
-        response = requests.get(url)
+    
-        
+    # Try to load from cache first
-        # Check if page exists
+    if cache_file.exists():
-        if response.status_code == 404:
+        logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
-            logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
+        try:
            with open(cache_file, 'r', encoding='utf-8') as f:
                html_content = f.read()
        except Exception as e:
            logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
            html_content = None
    # If not in cache or cache read failed, fetch from web
    if html_content is None:
        logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
        try:
            response = requests.get(url)
            # Check if page exists
            if response.status_code == 404:
                logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
                return None
            response.raise_for_status()
            html_content = response.text
            # Save to cache
            try:
                with open(cache_file, 'w', encoding='utf-8') as f:
                    f.write(html_content)
                logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
            except Exception as e:
                logger.warning(f"Error saving to cache: {e}")
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
            return None
    soup = BeautifulSoup(html_content, 'html.parser')
    # Get last modification date
    last_modified = None
    footer_info = soup.select_one('#footer-info-lastmod')
    if footer_info:
        date_text = footer_info.text
        # Extract date using regex
        date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
        if date_match:
            date_str = date_match.group(1)
            try:
                # Parse date (format may vary based on wiki language)
                last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
            except ValueError:
                logger.warning(f"Could not parse date: {date_str}")
    # Extract sections (h2, h3, h4)
    section_elements = soup.select('h2, h3, h4')
    sections = len(section_elements)
    # Extract section titles
    section_titles = []
    for section_elem in section_elements:
        # Skip sections that are part of the table of contents, navigation, or DescriptionBox
        if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
            continue
        # Skip sections that are inside a table with class DescriptionBox
        if section_elem.find_parent('table', class_='DescriptionBox'):
            continue
        # Get the text of the section title, removing any edit links
        for edit_link in section_elem.select('.mw-editsection'):
            edit_link.extract()
-        response.raise_for_status()
+        section_title = section_elem.get_text(strip=True)
        section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
-        soup = BeautifulSoup(response.text, 'html.parser')
+        section_titles.append({
            'title': section_title,
            'level': section_level
        })
-        # Get last modification date
+    # Count words and sentences in the content
-        last_modified = None
+    content = soup.select_one('#mw-content-text')
-        footer_info = soup.select_one('#footer-info-lastmod')
+    clean_text = ""
-        if footer_info:
+    if content:
-            date_text = footer_info.text
+        # Remove script and style elements
-            # Extract date using regex
+        for script in content.select('script, style'):
-            date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
+            script.extract()
            if date_match:
                date_str = date_match.group(1)
                try:
                    # Parse date (format may vary based on wiki language)
                    last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
                except ValueError:
                    logger.warning(f"Could not parse date: {date_str}")
-        # Extract sections (h2, h3, h4)
+        # Remove .languages elements
-        section_elements = soup.select('h2, h3, h4')
+        for languages_elem in content.select('.languages'):
-        sections = len(section_elements)
+            languages_elem.extract()
-        # Extract section titles
+        # Get text and count words
-        section_titles = []
+        clean_text = content.get_text(separator=' ', strip=True)
-        for section_elem in section_elements:
+        word_count = len(clean_text.split())
-            # Skip sections that are part of the table of contents, navigation, or DescriptionBox
+        
-            if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
+        # Count sentences using NLTK
        sentences = nltk.sent_tokenize(clean_text)
        sentence_count = len(sentences)
        # Check grammar for French pages
        grammar_suggestions = []
        if language == 'fr':
            logger.info(f"Checking grammar for French page: {key}")
            grammar_suggestions = check_grammar_with_grammalecte(clean_text)
        # Extract links
        links = content.select('a')
        link_count = len(links)
        # Get link details (text and href)
        link_details = []
        for link in links:
            href = link.get('href', '')
            # Skip edit section links and other non-content links
            if 'action=edit' in href or 'redlink=1' in href or not href:
                continue
            # Skip sections that are inside a table with class DescriptionBox
            if section_elem.find_parent('table', class_='DescriptionBox'):
                continue
            # Get the text of the section title, removing any edit links
            for edit_link in section_elem.select('.mw-editsection'):
                edit_link.extract()
-            section_title = section_elem.get_text(strip=True)
+            # Make relative URLs absolute
-            section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
+            if href.startswith('/'):
                href = 'https://wiki.openstreetmap.org' + href
-            section_titles.append({
+            link_text = link.get_text(strip=True)
-                'title': section_title,
+            if link_text:  # Only include links with text
-                'level': section_level
+                link_details.append({
-            })
+                    'text': link_text,
                    'href': href
                })
-        # Count words in the content
+        # Extract media (images)
-        content = soup.select_one('#mw-content-text')
+        media_elements = content.select('img')
-        clean_text = ""
+        media_count = len(media_elements)
-        if content:
+        
-            # Remove script and style elements
+        # Get media details (src and alt text)
-            for script in content.select('script, style'):
+        media_details = []
-                script.extract()
+
-            
+        # Extract description image specifically
-            # Remove .languages elements
+        # Try multiple selectors to find the description image
-            for languages_elem in content.select('.languages'):
+        description_img = None
-                languages_elem.extract()
+
-            
+        # Debug: Log the key we're processing
-            # Get text and count words
+        logger.info(f"Looking for description image for key '{key}' in {language}")
-            clean_text = content.get_text(separator=' ', strip=True)
+
-            word_count = len(clean_text.split())
+        # Function to filter out OSM logo and small icons
-            
+        def is_relevant_image(img):
-            # Check grammar for French pages
+            src = img.get('src', '')
-            grammar_suggestions = []
+            # Skip OSM logo
-            if language == 'fr':
+            if 'osm_logo' in src:
-                logger.info(f"Checking grammar for French page: {key}")
+                return False
-                grammar_suggestions = check_grammar_with_grammalecte(clean_text)
+            # Skip small icons (usually less than 30px)
-            
+            width = img.get('width')
-            # Extract links
+            if width and int(width) < 30:
-            links = content.select('a')
+                return False
-            link_count = len(links)
+            height = img.get('height')
-            
+            if height and int(height) < 30:
-            # Get link details (text and href)
+                return False
-            link_details = []
+            return True
-            for link in links:
+
-                href = link.get('href', '')
+        # Special case for highway key - directly target the image we want
-                # Skip edit section links and other non-content links
+        if key == 'highway':
-                if 'action=edit' in href or 'redlink=1' in href or not href:
+            # Try to find the specific image in figure elements
-                    continue
+            highway_img_elements = content.select('figure.mw-halign-center img')
-                
+            logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
            # Filter for relevant images
            relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images for highway")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")
        # If not found with highway-specific selector, try the td.d_image selector
        if not description_img:
            description_img_elements = content.select('td.d_image img')
            logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")
            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")
        # If still not found, try the specific selector for .description img.mw-file-element
        if not description_img:
            description_img_elements = content.select('.description img.mw-file-element')
            logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in .description")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
        # If still not found, try images in figures within the description box
        if not description_img:
            description_img_elements = content.select('.description figure img')
            logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")
            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")
        # If still not found, try any image in the description box
        if not description_img:
            description_img_elements = content.select('.description img')
            logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")
            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in .description general")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")
        # If still not found, try images in the DescriptionBox table
        if not description_img:
            description_img_elements = content.select('table.DescriptionBox img')
            logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
        # If still not found, try images in figure elements anywhere in the content
        if not description_img:
            description_img_elements = content.select('figure img')
            logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")
            # Filter for relevant images
            relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")
        # If we still don't have an image, use any image that's not the OSM logo
        if not description_img:
            all_images = content.select('img')
            relevant_images = [img for img in all_images if is_relevant_image(img)]
            logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")
            if relevant_images:
                description_img = relevant_images[0]
                logger.info(f"  Using fallback image: {description_img.get('src', '')}")
        # Process the found image
        description_img_url = None
        if description_img:
            src = description_img.get('src', '')
            if src:
                # Make relative URLs absolute
-                if href.startswith('/'):
+                if src.startswith('//'):
-                    href = 'https://wiki.openstreetmap.org' + href
+                    src = 'https:' + src
-                
+                elif src.startswith('/'):
-                link_text = link.get_text(strip=True)
+                    src = 'https://wiki.openstreetmap.org' + src
                if link_text:  # Only include links with text
                    link_details.append({
                        'text': link_text,
                        'href': href
                    })
            # Extract media (images)
            media_elements = content.select('img')
            media_count = len(media_elements)
            # Get media details (src and alt text)
            media_details = []
-            # Extract description image specifically
+                description_img_url = src
            # Try multiple selectors to find the description image
            description_img = None
            # Debug: Log the key we're processing
            logger.info(f"Looking for description image for key '{key}' in {language}")
            # Function to filter out OSM logo and small icons
            def is_relevant_image(img):
                src = img.get('src', '')
                # Skip OSM logo
                if 'osm_logo' in src:
                    return False
                # Skip small icons (usually less than 30px)
                width = img.get('width')
                if width and int(width) < 30:
                    return False
                height = img.get('height')
                if height and int(height) < 30:
                    return False
                return True
            # Special case for highway key - directly target the image we want
            if key == 'highway':
                # Try to find the specific image in figure elements
                highway_img_elements = content.select('figure.mw-halign-center img')
                logger.info(f"  Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
                # Filter for relevant images
                relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images for highway")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using highway-specific image: {description_img.get('src', '')}")
            # If not found with highway-specific selector, try the td.d_image selector
            if not description_img:
                description_img_elements = content.select('td.d_image img')
                logger.info(f"  Selector 'td.d_image img' found {len(description_img_elements)} elements")
                # Filter for relevant images
                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images in td.d_image")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using image from 'td.d_image img': {description_img.get('src', '')}")
            # If still not found, try the specific selector for .description img.mw-file-element
            if not description_img:
                description_img_elements = content.select('.description img.mw-file-element')
                logger.info(f"  Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
                # Filter for relevant images
                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images in .description")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
            # If still not found, try images in figures within the description box
            if not description_img:
                description_img_elements = content.select('.description figure img')
                logger.info(f"  Selector '.description figure img' found {len(description_img_elements)} elements")
                # Filter for relevant images
                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images in .description figure")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using image from '.description figure img': {description_img.get('src', '')}")
            # If still not found, try any image in the description box
            if not description_img:
                description_img_elements = content.select('.description img')
                logger.info(f"  Selector '.description img' found {len(description_img_elements)} elements")
                # Filter for relevant images
                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images in .description general")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using image from '.description img': {description_img.get('src', '')}")
            # If still not found, try images in the DescriptionBox table
            if not description_img:
                description_img_elements = content.select('table.DescriptionBox img')
                logger.info(f"  Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
                # Filter for relevant images
                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images in DescriptionBox")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
            # If still not found, try images in figure elements anywhere in the content
            if not description_img:
                description_img_elements = content.select('figure img')
                logger.info(f"  Selector 'figure img' found {len(description_img_elements)} elements")
                # Filter for relevant images
                relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images in figure elements")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using image from 'figure img': {description_img.get('src', '')}")
            # If we still don't have an image, use any image that's not the OSM logo
            if not description_img:
                all_images = content.select('img')
                relevant_images = [img for img in all_images if is_relevant_image(img)]
                logger.info(f"  Found {len(relevant_images)} relevant images in the entire page")
                if relevant_images:
                    description_img = relevant_images[0]
                    logger.info(f"  Using fallback image: {description_img.get('src', '')}")
            # Process the found image
            description_img_url = None
            if description_img:
                src = description_img.get('src', '')
                if src:
                    # Make relative URLs absolute
                    if src.startswith('//'):
                        src = 'https:' + src
                    elif src.startswith('/'):
                        src = 'https://wiki.openstreetmap.org' + src
                    description_img_url = src
            # Process all images
            for img in media_elements:
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'sections': sections,
            'section_titles': section_titles,
            'word_count': word_count,
            'sentence_count': sentence_count,
            'link_count': link_count,
            'link_details': link_details,
            'media_count': media_count,
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'categories': categories,
            'description_img_url': description_img_url,
            'is_specific_page': is_specific_page,
-            'grammar_suggestions': grammar_suggestions
+            'grammar_suggestions': grammar_suggestions,
            'html_content': html_content
        }
    except requests.exceptions.RequestException as e:
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
            if date_diff > 30:
                reason.append(f"La version Française est datée de {date_diff} jours")
            if word_diff > 200:
-                reason.append(f"La version Anglaise a {word_diff} plus de mots")
+                reason.append(f"La version Anglaise a {word_diff} mots de plus")
            if section_diff > 2:
-                reason.append(f"La version Anglaise a {section_diff} plus de sections")
+                reason.append(f"La version Anglaise a {section_diff} sections de plus")
            if link_diff > 20:
-                reason.append(f"La version Anglaise a {link_diff} plus de liens")
+                reason.append(f"La version Anglaise a {link_diff} liens de plus")
            if media_diff > 5:
-                reason.append(f"La version Anglaise a {media_diff} plus d'images")
+                reason.append(f"La version Anglaise a {media_diff} images de plus")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
                reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")