up wording comparaison

2025-09-03 16:04:16 +02:00 · 2025-09-03 16:04:16 +02:00 · 09e16d9075
commit 09e16d9075
parent 1140c87932
6 changed files with 443 additions and 239 deletions
--- a/.gitignore
+++ b/.gitignore
@ -30,6 +30,7 @@ venv
 wiki_compare/.env
 wiki_compare/*.png
 wiki_compare/*.json
+wiki_compare/html_cache/
 public/*.json

 .idea
--- a/.idea/php.xml
+++ b/.idea/php.xml
@ -141,7 +141,7 @@
      <path value="$PROJECT_DIR$/vendor/symfony/mime" />
    </include_path>
  </component>
-  <component name="PhpProjectSharedConfiguration" php_language_level="8.3">
+  <component name="PhpProjectSharedConfiguration" php_language_level="8.2">
    <option name="suggestChangeDefaultLanguageLevel" value="false" />
  </component>
  <component name="PhpStanOptionsConfiguration">
--- a/src/Controller/WikiController.php
+++ b/src/Controller/WikiController.php
@ -720,10 +720,67 @@ class WikiController extends AbstractController
        $englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
        $frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
        
+        // Fetch the HTML content of the English page using wiki_compare.py
+        $scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
+        $englishHtml = null;
+        $frenchHtml = null;
+        
+        if (file_exists($scriptPath)) {
+            // Create a temporary Python script to fetch the page content
+            $tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
+            $pythonCode = <<<EOT
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import sys
+import json
+from wiki_compare import fetch_wiki_page
+
+# Get the key from command line arguments
+key = sys.argv[1]
+language = sys.argv[2]
+
+# Fetch the page
+page = fetch_wiki_page(key, language)
+
+# Output the HTML content
+if page and 'html_content' in page:
+    print(page['html_content'])
+else:
+    print("")
+EOT;
+            
+            file_put_contents($tempScriptPath, $pythonCode);
+            chmod($tempScriptPath, 0755);
+            
+            // Fetch English page
+            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
+            $englishHtml = shell_exec($command);
+            
+            // Extract only the content part from the HTML (remove headers, footers, etc.)
+            if ($englishHtml) {
+                $englishHtml = $this->extractMainContent($englishHtml);
+            }
+            
+            // Fetch French page (might not exist, but we'll try)
+            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
+            $frenchHtml = shell_exec($command);
+            
+            // Extract only the content part from the HTML (remove headers, footers, etc.)
+            if ($frenchHtml) {
+                $frenchHtml = $this->extractMainContent($frenchHtml);
+            }
+            
+            // Clean up the temporary script
+            unlink($tempScriptPath);
+        }
+
        return $this->render('admin/wiki_create_french.html.twig', [
            'key' => $key,
            'english_url' => $englishUrl,
-            'french_edit_url' => $frenchEditUrl
+            'french_edit_url' => $frenchEditUrl,
+            'english_html' => $englishHtml,
+            'french_html' => $frenchHtml
        ]);
    }

@ -1436,4 +1493,57 @@ class WikiController extends AbstractController
            'fr_links' => $frLinks
        ]);
    }
+    
+    /**
+     * Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
+     * 
+     * @param string $html The full HTML content
+     * @return string The extracted main content
+     */
+    private function extractMainContent(string $html): string
+    {
+        // Use a simple approach to extract the content
+        // This could be improved with a more sophisticated HTML parser if needed
+        
+        // Create a DOMDocument to parse the HTML
+        $dom = new \DOMDocument();
+        
+        // Suppress warnings about malformed HTML
+        libxml_use_internal_errors(true);
+        $dom->loadHTML($html);
+        libxml_clear_errors();
+        
+        // Try to find the main content element
+        $contentElement = null;
+        
+        // First, try to find the element with id "mw-content-text"
+        $contentElement = $dom->getElementById('mw-content-text');
+        
+        // If not found, try to find the element with class "mw-content-ltr"
+        if (!$contentElement) {
+            $xpath = new \DOMXPath($dom);
+            $elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
+            if ($elements->length > 0) {
+                $contentElement = $elements->item(0);
+            }
+        }
+        
+        // If still not found, return the original HTML
+        if (!$contentElement) {
+            return $html;
+        }
+        
+        // Get the HTML of the content element
+        $contentHtml = $dom->saveHTML($contentElement);
+        
+        // Clean up the content HTML
+        // Remove script and style elements
+        $contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
+        $contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
+        
+        // Remove edit section links
+        $contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
+        
+        return $contentHtml;
+    }
 }
--- a/templates/admin/wiki.html.twig
+++ b/templates/admin/wiki.html.twig
@ -48,7 +48,9 @@

                                    </td>
                                    <td>
+                                        <a href="https://wiki.openstreetmap.org/{{ key }}">
                                        <strong>{{ key }}</strong>
+                                        </a>
                                    </td>

                                    {% set diff = page_differences[key] %}
@ -212,7 +214,6 @@
                                            {% endif %}
                                            <div>
                                                <strong>{{ page.key }}</strong>
-                                                <span class="badge bg-primary">Spécifique</span>
                                            </div>
                                        </div>
                                    </td>
--- a/templates/admin/wiki_create_french.html.twig
+++ b/templates/admin/wiki_create_french.html.twig
@ -81,6 +81,49 @@
            </ul>
        </div>
        
+        {% if english_html or french_html %}
+        <div class="card mb-4">
+            <div class="card-header">
+                <h2>Contenu HTML des pages</h2>
+                <p class="mb-0">Vous pouvez consulter le contenu HTML des pages ci-dessous pour faciliter la traduction.</p>
+            </div>
+            <div class="card-body">
+                <div class="row">
+                    <div class="col-md-6">
+                        <div class="card h-100">
+                            <div class="card-header bg-primary text-white">
+                                <i class="bi bi-flag-fill"></i> Contenu HTML de la version anglaise
+                            </div>
+                            <div class="card-body">
+                                <div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
+                                    {{ english_html|raw }}
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <div class="col-md-6">
+                        <div class="card h-100">
+                            <div class="card-header bg-info text-white">
+                                <i class="bi bi-translate"></i> Contenu HTML de la version française
+                            </div>
+                            <div class="card-body">
+                                <div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
+                                    {% if french_html %}
+                                        {{ french_html|raw }}
+                                    {% else %}
+                                        <div class="alert alert-warning">
+                                            <i class="bi bi-exclamation-triangle"></i> La page française n'existe pas encore.
+                                        </div>
+                                    {% endif %}
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+        {% endif %}
+        
        <div class="row">
            <div class="col-md-6">
                <div class="iframe-header">
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -30,11 +30,14 @@ import re
 import os
 import subprocess
 import tempfile
+import hashlib
 from datetime import datetime
 from bs4 import BeautifulSoup
 import logging
 import matplotlib.pyplot as plt
 import numpy as np
+import nltk
+from pathlib import Path

 # Configure logging
 logging.basicConfig(
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
 NUM_WIKI_PAGES = 50
+# HTML cache folder
+HTML_CACHE_DIR = "html_cache"
+
+# Initialize NLTK for sentence tokenization
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+    
+# Create HTML cache directory if it doesn't exist
+Path(HTML_CACHE_DIR).mkdir(exist_ok=True)

 # List of specific pages to compare (in addition to top keys)
 # This list can include:
@ -262,8 +276,25 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        url = f"{base_url}{key}"
        page_title = key
    
-    logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
+    # Create a unique cache filename based on the URL
+    cache_key = hashlib.md5(url.encode()).hexdigest()
+    cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
    
+    html_content = None
+    
+    # Try to load from cache first
+    if cache_file.exists():
+        logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                html_content = f.read()
+        except Exception as e:
+            logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
+            html_content = None
+    
+    # If not in cache or cache read failed, fetch from web
+    if html_content is None:
+        logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
        try:
            response = requests.get(url)
            
@ -273,8 +304,20 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
                return None
            
            response.raise_for_status()
+            html_content = response.text
            
-        soup = BeautifulSoup(response.text, 'html.parser')
+            # Save to cache
+            try:
+                with open(cache_file, 'w', encoding='utf-8') as f:
+                    f.write(html_content)
+                logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
+            except Exception as e:
+                logger.warning(f"Error saving to cache: {e}")
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
+            return None
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Get last modification date
    last_modified = None
@ -318,7 +361,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'level': section_level
        })
        
-        # Count words in the content
+    # Count words and sentences in the content
    content = soup.select_one('#mw-content-text')
    clean_text = ""
    if content:
@ -334,6 +377,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        clean_text = content.get_text(separator=' ', strip=True)
        word_count = len(clean_text.split())
        
+        # Count sentences using NLTK
+        sentences = nltk.sent_tokenize(clean_text)
+        sentence_count = len(sentences)
+        
        # Check grammar for French pages
        grammar_suggestions = []
        if language == 'fr':
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'sections': sections,
            'section_titles': section_titles,
            'word_count': word_count,
+            'sentence_count': sentence_count,
            'link_count': link_count,
            'link_details': link_details,
            'media_count': media_count,
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'categories': categories,
            'description_img_url': description_img_url,
            'is_specific_page': is_specific_page,
-            'grammar_suggestions': grammar_suggestions
+            'grammar_suggestions': grammar_suggestions,
+            'html_content': html_content
        }
    
    except requests.exceptions.RequestException as e:
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
            if date_diff > 30:
                reason.append(f"La version Française est datée de {date_diff} jours")
            if word_diff > 200:
-                reason.append(f"La version Anglaise a {word_diff} plus de mots")
+                reason.append(f"La version Anglaise a {word_diff} mots de plus")
            if section_diff > 2:
-                reason.append(f"La version Anglaise a {section_diff} plus de sections")
+                reason.append(f"La version Anglaise a {section_diff} sections de plus")
            if link_diff > 20:
-                reason.append(f"La version Anglaise a {link_diff} plus de liens")
+                reason.append(f"La version Anglaise a {link_diff} liens de plus")
            if media_diff > 5:
-                reason.append(f"La version Anglaise a {media_diff} plus d'images")
+                reason.append(f"La version Anglaise a {media_diff} images de plus")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
                reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")