up wording comparaison

This commit is contained in:
Tykayn 2025-09-03 16:04:16 +02:00 committed by tykayn
parent 1140c87932
commit 09e16d9075
6 changed files with 443 additions and 239 deletions

1
.gitignore vendored
View file

@ -30,6 +30,7 @@ venv
wiki_compare/.env
wiki_compare/*.png
wiki_compare/*.json
wiki_compare/html_cache/
public/*.json
.idea

2
.idea/php.xml generated
View file

@ -141,7 +141,7 @@
<path value="$PROJECT_DIR$/vendor/symfony/mime" />
</include_path>
</component>
<component name="PhpProjectSharedConfiguration" php_language_level="8.3">
<component name="PhpProjectSharedConfiguration" php_language_level="8.2">
<option name="suggestChangeDefaultLanguageLevel" value="false" />
</component>
<component name="PhpStanOptionsConfiguration">

View file

@ -720,10 +720,67 @@ class WikiController extends AbstractController
$englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
$frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
// Fetch the HTML content of the English page using wiki_compare.py
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
$englishHtml = null;
$frenchHtml = null;
if (file_exists($scriptPath)) {
// Create a temporary Python script to fetch the page content
$tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
$pythonCode = <<<EOT
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import json
from wiki_compare import fetch_wiki_page
# Get the key from command line arguments
key = sys.argv[1]
language = sys.argv[2]
# Fetch the page
page = fetch_wiki_page(key, language)
# Output the HTML content
if page and 'html_content' in page:
print(page['html_content'])
else:
print("")
EOT;
file_put_contents($tempScriptPath, $pythonCode);
chmod($tempScriptPath, 0755);
// Fetch English page
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
$englishHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($englishHtml) {
$englishHtml = $this->extractMainContent($englishHtml);
}
// Fetch French page (might not exist, but we'll try)
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
$frenchHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($frenchHtml) {
$frenchHtml = $this->extractMainContent($frenchHtml);
}
// Clean up the temporary script
unlink($tempScriptPath);
}
return $this->render('admin/wiki_create_french.html.twig', [
'key' => $key,
'english_url' => $englishUrl,
'french_edit_url' => $frenchEditUrl
'french_edit_url' => $frenchEditUrl,
'english_html' => $englishHtml,
'french_html' => $frenchHtml
]);
}
@ -1436,4 +1493,57 @@ class WikiController extends AbstractController
'fr_links' => $frLinks
]);
}
/**
* Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
*
* @param string $html The full HTML content
* @return string The extracted main content
*/
private function extractMainContent(string $html): string
{
// Use a simple approach to extract the content
// This could be improved with a more sophisticated HTML parser if needed
// Create a DOMDocument to parse the HTML
$dom = new \DOMDocument();
// Suppress warnings about malformed HTML
libxml_use_internal_errors(true);
$dom->loadHTML($html);
libxml_clear_errors();
// Try to find the main content element
$contentElement = null;
// First, try to find the element with id "mw-content-text"
$contentElement = $dom->getElementById('mw-content-text');
// If not found, try to find the element with class "mw-content-ltr"
if (!$contentElement) {
$xpath = new \DOMXPath($dom);
$elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
if ($elements->length > 0) {
$contentElement = $elements->item(0);
}
}
// If still not found, return the original HTML
if (!$contentElement) {
return $html;
}
// Get the HTML of the content element
$contentHtml = $dom->saveHTML($contentElement);
// Clean up the content HTML
// Remove script and style elements
$contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
$contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
// Remove edit section links
$contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
return $contentHtml;
}
}

View file

@ -48,7 +48,9 @@
</td>
<td>
<a href="https://wiki.openstreetmap.org/{{ key }}">
<strong>{{ key }}</strong>
</a>
</td>
{% set diff = page_differences[key] %}
@ -212,7 +214,6 @@
{% endif %}
<div>
<strong>{{ page.key }}</strong>
<span class="badge bg-primary">Spécifique</span>
</div>
</div>
</td>

View file

@ -81,6 +81,49 @@
</ul>
</div>
{% if english_html or french_html %}
<div class="card mb-4">
<div class="card-header">
<h2>Contenu HTML des pages</h2>
<p class="mb-0">Vous pouvez consulter le contenu HTML des pages ci-dessous pour faciliter la traduction.</p>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-6">
<div class="card h-100">
<div class="card-header bg-primary text-white">
<i class="bi bi-flag-fill"></i> Contenu HTML de la version anglaise
</div>
<div class="card-body">
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
{{ english_html|raw }}
</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card h-100">
<div class="card-header bg-info text-white">
<i class="bi bi-translate"></i> Contenu HTML de la version française
</div>
<div class="card-body">
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
{% if french_html %}
{{ french_html|raw }}
{% else %}
<div class="alert alert-warning">
<i class="bi bi-exclamation-triangle"></i> La page française n'existe pas encore.
</div>
{% endif %}
</div>
</div>
</div>
</div>
</div>
</div>
</div>
{% endif %}
<div class="row">
<div class="col-md-6">
<div class="iframe-header">

View file

@ -30,11 +30,14 @@ import re
import os
import subprocess
import tempfile
import hashlib
from datetime import datetime
from bs4 import BeautifulSoup
import logging
import matplotlib.pyplot as plt
import numpy as np
import nltk
from pathlib import Path
# Configure logging
logging.basicConfig(
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 50
# HTML cache folder
HTML_CACHE_DIR = "html_cache"
# Initialize NLTK for sentence tokenization
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
# List of specific pages to compare (in addition to top keys)
# This list can include:
@ -262,8 +276,25 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
url = f"{base_url}{key}"
page_title = key
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
# Create a unique cache filename based on the URL
cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
html_content = None
# Try to load from cache first
if cache_file.exists():
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
try:
with open(cache_file, 'r', encoding='utf-8') as f:
html_content = f.read()
except Exception as e:
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
html_content = None
# If not in cache or cache read failed, fetch from web
if html_content is None:
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
try:
response = requests.get(url)
@ -273,8 +304,20 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
return None
response.raise_for_status()
html_content = response.text
soup = BeautifulSoup(response.text, 'html.parser')
# Save to cache
try:
with open(cache_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
except Exception as e:
logger.warning(f"Error saving to cache: {e}")
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None
soup = BeautifulSoup(html_content, 'html.parser')
# Get last modification date
last_modified = None
@ -318,7 +361,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'level': section_level
})
# Count words in the content
# Count words and sentences in the content
content = soup.select_one('#mw-content-text')
clean_text = ""
if content:
@ -334,6 +377,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Count sentences using NLTK
sentences = nltk.sent_tokenize(clean_text)
sentence_count = len(sentences)
# Check grammar for French pages
grammar_suggestions = []
if language == 'fr':
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'sections': sections,
'section_titles': section_titles,
'word_count': word_count,
'sentence_count': sentence_count,
'link_count': link_count,
'link_details': link_details,
'media_count': media_count,
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'categories': categories,
'description_img_url': description_img_url,
'is_specific_page': is_specific_page,
'grammar_suggestions': grammar_suggestions
'grammar_suggestions': grammar_suggestions,
'html_content': html_content
}
except requests.exceptions.RequestException as e:
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
if date_diff > 30:
reason.append(f"La version Française est datée de {date_diff} jours")
if word_diff > 200:
reason.append(f"La version Anglaise a {word_diff} plus de mots")
reason.append(f"La version Anglaise a {word_diff} mots de plus")
if section_diff > 2:
reason.append(f"La version Anglaise a {section_diff} plus de sections")
reason.append(f"La version Anglaise a {section_diff} sections de plus")
if link_diff > 20:
reason.append(f"La version Anglaise a {link_diff} plus de liens")
reason.append(f"La version Anglaise a {link_diff} liens de plus")
if media_diff > 5:
reason.append(f"La version Anglaise a {media_diff} plus d'images")
reason.append(f"La version Anglaise a {media_diff} images de plus")
if fr_page['word_count'] < en_page['word_count'] * 0.7:
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")