up wording comparaison

This commit is contained in:
Tykayn 2025-09-03 16:04:16 +02:00 committed by tykayn
parent 1140c87932
commit 09e16d9075
6 changed files with 443 additions and 239 deletions

1
.gitignore vendored
View file

@ -30,6 +30,7 @@ venv
wiki_compare/.env
wiki_compare/*.png
wiki_compare/*.json
wiki_compare/html_cache/
public/*.json
.idea

2
.idea/php.xml generated
View file

@ -141,7 +141,7 @@
<path value="$PROJECT_DIR$/vendor/symfony/mime" />
</include_path>
</component>
<component name="PhpProjectSharedConfiguration" php_language_level="8.3">
<component name="PhpProjectSharedConfiguration" php_language_level="8.2">
<option name="suggestChangeDefaultLanguageLevel" value="false" />
</component>
<component name="PhpStanOptionsConfiguration">

View file

@ -720,10 +720,67 @@ class WikiController extends AbstractController
$englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
$frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
// Fetch the HTML content of the English page using wiki_compare.py
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
$englishHtml = null;
$frenchHtml = null;
if (file_exists($scriptPath)) {
// Create a temporary Python script to fetch the page content
$tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
$pythonCode = <<<EOT
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import json
from wiki_compare import fetch_wiki_page
# Get the key from command line arguments
key = sys.argv[1]
language = sys.argv[2]
# Fetch the page
page = fetch_wiki_page(key, language)
# Output the HTML content
if page and 'html_content' in page:
print(page['html_content'])
else:
print("")
EOT;
file_put_contents($tempScriptPath, $pythonCode);
chmod($tempScriptPath, 0755);
// Fetch English page
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
$englishHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($englishHtml) {
$englishHtml = $this->extractMainContent($englishHtml);
}
// Fetch French page (might not exist, but we'll try)
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
$frenchHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($frenchHtml) {
$frenchHtml = $this->extractMainContent($frenchHtml);
}
// Clean up the temporary script
unlink($tempScriptPath);
}
return $this->render('admin/wiki_create_french.html.twig', [
'key' => $key,
'english_url' => $englishUrl,
'french_edit_url' => $frenchEditUrl
'french_edit_url' => $frenchEditUrl,
'english_html' => $englishHtml,
'french_html' => $frenchHtml
]);
}
@ -1436,4 +1493,57 @@ class WikiController extends AbstractController
'fr_links' => $frLinks
]);
}
/**
* Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
*
* @param string $html The full HTML content
* @return string The extracted main content
*/
private function extractMainContent(string $html): string
{
// Use a simple approach to extract the content
// This could be improved with a more sophisticated HTML parser if needed
// Create a DOMDocument to parse the HTML
$dom = new \DOMDocument();
// Suppress warnings about malformed HTML
libxml_use_internal_errors(true);
$dom->loadHTML($html);
libxml_clear_errors();
// Try to find the main content element
$contentElement = null;
// First, try to find the element with id "mw-content-text"
$contentElement = $dom->getElementById('mw-content-text');
// If not found, try to find the element with class "mw-content-ltr"
if (!$contentElement) {
$xpath = new \DOMXPath($dom);
$elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
if ($elements->length > 0) {
$contentElement = $elements->item(0);
}
}
// If still not found, return the original HTML
if (!$contentElement) {
return $html;
}
// Get the HTML of the content element
$contentHtml = $dom->saveHTML($contentElement);
// Clean up the content HTML
// Remove script and style elements
$contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
$contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
// Remove edit section links
$contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
return $contentHtml;
}
}

View file

@ -48,7 +48,9 @@
</td>
<td>
<a href="https://wiki.openstreetmap.org/{{ key }}">
<strong>{{ key }}</strong>
</a>
</td>
{% set diff = page_differences[key] %}
@ -212,7 +214,6 @@
{% endif %}
<div>
<strong>{{ page.key }}</strong>
<span class="badge bg-primary">Spécifique</span>
</div>
</div>
</td>

View file

@ -81,6 +81,49 @@
</ul>
</div>
{% if english_html or french_html %}
<div class="card mb-4">
<div class="card-header">
<h2>Contenu HTML des pages</h2>
<p class="mb-0">Vous pouvez consulter le contenu HTML des pages ci-dessous pour faciliter la traduction.</p>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-6">
<div class="card h-100">
<div class="card-header bg-primary text-white">
<i class="bi bi-flag-fill"></i> Contenu HTML de la version anglaise
</div>
<div class="card-body">
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
{{ english_html|raw }}
</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card h-100">
<div class="card-header bg-info text-white">
<i class="bi bi-translate"></i> Contenu HTML de la version française
</div>
<div class="card-body">
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
{% if french_html %}
{{ french_html|raw }}
{% else %}
<div class="alert alert-warning">
<i class="bi bi-exclamation-triangle"></i> La page française n'existe pas encore.
</div>
{% endif %}
</div>
</div>
</div>
</div>
</div>
</div>
</div>
{% endif %}
<div class="row">
<div class="col-md-6">
<div class="iframe-header">

View file

@ -30,11 +30,14 @@ import re
import os
import subprocess
import tempfile
import hashlib
from datetime import datetime
from bs4 import BeautifulSoup
import logging
import matplotlib.pyplot as plt
import numpy as np
import nltk
from pathlib import Path
# Configure logging
logging.basicConfig(
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 50
# HTML cache folder
HTML_CACHE_DIR = "html_cache"
# Initialize NLTK for sentence tokenization
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
# List of specific pages to compare (in addition to top keys)
# This list can include:
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
url = f"{base_url}{key}"
page_title = key
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
# Create a unique cache filename based on the URL
cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
try:
response = requests.get(url)
html_content = None
# Check if page exists
if response.status_code == 404:
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
# Try to load from cache first
if cache_file.exists():
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
try:
with open(cache_file, 'r', encoding='utf-8') as f:
html_content = f.read()
except Exception as e:
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
html_content = None
# If not in cache or cache read failed, fetch from web
if html_content is None:
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
try:
response = requests.get(url)
# Check if page exists
if response.status_code == 404:
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
return None
response.raise_for_status()
html_content = response.text
# Save to cache
try:
with open(cache_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
except Exception as e:
logger.warning(f"Error saving to cache: {e}")
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None
response.raise_for_status()
soup = BeautifulSoup(html_content, 'html.parser')
soup = BeautifulSoup(response.text, 'html.parser')
# Get last modification date
last_modified = None
footer_info = soup.select_one('#footer-info-lastmod')
if footer_info:
date_text = footer_info.text
# Extract date using regex
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
if date_match:
date_str = date_match.group(1)
try:
# Parse date (format may vary based on wiki language)
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
# Get last modification date
last_modified = None
footer_info = soup.select_one('#footer-info-lastmod')
if footer_info:
date_text = footer_info.text
# Extract date using regex
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
if date_match:
date_str = date_match.group(1)
try:
# Parse date (format may vary based on wiki language)
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
# Extract sections (h2, h3, h4)
section_elements = soup.select('h2, h3, h4')
sections = len(section_elements)
# Extract sections (h2, h3, h4)
section_elements = soup.select('h2, h3, h4')
sections = len(section_elements)
# Extract section titles
section_titles = []
for section_elem in section_elements:
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
continue
# Extract section titles
section_titles = []
for section_elem in section_elements:
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
# Skip sections that are inside a table with class DescriptionBox
if section_elem.find_parent('table', class_='DescriptionBox'):
continue
# Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract()
section_title = section_elem.get_text(strip=True)
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
section_titles.append({
'title': section_title,
'level': section_level
})
# Count words and sentences in the content
content = soup.select_one('#mw-content-text')
clean_text = ""
if content:
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Get text and count words
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Count sentences using NLTK
sentences = nltk.sent_tokenize(clean_text)
sentence_count = len(sentences)
# Check grammar for French pages
grammar_suggestions = []
if language == 'fr':
logger.info(f"Checking grammar for French page: {key}")
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
# Extract links
links = content.select('a')
link_count = len(links)
# Get link details (text and href)
link_details = []
for link in links:
href = link.get('href', '')
# Skip edit section links and other non-content links
if 'action=edit' in href or 'redlink=1' in href or not href:
continue
# Skip sections that are inside a table with class DescriptionBox
if section_elem.find_parent('table', class_='DescriptionBox'):
continue
# Make relative URLs absolute
if href.startswith('/'):
href = 'https://wiki.openstreetmap.org' + href
# Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract()
link_text = link.get_text(strip=True)
if link_text: # Only include links with text
link_details.append({
'text': link_text,
'href': href
})
section_title = section_elem.get_text(strip=True)
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
# Extract media (images)
media_elements = content.select('img')
media_count = len(media_elements)
section_titles.append({
'title': section_title,
'level': section_level
})
# Get media details (src and alt text)
media_details = []
# Count words in the content
content = soup.select_one('#mw-content-text')
clean_text = ""
if content:
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Extract description image specifically
# Try multiple selectors to find the description image
description_img = None
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Debug: Log the key we're processing
logger.info(f"Looking for description image for key '{key}' in {language}")
# Get text and count words
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Function to filter out OSM logo and small icons
def is_relevant_image(img):
src = img.get('src', '')
# Skip OSM logo
if 'osm_logo' in src:
return False
# Skip small icons (usually less than 30px)
width = img.get('width')
if width and int(width) < 30:
return False
height = img.get('height')
if height and int(height) < 30:
return False
return True
# Check grammar for French pages
grammar_suggestions = []
if language == 'fr':
logger.info(f"Checking grammar for French page: {key}")
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
# Special case for highway key - directly target the image we want
if key == 'highway':
# Try to find the specific image in figure elements
highway_img_elements = content.select('figure.mw-halign-center img')
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
# Extract links
links = content.select('a')
link_count = len(links)
# Filter for relevant images
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images for highway")
# Get link details (text and href)
link_details = []
for link in links:
href = link.get('href', '')
# Skip edit section links and other non-content links
if 'action=edit' in href or 'redlink=1' in href or not href:
continue
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
# If not found with highway-specific selector, try the td.d_image selector
if not description_img:
description_img_elements = content.select('td.d_image img')
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
# If still not found, try the specific selector for .description img.mw-file-element
if not description_img:
description_img_elements = content.select('.description img.mw-file-element')
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
# If still not found, try images in figures within the description box
if not description_img:
description_img_elements = content.select('.description figure img')
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
# If still not found, try any image in the description box
if not description_img:
description_img_elements = content.select('.description img')
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
# If still not found, try images in the DescriptionBox table
if not description_img:
description_img_elements = content.select('table.DescriptionBox img')
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
# If still not found, try images in figure elements anywhere in the content
if not description_img:
description_img_elements = content.select('figure img')
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
# If we still don't have an image, use any image that's not the OSM logo
if not description_img:
all_images = content.select('img')
relevant_images = [img for img in all_images if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using fallback image: {description_img.get('src', '')}")
# Process the found image
description_img_url = None
if description_img:
src = description_img.get('src', '')
if src:
# Make relative URLs absolute
if href.startswith('/'):
href = 'https://wiki.openstreetmap.org' + href
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = 'https://wiki.openstreetmap.org' + src
link_text = link.get_text(strip=True)
if link_text: # Only include links with text
link_details.append({
'text': link_text,
'href': href
})
# Extract media (images)
media_elements = content.select('img')
media_count = len(media_elements)
# Get media details (src and alt text)
media_details = []
# Extract description image specifically
# Try multiple selectors to find the description image
description_img = None
# Debug: Log the key we're processing
logger.info(f"Looking for description image for key '{key}' in {language}")
# Function to filter out OSM logo and small icons
def is_relevant_image(img):
src = img.get('src', '')
# Skip OSM logo
if 'osm_logo' in src:
return False
# Skip small icons (usually less than 30px)
width = img.get('width')
if width and int(width) < 30:
return False
height = img.get('height')
if height and int(height) < 30:
return False
return True
# Special case for highway key - directly target the image we want
if key == 'highway':
# Try to find the specific image in figure elements
highway_img_elements = content.select('figure.mw-halign-center img')
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images for highway")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
# If not found with highway-specific selector, try the td.d_image selector
if not description_img:
description_img_elements = content.select('td.d_image img')
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
# If still not found, try the specific selector for .description img.mw-file-element
if not description_img:
description_img_elements = content.select('.description img.mw-file-element')
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
# If still not found, try images in figures within the description box
if not description_img:
description_img_elements = content.select('.description figure img')
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
# If still not found, try any image in the description box
if not description_img:
description_img_elements = content.select('.description img')
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
# If still not found, try images in the DescriptionBox table
if not description_img:
description_img_elements = content.select('table.DescriptionBox img')
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
# If still not found, try images in figure elements anywhere in the content
if not description_img:
description_img_elements = content.select('figure img')
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
# If we still don't have an image, use any image that's not the OSM logo
if not description_img:
all_images = content.select('img')
relevant_images = [img for img in all_images if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using fallback image: {description_img.get('src', '')}")
# Process the found image
description_img_url = None
if description_img:
src = description_img.get('src', '')
if src:
# Make relative URLs absolute
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = 'https://wiki.openstreetmap.org' + src
description_img_url = src
description_img_url = src
# Process all images
for img in media_elements:
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'sections': sections,
'section_titles': section_titles,
'word_count': word_count,
'sentence_count': sentence_count,
'link_count': link_count,
'link_details': link_details,
'media_count': media_count,
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'categories': categories,
'description_img_url': description_img_url,
'is_specific_page': is_specific_page,
'grammar_suggestions': grammar_suggestions
'grammar_suggestions': grammar_suggestions,
'html_content': html_content
}
except requests.exceptions.RequestException as e:
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
if date_diff > 30:
reason.append(f"La version Française est datée de {date_diff} jours")
if word_diff > 200:
reason.append(f"La version Anglaise a {word_diff} plus de mots")
reason.append(f"La version Anglaise a {word_diff} mots de plus")
if section_diff > 2:
reason.append(f"La version Anglaise a {section_diff} plus de sections")
reason.append(f"La version Anglaise a {section_diff} sections de plus")
if link_diff > 20:
reason.append(f"La version Anglaise a {link_diff} plus de liens")
reason.append(f"La version Anglaise a {link_diff} liens de plus")
if media_diff > 5:
reason.append(f"La version Anglaise a {media_diff} plus d'images")
reason.append(f"La version Anglaise a {media_diff} images de plus")
if fr_page['word_count'] < en_page['word_count'] * 0.7:
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")