up wording comparaison
This commit is contained in:
parent
1140c87932
commit
09e16d9075
6 changed files with 443 additions and 239 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -30,6 +30,7 @@ venv
|
||||||
wiki_compare/.env
|
wiki_compare/.env
|
||||||
wiki_compare/*.png
|
wiki_compare/*.png
|
||||||
wiki_compare/*.json
|
wiki_compare/*.json
|
||||||
|
wiki_compare/html_cache/
|
||||||
public/*.json
|
public/*.json
|
||||||
|
|
||||||
.idea
|
.idea
|
2
.idea/php.xml
generated
2
.idea/php.xml
generated
|
@ -141,7 +141,7 @@
|
||||||
<path value="$PROJECT_DIR$/vendor/symfony/mime" />
|
<path value="$PROJECT_DIR$/vendor/symfony/mime" />
|
||||||
</include_path>
|
</include_path>
|
||||||
</component>
|
</component>
|
||||||
<component name="PhpProjectSharedConfiguration" php_language_level="8.3">
|
<component name="PhpProjectSharedConfiguration" php_language_level="8.2">
|
||||||
<option name="suggestChangeDefaultLanguageLevel" value="false" />
|
<option name="suggestChangeDefaultLanguageLevel" value="false" />
|
||||||
</component>
|
</component>
|
||||||
<component name="PhpStanOptionsConfiguration">
|
<component name="PhpStanOptionsConfiguration">
|
||||||
|
|
|
@ -719,11 +719,68 @@ class WikiController extends AbstractController
|
||||||
// Construct the URLs for the English page and the French page creation form
|
// Construct the URLs for the English page and the French page creation form
|
||||||
$englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
|
$englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
|
||||||
$frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
|
$frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
|
||||||
|
|
||||||
|
// Fetch the HTML content of the English page using wiki_compare.py
|
||||||
|
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
|
||||||
|
$englishHtml = null;
|
||||||
|
$frenchHtml = null;
|
||||||
|
|
||||||
|
if (file_exists($scriptPath)) {
|
||||||
|
// Create a temporary Python script to fetch the page content
|
||||||
|
$tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
|
||||||
|
$pythonCode = <<<EOT
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from wiki_compare import fetch_wiki_page
|
||||||
|
|
||||||
|
# Get the key from command line arguments
|
||||||
|
key = sys.argv[1]
|
||||||
|
language = sys.argv[2]
|
||||||
|
|
||||||
|
# Fetch the page
|
||||||
|
page = fetch_wiki_page(key, language)
|
||||||
|
|
||||||
|
# Output the HTML content
|
||||||
|
if page and 'html_content' in page:
|
||||||
|
print(page['html_content'])
|
||||||
|
else:
|
||||||
|
print("")
|
||||||
|
EOT;
|
||||||
|
|
||||||
|
file_put_contents($tempScriptPath, $pythonCode);
|
||||||
|
chmod($tempScriptPath, 0755);
|
||||||
|
|
||||||
|
// Fetch English page
|
||||||
|
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
|
||||||
|
$englishHtml = shell_exec($command);
|
||||||
|
|
||||||
|
// Extract only the content part from the HTML (remove headers, footers, etc.)
|
||||||
|
if ($englishHtml) {
|
||||||
|
$englishHtml = $this->extractMainContent($englishHtml);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch French page (might not exist, but we'll try)
|
||||||
|
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
|
||||||
|
$frenchHtml = shell_exec($command);
|
||||||
|
|
||||||
|
// Extract only the content part from the HTML (remove headers, footers, etc.)
|
||||||
|
if ($frenchHtml) {
|
||||||
|
$frenchHtml = $this->extractMainContent($frenchHtml);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up the temporary script
|
||||||
|
unlink($tempScriptPath);
|
||||||
|
}
|
||||||
|
|
||||||
return $this->render('admin/wiki_create_french.html.twig', [
|
return $this->render('admin/wiki_create_french.html.twig', [
|
||||||
'key' => $key,
|
'key' => $key,
|
||||||
'english_url' => $englishUrl,
|
'english_url' => $englishUrl,
|
||||||
'french_edit_url' => $frenchEditUrl
|
'french_edit_url' => $frenchEditUrl,
|
||||||
|
'english_html' => $englishHtml,
|
||||||
|
'french_html' => $frenchHtml
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1436,4 +1493,57 @@ class WikiController extends AbstractController
|
||||||
'fr_links' => $frLinks
|
'fr_links' => $frLinks
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
|
||||||
|
*
|
||||||
|
* @param string $html The full HTML content
|
||||||
|
* @return string The extracted main content
|
||||||
|
*/
|
||||||
|
private function extractMainContent(string $html): string
|
||||||
|
{
|
||||||
|
// Use a simple approach to extract the content
|
||||||
|
// This could be improved with a more sophisticated HTML parser if needed
|
||||||
|
|
||||||
|
// Create a DOMDocument to parse the HTML
|
||||||
|
$dom = new \DOMDocument();
|
||||||
|
|
||||||
|
// Suppress warnings about malformed HTML
|
||||||
|
libxml_use_internal_errors(true);
|
||||||
|
$dom->loadHTML($html);
|
||||||
|
libxml_clear_errors();
|
||||||
|
|
||||||
|
// Try to find the main content element
|
||||||
|
$contentElement = null;
|
||||||
|
|
||||||
|
// First, try to find the element with id "mw-content-text"
|
||||||
|
$contentElement = $dom->getElementById('mw-content-text');
|
||||||
|
|
||||||
|
// If not found, try to find the element with class "mw-content-ltr"
|
||||||
|
if (!$contentElement) {
|
||||||
|
$xpath = new \DOMXPath($dom);
|
||||||
|
$elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
|
||||||
|
if ($elements->length > 0) {
|
||||||
|
$contentElement = $elements->item(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If still not found, return the original HTML
|
||||||
|
if (!$contentElement) {
|
||||||
|
return $html;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the HTML of the content element
|
||||||
|
$contentHtml = $dom->saveHTML($contentElement);
|
||||||
|
|
||||||
|
// Clean up the content HTML
|
||||||
|
// Remove script and style elements
|
||||||
|
$contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
|
||||||
|
$contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
|
||||||
|
|
||||||
|
// Remove edit section links
|
||||||
|
$contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
|
||||||
|
|
||||||
|
return $contentHtml;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -48,7 +48,9 @@
|
||||||
|
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
|
<a href="https://wiki.openstreetmap.org/{{ key }}">
|
||||||
<strong>{{ key }}</strong>
|
<strong>{{ key }}</strong>
|
||||||
|
</a>
|
||||||
</td>
|
</td>
|
||||||
|
|
||||||
{% set diff = page_differences[key] %}
|
{% set diff = page_differences[key] %}
|
||||||
|
@ -212,7 +214,6 @@
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<div>
|
<div>
|
||||||
<strong>{{ page.key }}</strong>
|
<strong>{{ page.key }}</strong>
|
||||||
<span class="badge bg-primary">Spécifique</span>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</td>
|
</td>
|
||||||
|
|
|
@ -81,6 +81,49 @@
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{% if english_html or french_html %}
|
||||||
|
<div class="card mb-4">
|
||||||
|
<div class="card-header">
|
||||||
|
<h2>Contenu HTML des pages</h2>
|
||||||
|
<p class="mb-0">Vous pouvez consulter le contenu HTML des pages ci-dessous pour faciliter la traduction.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<div class="card h-100">
|
||||||
|
<div class="card-header bg-primary text-white">
|
||||||
|
<i class="bi bi-flag-fill"></i> Contenu HTML de la version anglaise
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
|
||||||
|
{{ english_html|raw }}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-6">
|
||||||
|
<div class="card h-100">
|
||||||
|
<div class="card-header bg-info text-white">
|
||||||
|
<i class="bi bi-translate"></i> Contenu HTML de la version française
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
|
||||||
|
{% if french_html %}
|
||||||
|
{{ french_html|raw }}
|
||||||
|
{% else %}
|
||||||
|
<div class="alert alert-warning">
|
||||||
|
<i class="bi bi-exclamation-triangle"></i> La page française n'existe pas encore.
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<div class="row">
|
<div class="row">
|
||||||
<div class="col-md-6">
|
<div class="col-md-6">
|
||||||
<div class="iframe-header">
|
<div class="iframe-header">
|
||||||
|
|
|
@ -30,11 +30,14 @@ import re
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import hashlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import logging
|
import logging
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import nltk
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
|
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||||
# Number of wiki pages to examine
|
# Number of wiki pages to examine
|
||||||
NUM_WIKI_PAGES = 50
|
NUM_WIKI_PAGES = 50
|
||||||
|
# HTML cache folder
|
||||||
|
HTML_CACHE_DIR = "html_cache"
|
||||||
|
|
||||||
|
# Initialize NLTK for sentence tokenization
|
||||||
|
try:
|
||||||
|
nltk.data.find('tokenizers/punkt')
|
||||||
|
except LookupError:
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
# Create HTML cache directory if it doesn't exist
|
||||||
|
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||||
|
|
||||||
# List of specific pages to compare (in addition to top keys)
|
# List of specific pages to compare (in addition to top keys)
|
||||||
# This list can include:
|
# This list can include:
|
||||||
|
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
url = f"{base_url}{key}"
|
url = f"{base_url}{key}"
|
||||||
page_title = key
|
page_title = key
|
||||||
|
|
||||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
# Create a unique cache filename based on the URL
|
||||||
|
cache_key = hashlib.md5(url.encode()).hexdigest()
|
||||||
|
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
||||||
|
|
||||||
try:
|
html_content = None
|
||||||
response = requests.get(url)
|
|
||||||
|
# Try to load from cache first
|
||||||
# Check if page exists
|
if cache_file.exists():
|
||||||
if response.status_code == 404:
|
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
try:
|
||||||
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||||
|
html_content = f.read()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
|
||||||
|
html_content = None
|
||||||
|
|
||||||
|
# If not in cache or cache read failed, fetch from web
|
||||||
|
if html_content is None:
|
||||||
|
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
|
||||||
|
# Check if page exists
|
||||||
|
if response.status_code == 404:
|
||||||
|
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||||
|
return None
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
html_content = response.text
|
||||||
|
|
||||||
|
# Save to cache
|
||||||
|
try:
|
||||||
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(html_content)
|
||||||
|
logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error saving to cache: {e}")
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Get last modification date
|
||||||
|
last_modified = None
|
||||||
|
footer_info = soup.select_one('#footer-info-lastmod')
|
||||||
|
if footer_info:
|
||||||
|
date_text = footer_info.text
|
||||||
|
# Extract date using regex
|
||||||
|
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
||||||
|
if date_match:
|
||||||
|
date_str = date_match.group(1)
|
||||||
|
try:
|
||||||
|
# Parse date (format may vary based on wiki language)
|
||||||
|
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||||
|
except ValueError:
|
||||||
|
logger.warning(f"Could not parse date: {date_str}")
|
||||||
|
|
||||||
|
# Extract sections (h2, h3, h4)
|
||||||
|
section_elements = soup.select('h2, h3, h4')
|
||||||
|
sections = len(section_elements)
|
||||||
|
|
||||||
|
# Extract section titles
|
||||||
|
section_titles = []
|
||||||
|
for section_elem in section_elements:
|
||||||
|
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||||
|
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip sections that are inside a table with class DescriptionBox
|
||||||
|
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the text of the section title, removing any edit links
|
||||||
|
for edit_link in section_elem.select('.mw-editsection'):
|
||||||
|
edit_link.extract()
|
||||||
|
|
||||||
response.raise_for_status()
|
section_title = section_elem.get_text(strip=True)
|
||||||
|
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
section_titles.append({
|
||||||
|
'title': section_title,
|
||||||
|
'level': section_level
|
||||||
|
})
|
||||||
|
|
||||||
# Get last modification date
|
# Count words and sentences in the content
|
||||||
last_modified = None
|
content = soup.select_one('#mw-content-text')
|
||||||
footer_info = soup.select_one('#footer-info-lastmod')
|
clean_text = ""
|
||||||
if footer_info:
|
if content:
|
||||||
date_text = footer_info.text
|
# Remove script and style elements
|
||||||
# Extract date using regex
|
for script in content.select('script, style'):
|
||||||
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
script.extract()
|
||||||
if date_match:
|
|
||||||
date_str = date_match.group(1)
|
|
||||||
try:
|
|
||||||
# Parse date (format may vary based on wiki language)
|
|
||||||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
|
||||||
except ValueError:
|
|
||||||
logger.warning(f"Could not parse date: {date_str}")
|
|
||||||
|
|
||||||
# Extract sections (h2, h3, h4)
|
# Remove .languages elements
|
||||||
section_elements = soup.select('h2, h3, h4')
|
for languages_elem in content.select('.languages'):
|
||||||
sections = len(section_elements)
|
languages_elem.extract()
|
||||||
|
|
||||||
# Extract section titles
|
# Get text and count words
|
||||||
section_titles = []
|
clean_text = content.get_text(separator=' ', strip=True)
|
||||||
for section_elem in section_elements:
|
word_count = len(clean_text.split())
|
||||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
|
||||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
# Count sentences using NLTK
|
||||||
|
sentences = nltk.sent_tokenize(clean_text)
|
||||||
|
sentence_count = len(sentences)
|
||||||
|
|
||||||
|
# Check grammar for French pages
|
||||||
|
grammar_suggestions = []
|
||||||
|
if language == 'fr':
|
||||||
|
logger.info(f"Checking grammar for French page: {key}")
|
||||||
|
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||||
|
|
||||||
|
# Extract links
|
||||||
|
links = content.select('a')
|
||||||
|
link_count = len(links)
|
||||||
|
|
||||||
|
# Get link details (text and href)
|
||||||
|
link_details = []
|
||||||
|
for link in links:
|
||||||
|
href = link.get('href', '')
|
||||||
|
# Skip edit section links and other non-content links
|
||||||
|
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip sections that are inside a table with class DescriptionBox
|
|
||||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get the text of the section title, removing any edit links
|
|
||||||
for edit_link in section_elem.select('.mw-editsection'):
|
|
||||||
edit_link.extract()
|
|
||||||
|
|
||||||
section_title = section_elem.get_text(strip=True)
|
# Make relative URLs absolute
|
||||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
if href.startswith('/'):
|
||||||
|
href = 'https://wiki.openstreetmap.org' + href
|
||||||
|
|
||||||
section_titles.append({
|
link_text = link.get_text(strip=True)
|
||||||
'title': section_title,
|
if link_text: # Only include links with text
|
||||||
'level': section_level
|
link_details.append({
|
||||||
})
|
'text': link_text,
|
||||||
|
'href': href
|
||||||
|
})
|
||||||
|
|
||||||
# Count words in the content
|
# Extract media (images)
|
||||||
content = soup.select_one('#mw-content-text')
|
media_elements = content.select('img')
|
||||||
clean_text = ""
|
media_count = len(media_elements)
|
||||||
if content:
|
|
||||||
# Remove script and style elements
|
# Get media details (src and alt text)
|
||||||
for script in content.select('script, style'):
|
media_details = []
|
||||||
script.extract()
|
|
||||||
|
# Extract description image specifically
|
||||||
# Remove .languages elements
|
# Try multiple selectors to find the description image
|
||||||
for languages_elem in content.select('.languages'):
|
description_img = None
|
||||||
languages_elem.extract()
|
|
||||||
|
# Debug: Log the key we're processing
|
||||||
# Get text and count words
|
logger.info(f"Looking for description image for key '{key}' in {language}")
|
||||||
clean_text = content.get_text(separator=' ', strip=True)
|
|
||||||
word_count = len(clean_text.split())
|
# Function to filter out OSM logo and small icons
|
||||||
|
def is_relevant_image(img):
|
||||||
# Check grammar for French pages
|
src = img.get('src', '')
|
||||||
grammar_suggestions = []
|
# Skip OSM logo
|
||||||
if language == 'fr':
|
if 'osm_logo' in src:
|
||||||
logger.info(f"Checking grammar for French page: {key}")
|
return False
|
||||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
# Skip small icons (usually less than 30px)
|
||||||
|
width = img.get('width')
|
||||||
# Extract links
|
if width and int(width) < 30:
|
||||||
links = content.select('a')
|
return False
|
||||||
link_count = len(links)
|
height = img.get('height')
|
||||||
|
if height and int(height) < 30:
|
||||||
# Get link details (text and href)
|
return False
|
||||||
link_details = []
|
return True
|
||||||
for link in links:
|
|
||||||
href = link.get('href', '')
|
# Special case for highway key - directly target the image we want
|
||||||
# Skip edit section links and other non-content links
|
if key == 'highway':
|
||||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
# Try to find the specific image in figure elements
|
||||||
continue
|
highway_img_elements = content.select('figure.mw-halign-center img')
|
||||||
|
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
||||||
|
|
||||||
|
# Filter for relevant images
|
||||||
|
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# If not found with highway-specific selector, try the td.d_image selector
|
||||||
|
if not description_img:
|
||||||
|
description_img_elements = content.select('td.d_image img')
|
||||||
|
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
||||||
|
|
||||||
|
# Filter for relevant images
|
||||||
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# If still not found, try the specific selector for .description img.mw-file-element
|
||||||
|
if not description_img:
|
||||||
|
description_img_elements = content.select('.description img.mw-file-element')
|
||||||
|
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
||||||
|
|
||||||
|
# Filter for relevant images
|
||||||
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# If still not found, try images in figures within the description box
|
||||||
|
if not description_img:
|
||||||
|
description_img_elements = content.select('.description figure img')
|
||||||
|
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
||||||
|
|
||||||
|
# Filter for relevant images
|
||||||
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# If still not found, try any image in the description box
|
||||||
|
if not description_img:
|
||||||
|
description_img_elements = content.select('.description img')
|
||||||
|
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
||||||
|
|
||||||
|
# Filter for relevant images
|
||||||
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# If still not found, try images in the DescriptionBox table
|
||||||
|
if not description_img:
|
||||||
|
description_img_elements = content.select('table.DescriptionBox img')
|
||||||
|
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
||||||
|
|
||||||
|
# Filter for relevant images
|
||||||
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# If still not found, try images in figure elements anywhere in the content
|
||||||
|
if not description_img:
|
||||||
|
description_img_elements = content.select('figure img')
|
||||||
|
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
||||||
|
|
||||||
|
# Filter for relevant images
|
||||||
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# If we still don't have an image, use any image that's not the OSM logo
|
||||||
|
if not description_img:
|
||||||
|
all_images = content.select('img')
|
||||||
|
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
||||||
|
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
||||||
|
|
||||||
|
if relevant_images:
|
||||||
|
description_img = relevant_images[0]
|
||||||
|
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
||||||
|
|
||||||
|
# Process the found image
|
||||||
|
description_img_url = None
|
||||||
|
if description_img:
|
||||||
|
src = description_img.get('src', '')
|
||||||
|
if src:
|
||||||
# Make relative URLs absolute
|
# Make relative URLs absolute
|
||||||
if href.startswith('/'):
|
if src.startswith('//'):
|
||||||
href = 'https://wiki.openstreetmap.org' + href
|
src = 'https:' + src
|
||||||
|
elif src.startswith('/'):
|
||||||
link_text = link.get_text(strip=True)
|
src = 'https://wiki.openstreetmap.org' + src
|
||||||
if link_text: # Only include links with text
|
|
||||||
link_details.append({
|
|
||||||
'text': link_text,
|
|
||||||
'href': href
|
|
||||||
})
|
|
||||||
|
|
||||||
# Extract media (images)
|
|
||||||
media_elements = content.select('img')
|
|
||||||
media_count = len(media_elements)
|
|
||||||
|
|
||||||
# Get media details (src and alt text)
|
|
||||||
media_details = []
|
|
||||||
|
|
||||||
# Extract description image specifically
|
description_img_url = src
|
||||||
# Try multiple selectors to find the description image
|
|
||||||
description_img = None
|
|
||||||
|
|
||||||
# Debug: Log the key we're processing
|
|
||||||
logger.info(f"Looking for description image for key '{key}' in {language}")
|
|
||||||
|
|
||||||
# Function to filter out OSM logo and small icons
|
|
||||||
def is_relevant_image(img):
|
|
||||||
src = img.get('src', '')
|
|
||||||
# Skip OSM logo
|
|
||||||
if 'osm_logo' in src:
|
|
||||||
return False
|
|
||||||
# Skip small icons (usually less than 30px)
|
|
||||||
width = img.get('width')
|
|
||||||
if width and int(width) < 30:
|
|
||||||
return False
|
|
||||||
height = img.get('height')
|
|
||||||
if height and int(height) < 30:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Special case for highway key - directly target the image we want
|
|
||||||
if key == 'highway':
|
|
||||||
# Try to find the specific image in figure elements
|
|
||||||
highway_img_elements = content.select('figure.mw-halign-center img')
|
|
||||||
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
|
||||||
|
|
||||||
# Filter for relevant images
|
|
||||||
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# If not found with highway-specific selector, try the td.d_image selector
|
|
||||||
if not description_img:
|
|
||||||
description_img_elements = content.select('td.d_image img')
|
|
||||||
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
|
||||||
|
|
||||||
# Filter for relevant images
|
|
||||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# If still not found, try the specific selector for .description img.mw-file-element
|
|
||||||
if not description_img:
|
|
||||||
description_img_elements = content.select('.description img.mw-file-element')
|
|
||||||
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
|
||||||
|
|
||||||
# Filter for relevant images
|
|
||||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# If still not found, try images in figures within the description box
|
|
||||||
if not description_img:
|
|
||||||
description_img_elements = content.select('.description figure img')
|
|
||||||
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
|
||||||
|
|
||||||
# Filter for relevant images
|
|
||||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# If still not found, try any image in the description box
|
|
||||||
if not description_img:
|
|
||||||
description_img_elements = content.select('.description img')
|
|
||||||
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
|
||||||
|
|
||||||
# Filter for relevant images
|
|
||||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# If still not found, try images in the DescriptionBox table
|
|
||||||
if not description_img:
|
|
||||||
description_img_elements = content.select('table.DescriptionBox img')
|
|
||||||
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
|
||||||
|
|
||||||
# Filter for relevant images
|
|
||||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# If still not found, try images in figure elements anywhere in the content
|
|
||||||
if not description_img:
|
|
||||||
description_img_elements = content.select('figure img')
|
|
||||||
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
|
||||||
|
|
||||||
# Filter for relevant images
|
|
||||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# If we still don't have an image, use any image that's not the OSM logo
|
|
||||||
if not description_img:
|
|
||||||
all_images = content.select('img')
|
|
||||||
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
|
||||||
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
|
||||||
|
|
||||||
if relevant_images:
|
|
||||||
description_img = relevant_images[0]
|
|
||||||
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
|
||||||
|
|
||||||
# Process the found image
|
|
||||||
description_img_url = None
|
|
||||||
if description_img:
|
|
||||||
src = description_img.get('src', '')
|
|
||||||
if src:
|
|
||||||
# Make relative URLs absolute
|
|
||||||
if src.startswith('//'):
|
|
||||||
src = 'https:' + src
|
|
||||||
elif src.startswith('/'):
|
|
||||||
src = 'https://wiki.openstreetmap.org' + src
|
|
||||||
|
|
||||||
description_img_url = src
|
|
||||||
|
|
||||||
# Process all images
|
# Process all images
|
||||||
for img in media_elements:
|
for img in media_elements:
|
||||||
|
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
'sections': sections,
|
'sections': sections,
|
||||||
'section_titles': section_titles,
|
'section_titles': section_titles,
|
||||||
'word_count': word_count,
|
'word_count': word_count,
|
||||||
|
'sentence_count': sentence_count,
|
||||||
'link_count': link_count,
|
'link_count': link_count,
|
||||||
'link_details': link_details,
|
'link_details': link_details,
|
||||||
'media_count': media_count,
|
'media_count': media_count,
|
||||||
|
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
'categories': categories,
|
'categories': categories,
|
||||||
'description_img_url': description_img_url,
|
'description_img_url': description_img_url,
|
||||||
'is_specific_page': is_specific_page,
|
'is_specific_page': is_specific_page,
|
||||||
'grammar_suggestions': grammar_suggestions
|
'grammar_suggestions': grammar_suggestions,
|
||||||
|
'html_content': html_content
|
||||||
}
|
}
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
|
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
|
||||||
if date_diff > 30:
|
if date_diff > 30:
|
||||||
reason.append(f"La version Française est datée de {date_diff} jours")
|
reason.append(f"La version Française est datée de {date_diff} jours")
|
||||||
if word_diff > 200:
|
if word_diff > 200:
|
||||||
reason.append(f"La version Anglaise a {word_diff} plus de mots")
|
reason.append(f"La version Anglaise a {word_diff} mots de plus")
|
||||||
if section_diff > 2:
|
if section_diff > 2:
|
||||||
reason.append(f"La version Anglaise a {section_diff} plus de sections")
|
reason.append(f"La version Anglaise a {section_diff} sections de plus")
|
||||||
if link_diff > 20:
|
if link_diff > 20:
|
||||||
reason.append(f"La version Anglaise a {link_diff} plus de liens")
|
reason.append(f"La version Anglaise a {link_diff} liens de plus")
|
||||||
if media_diff > 5:
|
if media_diff > 5:
|
||||||
reason.append(f"La version Anglaise a {media_diff} plus d'images")
|
reason.append(f"La version Anglaise a {media_diff} images de plus")
|
||||||
if fr_page['word_count'] < en_page['word_count'] * 0.7:
|
if fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||||
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
|
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue