up wording comparaison
This commit is contained in:
parent
1140c87932
commit
09e16d9075
6 changed files with 443 additions and 239 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -30,6 +30,7 @@ venv
|
|||
wiki_compare/.env
|
||||
wiki_compare/*.png
|
||||
wiki_compare/*.json
|
||||
wiki_compare/html_cache/
|
||||
public/*.json
|
||||
|
||||
.idea
|
2
.idea/php.xml
generated
2
.idea/php.xml
generated
|
@ -141,7 +141,7 @@
|
|||
<path value="$PROJECT_DIR$/vendor/symfony/mime" />
|
||||
</include_path>
|
||||
</component>
|
||||
<component name="PhpProjectSharedConfiguration" php_language_level="8.3">
|
||||
<component name="PhpProjectSharedConfiguration" php_language_level="8.2">
|
||||
<option name="suggestChangeDefaultLanguageLevel" value="false" />
|
||||
</component>
|
||||
<component name="PhpStanOptionsConfiguration">
|
||||
|
|
|
@ -720,10 +720,67 @@ class WikiController extends AbstractController
|
|||
$englishUrl = "https://wiki.openstreetmap.org/wiki/Key:{$key}";
|
||||
$frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
|
||||
|
||||
// Fetch the HTML content of the English page using wiki_compare.py
|
||||
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
|
||||
$englishHtml = null;
|
||||
$frenchHtml = null;
|
||||
|
||||
if (file_exists($scriptPath)) {
|
||||
// Create a temporary Python script to fetch the page content
|
||||
$tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
|
||||
$pythonCode = <<<EOT
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
import json
|
||||
from wiki_compare import fetch_wiki_page
|
||||
|
||||
# Get the key from command line arguments
|
||||
key = sys.argv[1]
|
||||
language = sys.argv[2]
|
||||
|
||||
# Fetch the page
|
||||
page = fetch_wiki_page(key, language)
|
||||
|
||||
# Output the HTML content
|
||||
if page and 'html_content' in page:
|
||||
print(page['html_content'])
|
||||
else:
|
||||
print("")
|
||||
EOT;
|
||||
|
||||
file_put_contents($tempScriptPath, $pythonCode);
|
||||
chmod($tempScriptPath, 0755);
|
||||
|
||||
// Fetch English page
|
||||
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
|
||||
$englishHtml = shell_exec($command);
|
||||
|
||||
// Extract only the content part from the HTML (remove headers, footers, etc.)
|
||||
if ($englishHtml) {
|
||||
$englishHtml = $this->extractMainContent($englishHtml);
|
||||
}
|
||||
|
||||
// Fetch French page (might not exist, but we'll try)
|
||||
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
|
||||
$frenchHtml = shell_exec($command);
|
||||
|
||||
// Extract only the content part from the HTML (remove headers, footers, etc.)
|
||||
if ($frenchHtml) {
|
||||
$frenchHtml = $this->extractMainContent($frenchHtml);
|
||||
}
|
||||
|
||||
// Clean up the temporary script
|
||||
unlink($tempScriptPath);
|
||||
}
|
||||
|
||||
return $this->render('admin/wiki_create_french.html.twig', [
|
||||
'key' => $key,
|
||||
'english_url' => $englishUrl,
|
||||
'french_edit_url' => $frenchEditUrl
|
||||
'french_edit_url' => $frenchEditUrl,
|
||||
'english_html' => $englishHtml,
|
||||
'french_html' => $frenchHtml
|
||||
]);
|
||||
}
|
||||
|
||||
|
@ -1436,4 +1493,57 @@ class WikiController extends AbstractController
|
|||
'fr_links' => $frLinks
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
|
||||
*
|
||||
* @param string $html The full HTML content
|
||||
* @return string The extracted main content
|
||||
*/
|
||||
private function extractMainContent(string $html): string
|
||||
{
|
||||
// Use a simple approach to extract the content
|
||||
// This could be improved with a more sophisticated HTML parser if needed
|
||||
|
||||
// Create a DOMDocument to parse the HTML
|
||||
$dom = new \DOMDocument();
|
||||
|
||||
// Suppress warnings about malformed HTML
|
||||
libxml_use_internal_errors(true);
|
||||
$dom->loadHTML($html);
|
||||
libxml_clear_errors();
|
||||
|
||||
// Try to find the main content element
|
||||
$contentElement = null;
|
||||
|
||||
// First, try to find the element with id "mw-content-text"
|
||||
$contentElement = $dom->getElementById('mw-content-text');
|
||||
|
||||
// If not found, try to find the element with class "mw-content-ltr"
|
||||
if (!$contentElement) {
|
||||
$xpath = new \DOMXPath($dom);
|
||||
$elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
|
||||
if ($elements->length > 0) {
|
||||
$contentElement = $elements->item(0);
|
||||
}
|
||||
}
|
||||
|
||||
// If still not found, return the original HTML
|
||||
if (!$contentElement) {
|
||||
return $html;
|
||||
}
|
||||
|
||||
// Get the HTML of the content element
|
||||
$contentHtml = $dom->saveHTML($contentElement);
|
||||
|
||||
// Clean up the content HTML
|
||||
// Remove script and style elements
|
||||
$contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
|
||||
$contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
|
||||
|
||||
// Remove edit section links
|
||||
$contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
|
||||
|
||||
return $contentHtml;
|
||||
}
|
||||
}
|
|
@ -48,7 +48,9 @@
|
|||
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://wiki.openstreetmap.org/{{ key }}">
|
||||
<strong>{{ key }}</strong>
|
||||
</a>
|
||||
</td>
|
||||
|
||||
{% set diff = page_differences[key] %}
|
||||
|
@ -212,7 +214,6 @@
|
|||
{% endif %}
|
||||
<div>
|
||||
<strong>{{ page.key }}</strong>
|
||||
<span class="badge bg-primary">Spécifique</span>
|
||||
</div>
|
||||
</div>
|
||||
</td>
|
||||
|
|
|
@ -81,6 +81,49 @@
|
|||
</ul>
|
||||
</div>
|
||||
|
||||
{% if english_html or french_html %}
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h2>Contenu HTML des pages</h2>
|
||||
<p class="mb-0">Vous pouvez consulter le contenu HTML des pages ci-dessous pour faciliter la traduction.</p>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<div class="card h-100">
|
||||
<div class="card-header bg-primary text-white">
|
||||
<i class="bi bi-flag-fill"></i> Contenu HTML de la version anglaise
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
|
||||
{{ english_html|raw }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<div class="card h-100">
|
||||
<div class="card-header bg-info text-white">
|
||||
<i class="bi bi-translate"></i> Contenu HTML de la version française
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div style="max-height: 500px; overflow-y: auto; font-family: monospace; font-size: 0.8rem; white-space: pre-wrap;">
|
||||
{% if french_html %}
|
||||
{{ french_html|raw }}
|
||||
{% else %}
|
||||
<div class="alert alert-warning">
|
||||
<i class="bi bi-exclamation-triangle"></i> La page française n'existe pas encore.
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<div class="iframe-header">
|
||||
|
|
|
@ -30,11 +30,14 @@ import re
|
|||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import nltk
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
|
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
|
|||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 50
|
||||
# HTML cache folder
|
||||
HTML_CACHE_DIR = "html_cache"
|
||||
|
||||
# Initialize NLTK for sentence tokenization
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
|
||||
# Create HTML cache directory if it doesn't exist
|
||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||
|
||||
# List of specific pages to compare (in addition to top keys)
|
||||
# This list can include:
|
||||
|
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
url = f"{base_url}{key}"
|
||||
page_title = key
|
||||
|
||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||
# Create a unique cache filename based on the URL
|
||||
cache_key = hashlib.md5(url.encode()).hexdigest()
|
||||
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
||||
|
||||
try:
|
||||
response = requests.get(url)
|
||||
html_content = None
|
||||
|
||||
# Check if page exists
|
||||
if response.status_code == 404:
|
||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||
# Try to load from cache first
|
||||
if cache_file.exists():
|
||||
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||
try:
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
html_content = f.read()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
|
||||
html_content = None
|
||||
|
||||
# If not in cache or cache read failed, fetch from web
|
||||
if html_content is None:
|
||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
# Check if page exists
|
||||
if response.status_code == 404:
|
||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
html_content = response.text
|
||||
|
||||
# Save to cache
|
||||
try:
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error saving to cache: {e}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# Get last modification date
|
||||
last_modified = None
|
||||
footer_info = soup.select_one('#footer-info-lastmod')
|
||||
if footer_info:
|
||||
date_text = footer_info.text
|
||||
# Extract date using regex
|
||||
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
||||
if date_match:
|
||||
date_str = date_match.group(1)
|
||||
try:
|
||||
# Parse date (format may vary based on wiki language)
|
||||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
logger.warning(f"Could not parse date: {date_str}")
|
||||
|
||||
# Get last modification date
|
||||
last_modified = None
|
||||
footer_info = soup.select_one('#footer-info-lastmod')
|
||||
if footer_info:
|
||||
date_text = footer_info.text
|
||||
# Extract date using regex
|
||||
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
||||
if date_match:
|
||||
date_str = date_match.group(1)
|
||||
try:
|
||||
# Parse date (format may vary based on wiki language)
|
||||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
logger.warning(f"Could not parse date: {date_str}")
|
||||
# Extract sections (h2, h3, h4)
|
||||
section_elements = soup.select('h2, h3, h4')
|
||||
sections = len(section_elements)
|
||||
|
||||
# Extract sections (h2, h3, h4)
|
||||
section_elements = soup.select('h2, h3, h4')
|
||||
sections = len(section_elements)
|
||||
# Extract section titles
|
||||
section_titles = []
|
||||
for section_elem in section_elements:
|
||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||
continue
|
||||
|
||||
# Extract section titles
|
||||
section_titles = []
|
||||
for section_elem in section_elements:
|
||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||
# Skip sections that are inside a table with class DescriptionBox
|
||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||
continue
|
||||
|
||||
# Get the text of the section title, removing any edit links
|
||||
for edit_link in section_elem.select('.mw-editsection'):
|
||||
edit_link.extract()
|
||||
|
||||
section_title = section_elem.get_text(strip=True)
|
||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||
|
||||
section_titles.append({
|
||||
'title': section_title,
|
||||
'level': section_level
|
||||
})
|
||||
|
||||
# Count words and sentences in the content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
clean_text = ""
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
# Get text and count words
|
||||
clean_text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(clean_text.split())
|
||||
|
||||
# Count sentences using NLTK
|
||||
sentences = nltk.sent_tokenize(clean_text)
|
||||
sentence_count = len(sentences)
|
||||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
if language == 'fr':
|
||||
logger.info(f"Checking grammar for French page: {key}")
|
||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
link_count = len(links)
|
||||
|
||||
# Get link details (text and href)
|
||||
link_details = []
|
||||
for link in links:
|
||||
href = link.get('href', '')
|
||||
# Skip edit section links and other non-content links
|
||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||
continue
|
||||
|
||||
# Skip sections that are inside a table with class DescriptionBox
|
||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||
continue
|
||||
# Make relative URLs absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://wiki.openstreetmap.org' + href
|
||||
|
||||
# Get the text of the section title, removing any edit links
|
||||
for edit_link in section_elem.select('.mw-editsection'):
|
||||
edit_link.extract()
|
||||
link_text = link.get_text(strip=True)
|
||||
if link_text: # Only include links with text
|
||||
link_details.append({
|
||||
'text': link_text,
|
||||
'href': href
|
||||
})
|
||||
|
||||
section_title = section_elem.get_text(strip=True)
|
||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||
# Extract media (images)
|
||||
media_elements = content.select('img')
|
||||
media_count = len(media_elements)
|
||||
|
||||
section_titles.append({
|
||||
'title': section_title,
|
||||
'level': section_level
|
||||
})
|
||||
# Get media details (src and alt text)
|
||||
media_details = []
|
||||
|
||||
# Count words in the content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
clean_text = ""
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
# Extract description image specifically
|
||||
# Try multiple selectors to find the description image
|
||||
description_img = None
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
# Debug: Log the key we're processing
|
||||
logger.info(f"Looking for description image for key '{key}' in {language}")
|
||||
|
||||
# Get text and count words
|
||||
clean_text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(clean_text.split())
|
||||
# Function to filter out OSM logo and small icons
|
||||
def is_relevant_image(img):
|
||||
src = img.get('src', '')
|
||||
# Skip OSM logo
|
||||
if 'osm_logo' in src:
|
||||
return False
|
||||
# Skip small icons (usually less than 30px)
|
||||
width = img.get('width')
|
||||
if width and int(width) < 30:
|
||||
return False
|
||||
height = img.get('height')
|
||||
if height and int(height) < 30:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
if language == 'fr':
|
||||
logger.info(f"Checking grammar for French page: {key}")
|
||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
# Special case for highway key - directly target the image we want
|
||||
if key == 'highway':
|
||||
# Try to find the specific image in figure elements
|
||||
highway_img_elements = content.select('figure.mw-halign-center img')
|
||||
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
||||
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
link_count = len(links)
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
||||
|
||||
# Get link details (text and href)
|
||||
link_details = []
|
||||
for link in links:
|
||||
href = link.get('href', '')
|
||||
# Skip edit section links and other non-content links
|
||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||
continue
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
||||
|
||||
# If not found with highway-specific selector, try the td.d_image selector
|
||||
if not description_img:
|
||||
description_img_elements = content.select('td.d_image img')
|
||||
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try the specific selector for .description img.mw-file-element
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img.mw-file-element')
|
||||
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figures within the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description figure img')
|
||||
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try any image in the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img')
|
||||
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in the DescriptionBox table
|
||||
if not description_img:
|
||||
description_img_elements = content.select('table.DescriptionBox img')
|
||||
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figure elements anywhere in the content
|
||||
if not description_img:
|
||||
description_img_elements = content.select('figure img')
|
||||
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If we still don't have an image, use any image that's not the OSM logo
|
||||
if not description_img:
|
||||
all_images = content.select('img')
|
||||
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
||||
|
||||
# Process the found image
|
||||
description_img_url = None
|
||||
if description_img:
|
||||
src = description_img.get('src', '')
|
||||
if src:
|
||||
# Make relative URLs absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://wiki.openstreetmap.org' + href
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = 'https://wiki.openstreetmap.org' + src
|
||||
|
||||
link_text = link.get_text(strip=True)
|
||||
if link_text: # Only include links with text
|
||||
link_details.append({
|
||||
'text': link_text,
|
||||
'href': href
|
||||
})
|
||||
|
||||
# Extract media (images)
|
||||
media_elements = content.select('img')
|
||||
media_count = len(media_elements)
|
||||
|
||||
# Get media details (src and alt text)
|
||||
media_details = []
|
||||
|
||||
# Extract description image specifically
|
||||
# Try multiple selectors to find the description image
|
||||
description_img = None
|
||||
|
||||
# Debug: Log the key we're processing
|
||||
logger.info(f"Looking for description image for key '{key}' in {language}")
|
||||
|
||||
# Function to filter out OSM logo and small icons
|
||||
def is_relevant_image(img):
|
||||
src = img.get('src', '')
|
||||
# Skip OSM logo
|
||||
if 'osm_logo' in src:
|
||||
return False
|
||||
# Skip small icons (usually less than 30px)
|
||||
width = img.get('width')
|
||||
if width and int(width) < 30:
|
||||
return False
|
||||
height = img.get('height')
|
||||
if height and int(height) < 30:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Special case for highway key - directly target the image we want
|
||||
if key == 'highway':
|
||||
# Try to find the specific image in figure elements
|
||||
highway_img_elements = content.select('figure.mw-halign-center img')
|
||||
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
||||
|
||||
# If not found with highway-specific selector, try the td.d_image selector
|
||||
if not description_img:
|
||||
description_img_elements = content.select('td.d_image img')
|
||||
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try the specific selector for .description img.mw-file-element
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img.mw-file-element')
|
||||
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figures within the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description figure img')
|
||||
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try any image in the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img')
|
||||
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in the DescriptionBox table
|
||||
if not description_img:
|
||||
description_img_elements = content.select('table.DescriptionBox img')
|
||||
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figure elements anywhere in the content
|
||||
if not description_img:
|
||||
description_img_elements = content.select('figure img')
|
||||
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If we still don't have an image, use any image that's not the OSM logo
|
||||
if not description_img:
|
||||
all_images = content.select('img')
|
||||
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
||||
|
||||
# Process the found image
|
||||
description_img_url = None
|
||||
if description_img:
|
||||
src = description_img.get('src', '')
|
||||
if src:
|
||||
# Make relative URLs absolute
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = 'https://wiki.openstreetmap.org' + src
|
||||
|
||||
description_img_url = src
|
||||
description_img_url = src
|
||||
|
||||
# Process all images
|
||||
for img in media_elements:
|
||||
|
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
'sections': sections,
|
||||
'section_titles': section_titles,
|
||||
'word_count': word_count,
|
||||
'sentence_count': sentence_count,
|
||||
'link_count': link_count,
|
||||
'link_details': link_details,
|
||||
'media_count': media_count,
|
||||
|
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
'categories': categories,
|
||||
'description_img_url': description_img_url,
|
||||
'is_specific_page': is_specific_page,
|
||||
'grammar_suggestions': grammar_suggestions
|
||||
'grammar_suggestions': grammar_suggestions,
|
||||
'html_content': html_content
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
|
|||
if date_diff > 30:
|
||||
reason.append(f"La version Française est datée de {date_diff} jours")
|
||||
if word_diff > 200:
|
||||
reason.append(f"La version Anglaise a {word_diff} plus de mots")
|
||||
reason.append(f"La version Anglaise a {word_diff} mots de plus")
|
||||
if section_diff > 2:
|
||||
reason.append(f"La version Anglaise a {section_diff} plus de sections")
|
||||
reason.append(f"La version Anglaise a {section_diff} sections de plus")
|
||||
if link_diff > 20:
|
||||
reason.append(f"La version Anglaise a {link_diff} plus de liens")
|
||||
reason.append(f"La version Anglaise a {link_diff} liens de plus")
|
||||
if media_diff > 5:
|
||||
reason.append(f"La version Anglaise a {media_diff} plus d'images")
|
||||
reason.append(f"La version Anglaise a {media_diff} images de plus")
|
||||
if fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue