up wording comparaison

This commit is contained in:
Tykayn 2025-09-03 16:04:16 +02:00 committed by tykayn
parent 1140c87932
commit 09e16d9075
6 changed files with 443 additions and 239 deletions

View file

@ -30,11 +30,14 @@ import re
import os
import subprocess
import tempfile
import hashlib
from datetime import datetime
from bs4 import BeautifulSoup
import logging
import matplotlib.pyplot as plt
import numpy as np
import nltk
from pathlib import Path
# Configure logging
logging.basicConfig(
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 50
# HTML cache folder
HTML_CACHE_DIR = "html_cache"
# Initialize NLTK for sentence tokenization
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
# List of specific pages to compare (in addition to top keys)
# This list can include:
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
url = f"{base_url}{key}"
page_title = key
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
# Create a unique cache filename based on the URL
cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
try:
response = requests.get(url)
# Check if page exists
if response.status_code == 404:
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
html_content = None
# Try to load from cache first
if cache_file.exists():
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
try:
with open(cache_file, 'r', encoding='utf-8') as f:
html_content = f.read()
except Exception as e:
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
html_content = None
# If not in cache or cache read failed, fetch from web
if html_content is None:
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
try:
response = requests.get(url)
# Check if page exists
if response.status_code == 404:
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
return None
response.raise_for_status()
html_content = response.text
# Save to cache
try:
with open(cache_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
except Exception as e:
logger.warning(f"Error saving to cache: {e}")
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None
soup = BeautifulSoup(html_content, 'html.parser')
# Get last modification date
last_modified = None
footer_info = soup.select_one('#footer-info-lastmod')
if footer_info:
date_text = footer_info.text
# Extract date using regex
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
if date_match:
date_str = date_match.group(1)
try:
# Parse date (format may vary based on wiki language)
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
# Extract sections (h2, h3, h4)
section_elements = soup.select('h2, h3, h4')
sections = len(section_elements)
# Extract section titles
section_titles = []
for section_elem in section_elements:
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
continue
# Skip sections that are inside a table with class DescriptionBox
if section_elem.find_parent('table', class_='DescriptionBox'):
continue
# Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract()
response.raise_for_status()
section_title = section_elem.get_text(strip=True)
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
soup = BeautifulSoup(response.text, 'html.parser')
section_titles.append({
'title': section_title,
'level': section_level
})
# Get last modification date
last_modified = None
footer_info = soup.select_one('#footer-info-lastmod')
if footer_info:
date_text = footer_info.text
# Extract date using regex
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
if date_match:
date_str = date_match.group(1)
try:
# Parse date (format may vary based on wiki language)
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
# Count words and sentences in the content
content = soup.select_one('#mw-content-text')
clean_text = ""
if content:
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Extract sections (h2, h3, h4)
section_elements = soup.select('h2, h3, h4')
sections = len(section_elements)
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Extract section titles
section_titles = []
for section_elem in section_elements:
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
# Get text and count words
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Count sentences using NLTK
sentences = nltk.sent_tokenize(clean_text)
sentence_count = len(sentences)
# Check grammar for French pages
grammar_suggestions = []
if language == 'fr':
logger.info(f"Checking grammar for French page: {key}")
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
# Extract links
links = content.select('a')
link_count = len(links)
# Get link details (text and href)
link_details = []
for link in links:
href = link.get('href', '')
# Skip edit section links and other non-content links
if 'action=edit' in href or 'redlink=1' in href or not href:
continue
# Skip sections that are inside a table with class DescriptionBox
if section_elem.find_parent('table', class_='DescriptionBox'):
continue
# Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract()
section_title = section_elem.get_text(strip=True)
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
# Make relative URLs absolute
if href.startswith('/'):
href = 'https://wiki.openstreetmap.org' + href
section_titles.append({
'title': section_title,
'level': section_level
})
link_text = link.get_text(strip=True)
if link_text: # Only include links with text
link_details.append({
'text': link_text,
'href': href
})
# Count words in the content
content = soup.select_one('#mw-content-text')
clean_text = ""
if content:
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Get text and count words
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Check grammar for French pages
grammar_suggestions = []
if language == 'fr':
logger.info(f"Checking grammar for French page: {key}")
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
# Extract links
links = content.select('a')
link_count = len(links)
# Get link details (text and href)
link_details = []
for link in links:
href = link.get('href', '')
# Skip edit section links and other non-content links
if 'action=edit' in href or 'redlink=1' in href or not href:
continue
# Extract media (images)
media_elements = content.select('img')
media_count = len(media_elements)
# Get media details (src and alt text)
media_details = []
# Extract description image specifically
# Try multiple selectors to find the description image
description_img = None
# Debug: Log the key we're processing
logger.info(f"Looking for description image for key '{key}' in {language}")
# Function to filter out OSM logo and small icons
def is_relevant_image(img):
src = img.get('src', '')
# Skip OSM logo
if 'osm_logo' in src:
return False
# Skip small icons (usually less than 30px)
width = img.get('width')
if width and int(width) < 30:
return False
height = img.get('height')
if height and int(height) < 30:
return False
return True
# Special case for highway key - directly target the image we want
if key == 'highway':
# Try to find the specific image in figure elements
highway_img_elements = content.select('figure.mw-halign-center img')
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images for highway")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
# If not found with highway-specific selector, try the td.d_image selector
if not description_img:
description_img_elements = content.select('td.d_image img')
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
# If still not found, try the specific selector for .description img.mw-file-element
if not description_img:
description_img_elements = content.select('.description img.mw-file-element')
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
# If still not found, try images in figures within the description box
if not description_img:
description_img_elements = content.select('.description figure img')
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
# If still not found, try any image in the description box
if not description_img:
description_img_elements = content.select('.description img')
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
# If still not found, try images in the DescriptionBox table
if not description_img:
description_img_elements = content.select('table.DescriptionBox img')
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
# If still not found, try images in figure elements anywhere in the content
if not description_img:
description_img_elements = content.select('figure img')
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
# If we still don't have an image, use any image that's not the OSM logo
if not description_img:
all_images = content.select('img')
relevant_images = [img for img in all_images if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using fallback image: {description_img.get('src', '')}")
# Process the found image
description_img_url = None
if description_img:
src = description_img.get('src', '')
if src:
# Make relative URLs absolute
if href.startswith('/'):
href = 'https://wiki.openstreetmap.org' + href
link_text = link.get_text(strip=True)
if link_text: # Only include links with text
link_details.append({
'text': link_text,
'href': href
})
# Extract media (images)
media_elements = content.select('img')
media_count = len(media_elements)
# Get media details (src and alt text)
media_details = []
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = 'https://wiki.openstreetmap.org' + src
# Extract description image specifically
# Try multiple selectors to find the description image
description_img = None
# Debug: Log the key we're processing
logger.info(f"Looking for description image for key '{key}' in {language}")
# Function to filter out OSM logo and small icons
def is_relevant_image(img):
src = img.get('src', '')
# Skip OSM logo
if 'osm_logo' in src:
return False
# Skip small icons (usually less than 30px)
width = img.get('width')
if width and int(width) < 30:
return False
height = img.get('height')
if height and int(height) < 30:
return False
return True
# Special case for highway key - directly target the image we want
if key == 'highway':
# Try to find the specific image in figure elements
highway_img_elements = content.select('figure.mw-halign-center img')
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images for highway")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
# If not found with highway-specific selector, try the td.d_image selector
if not description_img:
description_img_elements = content.select('td.d_image img')
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
# If still not found, try the specific selector for .description img.mw-file-element
if not description_img:
description_img_elements = content.select('.description img.mw-file-element')
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
# If still not found, try images in figures within the description box
if not description_img:
description_img_elements = content.select('.description figure img')
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
# If still not found, try any image in the description box
if not description_img:
description_img_elements = content.select('.description img')
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
# If still not found, try images in the DescriptionBox table
if not description_img:
description_img_elements = content.select('table.DescriptionBox img')
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
# If still not found, try images in figure elements anywhere in the content
if not description_img:
description_img_elements = content.select('figure img')
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
# If we still don't have an image, use any image that's not the OSM logo
if not description_img:
all_images = content.select('img')
relevant_images = [img for img in all_images if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using fallback image: {description_img.get('src', '')}")
# Process the found image
description_img_url = None
if description_img:
src = description_img.get('src', '')
if src:
# Make relative URLs absolute
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = 'https://wiki.openstreetmap.org' + src
description_img_url = src
description_img_url = src
# Process all images
for img in media_elements:
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'sections': sections,
'section_titles': section_titles,
'word_count': word_count,
'sentence_count': sentence_count,
'link_count': link_count,
'link_details': link_details,
'media_count': media_count,
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'categories': categories,
'description_img_url': description_img_url,
'is_specific_page': is_specific_page,
'grammar_suggestions': grammar_suggestions
'grammar_suggestions': grammar_suggestions,
'html_content': html_content
}
except requests.exceptions.RequestException as e:
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
if date_diff > 30:
reason.append(f"La version Française est datée de {date_diff} jours")
if word_diff > 200:
reason.append(f"La version Anglaise a {word_diff} plus de mots")
reason.append(f"La version Anglaise a {word_diff} mots de plus")
if section_diff > 2:
reason.append(f"La version Anglaise a {section_diff} plus de sections")
reason.append(f"La version Anglaise a {section_diff} sections de plus")
if link_diff > 20:
reason.append(f"La version Anglaise a {link_diff} plus de liens")
reason.append(f"La version Anglaise a {link_diff} liens de plus")
if media_diff > 5:
reason.append(f"La version Anglaise a {media_diff} plus d'images")
reason.append(f"La version Anglaise a {media_diff} images de plus")
if fr_page['word_count'] < en_page['word_count'] * 0.7:
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")