up wording comparaison
This commit is contained in:
parent
1140c87932
commit
09e16d9075
6 changed files with 443 additions and 239 deletions
|
@ -30,11 +30,14 @@ import re
|
|||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import nltk
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
|
@ -55,6 +58,17 @@ OUTDATED_PAGES_FILE = "outdated_pages.json"
|
|||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 50
|
||||
# HTML cache folder
|
||||
HTML_CACHE_DIR = "html_cache"
|
||||
|
||||
# Initialize NLTK for sentence tokenization
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
|
||||
# Create HTML cache directory if it doesn't exist
|
||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||
|
||||
# List of specific pages to compare (in addition to top keys)
|
||||
# This list can include:
|
||||
|
@ -262,250 +276,283 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
url = f"{base_url}{key}"
|
||||
page_title = key
|
||||
|
||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||
# Create a unique cache filename based on the URL
|
||||
cache_key = hashlib.md5(url.encode()).hexdigest()
|
||||
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
||||
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
# Check if page exists
|
||||
if response.status_code == 404:
|
||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||
html_content = None
|
||||
|
||||
# Try to load from cache first
|
||||
if cache_file.exists():
|
||||
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||
try:
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
html_content = f.read()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
|
||||
html_content = None
|
||||
|
||||
# If not in cache or cache read failed, fetch from web
|
||||
if html_content is None:
|
||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
# Check if page exists
|
||||
if response.status_code == 404:
|
||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
html_content = response.text
|
||||
|
||||
# Save to cache
|
||||
try:
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error saving to cache: {e}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Get last modification date
|
||||
last_modified = None
|
||||
footer_info = soup.select_one('#footer-info-lastmod')
|
||||
if footer_info:
|
||||
date_text = footer_info.text
|
||||
# Extract date using regex
|
||||
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
||||
if date_match:
|
||||
date_str = date_match.group(1)
|
||||
try:
|
||||
# Parse date (format may vary based on wiki language)
|
||||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
logger.warning(f"Could not parse date: {date_str}")
|
||||
|
||||
# Extract sections (h2, h3, h4)
|
||||
section_elements = soup.select('h2, h3, h4')
|
||||
sections = len(section_elements)
|
||||
|
||||
# Extract section titles
|
||||
section_titles = []
|
||||
for section_elem in section_elements:
|
||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||
continue
|
||||
|
||||
# Skip sections that are inside a table with class DescriptionBox
|
||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||
continue
|
||||
|
||||
# Get the text of the section title, removing any edit links
|
||||
for edit_link in section_elem.select('.mw-editsection'):
|
||||
edit_link.extract()
|
||||
|
||||
response.raise_for_status()
|
||||
section_title = section_elem.get_text(strip=True)
|
||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
section_titles.append({
|
||||
'title': section_title,
|
||||
'level': section_level
|
||||
})
|
||||
|
||||
# Get last modification date
|
||||
last_modified = None
|
||||
footer_info = soup.select_one('#footer-info-lastmod')
|
||||
if footer_info:
|
||||
date_text = footer_info.text
|
||||
# Extract date using regex
|
||||
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
||||
if date_match:
|
||||
date_str = date_match.group(1)
|
||||
try:
|
||||
# Parse date (format may vary based on wiki language)
|
||||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
logger.warning(f"Could not parse date: {date_str}")
|
||||
# Count words and sentences in the content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
clean_text = ""
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Extract sections (h2, h3, h4)
|
||||
section_elements = soup.select('h2, h3, h4')
|
||||
sections = len(section_elements)
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
# Extract section titles
|
||||
section_titles = []
|
||||
for section_elem in section_elements:
|
||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||
# Get text and count words
|
||||
clean_text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(clean_text.split())
|
||||
|
||||
# Count sentences using NLTK
|
||||
sentences = nltk.sent_tokenize(clean_text)
|
||||
sentence_count = len(sentences)
|
||||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
if language == 'fr':
|
||||
logger.info(f"Checking grammar for French page: {key}")
|
||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
link_count = len(links)
|
||||
|
||||
# Get link details (text and href)
|
||||
link_details = []
|
||||
for link in links:
|
||||
href = link.get('href', '')
|
||||
# Skip edit section links and other non-content links
|
||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||
continue
|
||||
|
||||
# Skip sections that are inside a table with class DescriptionBox
|
||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||
continue
|
||||
|
||||
# Get the text of the section title, removing any edit links
|
||||
for edit_link in section_elem.select('.mw-editsection'):
|
||||
edit_link.extract()
|
||||
|
||||
section_title = section_elem.get_text(strip=True)
|
||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||
# Make relative URLs absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://wiki.openstreetmap.org' + href
|
||||
|
||||
section_titles.append({
|
||||
'title': section_title,
|
||||
'level': section_level
|
||||
})
|
||||
link_text = link.get_text(strip=True)
|
||||
if link_text: # Only include links with text
|
||||
link_details.append({
|
||||
'text': link_text,
|
||||
'href': href
|
||||
})
|
||||
|
||||
# Count words in the content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
clean_text = ""
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
# Get text and count words
|
||||
clean_text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(clean_text.split())
|
||||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
if language == 'fr':
|
||||
logger.info(f"Checking grammar for French page: {key}")
|
||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
link_count = len(links)
|
||||
|
||||
# Get link details (text and href)
|
||||
link_details = []
|
||||
for link in links:
|
||||
href = link.get('href', '')
|
||||
# Skip edit section links and other non-content links
|
||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||
continue
|
||||
|
||||
# Extract media (images)
|
||||
media_elements = content.select('img')
|
||||
media_count = len(media_elements)
|
||||
|
||||
# Get media details (src and alt text)
|
||||
media_details = []
|
||||
|
||||
# Extract description image specifically
|
||||
# Try multiple selectors to find the description image
|
||||
description_img = None
|
||||
|
||||
# Debug: Log the key we're processing
|
||||
logger.info(f"Looking for description image for key '{key}' in {language}")
|
||||
|
||||
# Function to filter out OSM logo and small icons
|
||||
def is_relevant_image(img):
|
||||
src = img.get('src', '')
|
||||
# Skip OSM logo
|
||||
if 'osm_logo' in src:
|
||||
return False
|
||||
# Skip small icons (usually less than 30px)
|
||||
width = img.get('width')
|
||||
if width and int(width) < 30:
|
||||
return False
|
||||
height = img.get('height')
|
||||
if height and int(height) < 30:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Special case for highway key - directly target the image we want
|
||||
if key == 'highway':
|
||||
# Try to find the specific image in figure elements
|
||||
highway_img_elements = content.select('figure.mw-halign-center img')
|
||||
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
||||
|
||||
# If not found with highway-specific selector, try the td.d_image selector
|
||||
if not description_img:
|
||||
description_img_elements = content.select('td.d_image img')
|
||||
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try the specific selector for .description img.mw-file-element
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img.mw-file-element')
|
||||
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figures within the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description figure img')
|
||||
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try any image in the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img')
|
||||
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in the DescriptionBox table
|
||||
if not description_img:
|
||||
description_img_elements = content.select('table.DescriptionBox img')
|
||||
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figure elements anywhere in the content
|
||||
if not description_img:
|
||||
description_img_elements = content.select('figure img')
|
||||
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If we still don't have an image, use any image that's not the OSM logo
|
||||
if not description_img:
|
||||
all_images = content.select('img')
|
||||
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
||||
|
||||
# Process the found image
|
||||
description_img_url = None
|
||||
if description_img:
|
||||
src = description_img.get('src', '')
|
||||
if src:
|
||||
# Make relative URLs absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://wiki.openstreetmap.org' + href
|
||||
|
||||
link_text = link.get_text(strip=True)
|
||||
if link_text: # Only include links with text
|
||||
link_details.append({
|
||||
'text': link_text,
|
||||
'href': href
|
||||
})
|
||||
|
||||
# Extract media (images)
|
||||
media_elements = content.select('img')
|
||||
media_count = len(media_elements)
|
||||
|
||||
# Get media details (src and alt text)
|
||||
media_details = []
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = 'https://wiki.openstreetmap.org' + src
|
||||
|
||||
# Extract description image specifically
|
||||
# Try multiple selectors to find the description image
|
||||
description_img = None
|
||||
|
||||
# Debug: Log the key we're processing
|
||||
logger.info(f"Looking for description image for key '{key}' in {language}")
|
||||
|
||||
# Function to filter out OSM logo and small icons
|
||||
def is_relevant_image(img):
|
||||
src = img.get('src', '')
|
||||
# Skip OSM logo
|
||||
if 'osm_logo' in src:
|
||||
return False
|
||||
# Skip small icons (usually less than 30px)
|
||||
width = img.get('width')
|
||||
if width and int(width) < 30:
|
||||
return False
|
||||
height = img.get('height')
|
||||
if height and int(height) < 30:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Special case for highway key - directly target the image we want
|
||||
if key == 'highway':
|
||||
# Try to find the specific image in figure elements
|
||||
highway_img_elements = content.select('figure.mw-halign-center img')
|
||||
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
||||
|
||||
# If not found with highway-specific selector, try the td.d_image selector
|
||||
if not description_img:
|
||||
description_img_elements = content.select('td.d_image img')
|
||||
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try the specific selector for .description img.mw-file-element
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img.mw-file-element')
|
||||
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figures within the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description figure img')
|
||||
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try any image in the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img')
|
||||
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in the DescriptionBox table
|
||||
if not description_img:
|
||||
description_img_elements = content.select('table.DescriptionBox img')
|
||||
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figure elements anywhere in the content
|
||||
if not description_img:
|
||||
description_img_elements = content.select('figure img')
|
||||
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If we still don't have an image, use any image that's not the OSM logo
|
||||
if not description_img:
|
||||
all_images = content.select('img')
|
||||
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
||||
|
||||
# Process the found image
|
||||
description_img_url = None
|
||||
if description_img:
|
||||
src = description_img.get('src', '')
|
||||
if src:
|
||||
# Make relative URLs absolute
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = 'https://wiki.openstreetmap.org' + src
|
||||
|
||||
description_img_url = src
|
||||
description_img_url = src
|
||||
|
||||
# Process all images
|
||||
for img in media_elements:
|
||||
|
@ -546,6 +593,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
'sections': sections,
|
||||
'section_titles': section_titles,
|
||||
'word_count': word_count,
|
||||
'sentence_count': sentence_count,
|
||||
'link_count': link_count,
|
||||
'link_details': link_details,
|
||||
'media_count': media_count,
|
||||
|
@ -553,7 +601,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
'categories': categories,
|
||||
'description_img_url': description_img_url,
|
||||
'is_specific_page': is_specific_page,
|
||||
'grammar_suggestions': grammar_suggestions
|
||||
'grammar_suggestions': grammar_suggestions,
|
||||
'html_content': html_content
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
@ -869,13 +918,13 @@ def analyze_wiki_pages(pages):
|
|||
if date_diff > 30:
|
||||
reason.append(f"La version Française est datée de {date_diff} jours")
|
||||
if word_diff > 200:
|
||||
reason.append(f"La version Anglaise a {word_diff} plus de mots")
|
||||
reason.append(f"La version Anglaise a {word_diff} mots de plus")
|
||||
if section_diff > 2:
|
||||
reason.append(f"La version Anglaise a {section_diff} plus de sections")
|
||||
reason.append(f"La version Anglaise a {section_diff} sections de plus")
|
||||
if link_diff > 20:
|
||||
reason.append(f"La version Anglaise a {link_diff} plus de liens")
|
||||
reason.append(f"La version Anglaise a {link_diff} liens de plus")
|
||||
if media_diff > 5:
|
||||
reason.append(f"La version Anglaise a {media_diff} plus d'images")
|
||||
reason.append(f"La version Anglaise a {media_diff} images de plus")
|
||||
if fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue