up fetch desyncronised pages

This commit is contained in:
Tykayn 2025-09-04 22:41:38 +02:00 committed by tykayn
parent 8008e0291e
commit 685efd6710
4 changed files with 140 additions and 62 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 167 KiB

View file

@ -1,5 +1,5 @@
qualiwiki.cipherbliss.com { qualiwiki.cipherbliss.com {
root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public root * /home/poule/encrypted/qualiwiki/public
# serve files directly if they can be found (e.g. CSS or JS files in public/) # serve files directly if they can be found (e.g. CSS or JS files in public/)
encode zstd gzip encode zstd gzip

Binary file not shown.

View file

@ -31,6 +31,7 @@ import os
import subprocess import subprocess
import tempfile import tempfile
import hashlib import hashlib
import argparse
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging import logging
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:" WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:" WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/" WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
TOP_KEYS_FILE = "top_keys.json" TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv" WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json" OUTDATED_PAGES_FILE = "outdated_pages.json"
@ -72,7 +74,7 @@ try:
nltk.data.find('tokenizers/punkt_tab') nltk.data.find('tokenizers/punkt_tab')
except LookupError: except LookupError:
nltk.download('punkt_tab') nltk.download('punkt_tab')
# Create HTML cache directory if it doesn't exist # Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True) Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
"Key:cuisine", "Key:cuisine",
"Libre_Charge_Map", "Libre_Charge_Map",
"OSM_Mon_Commerce", "OSM_Mon_Commerce",
"Complète_Tes_Commerces",
"Tag:amenity=charging_station", "Tag:amenity=charging_station",
"Organised_Editing/Activities/MapYourGrid_Initiative",
"Key:highway", "Key:highway",
"Quality_assurance" "Quality_assurance",
"Verifiability",
"Good_practice",
"Mapping_parties",
"State_of_the_Map",
"Diversity"
] ]
def fetch_desynchronized_pages():
"""
Fetch pages from the FR:Traductions_désynchronisées category
Returns:
list: List of page URLs from the category
"""
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
try:
response = requests.get(WIKI_CATEGORY_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links to French pages in the category
page_links = []
for link in soup.select('a[href^="/wiki/FR:"]'):
href = link.get('href', '')
# Skip if it's a category link or a language link
if '/Category:' in href or 'action=edit' in href:
continue
# Get the full URL
full_url = 'https://wiki.openstreetmap.org' + href
page_links.append(full_url)
logger.info(f"Found {len(page_links)} pages in the category")
return page_links
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching category page: {e}")
return []
def fetch_top_keys(limit=NUM_WIKI_PAGES): def fetch_top_keys(limit=NUM_WIKI_PAGES):
""" """
Fetch the most used OSM keys from TagInfo API Fetch the most used OSM keys from TagInfo API
@ -133,10 +176,10 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
def load_json_data(filename): def load_json_data(filename):
""" """
Load data from a JSON file Load data from a JSON file
Args: Args:
filename (str): Name of the file filename (str): Name of the file
Returns: Returns:
dict: Data loaded from the file or empty dict if file doesn't exist dict: Data loaded from the file or empty dict if file doesn't exist
""" """
@ -164,7 +207,7 @@ def save_to_json(data, filename):
try: try:
# Convert data to JSON string # Convert data to JSON string
json_str = json.dumps(data, indent=2, ensure_ascii=False) json_str = json.dumps(data, indent=2, ensure_ascii=False)
# Print the JSON string for debugging # Print the JSON string for debugging
logger.info(f"JSON string to be written to {filename}:") logger.info(f"JSON string to be written to {filename}:")
logger.info(f"JSON keys at top level: {list(data.keys())}") logger.info(f"JSON keys at top level: {list(data.keys())}")
@ -174,22 +217,22 @@ def save_to_json(data, filename):
logger.info(f"'type' key exists in translations") logger.info(f"'type' key exists in translations")
if 'type_key' in data['translations']: if 'type_key' in data['translations']:
logger.info(f"'type_key' key exists in translations") logger.info(f"'type_key' key exists in translations")
# Write the JSON string to the file # Write the JSON string to the file
with open(filename, 'w', encoding='utf-8') as f: with open(filename, 'w', encoding='utf-8') as f:
f.write(json_str) f.write(json_str)
logger.info(f"Data saved to {filename}") logger.info(f"Data saved to {filename}")
except IOError as e: except IOError as e:
logger.error(f"Error saving data to {filename}: {e}") logger.error(f"Error saving data to {filename}: {e}")
def save_with_history(data, filename): def save_with_history(data, filename):
""" """
Save data to a JSON file while preserving history Save data to a JSON file while preserving history
This function loads existing data from the file (if it exists), This function loads existing data from the file (if it exists),
adds the new data to the history, and saves the updated data back to the file. adds the new data to the history, and saves the updated data back to the file.
Args: Args:
data: New data to save data: New data to save
filename (str): Name of the file filename (str): Name of the file
@ -197,32 +240,32 @@ def save_with_history(data, filename):
try: try:
# Load existing data # Load existing data
existing_data = load_json_data(filename) existing_data = load_json_data(filename)
# Create a timestamp for the current data # Create a timestamp for the current data
current_timestamp = datetime.now().isoformat() current_timestamp = datetime.now().isoformat()
# Initialize history if it doesn't exist # Initialize history if it doesn't exist
if 'history' not in existing_data: if 'history' not in existing_data:
existing_data['history'] = {} existing_data['history'] = {}
# Add current regular_pages and specific_pages to history # Add current regular_pages and specific_pages to history
history_entry = { history_entry = {
'regular_pages': data.get('regular_pages', []), 'regular_pages': data.get('regular_pages', []),
'specific_pages': data.get('specific_pages', []) 'specific_pages': data.get('specific_pages', [])
} }
# Add the entry to history with timestamp as key # Add the entry to history with timestamp as key
existing_data['history'][current_timestamp] = history_entry existing_data['history'][current_timestamp] = history_entry
# Update the current data # Update the current data
existing_data['regular_pages'] = data.get('regular_pages', []) existing_data['regular_pages'] = data.get('regular_pages', [])
existing_data['specific_pages'] = data.get('specific_pages', []) existing_data['specific_pages'] = data.get('specific_pages', [])
existing_data['last_updated'] = current_timestamp existing_data['last_updated'] = current_timestamp
# Save the updated data # Save the updated data
with open(filename, 'w', encoding='utf-8') as f: with open(filename, 'w', encoding='utf-8') as f:
json.dump(existing_data, f, indent=2, ensure_ascii=False) json.dump(existing_data, f, indent=2, ensure_ascii=False)
logger.info(f"Data with history saved to {filename}") logger.info(f"Data with history saved to {filename}")
except (IOError, json.JSONDecodeError) as e: except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving data with history to {filename}: {e}") logger.error(f"Error saving data with history to {filename}: {e}")
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
logger.error(f"Unexpected error during grammar checking: {e}") logger.error(f"Unexpected error during grammar checking: {e}")
return [] return []
def fetch_wiki_page(key, language='en', is_specific_page=False): def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
""" """
Fetch wiki page for a given key or specific page Fetch wiki page for a given key or specific page
@ -328,7 +371,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
key (str): OSM key or specific page title/URL key (str): OSM key or specific page title/URL
language (str): Language code ('en' or 'fr') language (str): Language code ('en' or 'fr')
is_specific_page (bool): Whether this is a specific page rather than a key is_specific_page (bool): Whether this is a specific page rather than a key
check_grammar (bool): Whether to check grammar for French pages
Returns: Returns:
dict: Dictionary with page information or None if page doesn't exist dict: Dictionary with page information or None if page doesn't exist
""" """
@ -369,9 +413,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Create a unique cache filename based on the URL # Create a unique cache filename based on the URL
cache_key = hashlib.md5(url.encode()).hexdigest() cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html" cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
html_content = None html_content = None
# Try to load from cache first # Try to load from cache first
if cache_file.exists(): if cache_file.exists():
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'") logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
@ -381,21 +425,21 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
except Exception as e: except Exception as e:
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.") logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
html_content = None html_content = None
# If not in cache or cache read failed, fetch from web # If not in cache or cache read failed, fetch from web
if html_content is None: if html_content is None:
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}") logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
try: try:
response = requests.get(url) response = requests.get(url)
# Check if page exists # Check if page exists
if response.status_code == 404: if response.status_code == 404:
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist") logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
return None return None
response.raise_for_status() response.raise_for_status()
html_content = response.text html_content = response.text
# Save to cache # Save to cache
try: try:
with open(cache_file, 'w', encoding='utf-8') as f: with open(cache_file, 'w', encoding='utf-8') as f:
@ -406,9 +450,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}") logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None return None
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
# Get last modification date # Get last modification date
last_modified = None last_modified = None
footer_info = soup.select_one('#footer-info-lastmod') footer_info = soup.select_one('#footer-info-lastmod')
@ -423,29 +467,29 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d') last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
except ValueError: except ValueError:
logger.warning(f"Could not parse date: {date_str}") logger.warning(f"Could not parse date: {date_str}")
# Extract sections (h2, h3, h4) # Extract sections (h2, h3, h4)
section_elements = soup.select('h2, h3, h4') section_elements = soup.select('h2, h3, h4')
sections = len(section_elements) sections = len(section_elements)
# Extract section titles # Extract section titles
section_titles = [] section_titles = []
for section_elem in section_elements: for section_elem in section_elements:
# Skip sections that are part of the table of contents, navigation, or DescriptionBox # Skip sections that are part of the table of contents, navigation, or DescriptionBox
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']: if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
continue continue
# Skip sections that are inside a table with class DescriptionBox # Skip sections that are inside a table with class DescriptionBox
if section_elem.find_parent('table', class_='DescriptionBox'): if section_elem.find_parent('table', class_='DescriptionBox'):
continue continue
# Get the text of the section title, removing any edit links # Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'): for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract() edit_link.extract()
section_title = section_elem.get_text(strip=True) section_title = section_elem.get_text(strip=True)
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4 section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
section_titles.append({ section_titles.append({
'title': section_title, 'title': section_title,
'level': section_level 'level': section_level
@ -458,29 +502,31 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Remove script and style elements # Remove script and style elements
for script in content.select('script, style'): for script in content.select('script, style'):
script.extract() script.extract()
# Remove .languages elements # Remove .languages elements
for languages_elem in content.select('.languages'): for languages_elem in content.select('.languages'):
languages_elem.extract() languages_elem.extract()
# Get text and count words # Get text and count words
clean_text = content.get_text(separator=' ', strip=True) clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split()) word_count = len(clean_text.split())
# Count sentences using NLTK # Count sentences using NLTK
sentences = nltk.sent_tokenize(clean_text) sentences = nltk.sent_tokenize(clean_text)
sentence_count = len(sentences) sentence_count = len(sentences)
# Check grammar for French pages # Check grammar for French pages
grammar_suggestions = [] grammar_suggestions = []
# if language == 'fr': if language == 'fr' and check_grammar:
# logger.info(f"Checking grammar for French page: {key}") logger.info(f"Checking grammar for French page: {key}")
# grammar_suggestions = check_grammar_with_grammalecte(clean_text) grammar_suggestions = check_grammar_with_grammalecte(clean_text)
elif language == 'fr' and not check_grammar:
logger.info(f"Grammar checking disabled for French page: {key}")
# Extract links # Extract links
links = content.select('a') links = content.select('a')
link_count = len(links) link_count = len(links)
# Get link details (text and href) # Get link details (text and href)
link_details = [] link_details = []
for link in links: for link in links:
@ -488,22 +534,22 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Skip edit section links and other non-content links # Skip edit section links and other non-content links
if 'action=edit' in href or 'redlink=1' in href or not href: if 'action=edit' in href or 'redlink=1' in href or not href:
continue continue
# Make relative URLs absolute # Make relative URLs absolute
if href.startswith('/'): if href.startswith('/'):
href = 'https://wiki.openstreetmap.org' + href href = 'https://wiki.openstreetmap.org' + href
link_text = link.get_text(strip=True) link_text = link.get_text(strip=True)
if link_text: # Only include links with text if link_text: # Only include links with text
link_details.append({ link_details.append({
'text': link_text, 'text': link_text,
'href': href 'href': href
}) })
# Extract media (images) # Extract media (images)
media_elements = content.select('img') media_elements = content.select('img')
media_count = len(media_elements) media_count = len(media_elements)
# Get media details (src and alt text) # Get media details (src and alt text)
media_details = [] media_details = []
@ -1045,13 +1091,24 @@ def main():
1. Fetches the top OSM keys from TagInfo API 1. Fetches the top OSM keys from TagInfo API
2. Fetches and processes wiki pages for these keys 2. Fetches and processes wiki pages for these keys
3. Processes specific wiki pages listed in SPECIFIC_PAGES 3. Processes specific wiki pages listed in SPECIFIC_PAGES
4. Calculates staleness scores for all pages 4. Processes pages from the FR:Traductions_désynchronisées category
5. Generates a histogram of staleness scores 5. Calculates staleness scores for all pages
6. Saves the results to CSV and JSON files 6. Generates a histogram of staleness scores
7. Prints a list of pages that need updating 7. Saves the results to CSV and JSON files
8. Prints a list of pages that need updating
""" """
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
parser.add_argument('--no-grammar-check', action='store_true',
help='Disable grammar checking for French pages')
args = parser.parse_args()
# Whether to check grammar for French pages
check_grammar = not args.no_grammar_check
logger.info("Starting wiki_compare.py") logger.info("Starting wiki_compare.py")
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
# Create output directory if it doesn't exist # Create output directory if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True) os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
@ -1074,12 +1131,12 @@ def main():
key = key_info['key'] key = key_info['key']
# Fetch English page # Fetch English page
en_page = fetch_wiki_page(key, 'en') en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# Fetch French page # Fetch French page
fr_page = fetch_wiki_page(key, 'fr') fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
@ -1092,7 +1149,7 @@ def main():
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois") # Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
if page.startswith('http'): if page.startswith('http'):
# For full URLs, we directly fetch the page # For full URLs, we directly fetch the page
page_info = fetch_wiki_page(page, 'en', is_specific_page=True) page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if page_info: if page_info:
wiki_pages.append(page_info) wiki_pages.append(page_info)
@ -1102,7 +1159,7 @@ def main():
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '') en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
en_url = f"{WIKI_BASE_URL}{en_title}" en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}") logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True) en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# If it's an English page, try to find the French equivalent # If it's an English page, try to find the French equivalent
@ -1111,14 +1168,14 @@ def main():
fr_title = f"FR:{page_info['page_title']}" fr_title = f"FR:{page_info['page_title']}"
fr_url = f"{WIKI_BASE_URL}{fr_title}" fr_url = f"{WIKI_BASE_URL}{fr_title}"
logger.info(f"Trying to find French equivalent for {page}: {fr_url}") logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True) fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club") # Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
elif page.startswith('FR:'): elif page.startswith('FR:'):
# Fetch the French page # Fetch the French page
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True) fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
@ -1126,22 +1183,43 @@ def main():
en_title = page[3:] # Remove FR: prefix en_title = page[3:] # Remove FR: prefix
en_url = f"{WIKI_BASE_URL}{en_title}" en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}") logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True) en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm") # Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
else: else:
# Fetch the English page # Fetch the English page
en_page = fetch_wiki_page(page, 'en', is_specific_page=True) en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# Fetch the French page (by adding FR: prefix) # Fetch the French page (by adding FR: prefix)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True) fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
# Process pages from the FR:Traductions_désynchronisées category
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
desynchronized_pages = fetch_desynchronized_pages()
for page_url in desynchronized_pages:
# Fetch the French page
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Try to find the English equivalent
if fr_page['page_title'].startswith('FR:'):
en_title = fr_page['page_title'][3:] # Remove FR: prefix
else:
en_title = fr_page['page_title']
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Process wiki pages to add staleness score # Process wiki pages to add staleness score
processed_wiki_pages = [] processed_wiki_pages = []
pages_by_key = {} pages_by_key = {}