up fetch desyncronised pages

This commit is contained in:
Tykayn 2025-09-04 22:41:38 +02:00 committed by tykayn
parent 8008e0291e
commit 685efd6710
4 changed files with 140 additions and 62 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 167 KiB

View file

@ -1,5 +1,5 @@
qualiwiki.cipherbliss.com {
root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public
root * /home/poule/encrypted/qualiwiki/public
# serve files directly if they can be found (e.g. CSS or JS files in public/)
encode zstd gzip

Binary file not shown.

View file

@ -31,6 +31,7 @@ import os
import subprocess
import tempfile
import hashlib
import argparse
from datetime import datetime
from bs4 import BeautifulSoup
import logging
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
@ -72,7 +74,7 @@ try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
"Key:cuisine",
"Libre_Charge_Map",
"OSM_Mon_Commerce",
"Complète_Tes_Commerces",
"Tag:amenity=charging_station",
"Organised_Editing/Activities/MapYourGrid_Initiative",
"Key:highway",
"Quality_assurance"
"Quality_assurance",
"Verifiability",
"Good_practice",
"Mapping_parties",
"State_of_the_Map",
"Diversity"
]
def fetch_desynchronized_pages():
"""
Fetch pages from the FR:Traductions_désynchronisées category
Returns:
list: List of page URLs from the category
"""
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
try:
response = requests.get(WIKI_CATEGORY_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links to French pages in the category
page_links = []
for link in soup.select('a[href^="/wiki/FR:"]'):
href = link.get('href', '')
# Skip if it's a category link or a language link
if '/Category:' in href or 'action=edit' in href:
continue
# Get the full URL
full_url = 'https://wiki.openstreetmap.org' + href
page_links.append(full_url)
logger.info(f"Found {len(page_links)} pages in the category")
return page_links
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching category page: {e}")
return []
def fetch_top_keys(limit=NUM_WIKI_PAGES):
"""
Fetch the most used OSM keys from TagInfo API
@ -133,10 +176,10 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
def load_json_data(filename):
"""
Load data from a JSON file
Args:
filename (str): Name of the file
Returns:
dict: Data loaded from the file or empty dict if file doesn't exist
"""
@ -164,7 +207,7 @@ def save_to_json(data, filename):
try:
# Convert data to JSON string
json_str = json.dumps(data, indent=2, ensure_ascii=False)
# Print the JSON string for debugging
logger.info(f"JSON string to be written to {filename}:")
logger.info(f"JSON keys at top level: {list(data.keys())}")
@ -174,22 +217,22 @@ def save_to_json(data, filename):
logger.info(f"'type' key exists in translations")
if 'type_key' in data['translations']:
logger.info(f"'type_key' key exists in translations")
# Write the JSON string to the file
with open(filename, 'w', encoding='utf-8') as f:
f.write(json_str)
logger.info(f"Data saved to {filename}")
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def save_with_history(data, filename):
"""
Save data to a JSON file while preserving history
This function loads existing data from the file (if it exists),
adds the new data to the history, and saves the updated data back to the file.
Args:
data: New data to save
filename (str): Name of the file
@ -197,32 +240,32 @@ def save_with_history(data, filename):
try:
# Load existing data
existing_data = load_json_data(filename)
# Create a timestamp for the current data
current_timestamp = datetime.now().isoformat()
# Initialize history if it doesn't exist
if 'history' not in existing_data:
existing_data['history'] = {}
# Add current regular_pages and specific_pages to history
history_entry = {
'regular_pages': data.get('regular_pages', []),
'specific_pages': data.get('specific_pages', [])
}
# Add the entry to history with timestamp as key
existing_data['history'][current_timestamp] = history_entry
# Update the current data
existing_data['regular_pages'] = data.get('regular_pages', [])
existing_data['specific_pages'] = data.get('specific_pages', [])
existing_data['last_updated'] = current_timestamp
# Save the updated data
with open(filename, 'w', encoding='utf-8') as f:
json.dump(existing_data, f, indent=2, ensure_ascii=False)
logger.info(f"Data with history saved to {filename}")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving data with history to {filename}: {e}")
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
logger.error(f"Unexpected error during grammar checking: {e}")
return []
def fetch_wiki_page(key, language='en', is_specific_page=False):
def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
"""
Fetch wiki page for a given key or specific page
@ -328,7 +371,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
key (str): OSM key or specific page title/URL
language (str): Language code ('en' or 'fr')
is_specific_page (bool): Whether this is a specific page rather than a key
check_grammar (bool): Whether to check grammar for French pages
Returns:
dict: Dictionary with page information or None if page doesn't exist
"""
@ -369,9 +413,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Create a unique cache filename based on the URL
cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
html_content = None
# Try to load from cache first
if cache_file.exists():
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
@ -381,21 +425,21 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
except Exception as e:
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
html_content = None
# If not in cache or cache read failed, fetch from web
if html_content is None:
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
try:
response = requests.get(url)
# Check if page exists
if response.status_code == 404:
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
return None
response.raise_for_status()
html_content = response.text
# Save to cache
try:
with open(cache_file, 'w', encoding='utf-8') as f:
@ -406,9 +450,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None
soup = BeautifulSoup(html_content, 'html.parser')
# Get last modification date
last_modified = None
footer_info = soup.select_one('#footer-info-lastmod')
@ -423,29 +467,29 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
# Extract sections (h2, h3, h4)
section_elements = soup.select('h2, h3, h4')
sections = len(section_elements)
# Extract section titles
section_titles = []
for section_elem in section_elements:
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
continue
# Skip sections that are inside a table with class DescriptionBox
if section_elem.find_parent('table', class_='DescriptionBox'):
continue
# Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract()
section_title = section_elem.get_text(strip=True)
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
section_titles.append({
'title': section_title,
'level': section_level
@ -458,29 +502,31 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Get text and count words
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Count sentences using NLTK
sentences = nltk.sent_tokenize(clean_text)
sentence_count = len(sentences)
# Check grammar for French pages
grammar_suggestions = []
# if language == 'fr':
# logger.info(f"Checking grammar for French page: {key}")
# grammar_suggestions = check_grammar_with_grammalecte(clean_text)
if language == 'fr' and check_grammar:
logger.info(f"Checking grammar for French page: {key}")
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
elif language == 'fr' and not check_grammar:
logger.info(f"Grammar checking disabled for French page: {key}")
# Extract links
links = content.select('a')
link_count = len(links)
# Get link details (text and href)
link_details = []
for link in links:
@ -488,22 +534,22 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Skip edit section links and other non-content links
if 'action=edit' in href or 'redlink=1' in href or not href:
continue
# Make relative URLs absolute
if href.startswith('/'):
href = 'https://wiki.openstreetmap.org' + href
link_text = link.get_text(strip=True)
if link_text: # Only include links with text
link_details.append({
'text': link_text,
'href': href
})
# Extract media (images)
media_elements = content.select('img')
media_count = len(media_elements)
# Get media details (src and alt text)
media_details = []
@ -1045,13 +1091,24 @@ def main():
1. Fetches the top OSM keys from TagInfo API
2. Fetches and processes wiki pages for these keys
3. Processes specific wiki pages listed in SPECIFIC_PAGES
4. Calculates staleness scores for all pages
5. Generates a histogram of staleness scores
6. Saves the results to CSV and JSON files
7. Prints a list of pages that need updating
4. Processes pages from the FR:Traductions_désynchronisées category
5. Calculates staleness scores for all pages
6. Generates a histogram of staleness scores
7. Saves the results to CSV and JSON files
8. Prints a list of pages that need updating
"""
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
parser.add_argument('--no-grammar-check', action='store_true',
help='Disable grammar checking for French pages')
args = parser.parse_args()
# Whether to check grammar for French pages
check_grammar = not args.no_grammar_check
logger.info("Starting wiki_compare.py")
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
@ -1074,12 +1131,12 @@ def main():
key = key_info['key']
# Fetch English page
en_page = fetch_wiki_page(key, 'en')
en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Fetch French page
fr_page = fetch_wiki_page(key, 'fr')
fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
@ -1092,7 +1149,7 @@ def main():
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
if page.startswith('http'):
# For full URLs, we directly fetch the page
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if page_info:
wiki_pages.append(page_info)
@ -1102,7 +1159,7 @@ def main():
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# If it's an English page, try to find the French equivalent
@ -1111,14 +1168,14 @@ def main():
fr_title = f"FR:{page_info['page_title']}"
fr_url = f"{WIKI_BASE_URL}{fr_title}"
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
elif page.startswith('FR:'):
# Fetch the French page
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
@ -1126,22 +1183,43 @@ def main():
en_title = page[3:] # Remove FR: prefix
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
else:
# Fetch the English page
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Fetch the French page (by adding FR: prefix)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Process pages from the FR:Traductions_désynchronisées category
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
desynchronized_pages = fetch_desynchronized_pages()
for page_url in desynchronized_pages:
# Fetch the French page
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Try to find the English equivalent
if fr_page['page_title'].startswith('FR:'):
en_title = fr_page['page_title'][3:] # Remove FR: prefix
else:
en_title = fr_page['page_title']
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Process wiki pages to add staleness score
processed_wiki_pages = []
pages_by_key = {}