up fetch desyncronised pages

This commit is contained in:
Tykayn 2025-09-04 22:41:38 +02:00 committed by tykayn
parent 8008e0291e
commit 685efd6710
4 changed files with 140 additions and 62 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 167 KiB

View file

@ -1,5 +1,5 @@
qualiwiki.cipherbliss.com { qualiwiki.cipherbliss.com {
root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public root * /home/poule/encrypted/qualiwiki/public
# serve files directly if they can be found (e.g. CSS or JS files in public/) # serve files directly if they can be found (e.g. CSS or JS files in public/)
encode zstd gzip encode zstd gzip

Binary file not shown.

View file

@ -31,6 +31,7 @@ import os
import subprocess import subprocess
import tempfile import tempfile
import hashlib import hashlib
import argparse
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import logging import logging
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:" WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:" WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/" WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
TOP_KEYS_FILE = "top_keys.json" TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv" WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json" OUTDATED_PAGES_FILE = "outdated_pages.json"
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
"Key:cuisine", "Key:cuisine",
"Libre_Charge_Map", "Libre_Charge_Map",
"OSM_Mon_Commerce", "OSM_Mon_Commerce",
"Complète_Tes_Commerces",
"Tag:amenity=charging_station", "Tag:amenity=charging_station",
"Organised_Editing/Activities/MapYourGrid_Initiative",
"Key:highway", "Key:highway",
"Quality_assurance" "Quality_assurance",
"Verifiability",
"Good_practice",
"Mapping_parties",
"State_of_the_Map",
"Diversity"
] ]
def fetch_desynchronized_pages():
"""
Fetch pages from the FR:Traductions_désynchronisées category
Returns:
list: List of page URLs from the category
"""
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
try:
response = requests.get(WIKI_CATEGORY_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links to French pages in the category
page_links = []
for link in soup.select('a[href^="/wiki/FR:"]'):
href = link.get('href', '')
# Skip if it's a category link or a language link
if '/Category:' in href or 'action=edit' in href:
continue
# Get the full URL
full_url = 'https://wiki.openstreetmap.org' + href
page_links.append(full_url)
logger.info(f"Found {len(page_links)} pages in the category")
return page_links
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching category page: {e}")
return []
def fetch_top_keys(limit=NUM_WIKI_PAGES): def fetch_top_keys(limit=NUM_WIKI_PAGES):
""" """
Fetch the most used OSM keys from TagInfo API Fetch the most used OSM keys from TagInfo API
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
logger.error(f"Unexpected error during grammar checking: {e}") logger.error(f"Unexpected error during grammar checking: {e}")
return [] return []
def fetch_wiki_page(key, language='en', is_specific_page=False): def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
""" """
Fetch wiki page for a given key or specific page Fetch wiki page for a given key or specific page
@ -328,6 +371,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
key (str): OSM key or specific page title/URL key (str): OSM key or specific page title/URL
language (str): Language code ('en' or 'fr') language (str): Language code ('en' or 'fr')
is_specific_page (bool): Whether this is a specific page rather than a key is_specific_page (bool): Whether this is a specific page rather than a key
check_grammar (bool): Whether to check grammar for French pages
Returns: Returns:
dict: Dictionary with page information or None if page doesn't exist dict: Dictionary with page information or None if page doesn't exist
@ -473,9 +517,11 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Check grammar for French pages # Check grammar for French pages
grammar_suggestions = [] grammar_suggestions = []
# if language == 'fr': if language == 'fr' and check_grammar:
# logger.info(f"Checking grammar for French page: {key}") logger.info(f"Checking grammar for French page: {key}")
# grammar_suggestions = check_grammar_with_grammalecte(clean_text) grammar_suggestions = check_grammar_with_grammalecte(clean_text)
elif language == 'fr' and not check_grammar:
logger.info(f"Grammar checking disabled for French page: {key}")
# Extract links # Extract links
links = content.select('a') links = content.select('a')
@ -1045,12 +1091,23 @@ def main():
1. Fetches the top OSM keys from TagInfo API 1. Fetches the top OSM keys from TagInfo API
2. Fetches and processes wiki pages for these keys 2. Fetches and processes wiki pages for these keys
3. Processes specific wiki pages listed in SPECIFIC_PAGES 3. Processes specific wiki pages listed in SPECIFIC_PAGES
4. Calculates staleness scores for all pages 4. Processes pages from the FR:Traductions_désynchronisées category
5. Generates a histogram of staleness scores 5. Calculates staleness scores for all pages
6. Saves the results to CSV and JSON files 6. Generates a histogram of staleness scores
7. Prints a list of pages that need updating 7. Saves the results to CSV and JSON files
8. Prints a list of pages that need updating
""" """
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
parser.add_argument('--no-grammar-check', action='store_true',
help='Disable grammar checking for French pages')
args = parser.parse_args()
# Whether to check grammar for French pages
check_grammar = not args.no_grammar_check
logger.info("Starting wiki_compare.py") logger.info("Starting wiki_compare.py")
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
# Create output directory if it doesn't exist # Create output directory if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True) os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
@ -1074,12 +1131,12 @@ def main():
key = key_info['key'] key = key_info['key']
# Fetch English page # Fetch English page
en_page = fetch_wiki_page(key, 'en') en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# Fetch French page # Fetch French page
fr_page = fetch_wiki_page(key, 'fr') fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
@ -1092,7 +1149,7 @@ def main():
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois") # Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
if page.startswith('http'): if page.startswith('http'):
# For full URLs, we directly fetch the page # For full URLs, we directly fetch the page
page_info = fetch_wiki_page(page, 'en', is_specific_page=True) page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if page_info: if page_info:
wiki_pages.append(page_info) wiki_pages.append(page_info)
@ -1102,7 +1159,7 @@ def main():
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '') en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
en_url = f"{WIKI_BASE_URL}{en_title}" en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}") logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True) en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# If it's an English page, try to find the French equivalent # If it's an English page, try to find the French equivalent
@ -1111,14 +1168,14 @@ def main():
fr_title = f"FR:{page_info['page_title']}" fr_title = f"FR:{page_info['page_title']}"
fr_url = f"{WIKI_BASE_URL}{fr_title}" fr_url = f"{WIKI_BASE_URL}{fr_title}"
logger.info(f"Trying to find French equivalent for {page}: {fr_url}") logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True) fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club") # Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
elif page.startswith('FR:'): elif page.startswith('FR:'):
# Fetch the French page # Fetch the French page
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True) fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
@ -1126,22 +1183,43 @@ def main():
en_title = page[3:] # Remove FR: prefix en_title = page[3:] # Remove FR: prefix
en_url = f"{WIKI_BASE_URL}{en_title}" en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}") logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True) en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm") # Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
else: else:
# Fetch the English page # Fetch the English page
en_page = fetch_wiki_page(page, 'en', is_specific_page=True) en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page: if en_page:
wiki_pages.append(en_page) wiki_pages.append(en_page)
# Fetch the French page (by adding FR: prefix) # Fetch the French page (by adding FR: prefix)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True) fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page: if fr_page:
wiki_pages.append(fr_page) wiki_pages.append(fr_page)
# Process pages from the FR:Traductions_désynchronisées category
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
desynchronized_pages = fetch_desynchronized_pages()
for page_url in desynchronized_pages:
# Fetch the French page
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Try to find the English equivalent
if fr_page['page_title'].startswith('FR:'):
en_title = fr_page['page_title'][3:] # Remove FR: prefix
else:
en_title = fr_page['page_title']
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Process wiki pages to add staleness score # Process wiki pages to add staleness score
processed_wiki_pages = [] processed_wiki_pages = []
pages_by_key = {} pages_by_key = {}