up fetch desyncronised pages

This commit is contained in:
Tykayn 2025-09-04 22:41:38 +02:00 committed by tykayn
parent 8008e0291e
commit 685efd6710
4 changed files with 140 additions and 62 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 167 KiB

View file

@ -1,5 +1,5 @@
qualiwiki.cipherbliss.com {
root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public
root * /home/poule/encrypted/qualiwiki/public
# serve files directly if they can be found (e.g. CSS or JS files in public/)
encode zstd gzip

Binary file not shown.

View file

@ -31,6 +31,7 @@ import os
import subprocess
import tempfile
import hashlib
import argparse
from datetime import datetime
from bs4 import BeautifulSoup
import logging
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
"Key:cuisine",
"Libre_Charge_Map",
"OSM_Mon_Commerce",
"Complète_Tes_Commerces",
"Tag:amenity=charging_station",
"Organised_Editing/Activities/MapYourGrid_Initiative",
"Key:highway",
"Quality_assurance"
"Quality_assurance",
"Verifiability",
"Good_practice",
"Mapping_parties",
"State_of_the_Map",
"Diversity"
]
def fetch_desynchronized_pages():
"""
Fetch pages from the FR:Traductions_désynchronisées category
Returns:
list: List of page URLs from the category
"""
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
try:
response = requests.get(WIKI_CATEGORY_URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links to French pages in the category
page_links = []
for link in soup.select('a[href^="/wiki/FR:"]'):
href = link.get('href', '')
# Skip if it's a category link or a language link
if '/Category:' in href or 'action=edit' in href:
continue
# Get the full URL
full_url = 'https://wiki.openstreetmap.org' + href
page_links.append(full_url)
logger.info(f"Found {len(page_links)} pages in the category")
return page_links
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching category page: {e}")
return []
def fetch_top_keys(limit=NUM_WIKI_PAGES):
"""
Fetch the most used OSM keys from TagInfo API
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
logger.error(f"Unexpected error during grammar checking: {e}")
return []
def fetch_wiki_page(key, language='en', is_specific_page=False):
def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
"""
Fetch wiki page for a given key or specific page
@ -328,6 +371,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
key (str): OSM key or specific page title/URL
language (str): Language code ('en' or 'fr')
is_specific_page (bool): Whether this is a specific page rather than a key
check_grammar (bool): Whether to check grammar for French pages
Returns:
dict: Dictionary with page information or None if page doesn't exist
@ -473,9 +517,11 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Check grammar for French pages
grammar_suggestions = []
# if language == 'fr':
# logger.info(f"Checking grammar for French page: {key}")
# grammar_suggestions = check_grammar_with_grammalecte(clean_text)
if language == 'fr' and check_grammar:
logger.info(f"Checking grammar for French page: {key}")
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
elif language == 'fr' and not check_grammar:
logger.info(f"Grammar checking disabled for French page: {key}")
# Extract links
links = content.select('a')
@ -1045,12 +1091,23 @@ def main():
1. Fetches the top OSM keys from TagInfo API
2. Fetches and processes wiki pages for these keys
3. Processes specific wiki pages listed in SPECIFIC_PAGES
4. Calculates staleness scores for all pages
5. Generates a histogram of staleness scores
6. Saves the results to CSV and JSON files
7. Prints a list of pages that need updating
4. Processes pages from the FR:Traductions_désynchronisées category
5. Calculates staleness scores for all pages
6. Generates a histogram of staleness scores
7. Saves the results to CSV and JSON files
8. Prints a list of pages that need updating
"""
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
parser.add_argument('--no-grammar-check', action='store_true',
help='Disable grammar checking for French pages')
args = parser.parse_args()
# Whether to check grammar for French pages
check_grammar = not args.no_grammar_check
logger.info("Starting wiki_compare.py")
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
@ -1074,12 +1131,12 @@ def main():
key = key_info['key']
# Fetch English page
en_page = fetch_wiki_page(key, 'en')
en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Fetch French page
fr_page = fetch_wiki_page(key, 'fr')
fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
@ -1092,7 +1149,7 @@ def main():
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
if page.startswith('http'):
# For full URLs, we directly fetch the page
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if page_info:
wiki_pages.append(page_info)
@ -1102,7 +1159,7 @@ def main():
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# If it's an English page, try to find the French equivalent
@ -1111,14 +1168,14 @@ def main():
fr_title = f"FR:{page_info['page_title']}"
fr_url = f"{WIKI_BASE_URL}{fr_title}"
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
elif page.startswith('FR:'):
# Fetch the French page
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
@ -1126,22 +1183,43 @@ def main():
en_title = page[3:] # Remove FR: prefix
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
else:
# Fetch the English page
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Fetch the French page (by adding FR: prefix)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Process pages from the FR:Traductions_désynchronisées category
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
desynchronized_pages = fetch_desynchronized_pages()
for page_url in desynchronized_pages:
# Fetch the French page
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
if fr_page:
wiki_pages.append(fr_page)
# Try to find the English equivalent
if fr_page['page_title'].startswith('FR:'):
en_title = fr_page['page_title'][3:] # Remove FR: prefix
else:
en_title = fr_page['page_title']
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
if en_page:
wiki_pages.append(en_page)
# Process wiki pages to add staleness score
processed_wiki_pages = []
pages_by_key = {}