up fetch desyncronised pages
This commit is contained in:
parent
8008e0291e
commit
685efd6710
4 changed files with 140 additions and 62 deletions
Binary file not shown.
After Width: | Height: | Size: 167 KiB |
|
@ -1,5 +1,5 @@
|
|||
qualiwiki.cipherbliss.com {
|
||||
root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public
|
||||
root * /home/poule/encrypted/qualiwiki/public
|
||||
|
||||
# serve files directly if they can be found (e.g. CSS or JS files in public/)
|
||||
encode zstd gzip
|
BIN
wiki_compare/__pycache__/wiki_compare.cpython-312.pyc
Normal file
BIN
wiki_compare/__pycache__/wiki_compare.cpython-312.pyc
Normal file
Binary file not shown.
|
@ -31,6 +31,7 @@ import os
|
|||
import subprocess
|
||||
import tempfile
|
||||
import hashlib
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
|
|||
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
||||
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
|
||||
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
|
||||
TOP_KEYS_FILE = "top_keys.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
|
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
|
|||
"Key:cuisine",
|
||||
"Libre_Charge_Map",
|
||||
"OSM_Mon_Commerce",
|
||||
"Complète_Tes_Commerces",
|
||||
"Tag:amenity=charging_station",
|
||||
"Organised_Editing/Activities/MapYourGrid_Initiative",
|
||||
"Key:highway",
|
||||
"Quality_assurance"
|
||||
"Quality_assurance",
|
||||
"Verifiability",
|
||||
"Good_practice",
|
||||
"Mapping_parties",
|
||||
"State_of_the_Map",
|
||||
"Diversity"
|
||||
]
|
||||
|
||||
def fetch_desynchronized_pages():
|
||||
"""
|
||||
Fetch pages from the FR:Traductions_désynchronisées category
|
||||
|
||||
Returns:
|
||||
list: List of page URLs from the category
|
||||
"""
|
||||
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
|
||||
|
||||
try:
|
||||
response = requests.get(WIKI_CATEGORY_URL)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Find all links to French pages in the category
|
||||
page_links = []
|
||||
for link in soup.select('a[href^="/wiki/FR:"]'):
|
||||
href = link.get('href', '')
|
||||
# Skip if it's a category link or a language link
|
||||
if '/Category:' in href or 'action=edit' in href:
|
||||
continue
|
||||
|
||||
# Get the full URL
|
||||
full_url = 'https://wiki.openstreetmap.org' + href
|
||||
page_links.append(full_url)
|
||||
|
||||
logger.info(f"Found {len(page_links)} pages in the category")
|
||||
return page_links
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching category page: {e}")
|
||||
return []
|
||||
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
"""
|
||||
Fetch the most used OSM keys from TagInfo API
|
||||
|
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
|
|||
logger.error(f"Unexpected error during grammar checking: {e}")
|
||||
return []
|
||||
|
||||
def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||
def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
|
||||
"""
|
||||
Fetch wiki page for a given key or specific page
|
||||
|
||||
|
@ -328,6 +371,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
key (str): OSM key or specific page title/URL
|
||||
language (str): Language code ('en' or 'fr')
|
||||
is_specific_page (bool): Whether this is a specific page rather than a key
|
||||
check_grammar (bool): Whether to check grammar for French pages
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with page information or None if page doesn't exist
|
||||
|
@ -473,9 +517,11 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
# if language == 'fr':
|
||||
# logger.info(f"Checking grammar for French page: {key}")
|
||||
# grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
if language == 'fr' and check_grammar:
|
||||
logger.info(f"Checking grammar for French page: {key}")
|
||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
elif language == 'fr' and not check_grammar:
|
||||
logger.info(f"Grammar checking disabled for French page: {key}")
|
||||
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
|
@ -1045,12 +1091,23 @@ def main():
|
|||
1. Fetches the top OSM keys from TagInfo API
|
||||
2. Fetches and processes wiki pages for these keys
|
||||
3. Processes specific wiki pages listed in SPECIFIC_PAGES
|
||||
4. Calculates staleness scores for all pages
|
||||
5. Generates a histogram of staleness scores
|
||||
6. Saves the results to CSV and JSON files
|
||||
7. Prints a list of pages that need updating
|
||||
4. Processes pages from the FR:Traductions_désynchronisées category
|
||||
5. Calculates staleness scores for all pages
|
||||
6. Generates a histogram of staleness scores
|
||||
7. Saves the results to CSV and JSON files
|
||||
8. Prints a list of pages that need updating
|
||||
"""
|
||||
# Parse command-line arguments
|
||||
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
|
||||
parser.add_argument('--no-grammar-check', action='store_true',
|
||||
help='Disable grammar checking for French pages')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Whether to check grammar for French pages
|
||||
check_grammar = not args.no_grammar_check
|
||||
|
||||
logger.info("Starting wiki_compare.py")
|
||||
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
|
||||
|
@ -1074,12 +1131,12 @@ def main():
|
|||
key = key_info['key']
|
||||
|
||||
# Fetch English page
|
||||
en_page = fetch_wiki_page(key, 'en')
|
||||
en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Fetch French page
|
||||
fr_page = fetch_wiki_page(key, 'fr')
|
||||
fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
|
@ -1092,7 +1149,7 @@ def main():
|
|||
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
||||
if page.startswith('http'):
|
||||
# For full URLs, we directly fetch the page
|
||||
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if page_info:
|
||||
wiki_pages.append(page_info)
|
||||
|
||||
|
@ -1102,7 +1159,7 @@ def main():
|
|||
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
# If it's an English page, try to find the French equivalent
|
||||
|
@ -1111,14 +1168,14 @@ def main():
|
|||
fr_title = f"FR:{page_info['page_title']}"
|
||||
fr_url = f"{WIKI_BASE_URL}{fr_title}"
|
||||
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
|
||||
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
|
||||
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||
elif page.startswith('FR:'):
|
||||
# Fetch the French page
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
|
@ -1126,22 +1183,43 @@ def main():
|
|||
en_title = page[3:] # Remove FR: prefix
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
|
||||
else:
|
||||
# Fetch the English page
|
||||
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Fetch the French page (by adding FR: prefix)
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Process pages from the FR:Traductions_désynchronisées category
|
||||
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
|
||||
desynchronized_pages = fetch_desynchronized_pages()
|
||||
for page_url in desynchronized_pages:
|
||||
# Fetch the French page
|
||||
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Try to find the English equivalent
|
||||
if fr_page['page_title'].startswith('FR:'):
|
||||
en_title = fr_page['page_title'][3:] # Remove FR: prefix
|
||||
else:
|
||||
en_title = fr_page['page_title']
|
||||
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Process wiki pages to add staleness score
|
||||
processed_wiki_pages = []
|
||||
pages_by_key = {}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue