up fetch desyncronised pages

2025-09-04 22:41:38 +02:00 · 2025-09-04 22:41:38 +02:00 · 685efd6710
commit 685efd6710
parent 8008e0291e
4 changed files with 140 additions and 62 deletions
--- a/public/assets/img/Screenshot
+++ b/public/assets/img/Screenshot
--- a/qualiwiki.cipherbliss.com.config.caddy
+++ b/qualiwiki.cipherbliss.com.config.caddy
@ -1,5 +1,5 @@
 qualiwiki.cipherbliss.com {
-    root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public
+    root * /home/poule/encrypted/qualiwiki/public
    # serve files directly if they can be found (e.g. CSS or JS files in public/)
    encode zstd gzip
--- a/wiki_compare/pycache/wiki_compare.cpython-312.pyc
+++ b/wiki_compare/pycache/wiki_compare.cpython-312.pyc
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -31,6 +31,7 @@ import os
 import subprocess
 import tempfile
 import hashlib
 import argparse
 from datetime import datetime
 from bs4 import BeautifulSoup
 import logging
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
 WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
 WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
 WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
 WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
 TOP_KEYS_FILE = "top_keys.json"
 WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
@ -72,7 +74,7 @@ try:
    nltk.data.find('tokenizers/punkt_tab')
 except LookupError:
    nltk.download('punkt_tab')
-    
+
 # Create HTML cache directory if it doesn't exist
 Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
    "Key:cuisine",
    "Libre_Charge_Map",
    "OSM_Mon_Commerce",
    "Complète_Tes_Commerces",
    "Tag:amenity=charging_station",
    "Organised_Editing/Activities/MapYourGrid_Initiative",
    "Key:highway",
-    "Quality_assurance"
+    "Quality_assurance",
    "Verifiability",
    "Good_practice",
    "Mapping_parties",
    "State_of_the_Map",
    "Diversity"
 ]
 def fetch_desynchronized_pages():
    """
    Fetch pages from the FR:Traductions_désynchronisées category
    Returns:
        list: List of page URLs from the category
    """
    logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
    try:
        response = requests.get(WIKI_CATEGORY_URL)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find all links to French pages in the category
        page_links = []
        for link in soup.select('a[href^="/wiki/FR:"]'):
            href = link.get('href', '')
            # Skip if it's a category link or a language link
            if '/Category:' in href or 'action=edit' in href:
                continue
            # Get the full URL
            full_url = 'https://wiki.openstreetmap.org' + href
            page_links.append(full_url)
        logger.info(f"Found {len(page_links)} pages in the category")
        return page_links
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching category page: {e}")
        return []
 def fetch_top_keys(limit=NUM_WIKI_PAGES):
    """
    Fetch the most used OSM keys from TagInfo API
@ -133,10 +176,10 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
 def load_json_data(filename):
    """
    Load data from a JSON file
-    
+
    Args:
        filename (str): Name of the file
-        
+
    Returns:
        dict: Data loaded from the file or empty dict if file doesn't exist
    """
@ -164,7 +207,7 @@ def save_to_json(data, filename):
    try:
        # Convert data to JSON string
        json_str = json.dumps(data, indent=2, ensure_ascii=False)
-        
+
        # Print the JSON string for debugging
        logger.info(f"JSON string to be written to {filename}:")
        logger.info(f"JSON keys at top level: {list(data.keys())}")
@ -174,22 +217,22 @@ def save_to_json(data, filename):
                logger.info(f"'type' key exists in translations")
            if 'type_key' in data['translations']:
                logger.info(f"'type_key' key exists in translations")
-        
+
        # Write the JSON string to the file
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(json_str)
-        
+
        logger.info(f"Data saved to {filename}")
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")
-        
+
 def save_with_history(data, filename):
    """
    Save data to a JSON file while preserving history
-    
+
    This function loads existing data from the file (if it exists),
    adds the new data to the history, and saves the updated data back to the file.
-    
+
    Args:
        data: New data to save
        filename (str): Name of the file
@ -197,32 +240,32 @@ def save_with_history(data, filename):
    try:
        # Load existing data
        existing_data = load_json_data(filename)
-        
+
        # Create a timestamp for the current data
        current_timestamp = datetime.now().isoformat()
-        
+
        # Initialize history if it doesn't exist
        if 'history' not in existing_data:
            existing_data['history'] = {}
-            
+
        # Add current regular_pages and specific_pages to history
        history_entry = {
            'regular_pages': data.get('regular_pages', []),
            'specific_pages': data.get('specific_pages', [])
        }
-        
+
        # Add the entry to history with timestamp as key
        existing_data['history'][current_timestamp] = history_entry
-        
+
        # Update the current data
        existing_data['regular_pages'] = data.get('regular_pages', [])
        existing_data['specific_pages'] = data.get('specific_pages', [])
        existing_data['last_updated'] = current_timestamp
-        
+
        # Save the updated data
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)
-            
+
        logger.info(f"Data with history saved to {filename}")
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error saving data with history to {filename}: {e}")
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
        logger.error(f"Unexpected error during grammar checking: {e}")
        return []
-def fetch_wiki_page(key, language='en', is_specific_page=False):
+def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
    """
    Fetch wiki page for a given key or specific page
@ -328,7 +371,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        key (str): OSM key or specific page title/URL
        language (str): Language code ('en' or 'fr')
        is_specific_page (bool): Whether this is a specific page rather than a key
-        
+        check_grammar (bool): Whether to check grammar for French pages
    Returns:
        dict: Dictionary with page information or None if page doesn't exist
    """
@ -369,9 +413,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
    # Create a unique cache filename based on the URL
    cache_key = hashlib.md5(url.encode()).hexdigest()
    cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
-    
+
    html_content = None
-    
+
    # Try to load from cache first
    if cache_file.exists():
        logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
@ -381,21 +425,21 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        except Exception as e:
            logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
            html_content = None
-    
+
    # If not in cache or cache read failed, fetch from web
    if html_content is None:
        logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
        try:
            response = requests.get(url)
-            
+
            # Check if page exists
            if response.status_code == 404:
                logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
                return None
-            
+
            response.raise_for_status()
            html_content = response.text
-            
+
            # Save to cache
            try:
                with open(cache_file, 'w', encoding='utf-8') as f:
@ -406,9 +450,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
            return None
-    
+
    soup = BeautifulSoup(html_content, 'html.parser')
-    
+
    # Get last modification date
    last_modified = None
    footer_info = soup.select_one('#footer-info-lastmod')
@ -423,29 +467,29 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
                last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
            except ValueError:
                logger.warning(f"Could not parse date: {date_str}")
-    
+
    # Extract sections (h2, h3, h4)
    section_elements = soup.select('h2, h3, h4')
    sections = len(section_elements)
-    
+
    # Extract section titles
    section_titles = []
    for section_elem in section_elements:
        # Skip sections that are part of the table of contents, navigation, or DescriptionBox
        if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
            continue
-            
+
        # Skip sections that are inside a table with class DescriptionBox
        if section_elem.find_parent('table', class_='DescriptionBox'):
            continue
-            
+
        # Get the text of the section title, removing any edit links
        for edit_link in section_elem.select('.mw-editsection'):
            edit_link.extract()
-        
+
        section_title = section_elem.get_text(strip=True)
        section_level = int(section_elem.name[1])  # h2 -> 2, h3 -> 3, h4 -> 4
-        
+
        section_titles.append({
            'title': section_title,
            'level': section_level
@ -458,29 +502,31 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        # Remove script and style elements
        for script in content.select('script, style'):
            script.extract()
-        
+
        # Remove .languages elements
        for languages_elem in content.select('.languages'):
            languages_elem.extract()
-        
+
        # Get text and count words
        clean_text = content.get_text(separator=' ', strip=True)
        word_count = len(clean_text.split())
-        
+
        # Count sentences using NLTK
        sentences = nltk.sent_tokenize(clean_text)
        sentence_count = len(sentences)
-        
+
        # Check grammar for French pages
        grammar_suggestions = []
-#         if language == 'fr':
+        if language == 'fr' and check_grammar:
-#             logger.info(f"Checking grammar for French page: {key}")
+            logger.info(f"Checking grammar for French page: {key}")
-#             grammar_suggestions = check_grammar_with_grammalecte(clean_text)
+            grammar_suggestions = check_grammar_with_grammalecte(clean_text)
-        
+        elif language == 'fr' and not check_grammar:
            logger.info(f"Grammar checking disabled for French page: {key}")
        # Extract links
        links = content.select('a')
        link_count = len(links)
-        
+
        # Get link details (text and href)
        link_details = []
        for link in links:
@ -488,22 +534,22 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            # Skip edit section links and other non-content links
            if 'action=edit' in href or 'redlink=1' in href or not href:
                continue
-            
+
            # Make relative URLs absolute
            if href.startswith('/'):
                href = 'https://wiki.openstreetmap.org' + href
-            
+
            link_text = link.get_text(strip=True)
            if link_text:  # Only include links with text
                link_details.append({
                    'text': link_text,
                    'href': href
                })
-        
+
        # Extract media (images)
        media_elements = content.select('img')
        media_count = len(media_elements)
-        
+
        # Get media details (src and alt text)
        media_details = []
@ -1045,13 +1091,24 @@ def main():
    1. Fetches the top OSM keys from TagInfo API
    2. Fetches and processes wiki pages for these keys
    3. Processes specific wiki pages listed in SPECIFIC_PAGES
-    4. Calculates staleness scores for all pages
+    4. Processes pages from the FR:Traductions_désynchronisées category
-    5. Generates a histogram of staleness scores
+    5. Calculates staleness scores for all pages
-    6. Saves the results to CSV and JSON files
+    6. Generates a histogram of staleness scores
-    7. Prints a list of pages that need updating
+    7. Saves the results to CSV and JSON files
    8. Prints a list of pages that need updating
    """
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
    parser.add_argument('--no-grammar-check', action='store_true',
                        help='Disable grammar checking for French pages')
    args = parser.parse_args()
    # Whether to check grammar for French pages
    check_grammar = not args.no_grammar_check
    logger.info("Starting wiki_compare.py")
-    
+    logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
@ -1074,12 +1131,12 @@ def main():
        key = key_info['key']
        # Fetch English page
-        en_page = fetch_wiki_page(key, 'en')
+        en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
        if en_page:
            wiki_pages.append(en_page)
        # Fetch French page
-        fr_page = fetch_wiki_page(key, 'fr')
+        fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
        if fr_page:
            wiki_pages.append(fr_page)
@ -1092,7 +1149,7 @@ def main():
        # Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
        if page.startswith('http'):
            # For full URLs, we directly fetch the page
-            page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
+            page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
            if page_info:
                wiki_pages.append(page_info)
@ -1102,7 +1159,7 @@ def main():
                    en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
                    en_url = f"{WIKI_BASE_URL}{en_title}"
                    logger.info(f"Trying to find English equivalent for {page}: {en_url}")
-                    en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
+                    en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
                    if en_page:
                        wiki_pages.append(en_page)
                # If it's an English page, try to find the French equivalent
@ -1111,14 +1168,14 @@ def main():
                    fr_title = f"FR:{page_info['page_title']}"
                    fr_url = f"{WIKI_BASE_URL}{fr_title}"
                    logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
-                    fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
+                    fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
                    if fr_page:
                        wiki_pages.append(fr_page)
        # Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
        elif page.startswith('FR:'):
            # Fetch the French page
-            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
+            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
            if fr_page:
                wiki_pages.append(fr_page)
@ -1126,22 +1183,43 @@ def main():
                en_title = page[3:]  # Remove FR: prefix
                en_url = f"{WIKI_BASE_URL}{en_title}"
                logger.info(f"Trying to find English equivalent for {page}: {en_url}")
-                en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
+                en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
                if en_page:
                    wiki_pages.append(en_page)
        # Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
        else:
            # Fetch the English page
-            en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
+            en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
            if en_page:
                wiki_pages.append(en_page)
            # Fetch the French page (by adding FR: prefix)
-            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
+            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
            if fr_page:
                wiki_pages.append(fr_page)
-    
+
    # Process pages from the FR:Traductions_désynchronisées category
    logger.info("Processing pages from FR:Traductions_désynchronisées category...")
    desynchronized_pages = fetch_desynchronized_pages()
    for page_url in desynchronized_pages:
        # Fetch the French page
        fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
        if fr_page:
            wiki_pages.append(fr_page)
            # Try to find the English equivalent
            if fr_page['page_title'].startswith('FR:'):
                en_title = fr_page['page_title'][3:]  # Remove FR: prefix
            else:
                en_title = fr_page['page_title']
            en_url = f"{WIKI_BASE_URL}{en_title}"
            logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
            en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
            if en_page:
                wiki_pages.append(en_page)
    # Process wiki pages to add staleness score
    processed_wiki_pages = []
    pages_by_key = {}