add specific follow of translation pages

2025-09-01 00:29:17 +02:00 · 2025-09-01 00:29:17 +02:00 · bd3d14e9f8
commit bd3d14e9f8
parent 7a7704bc01
11 changed files with 48190 additions and 268240 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -8,6 +8,10 @@ This script fetches the most used OpenStreetMap keys from TagInfo,
 compares their English and French wiki pages, and identifies which pages
 need updating based on modification dates and content analysis.

+The script also compares a specific list of wiki pages defined in the
+SPECIFIC_PAGES constant. This list can include regular page titles,
+full URLs, or pages with FR: prefix.
+
 Usage:
    python wiki_compare.py

@ -15,6 +19,7 @@ Output:
    - top_keys.json: JSON file containing the most used OSM keys
    - wiki_pages.csv: CSV file with information about each wiki page
    - outdated_pages.json: JSON file containing pages that need updating
+    - staleness_histogram.png: Histogram of staleness scores
    - A console output listing the wiki pages that need updating
 """

@ -47,14 +52,21 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
-NUM_WIKI_PAGES = 100
+NUM_WIKI_PAGES = 1

-# List of specific pages to compare
+# List of specific pages to compare (in addition to top keys)
+# This list can include:
+# 1. Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
+# 2. Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
+# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
 SPECIFIC_PAGES = [
    "Anatomie_des_étiquettes_osm",
-    "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois",
    "FR:Tag:leisure%3Dchildren_club",
-    "FR:Tag:harassment_prevention%3Dask_angela"
+    "FR:Tag:harassment_prevention%3Dask_angela",
+    "Key:harassment_prevention",
+    "Proposal process",
+    "Automated_Edits_code_of_conduct",
+    "Key:cuisine"
 ]

 def fetch_top_keys(limit=NUM_WIKI_PAGES):
@ -110,6 +122,13 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
    """
    Fetch wiki page for a given key or specific page
    
+    This function handles different types of wiki pages:
+    1. Regular OSM key pages (e.g., "building", "highway")
+    2. Specific wiki pages that can be in various formats:
+       - Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
+       - Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
+       - Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
+    
    Args:
        key (str): OSM key or specific page title/URL
        language (str): Language code ('en' or 'fr')
@ -780,7 +799,18 @@ def analyze_wiki_pages(pages):
    return needs_update

 def main():
-    """Main function to execute the script"""
+    """
+    Main function to execute the script
+    
+    This function:
+    1. Fetches the top OSM keys from TagInfo API
+    2. Fetches and processes wiki pages for these keys
+    3. Processes specific wiki pages listed in SPECIFIC_PAGES
+    4. Calculates staleness scores for all pages
+    5. Generates a histogram of staleness scores
+    6. Saves the results to CSV and JSON files
+    7. Prints a list of pages that need updating
+    """
    logger.info("Starting wiki_compare.py")
    
    # Create output directory if it doesn't exist
@ -814,12 +844,13 @@ def main():
        if fr_page:
            wiki_pages.append(fr_page)
    
-    # Process specific pages
+    # Process specific pages from the SPECIFIC_PAGES list
+    # These are additional pages to compare beyond the top keys from TagInfo
    logger.info("Processing specific pages...")
    for page in SPECIFIC_PAGES:
        # For specific pages, we need to handle different formats
        
-        # Case 1: Full URL
+        # Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
        if page.startswith('http'):
            # For full URLs, we directly fetch the page
            page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
@ -831,6 +862,7 @@ def main():
                    # Try to get the English version by removing FR: prefix
                    en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
                    en_url = f"{WIKI_BASE_URL}{en_title}"
+                    logger.info(f"Trying to find English equivalent for {page}: {en_url}")
                    en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
                    if en_page:
                        wiki_pages.append(en_page)
@ -839,11 +871,12 @@ def main():
                    # Try to get the French version by adding FR: prefix
                    fr_title = f"FR:{page_info['page_title']}"
                    fr_url = f"{WIKI_BASE_URL}{fr_title}"
+                    logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
                    fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
                    if fr_page:
                        wiki_pages.append(fr_page)
        
-        # Case 2: Page with FR: prefix
+        # Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
        elif page.startswith('FR:'):
            # Fetch the French page
            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
@ -853,18 +886,19 @@ def main():
                # Try to get the English version by removing FR: prefix
                en_title = page[3:]  # Remove FR: prefix
                en_url = f"{WIKI_BASE_URL}{en_title}"
+                logger.info(f"Trying to find English equivalent for {page}: {en_url}")
                en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
                if en_page:
                    wiki_pages.append(en_page)
        
-        # Case 3: Regular page title
+        # Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
        else:
            # Fetch the English page
            en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
            if en_page:
                wiki_pages.append(en_page)
            
-            # Fetch the French page
+            # Fetch the French page (by adding FR: prefix)
            fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
            if fr_page:
                wiki_pages.append(fr_page)
@ -972,8 +1006,32 @@ def main():
    # Analyze pages to find those needing updates
    pages_to_update = analyze_wiki_pages(wiki_pages)
    
+    # Separate regular pages and specific pages
+    regular_pages = []
+    specific_pages = []
+    
+    for page in pages_to_update:
+        # Check if either English or French page is marked as specific
+        is_specific = False
+        if page['en_page'] and page['en_page'].get('is_specific_page', False):
+            is_specific = True
+        elif page['fr_page'] and page['fr_page'].get('is_specific_page', False):
+            is_specific = True
+            
+        if is_specific:
+            specific_pages.append(page)
+        else:
+            regular_pages.append(page)
+    
+    # Create a structured output with separate sections
+    output_data = {
+        "regular_pages": regular_pages,
+        "specific_pages": specific_pages,
+        "last_updated": datetime.now().isoformat()
+    }
+    
    # Save pages that need updating to JSON
-    save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
+    save_to_json(output_data, OUTDATED_PAGES_FILE)
    
    # Print the top pages needing updates
    print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")