add missing wiki pages from taginfo fr

2025-09-05 11:37:19 +02:00 · 2025-09-05 11:37:19 +02:00 · dffb21b56e
commit dffb21b56e
parent e056cfc8fa
8 changed files with 469 additions and 131 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -37,9 +37,15 @@ from bs4 import BeautifulSoup
 import logging
 import matplotlib.pyplot as plt
 import numpy as np
-import nltk
 from pathlib import Path

+# Try to import nltk, but make it optional
+try:
+    import nltk
+    NLTK_AVAILABLE = True
+except ImportError:
+    NLTK_AVAILABLE = False
+
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
@ -50,11 +56,13 @@ logger = logging.getLogger(__name__)

 # Constants
 TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
+TAGINFO_FRANCE_API_URL = "https://taginfo.geofabrik.de/europe:france/api/4/keys/without_wiki_page"
 WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
 WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
 WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
 WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
 TOP_KEYS_FILE = "top_keys.json"
+KEYS_WITHOUT_WIKI_FILE = "keys_without_wiki.json"
 WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
@ -63,17 +71,18 @@ NUM_WIKI_PAGES = 2
 # HTML cache folder
 HTML_CACHE_DIR = "html_cache"

-# Initialize NLTK for sentence tokenization
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
+# Initialize NLTK for sentence tokenization if available
+if NLTK_AVAILABLE:
+    try:
+        nltk.data.find('tokenizers/punkt')
+    except LookupError:
+        nltk.download('punkt')

-# Also download punkt_tab resource which is needed for sent_tokenize
-try:
-    nltk.data.find('tokenizers/punkt_tab')
-except LookupError:
-    nltk.download('punkt_tab')
+    # Also download punkt_tab resource which is needed for sent_tokenize
+    try:
+        nltk.data.find('tokenizers/punkt_tab')
+    except LookupError:
+        nltk.download('punkt_tab')

 # Create HTML cache directory if it doesn't exist
 Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -177,6 +186,41 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
        logger.error(f"Error fetching data from TagInfo API: {e}")
        return []

+def fetch_keys_without_wiki_page(limit=36):
+    """
+    Fetch keys used in France that are missing a wiki page from TagInfo API
+    
+    Args:
+        limit (int): Number of keys to fetch
+        
+    Returns:
+        list: List of dictionaries containing key information
+    """
+    logger.info(f"Fetching top {limit} OSM keys without wiki pages used in France...")
+    
+    params = {
+        'page': 1,
+        'rp': limit,
+        'english': 0,
+        'sortname': 'count_all',
+        'sortorder': 'desc'
+    }
+    
+    try:
+        response = requests.get(TAGINFO_FRANCE_API_URL, params=params)
+        response.raise_for_status()
+        data = response.json()
+        
+        # Extract just the key names and counts
+        keys_without_wiki = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
+        
+        logger.info(f"Successfully fetched {len(keys_without_wiki)} keys without wiki pages")
+        return keys_without_wiki
+    
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching data from TagInfo France API: {e}")
+        return []
+
 def load_json_data(filename):
    """
    Load data from a JSON file
@ -295,6 +339,13 @@ def check_grammar_with_grammalecte(text):
        logger.warning("Empty text provided for grammar checking")
        return []
    
+    # Check if grammalecte-cli is available
+    try:
+        subprocess.run(['which', 'grammalecte-cli'], capture_output=True, check=True)
+    except subprocess.CalledProcessError:
+        logger.warning("grammalecte-cli not found, skipping grammar check")
+        return []
+    
    logger.info("Checking grammar with grammalecte-cli...")
    
    try:
@ -520,9 +571,13 @@ def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=Tr
        clean_text = content.get_text(separator=' ', strip=True)
        word_count = len(clean_text.split())

-        # Count sentences using NLTK
-        sentences = nltk.sent_tokenize(clean_text)
-        sentence_count = len(sentences)
+        # Count sentences using NLTK if available, otherwise use a simple approximation
+        if NLTK_AVAILABLE and check_grammar:
+            sentences = nltk.sent_tokenize(clean_text)
+            sentence_count = len(sentences)
+        else:
+            # Simple approximation: count periods, exclamation marks, and question marks
+            sentence_count = len(re.findall(r'[.!?]+', clean_text))

        # Check grammar for French pages
        grammar_suggestions = []
@ -1098,18 +1153,19 @@ def main():
    
    This function:
    1. Fetches the top OSM keys from TagInfo API
-    2. Fetches and processes wiki pages for these keys
-    3. Processes specific wiki pages listed in SPECIFIC_PAGES
-    4. Processes pages from the FR:Traductions_désynchronisées category
-    5. Calculates staleness scores for all pages
-    6. Generates a histogram of staleness scores
-    7. Saves the results to CSV and JSON files
-    8. Prints a list of pages that need updating
+    2. Fetches keys used in France that are missing a wiki page from TagInfo API
+    3. Fetches and processes wiki pages for these keys
+    4. Processes specific wiki pages listed in SPECIFIC_PAGES
+    5. Processes pages from the FR:Traductions_désynchronisées category
+    6. Calculates staleness scores for all pages
+    7. Generates a histogram of staleness scores
+    8. Saves the results to CSV and JSON files
+    9. Prints a list of pages that need updating
    """
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
    parser.add_argument('--no-grammar-check', action='store_true',
-                        help='Disable grammar checking for French pages')
+                        help='Disable grammar checking for French pages', default=False)
    args = parser.parse_args()

    # Whether to check grammar for French pages
@ -1131,6 +1187,16 @@ def main():
    # Save top keys to JSON
    save_to_json(top_keys, TOP_KEYS_FILE)
    
+    # Fetch keys without wiki pages used in France
+    keys_without_wiki = fetch_keys_without_wiki_page()
+    
+    if keys_without_wiki:
+        # Save keys without wiki pages to JSON
+        save_to_json(keys_without_wiki, KEYS_WITHOUT_WIKI_FILE)
+        logger.info(f"Saved {len(keys_without_wiki)} keys without wiki pages to {KEYS_WITHOUT_WIKI_FILE}")
+    else:
+        logger.warning("No keys without wiki pages were fetched.")
+    
    # Fetch wiki pages for each key
    wiki_pages = []