suivi et exclusion de pages fr des réunions

2025-09-01 12:38:43 +02:00 · 2025-09-01 12:38:43 +02:00 · 466f9c773b
commit 466f9c773b
parent 471eab4cd0
5 changed files with 231 additions and 4 deletions
--- a/wiki_compare/.gitignore
+++ b/wiki_compare/.gitignore
@ -0,0 +1,2 @@
+*.json
+.env
--- a/wiki_compare/README.md
+++ b/wiki_compare/README.md
@ -5,7 +5,7 @@ jour ou de traductions, et publier des suggestions sur Mastodon pour encourager

 ## Vue d'ensemble

-Le projet comprend dix scripts principaux :
+Le projet comprend onze scripts principaux :

 1. **wiki_compare.py** : Récupère les 50 clés OSM les plus utilisées, compare leurs pages wiki en anglais et en
   français, et identifie celles qui ont besoin de mises à jour.
@ -30,6 +30,9 @@ Le projet comprend dix scripts principaux :
 10. **fetch_osm_fr_groups.py** : Récupère les informations sur les groupes de travail et les groupes locaux d'OSM-FR
    depuis la section #Pages_des_groupes_locaux et les enregistre dans un fichier JSON pour affichage sur le site web.
    Les données sont mises en cache pendant une heure.
+11. **fetch_recent_changes.py** : Récupère les changements récents du wiki OSM pour l'espace de noms français, détecte les pages
+    nouvellement créées qui étaient auparavant dans la liste des pages non disponibles en français, et les enregistre dans un
+    fichier JSON pour affichage sur le site web. Les données sont mises en cache pendant une heure.

 ## Installation

@ -286,6 +289,9 @@ Exemple de configuration cron pour publier des suggestions et mettre à jour les
 0 */6 * * * cd /chemin/vers/wiki_compare && ./find_untranslated_french_pages.py
 0 */6 * * * cd /chemin/vers/wiki_compare && ./find_pages_unavailable_in_french.py
 0 */6 * * * cd /chemin/vers/wiki_compare && ./fetch_osm_fr_groups.py
+
+# Récupérer les changements récents et détecter les pages nouvellement créées (toutes les heures)
+0 * * * * cd /chemin/vers/wiki_compare && ./fetch_recent_changes.py
 ```

 Note : Les scripts de mise à jour des données pour le site web intègrent déjà une vérification de fraîcheur du cache (1 heure),
--- a/wiki_compare/fetch_recent_changes.py
+++ b/wiki_compare/fetch_recent_changes.py
@ -25,6 +25,7 @@ import argparse
 import logging
 import os
 import re
+import shutil
 from datetime import datetime, timedelta
 import requests
 from bs4 import BeautifulSoup
@ -41,6 +42,8 @@ logger = logging.getLogger(__name__)
 # Use the directory of this script to determine the output file path
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
+UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
+CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
 RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
 WIKI_BASE_URL = "https://wiki.openstreetmap.org"
 CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour
@ -340,6 +343,188 @@ def save_results(recent_changes, dry_run=False):
        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
        return False

+def load_unavailable_pages():
+    """
+    Load the list of pages unavailable in French
+    
+    Returns:
+        tuple: (all_pages, grouped_pages, last_updated)
+    """
+    if not os.path.exists(UNAVAILABLE_PAGES_FILE):
+        logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
+        return [], {}, None
+    
+    try:
+        with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            all_pages = data.get('all_pages', [])
+            grouped_pages = data.get('grouped_pages', {})
+            last_updated = data.get('last_updated')
+            return all_pages, grouped_pages, last_updated
+    except (IOError, json.JSONDecodeError) as e:
+        logger.error(f"Error loading unavailable pages file: {e}")
+        return [], {}, None
+
+def load_created_pages():
+    """
+    Load the list of newly created French pages
+    
+    Returns:
+        tuple: (created_pages, last_updated)
+    """
+    if not os.path.exists(CREATED_PAGES_FILE):
+        logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
+        return [], None
+    
+    try:
+        with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            created_pages = data.get('created_pages', [])
+            last_updated = data.get('last_updated')
+            return created_pages, last_updated
+    except (IOError, json.JSONDecodeError) as e:
+        logger.error(f"Error loading created pages file: {e}")
+        return [], None
+
+def save_created_pages(created_pages, dry_run=False):
+    """
+    Save the list of newly created French pages
+    
+    Args:
+        created_pages (list): List of newly created French pages
+        dry_run (bool): If True, don't actually save to file
+        
+    Returns:
+        bool: True if saving was successful or dry run, False otherwise
+    """
+    if dry_run:
+        logger.info("DRY RUN: Would have saved created pages to file")
+        return True
+    
+    data = {
+        "last_updated": datetime.now().isoformat(),
+        "created_pages": created_pages
+    }
+    
+    try:
+        with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")
+        
+        # Copy the file to the public directory
+        public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
+        logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
+        shutil.copy2(CREATED_PAGES_FILE, public_file)
+        
+        return True
+    except IOError as e:
+        logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
+        return False
+
+def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
+    """
+    Save the updated list of pages unavailable in French
+    
+    Args:
+        all_pages (list): List of all unavailable pages
+        grouped_pages (dict): Dictionary of pages grouped by language prefix
+        dry_run (bool): If True, don't actually save to file
+        
+    Returns:
+        bool: True if saving was successful or dry run, False otherwise
+    """
+    if dry_run:
+        logger.info("DRY RUN: Would have saved updated unavailable pages to file")
+        return True
+    
+    data = {
+        "last_updated": datetime.now().isoformat(),
+        "all_pages": all_pages,
+        "grouped_pages": grouped_pages
+    }
+    
+    try:
+        with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")
+        
+        # Copy the file to the public directory
+        public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
+        logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
+        shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)
+        
+        return True
+    except IOError as e:
+        logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
+        return False
+
+def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
+    """
+    Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French
+    
+    Args:
+        recent_changes (list): List of recent change dictionaries
+        all_pages (list): List of all unavailable pages
+        grouped_pages (dict): Dictionary of pages grouped by language prefix
+        
+    Returns:
+        tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
+    """
+    newly_created_pages = []
+    updated_all_pages = all_pages.copy()
+    updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}
+    
+    # Check each recent change
+    for change in recent_changes:
+        page_name = change['page_name']
+        page_url = change['page_url']
+        comment = change['comment'].lower()
+        
+        # Check if this is a new page creation
+        is_new_page = "page created" in comment or "nouvelle page" in comment
+        
+        if is_new_page and page_name.startswith("FR:"):
+            logger.info(f"Found newly created French page: {page_name}")
+            
+            # Check if this page was previously in the list of unavailable pages
+            # We need to check if the English version of this page is in the list
+            en_page_name = page_name.replace("FR:", "")
+            
+            # Find the English page in the list of unavailable pages
+            found_en_page = None
+            for page in all_pages:
+                if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
+                    found_en_page = page
+                    break
+            
+            if found_en_page:
+                logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")
+                
+                # Remove the English page from the list of unavailable pages
+                updated_all_pages.remove(found_en_page)
+                
+                # Remove the English page from the grouped pages
+                lang_prefix = found_en_page['language_prefix']
+                if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
+                    updated_grouped_pages[lang_prefix].remove(found_en_page)
+                    
+                    # If the group is now empty, remove it
+                    if not updated_grouped_pages[lang_prefix]:
+                        del updated_grouped_pages[lang_prefix]
+                
+                # Add the newly created page to the list
+                newly_created_pages.append({
+                    "title": page_name,
+                    "url": page_url,
+                    "en_title": found_en_page['title'],
+                    "en_url": found_en_page['url'],
+                    "created_at": change['timestamp'],
+                    "created_by": change['user'],
+                    "comment": change['comment']
+                })
+    
+    return updated_all_pages, updated_grouped_pages, newly_created_pages
+
 def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
@ -417,6 +602,30 @@ def main():
    # Save results
    success = save_results(recent_changes, args.dry_run)
    
+    # Check for newly created French pages
+    logger.info("Checking for newly created French pages...")
+    all_pages, grouped_pages, last_updated = load_unavailable_pages()
+    created_pages, created_last_updated = load_created_pages()
+    
+    if all_pages and grouped_pages:
+        # Check for newly created pages
+        updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)
+        
+        # If we found newly created pages, update both files
+        if newly_created:
+            logger.info(f"Found {len(newly_created)} newly created French pages")
+            
+            # Add the newly created pages to the existing list
+            created_pages.extend(newly_created)
+            
+            # Save the updated files
+            save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
+            save_created_pages(created_pages, args.dry_run)
+        else:
+            logger.info("No newly created French pages found")
+    else:
+        logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")
+    
    if success:
        logger.info("Script completed successfully")
    else:
--- a/wiki_compare/find_pages_unavailable_in_french.py
+++ b/wiki_compare/find_pages_unavailable_in_french.py
@ -135,6 +135,11 @@ def extract_pages_from_category(html_content, current_url):
        title = link.get_text()
        url = WIKI_BASE_URL + link.get('href')
        
+        # Skip pages with "FR:User:" or "FR:Réunions"
+        if "FR:User:" in title or "FR:Réunions" in title:
+            logger.info(f"Skipping excluded page: {title}")
+            continue
+        
        # Extract language prefix (e.g., "En:", "De:", etc.)
        language_prefix = "Other"
        match = re.match(r'^([A-Za-z]{2}):', title)
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -63,14 +63,15 @@ NUM_WIKI_PAGES = 1
 # 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
 SPECIFIC_PAGES = [
    "Anatomie_des_étiquettes_osm",
-    "FR:Tag:leisure=children_club",
-    "FR:Tag:harassment_prevention=Dask_angela",
+    "Tag:leisure=children_club",
+    "Tag:harassment_prevention=ask_angela",
    "Key:harassment_prevention",
    "Proposal process",
    "Automated_Edits_code_of_conduct",
    "Key:cuisine",
    "Libre_Charge_Map",
-    "OSM_Mon_Commerce"
+    "OSM_Mon_Commerce",
+    "Tag:amenity=charging_station"
 ]

 def fetch_top_keys(limit=NUM_WIKI_PAGES):
@ -225,6 +226,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
    Returns:
        dict: Dictionary with page information or None if page doesn't exist
    """
+    # Skip pages with "FR:User:" or "FR:Réunions"
+    if "FR:User:" in key or "FR:Réunions" in key:
+        logger.info(f"Skipping excluded page: {key}")
+        return None
    # Handle different URL formats
    if is_specific_page:
        # Case 1: Full URL