suivi et exclusion de pages fr des réunions

2025-09-01 12:38:43 +02:00 · 2025-09-01 12:38:43 +02:00 · 466f9c773b
commit 466f9c773b
parent 471eab4cd0
5 changed files with 231 additions and 4 deletions
--- a/wiki_compare/.gitignore
+++ b/wiki_compare/.gitignore
@ -0,0 +1,2 @@
 *.json
 .env
--- a/wiki_compare/README.md
+++ b/wiki_compare/README.md
@ -5,7 +5,7 @@ jour ou de traductions, et publier des suggestions sur Mastodon pour encourager
 ## Vue d'ensemble
-Le projet comprend dix scripts principaux :
+Le projet comprend onze scripts principaux :
 1. **wiki_compare.py** : Récupère les 50 clés OSM les plus utilisées, compare leurs pages wiki en anglais et en
   français, et identifie celles qui ont besoin de mises à jour.
@ -30,6 +30,9 @@ Le projet comprend dix scripts principaux :
 10. **fetch_osm_fr_groups.py** : Récupère les informations sur les groupes de travail et les groupes locaux d'OSM-FR
    depuis la section #Pages_des_groupes_locaux et les enregistre dans un fichier JSON pour affichage sur le site web.
    Les données sont mises en cache pendant une heure.
 11. **fetch_recent_changes.py** : Récupère les changements récents du wiki OSM pour l'espace de noms français, détecte les pages
    nouvellement créées qui étaient auparavant dans la liste des pages non disponibles en français, et les enregistre dans un
    fichier JSON pour affichage sur le site web. Les données sont mises en cache pendant une heure.
 ## Installation
@ -286,6 +289,9 @@ Exemple de configuration cron pour publier des suggestions et mettre à jour les
 0 */6 * * * cd /chemin/vers/wiki_compare && ./find_untranslated_french_pages.py
 0 */6 * * * cd /chemin/vers/wiki_compare && ./find_pages_unavailable_in_french.py
 0 */6 * * * cd /chemin/vers/wiki_compare && ./fetch_osm_fr_groups.py
 # Récupérer les changements récents et détecter les pages nouvellement créées (toutes les heures)
 0 * * * * cd /chemin/vers/wiki_compare && ./fetch_recent_changes.py
 ```
 Note : Les scripts de mise à jour des données pour le site web intègrent déjà une vérification de fraîcheur du cache (1 heure),
--- a/wiki_compare/fetch_recent_changes.py
+++ b/wiki_compare/fetch_recent_changes.py
@ -25,6 +25,7 @@ import argparse
 import logging
 import os
 import re
 import shutil
 from datetime import datetime, timedelta
 import requests
 from bs4 import BeautifulSoup
@ -41,6 +42,8 @@ logger = logging.getLogger(__name__)
 # Use the directory of this script to determine the output file path
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
 UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
 CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
 RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
 WIKI_BASE_URL = "https://wiki.openstreetmap.org"
 CACHE_DURATION = timedelta(hours=1)  # Cache duration of 1 hour
@ -340,6 +343,188 @@ def save_results(recent_changes, dry_run=False):
        logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
        return False
 def load_unavailable_pages():
    """
    Load the list of pages unavailable in French
    Returns:
        tuple: (all_pages, grouped_pages, last_updated)
    """
    if not os.path.exists(UNAVAILABLE_PAGES_FILE):
        logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
        return [], {}, None
    try:
        with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_pages = data.get('all_pages', [])
            grouped_pages = data.get('grouped_pages', {})
            last_updated = data.get('last_updated')
            return all_pages, grouped_pages, last_updated
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error loading unavailable pages file: {e}")
        return [], {}, None
 def load_created_pages():
    """
    Load the list of newly created French pages
    Returns:
        tuple: (created_pages, last_updated)
    """
    if not os.path.exists(CREATED_PAGES_FILE):
        logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
        return [], None
    try:
        with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
            created_pages = data.get('created_pages', [])
            last_updated = data.get('last_updated')
            return created_pages, last_updated
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error loading created pages file: {e}")
        return [], None
 def save_created_pages(created_pages, dry_run=False):
    """
    Save the list of newly created French pages
    Args:
        created_pages (list): List of newly created French pages
        dry_run (bool): If True, don't actually save to file
    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved created pages to file")
        return True
    data = {
        "last_updated": datetime.now().isoformat(),
        "created_pages": created_pages
    }
    try:
        with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")
        # Copy the file to the public directory
        public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
        logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
        shutil.copy2(CREATED_PAGES_FILE, public_file)
        return True
    except IOError as e:
        logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
        return False
 def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
    """
    Save the updated list of pages unavailable in French
    Args:
        all_pages (list): List of all unavailable pages
        grouped_pages (dict): Dictionary of pages grouped by language prefix
        dry_run (bool): If True, don't actually save to file
    Returns:
        bool: True if saving was successful or dry run, False otherwise
    """
    if dry_run:
        logger.info("DRY RUN: Would have saved updated unavailable pages to file")
        return True
    data = {
        "last_updated": datetime.now().isoformat(),
        "all_pages": all_pages,
        "grouped_pages": grouped_pages
    }
    try:
        with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")
        # Copy the file to the public directory
        public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
        logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
        shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)
        return True
    except IOError as e:
        logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
        return False
 def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
    """
    Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French
    Args:
        recent_changes (list): List of recent change dictionaries
        all_pages (list): List of all unavailable pages
        grouped_pages (dict): Dictionary of pages grouped by language prefix
    Returns:
        tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
    """
    newly_created_pages = []
    updated_all_pages = all_pages.copy()
    updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}
    # Check each recent change
    for change in recent_changes:
        page_name = change['page_name']
        page_url = change['page_url']
        comment = change['comment'].lower()
        # Check if this is a new page creation
        is_new_page = "page created" in comment or "nouvelle page" in comment
        if is_new_page and page_name.startswith("FR:"):
            logger.info(f"Found newly created French page: {page_name}")
            # Check if this page was previously in the list of unavailable pages
            # We need to check if the English version of this page is in the list
            en_page_name = page_name.replace("FR:", "")
            # Find the English page in the list of unavailable pages
            found_en_page = None
            for page in all_pages:
                if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
                    found_en_page = page
                    break
            if found_en_page:
                logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")
                # Remove the English page from the list of unavailable pages
                updated_all_pages.remove(found_en_page)
                # Remove the English page from the grouped pages
                lang_prefix = found_en_page['language_prefix']
                if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
                    updated_grouped_pages[lang_prefix].remove(found_en_page)
                    # If the group is now empty, remove it
                    if not updated_grouped_pages[lang_prefix]:
                        del updated_grouped_pages[lang_prefix]
                # Add the newly created page to the list
                newly_created_pages.append({
                    "title": page_name,
                    "url": page_url,
                    "en_title": found_en_page['title'],
                    "en_url": found_en_page['url'],
                    "created_at": change['timestamp'],
                    "created_by": change['user'],
                    "comment": change['comment']
                })
    return updated_all_pages, updated_grouped_pages, newly_created_pages
 def main():
    """Main function to execute the script"""
    parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
@ -417,6 +602,30 @@ def main():
    # Save results
    success = save_results(recent_changes, args.dry_run)
    # Check for newly created French pages
    logger.info("Checking for newly created French pages...")
    all_pages, grouped_pages, last_updated = load_unavailable_pages()
    created_pages, created_last_updated = load_created_pages()
    if all_pages and grouped_pages:
        # Check for newly created pages
        updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)
        # If we found newly created pages, update both files
        if newly_created:
            logger.info(f"Found {len(newly_created)} newly created French pages")
            # Add the newly created pages to the existing list
            created_pages.extend(newly_created)
            # Save the updated files
            save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
            save_created_pages(created_pages, args.dry_run)
        else:
            logger.info("No newly created French pages found")
    else:
        logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")
    if success:
        logger.info("Script completed successfully")
    else:
--- a/wiki_compare/find_pages_unavailable_in_french.py
+++ b/wiki_compare/find_pages_unavailable_in_french.py
@ -135,6 +135,11 @@ def extract_pages_from_category(html_content, current_url):
        title = link.get_text()
        url = WIKI_BASE_URL + link.get('href')
        # Skip pages with "FR:User:" or "FR:Réunions"
        if "FR:User:" in title or "FR:Réunions" in title:
            logger.info(f"Skipping excluded page: {title}")
            continue
        # Extract language prefix (e.g., "En:", "De:", etc.)
        language_prefix = "Other"
        match = re.match(r'^([A-Za-z]{2}):', title)
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -63,14 +63,15 @@ NUM_WIKI_PAGES = 1
 # 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
 SPECIFIC_PAGES = [
    "Anatomie_des_étiquettes_osm",
-    "FR:Tag:leisure=children_club",
+    "Tag:leisure=children_club",
-    "FR:Tag:harassment_prevention=Dask_angela",
+    "Tag:harassment_prevention=ask_angela",
    "Key:harassment_prevention",
    "Proposal process",
    "Automated_Edits_code_of_conduct",
    "Key:cuisine",
    "Libre_Charge_Map",
-    "OSM_Mon_Commerce"
+    "OSM_Mon_Commerce",
    "Tag:amenity=charging_station"
 ]
 def fetch_top_keys(limit=NUM_WIKI_PAGES):
@ -225,6 +226,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
    Returns:
        dict: Dictionary with page information or None if page doesn't exist
    """
    # Skip pages with "FR:User:" or "FR:Réunions"
    if "FR:User:" in key or "FR:Réunions" in key:
        logger.info(f"Skipping excluded page: {key}")
        return None
    # Handle different URL formats
    if is_specific_page:
        # Case 1: Full URL