diff --git a/wiki_compare/.gitignore b/wiki_compare/.gitignore new file mode 100644 index 00000000..823fc79d --- /dev/null +++ b/wiki_compare/.gitignore @@ -0,0 +1,2 @@ +*.json +.env \ No newline at end of file diff --git a/wiki_compare/README.md b/wiki_compare/README.md index 98df2eee..0c6d9627 100644 --- a/wiki_compare/README.md +++ b/wiki_compare/README.md @@ -5,7 +5,7 @@ jour ou de traductions, et publier des suggestions sur Mastodon pour encourager ## Vue d'ensemble -Le projet comprend dix scripts principaux : +Le projet comprend onze scripts principaux : 1. **wiki_compare.py** : Récupère les 50 clés OSM les plus utilisées, compare leurs pages wiki en anglais et en français, et identifie celles qui ont besoin de mises à jour. @@ -30,6 +30,9 @@ Le projet comprend dix scripts principaux : 10. **fetch_osm_fr_groups.py** : Récupère les informations sur les groupes de travail et les groupes locaux d'OSM-FR depuis la section #Pages_des_groupes_locaux et les enregistre dans un fichier JSON pour affichage sur le site web. Les données sont mises en cache pendant une heure. +11. **fetch_recent_changes.py** : Récupère les changements récents du wiki OSM pour l'espace de noms français, détecte les pages + nouvellement créées qui étaient auparavant dans la liste des pages non disponibles en français, et les enregistre dans un + fichier JSON pour affichage sur le site web. Les données sont mises en cache pendant une heure. ## Installation @@ -286,6 +289,9 @@ Exemple de configuration cron pour publier des suggestions et mettre à jour les 0 */6 * * * cd /chemin/vers/wiki_compare && ./find_untranslated_french_pages.py 0 */6 * * * cd /chemin/vers/wiki_compare && ./find_pages_unavailable_in_french.py 0 */6 * * * cd /chemin/vers/wiki_compare && ./fetch_osm_fr_groups.py + +# Récupérer les changements récents et détecter les pages nouvellement créées (toutes les heures) +0 * * * * cd /chemin/vers/wiki_compare && ./fetch_recent_changes.py ``` Note : Les scripts de mise à jour des données pour le site web intègrent déjà une vérification de fraîcheur du cache (1 heure), diff --git a/wiki_compare/fetch_recent_changes.py b/wiki_compare/fetch_recent_changes.py index f2c13dc6..17625bff 100644 --- a/wiki_compare/fetch_recent_changes.py +++ b/wiki_compare/fetch_recent_changes.py @@ -25,6 +25,7 @@ import argparse import logging import os import re +import shutil from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup @@ -41,6 +42,8 @@ logger = logging.getLogger(__name__) # Use the directory of this script to determine the output file path SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json") +UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json") +CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json") RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2" WIKI_BASE_URL = "https://wiki.openstreetmap.org" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour @@ -340,6 +343,188 @@ def save_results(recent_changes, dry_run=False): logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False +def load_unavailable_pages(): + """ + Load the list of pages unavailable in French + + Returns: + tuple: (all_pages, grouped_pages, last_updated) + """ + if not os.path.exists(UNAVAILABLE_PAGES_FILE): + logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist") + return [], {}, None + + try: + with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f: + data = json.load(f) + all_pages = data.get('all_pages', []) + grouped_pages = data.get('grouped_pages', {}) + last_updated = data.get('last_updated') + return all_pages, grouped_pages, last_updated + except (IOError, json.JSONDecodeError) as e: + logger.error(f"Error loading unavailable pages file: {e}") + return [], {}, None + +def load_created_pages(): + """ + Load the list of newly created French pages + + Returns: + tuple: (created_pages, last_updated) + """ + if not os.path.exists(CREATED_PAGES_FILE): + logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it") + return [], None + + try: + with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f: + data = json.load(f) + created_pages = data.get('created_pages', []) + last_updated = data.get('last_updated') + return created_pages, last_updated + except (IOError, json.JSONDecodeError) as e: + logger.error(f"Error loading created pages file: {e}") + return [], None + +def save_created_pages(created_pages, dry_run=False): + """ + Save the list of newly created French pages + + Args: + created_pages (list): List of newly created French pages + dry_run (bool): If True, don't actually save to file + + Returns: + bool: True if saving was successful or dry run, False otherwise + """ + if dry_run: + logger.info("DRY RUN: Would have saved created pages to file") + return True + + data = { + "last_updated": datetime.now().isoformat(), + "created_pages": created_pages + } + + try: + with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}") + + # Copy the file to the public directory + public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE)) + logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}") + shutil.copy2(CREATED_PAGES_FILE, public_file) + + return True + except IOError as e: + logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}") + return False + +def save_unavailable_pages(all_pages, grouped_pages, dry_run=False): + """ + Save the updated list of pages unavailable in French + + Args: + all_pages (list): List of all unavailable pages + grouped_pages (dict): Dictionary of pages grouped by language prefix + dry_run (bool): If True, don't actually save to file + + Returns: + bool: True if saving was successful or dry run, False otherwise + """ + if dry_run: + logger.info("DRY RUN: Would have saved updated unavailable pages to file") + return True + + data = { + "last_updated": datetime.now().isoformat(), + "all_pages": all_pages, + "grouped_pages": grouped_pages + } + + try: + with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}") + + # Copy the file to the public directory + public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE)) + logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}") + shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file) + + return True + except IOError as e: + logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}") + return False + +def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages): + """ + Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French + + Args: + recent_changes (list): List of recent change dictionaries + all_pages (list): List of all unavailable pages + grouped_pages (dict): Dictionary of pages grouped by language prefix + + Returns: + tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages) + """ + newly_created_pages = [] + updated_all_pages = all_pages.copy() + updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()} + + # Check each recent change + for change in recent_changes: + page_name = change['page_name'] + page_url = change['page_url'] + comment = change['comment'].lower() + + # Check if this is a new page creation + is_new_page = "page created" in comment or "nouvelle page" in comment + + if is_new_page and page_name.startswith("FR:"): + logger.info(f"Found newly created French page: {page_name}") + + # Check if this page was previously in the list of unavailable pages + # We need to check if the English version of this page is in the list + en_page_name = page_name.replace("FR:", "") + + # Find the English page in the list of unavailable pages + found_en_page = None + for page in all_pages: + if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name): + found_en_page = page + break + + if found_en_page: + logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}") + + # Remove the English page from the list of unavailable pages + updated_all_pages.remove(found_en_page) + + # Remove the English page from the grouped pages + lang_prefix = found_en_page['language_prefix'] + if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]: + updated_grouped_pages[lang_prefix].remove(found_en_page) + + # If the group is now empty, remove it + if not updated_grouped_pages[lang_prefix]: + del updated_grouped_pages[lang_prefix] + + # Add the newly created page to the list + newly_created_pages.append({ + "title": page_name, + "url": page_url, + "en_title": found_en_page['title'], + "en_url": found_en_page['url'], + "created_at": change['timestamp'], + "created_by": change['user'], + "comment": change['comment'] + }) + + return updated_all_pages, updated_grouped_pages, newly_created_pages + def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace") @@ -417,6 +602,30 @@ def main(): # Save results success = save_results(recent_changes, args.dry_run) + # Check for newly created French pages + logger.info("Checking for newly created French pages...") + all_pages, grouped_pages, last_updated = load_unavailable_pages() + created_pages, created_last_updated = load_created_pages() + + if all_pages and grouped_pages: + # Check for newly created pages + updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages) + + # If we found newly created pages, update both files + if newly_created: + logger.info(f"Found {len(newly_created)} newly created French pages") + + # Add the newly created pages to the existing list + created_pages.extend(newly_created) + + # Save the updated files + save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run) + save_created_pages(created_pages, args.dry_run) + else: + logger.info("No newly created French pages found") + else: + logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty") + if success: logger.info("Script completed successfully") else: diff --git a/wiki_compare/find_pages_unavailable_in_french.py b/wiki_compare/find_pages_unavailable_in_french.py index aae17b7e..0d048fe6 100755 --- a/wiki_compare/find_pages_unavailable_in_french.py +++ b/wiki_compare/find_pages_unavailable_in_french.py @@ -135,6 +135,11 @@ def extract_pages_from_category(html_content, current_url): title = link.get_text() url = WIKI_BASE_URL + link.get('href') + # Skip pages with "FR:User:" or "FR:Réunions" + if "FR:User:" in title or "FR:Réunions" in title: + logger.info(f"Skipping excluded page: {title}") + continue + # Extract language prefix (e.g., "En:", "De:", etc.) language_prefix = "Other" match = re.match(r'^([A-Za-z]{2}):', title) diff --git a/wiki_compare/wiki_compare.py b/wiki_compare/wiki_compare.py index f0d0fe92..e45097f5 100755 --- a/wiki_compare/wiki_compare.py +++ b/wiki_compare/wiki_compare.py @@ -63,14 +63,15 @@ NUM_WIKI_PAGES = 1 # 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club") SPECIFIC_PAGES = [ "Anatomie_des_étiquettes_osm", - "FR:Tag:leisure=children_club", - "FR:Tag:harassment_prevention=Dask_angela", + "Tag:leisure=children_club", + "Tag:harassment_prevention=ask_angela", "Key:harassment_prevention", "Proposal process", "Automated_Edits_code_of_conduct", "Key:cuisine", "Libre_Charge_Map", - "OSM_Mon_Commerce" + "OSM_Mon_Commerce", + "Tag:amenity=charging_station" ] def fetch_top_keys(limit=NUM_WIKI_PAGES): @@ -225,6 +226,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): Returns: dict: Dictionary with page information or None if page doesn't exist """ + # Skip pages with "FR:User:" or "FR:Réunions" + if "FR:User:" in key or "FR:Réunions" in key: + logger.info(f"Skipping excluded page: {key}") + return None # Handle different URL formats if is_specific_page: # Case 1: Full URL