suivi et exclusion de pages fr des réunions

This commit is contained in:
Tykayn 2025-09-01 12:38:43 +02:00 committed by tykayn
parent 471eab4cd0
commit 466f9c773b
5 changed files with 231 additions and 4 deletions

2
wiki_compare/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*.json
.env

View file

@ -5,7 +5,7 @@ jour ou de traductions, et publier des suggestions sur Mastodon pour encourager
## Vue d'ensemble
Le projet comprend dix scripts principaux :
Le projet comprend onze scripts principaux :
1. **wiki_compare.py** : Récupère les 50 clés OSM les plus utilisées, compare leurs pages wiki en anglais et en
français, et identifie celles qui ont besoin de mises à jour.
@ -30,6 +30,9 @@ Le projet comprend dix scripts principaux :
10. **fetch_osm_fr_groups.py** : Récupère les informations sur les groupes de travail et les groupes locaux d'OSM-FR
depuis la section #Pages_des_groupes_locaux et les enregistre dans un fichier JSON pour affichage sur le site web.
Les données sont mises en cache pendant une heure.
11. **fetch_recent_changes.py** : Récupère les changements récents du wiki OSM pour l'espace de noms français, détecte les pages
nouvellement créées qui étaient auparavant dans la liste des pages non disponibles en français, et les enregistre dans un
fichier JSON pour affichage sur le site web. Les données sont mises en cache pendant une heure.
## Installation
@ -286,6 +289,9 @@ Exemple de configuration cron pour publier des suggestions et mettre à jour les
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_untranslated_french_pages.py
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_pages_unavailable_in_french.py
0 */6 * * * cd /chemin/vers/wiki_compare && ./fetch_osm_fr_groups.py
# Récupérer les changements récents et détecter les pages nouvellement créées (toutes les heures)
0 * * * * cd /chemin/vers/wiki_compare && ./fetch_recent_changes.py
```
Note : Les scripts de mise à jour des données pour le site web intègrent déjà une vérification de fraîcheur du cache (1 heure),

View file

@ -25,6 +25,7 @@ import argparse
import logging
import os
import re
import shutil
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
@ -41,6 +42,8 @@ logger = logging.getLogger(__name__)
# Use the directory of this script to determine the output file path
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
@ -340,6 +343,188 @@ def save_results(recent_changes, dry_run=False):
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
return False
def load_unavailable_pages():
"""
Load the list of pages unavailable in French
Returns:
tuple: (all_pages, grouped_pages, last_updated)
"""
if not os.path.exists(UNAVAILABLE_PAGES_FILE):
logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
return [], {}, None
try:
with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
all_pages = data.get('all_pages', [])
grouped_pages = data.get('grouped_pages', {})
last_updated = data.get('last_updated')
return all_pages, grouped_pages, last_updated
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading unavailable pages file: {e}")
return [], {}, None
def load_created_pages():
"""
Load the list of newly created French pages
Returns:
tuple: (created_pages, last_updated)
"""
if not os.path.exists(CREATED_PAGES_FILE):
logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
return [], None
try:
with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
created_pages = data.get('created_pages', [])
last_updated = data.get('last_updated')
return created_pages, last_updated
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading created pages file: {e}")
return [], None
def save_created_pages(created_pages, dry_run=False):
"""
Save the list of newly created French pages
Args:
created_pages (list): List of newly created French pages
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved created pages to file")
return True
data = {
"last_updated": datetime.now().isoformat(),
"created_pages": created_pages
}
try:
with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")
# Copy the file to the public directory
public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
shutil.copy2(CREATED_PAGES_FILE, public_file)
return True
except IOError as e:
logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
return False
def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
"""
Save the updated list of pages unavailable in French
Args:
all_pages (list): List of all unavailable pages
grouped_pages (dict): Dictionary of pages grouped by language prefix
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved updated unavailable pages to file")
return True
data = {
"last_updated": datetime.now().isoformat(),
"all_pages": all_pages,
"grouped_pages": grouped_pages
}
try:
with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")
# Copy the file to the public directory
public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)
return True
except IOError as e:
logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
return False
def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
"""
Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French
Args:
recent_changes (list): List of recent change dictionaries
all_pages (list): List of all unavailable pages
grouped_pages (dict): Dictionary of pages grouped by language prefix
Returns:
tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
"""
newly_created_pages = []
updated_all_pages = all_pages.copy()
updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}
# Check each recent change
for change in recent_changes:
page_name = change['page_name']
page_url = change['page_url']
comment = change['comment'].lower()
# Check if this is a new page creation
is_new_page = "page created" in comment or "nouvelle page" in comment
if is_new_page and page_name.startswith("FR:"):
logger.info(f"Found newly created French page: {page_name}")
# Check if this page was previously in the list of unavailable pages
# We need to check if the English version of this page is in the list
en_page_name = page_name.replace("FR:", "")
# Find the English page in the list of unavailable pages
found_en_page = None
for page in all_pages:
if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
found_en_page = page
break
if found_en_page:
logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")
# Remove the English page from the list of unavailable pages
updated_all_pages.remove(found_en_page)
# Remove the English page from the grouped pages
lang_prefix = found_en_page['language_prefix']
if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
updated_grouped_pages[lang_prefix].remove(found_en_page)
# If the group is now empty, remove it
if not updated_grouped_pages[lang_prefix]:
del updated_grouped_pages[lang_prefix]
# Add the newly created page to the list
newly_created_pages.append({
"title": page_name,
"url": page_url,
"en_title": found_en_page['title'],
"en_url": found_en_page['url'],
"created_at": change['timestamp'],
"created_by": change['user'],
"comment": change['comment']
})
return updated_all_pages, updated_grouped_pages, newly_created_pages
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
@ -417,6 +602,30 @@ def main():
# Save results
success = save_results(recent_changes, args.dry_run)
# Check for newly created French pages
logger.info("Checking for newly created French pages...")
all_pages, grouped_pages, last_updated = load_unavailable_pages()
created_pages, created_last_updated = load_created_pages()
if all_pages and grouped_pages:
# Check for newly created pages
updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)
# If we found newly created pages, update both files
if newly_created:
logger.info(f"Found {len(newly_created)} newly created French pages")
# Add the newly created pages to the existing list
created_pages.extend(newly_created)
# Save the updated files
save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
save_created_pages(created_pages, args.dry_run)
else:
logger.info("No newly created French pages found")
else:
logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")
if success:
logger.info("Script completed successfully")
else:

View file

@ -135,6 +135,11 @@ def extract_pages_from_category(html_content, current_url):
title = link.get_text()
url = WIKI_BASE_URL + link.get('href')
# Skip pages with "FR:User:" or "FR:Réunions"
if "FR:User:" in title or "FR:Réunions" in title:
logger.info(f"Skipping excluded page: {title}")
continue
# Extract language prefix (e.g., "En:", "De:", etc.)
language_prefix = "Other"
match = re.match(r'^([A-Za-z]{2}):', title)

View file

@ -63,14 +63,15 @@ NUM_WIKI_PAGES = 1
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
SPECIFIC_PAGES = [
"Anatomie_des_étiquettes_osm",
"FR:Tag:leisure=children_club",
"FR:Tag:harassment_prevention=Dask_angela",
"Tag:leisure=children_club",
"Tag:harassment_prevention=ask_angela",
"Key:harassment_prevention",
"Proposal process",
"Automated_Edits_code_of_conduct",
"Key:cuisine",
"Libre_Charge_Map",
"OSM_Mon_Commerce"
"OSM_Mon_Commerce",
"Tag:amenity=charging_station"
]
def fetch_top_keys(limit=NUM_WIKI_PAGES):
@ -225,6 +226,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
Returns:
dict: Dictionary with page information or None if page doesn't exist
"""
# Skip pages with "FR:User:" or "FR:Réunions"
if "FR:User:" in key or "FR:Réunions" in key:
logger.info(f"Skipping excluded page: {key}")
return None
# Handle different URL formats
if is_specific_page:
# Case 1: Full URL