suivi et exclusion de pages fr des réunions

This commit is contained in:
Tykayn 2025-09-01 12:38:43 +02:00 committed by tykayn
parent 471eab4cd0
commit 466f9c773b
5 changed files with 231 additions and 4 deletions

View file

@ -25,6 +25,7 @@ import argparse
import logging
import os
import re
import shutil
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
@ -41,6 +42,8 @@ logger = logging.getLogger(__name__)
# Use the directory of this script to determine the output file path
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
@ -340,6 +343,188 @@ def save_results(recent_changes, dry_run=False):
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
return False
def load_unavailable_pages():
"""
Load the list of pages unavailable in French
Returns:
tuple: (all_pages, grouped_pages, last_updated)
"""
if not os.path.exists(UNAVAILABLE_PAGES_FILE):
logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
return [], {}, None
try:
with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
all_pages = data.get('all_pages', [])
grouped_pages = data.get('grouped_pages', {})
last_updated = data.get('last_updated')
return all_pages, grouped_pages, last_updated
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading unavailable pages file: {e}")
return [], {}, None
def load_created_pages():
"""
Load the list of newly created French pages
Returns:
tuple: (created_pages, last_updated)
"""
if not os.path.exists(CREATED_PAGES_FILE):
logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
return [], None
try:
with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
created_pages = data.get('created_pages', [])
last_updated = data.get('last_updated')
return created_pages, last_updated
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading created pages file: {e}")
return [], None
def save_created_pages(created_pages, dry_run=False):
"""
Save the list of newly created French pages
Args:
created_pages (list): List of newly created French pages
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved created pages to file")
return True
data = {
"last_updated": datetime.now().isoformat(),
"created_pages": created_pages
}
try:
with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")
# Copy the file to the public directory
public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
shutil.copy2(CREATED_PAGES_FILE, public_file)
return True
except IOError as e:
logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
return False
def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
"""
Save the updated list of pages unavailable in French
Args:
all_pages (list): List of all unavailable pages
grouped_pages (dict): Dictionary of pages grouped by language prefix
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved updated unavailable pages to file")
return True
data = {
"last_updated": datetime.now().isoformat(),
"all_pages": all_pages,
"grouped_pages": grouped_pages
}
try:
with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")
# Copy the file to the public directory
public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)
return True
except IOError as e:
logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
return False
def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
"""
Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French
Args:
recent_changes (list): List of recent change dictionaries
all_pages (list): List of all unavailable pages
grouped_pages (dict): Dictionary of pages grouped by language prefix
Returns:
tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
"""
newly_created_pages = []
updated_all_pages = all_pages.copy()
updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}
# Check each recent change
for change in recent_changes:
page_name = change['page_name']
page_url = change['page_url']
comment = change['comment'].lower()
# Check if this is a new page creation
is_new_page = "page created" in comment or "nouvelle page" in comment
if is_new_page and page_name.startswith("FR:"):
logger.info(f"Found newly created French page: {page_name}")
# Check if this page was previously in the list of unavailable pages
# We need to check if the English version of this page is in the list
en_page_name = page_name.replace("FR:", "")
# Find the English page in the list of unavailable pages
found_en_page = None
for page in all_pages:
if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
found_en_page = page
break
if found_en_page:
logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")
# Remove the English page from the list of unavailable pages
updated_all_pages.remove(found_en_page)
# Remove the English page from the grouped pages
lang_prefix = found_en_page['language_prefix']
if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
updated_grouped_pages[lang_prefix].remove(found_en_page)
# If the group is now empty, remove it
if not updated_grouped_pages[lang_prefix]:
del updated_grouped_pages[lang_prefix]
# Add the newly created page to the list
newly_created_pages.append({
"title": page_name,
"url": page_url,
"en_title": found_en_page['title'],
"en_url": found_en_page['url'],
"created_at": change['timestamp'],
"created_by": change['user'],
"comment": change['comment']
})
return updated_all_pages, updated_grouped_pages, newly_created_pages
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
@ -417,6 +602,30 @@ def main():
# Save results
success = save_results(recent_changes, args.dry_run)
# Check for newly created French pages
logger.info("Checking for newly created French pages...")
all_pages, grouped_pages, last_updated = load_unavailable_pages()
created_pages, created_last_updated = load_created_pages()
if all_pages and grouped_pages:
# Check for newly created pages
updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)
# If we found newly created pages, update both files
if newly_created:
logger.info(f"Found {len(newly_created)} newly created French pages")
# Add the newly created pages to the existing list
created_pages.extend(newly_created)
# Save the updated files
save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
save_created_pages(created_pages, args.dry_run)
else:
logger.info("No newly created French pages found")
else:
logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")
if success:
logger.info("Script completed successfully")
else: