suivi et exclusion de pages fr des réunions
This commit is contained in:
parent
471eab4cd0
commit
466f9c773b
5 changed files with 231 additions and 4 deletions
2
wiki_compare/.gitignore
vendored
Normal file
2
wiki_compare/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
*.json
|
||||
.env
|
|
@ -5,7 +5,7 @@ jour ou de traductions, et publier des suggestions sur Mastodon pour encourager
|
|||
|
||||
## Vue d'ensemble
|
||||
|
||||
Le projet comprend dix scripts principaux :
|
||||
Le projet comprend onze scripts principaux :
|
||||
|
||||
1. **wiki_compare.py** : Récupère les 50 clés OSM les plus utilisées, compare leurs pages wiki en anglais et en
|
||||
français, et identifie celles qui ont besoin de mises à jour.
|
||||
|
@ -30,6 +30,9 @@ Le projet comprend dix scripts principaux :
|
|||
10. **fetch_osm_fr_groups.py** : Récupère les informations sur les groupes de travail et les groupes locaux d'OSM-FR
|
||||
depuis la section #Pages_des_groupes_locaux et les enregistre dans un fichier JSON pour affichage sur le site web.
|
||||
Les données sont mises en cache pendant une heure.
|
||||
11. **fetch_recent_changes.py** : Récupère les changements récents du wiki OSM pour l'espace de noms français, détecte les pages
|
||||
nouvellement créées qui étaient auparavant dans la liste des pages non disponibles en français, et les enregistre dans un
|
||||
fichier JSON pour affichage sur le site web. Les données sont mises en cache pendant une heure.
|
||||
|
||||
## Installation
|
||||
|
||||
|
@ -286,6 +289,9 @@ Exemple de configuration cron pour publier des suggestions et mettre à jour les
|
|||
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_untranslated_french_pages.py
|
||||
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_pages_unavailable_in_french.py
|
||||
0 */6 * * * cd /chemin/vers/wiki_compare && ./fetch_osm_fr_groups.py
|
||||
|
||||
# Récupérer les changements récents et détecter les pages nouvellement créées (toutes les heures)
|
||||
0 * * * * cd /chemin/vers/wiki_compare && ./fetch_recent_changes.py
|
||||
```
|
||||
|
||||
Note : Les scripts de mise à jour des données pour le site web intègrent déjà une vérification de fraîcheur du cache (1 heure),
|
||||
|
|
|
@ -25,6 +25,7 @@ import argparse
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -41,6 +42,8 @@ logger = logging.getLogger(__name__)
|
|||
# Use the directory of this script to determine the output file path
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
|
||||
UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
|
||||
CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
|
||||
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
||||
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
||||
|
@ -340,6 +343,188 @@ def save_results(recent_changes, dry_run=False):
|
|||
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
||||
return False
|
||||
|
||||
def load_unavailable_pages():
|
||||
"""
|
||||
Load the list of pages unavailable in French
|
||||
|
||||
Returns:
|
||||
tuple: (all_pages, grouped_pages, last_updated)
|
||||
"""
|
||||
if not os.path.exists(UNAVAILABLE_PAGES_FILE):
|
||||
logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
|
||||
return [], {}, None
|
||||
|
||||
try:
|
||||
with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
all_pages = data.get('all_pages', [])
|
||||
grouped_pages = data.get('grouped_pages', {})
|
||||
last_updated = data.get('last_updated')
|
||||
return all_pages, grouped_pages, last_updated
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error loading unavailable pages file: {e}")
|
||||
return [], {}, None
|
||||
|
||||
def load_created_pages():
|
||||
"""
|
||||
Load the list of newly created French pages
|
||||
|
||||
Returns:
|
||||
tuple: (created_pages, last_updated)
|
||||
"""
|
||||
if not os.path.exists(CREATED_PAGES_FILE):
|
||||
logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
|
||||
return [], None
|
||||
|
||||
try:
|
||||
with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
created_pages = data.get('created_pages', [])
|
||||
last_updated = data.get('last_updated')
|
||||
return created_pages, last_updated
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error loading created pages file: {e}")
|
||||
return [], None
|
||||
|
||||
def save_created_pages(created_pages, dry_run=False):
|
||||
"""
|
||||
Save the list of newly created French pages
|
||||
|
||||
Args:
|
||||
created_pages (list): List of newly created French pages
|
||||
dry_run (bool): If True, don't actually save to file
|
||||
|
||||
Returns:
|
||||
bool: True if saving was successful or dry run, False otherwise
|
||||
"""
|
||||
if dry_run:
|
||||
logger.info("DRY RUN: Would have saved created pages to file")
|
||||
return True
|
||||
|
||||
data = {
|
||||
"last_updated": datetime.now().isoformat(),
|
||||
"created_pages": created_pages
|
||||
}
|
||||
|
||||
try:
|
||||
with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")
|
||||
|
||||
# Copy the file to the public directory
|
||||
public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
|
||||
logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
|
||||
shutil.copy2(CREATED_PAGES_FILE, public_file)
|
||||
|
||||
return True
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
|
||||
return False
|
||||
|
||||
def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
|
||||
"""
|
||||
Save the updated list of pages unavailable in French
|
||||
|
||||
Args:
|
||||
all_pages (list): List of all unavailable pages
|
||||
grouped_pages (dict): Dictionary of pages grouped by language prefix
|
||||
dry_run (bool): If True, don't actually save to file
|
||||
|
||||
Returns:
|
||||
bool: True if saving was successful or dry run, False otherwise
|
||||
"""
|
||||
if dry_run:
|
||||
logger.info("DRY RUN: Would have saved updated unavailable pages to file")
|
||||
return True
|
||||
|
||||
data = {
|
||||
"last_updated": datetime.now().isoformat(),
|
||||
"all_pages": all_pages,
|
||||
"grouped_pages": grouped_pages
|
||||
}
|
||||
|
||||
try:
|
||||
with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")
|
||||
|
||||
# Copy the file to the public directory
|
||||
public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
|
||||
logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
|
||||
shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)
|
||||
|
||||
return True
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
|
||||
return False
|
||||
|
||||
def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
|
||||
"""
|
||||
Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French
|
||||
|
||||
Args:
|
||||
recent_changes (list): List of recent change dictionaries
|
||||
all_pages (list): List of all unavailable pages
|
||||
grouped_pages (dict): Dictionary of pages grouped by language prefix
|
||||
|
||||
Returns:
|
||||
tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
|
||||
"""
|
||||
newly_created_pages = []
|
||||
updated_all_pages = all_pages.copy()
|
||||
updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}
|
||||
|
||||
# Check each recent change
|
||||
for change in recent_changes:
|
||||
page_name = change['page_name']
|
||||
page_url = change['page_url']
|
||||
comment = change['comment'].lower()
|
||||
|
||||
# Check if this is a new page creation
|
||||
is_new_page = "page created" in comment or "nouvelle page" in comment
|
||||
|
||||
if is_new_page and page_name.startswith("FR:"):
|
||||
logger.info(f"Found newly created French page: {page_name}")
|
||||
|
||||
# Check if this page was previously in the list of unavailable pages
|
||||
# We need to check if the English version of this page is in the list
|
||||
en_page_name = page_name.replace("FR:", "")
|
||||
|
||||
# Find the English page in the list of unavailable pages
|
||||
found_en_page = None
|
||||
for page in all_pages:
|
||||
if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
|
||||
found_en_page = page
|
||||
break
|
||||
|
||||
if found_en_page:
|
||||
logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")
|
||||
|
||||
# Remove the English page from the list of unavailable pages
|
||||
updated_all_pages.remove(found_en_page)
|
||||
|
||||
# Remove the English page from the grouped pages
|
||||
lang_prefix = found_en_page['language_prefix']
|
||||
if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
|
||||
updated_grouped_pages[lang_prefix].remove(found_en_page)
|
||||
|
||||
# If the group is now empty, remove it
|
||||
if not updated_grouped_pages[lang_prefix]:
|
||||
del updated_grouped_pages[lang_prefix]
|
||||
|
||||
# Add the newly created page to the list
|
||||
newly_created_pages.append({
|
||||
"title": page_name,
|
||||
"url": page_url,
|
||||
"en_title": found_en_page['title'],
|
||||
"en_url": found_en_page['url'],
|
||||
"created_at": change['timestamp'],
|
||||
"created_by": change['user'],
|
||||
"comment": change['comment']
|
||||
})
|
||||
|
||||
return updated_all_pages, updated_grouped_pages, newly_created_pages
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
|
||||
|
@ -417,6 +602,30 @@ def main():
|
|||
# Save results
|
||||
success = save_results(recent_changes, args.dry_run)
|
||||
|
||||
# Check for newly created French pages
|
||||
logger.info("Checking for newly created French pages...")
|
||||
all_pages, grouped_pages, last_updated = load_unavailable_pages()
|
||||
created_pages, created_last_updated = load_created_pages()
|
||||
|
||||
if all_pages and grouped_pages:
|
||||
# Check for newly created pages
|
||||
updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)
|
||||
|
||||
# If we found newly created pages, update both files
|
||||
if newly_created:
|
||||
logger.info(f"Found {len(newly_created)} newly created French pages")
|
||||
|
||||
# Add the newly created pages to the existing list
|
||||
created_pages.extend(newly_created)
|
||||
|
||||
# Save the updated files
|
||||
save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
|
||||
save_created_pages(created_pages, args.dry_run)
|
||||
else:
|
||||
logger.info("No newly created French pages found")
|
||||
else:
|
||||
logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")
|
||||
|
||||
if success:
|
||||
logger.info("Script completed successfully")
|
||||
else:
|
||||
|
|
|
@ -135,6 +135,11 @@ def extract_pages_from_category(html_content, current_url):
|
|||
title = link.get_text()
|
||||
url = WIKI_BASE_URL + link.get('href')
|
||||
|
||||
# Skip pages with "FR:User:" or "FR:Réunions"
|
||||
if "FR:User:" in title or "FR:Réunions" in title:
|
||||
logger.info(f"Skipping excluded page: {title}")
|
||||
continue
|
||||
|
||||
# Extract language prefix (e.g., "En:", "De:", etc.)
|
||||
language_prefix = "Other"
|
||||
match = re.match(r'^([A-Za-z]{2}):', title)
|
||||
|
|
|
@ -63,14 +63,15 @@ NUM_WIKI_PAGES = 1
|
|||
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||
SPECIFIC_PAGES = [
|
||||
"Anatomie_des_étiquettes_osm",
|
||||
"FR:Tag:leisure=children_club",
|
||||
"FR:Tag:harassment_prevention=Dask_angela",
|
||||
"Tag:leisure=children_club",
|
||||
"Tag:harassment_prevention=ask_angela",
|
||||
"Key:harassment_prevention",
|
||||
"Proposal process",
|
||||
"Automated_Edits_code_of_conduct",
|
||||
"Key:cuisine",
|
||||
"Libre_Charge_Map",
|
||||
"OSM_Mon_Commerce"
|
||||
"OSM_Mon_Commerce",
|
||||
"Tag:amenity=charging_station"
|
||||
]
|
||||
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
|
@ -225,6 +226,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
Returns:
|
||||
dict: Dictionary with page information or None if page doesn't exist
|
||||
"""
|
||||
# Skip pages with "FR:User:" or "FR:Réunions"
|
||||
if "FR:User:" in key or "FR:Réunions" in key:
|
||||
logger.info(f"Skipping excluded page: {key}")
|
||||
return None
|
||||
# Handle different URL formats
|
||||
if is_specific_page:
|
||||
# Case 1: Full URL
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue