suivi et exclusion de pages fr des réunions
This commit is contained in:
parent
471eab4cd0
commit
466f9c773b
5 changed files with 231 additions and 4 deletions
2
wiki_compare/.gitignore
vendored
Normal file
2
wiki_compare/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
*.json
|
||||||
|
.env
|
|
@ -5,7 +5,7 @@ jour ou de traductions, et publier des suggestions sur Mastodon pour encourager
|
||||||
|
|
||||||
## Vue d'ensemble
|
## Vue d'ensemble
|
||||||
|
|
||||||
Le projet comprend dix scripts principaux :
|
Le projet comprend onze scripts principaux :
|
||||||
|
|
||||||
1. **wiki_compare.py** : Récupère les 50 clés OSM les plus utilisées, compare leurs pages wiki en anglais et en
|
1. **wiki_compare.py** : Récupère les 50 clés OSM les plus utilisées, compare leurs pages wiki en anglais et en
|
||||||
français, et identifie celles qui ont besoin de mises à jour.
|
français, et identifie celles qui ont besoin de mises à jour.
|
||||||
|
@ -30,6 +30,9 @@ Le projet comprend dix scripts principaux :
|
||||||
10. **fetch_osm_fr_groups.py** : Récupère les informations sur les groupes de travail et les groupes locaux d'OSM-FR
|
10. **fetch_osm_fr_groups.py** : Récupère les informations sur les groupes de travail et les groupes locaux d'OSM-FR
|
||||||
depuis la section #Pages_des_groupes_locaux et les enregistre dans un fichier JSON pour affichage sur le site web.
|
depuis la section #Pages_des_groupes_locaux et les enregistre dans un fichier JSON pour affichage sur le site web.
|
||||||
Les données sont mises en cache pendant une heure.
|
Les données sont mises en cache pendant une heure.
|
||||||
|
11. **fetch_recent_changes.py** : Récupère les changements récents du wiki OSM pour l'espace de noms français, détecte les pages
|
||||||
|
nouvellement créées qui étaient auparavant dans la liste des pages non disponibles en français, et les enregistre dans un
|
||||||
|
fichier JSON pour affichage sur le site web. Les données sont mises en cache pendant une heure.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
@ -286,6 +289,9 @@ Exemple de configuration cron pour publier des suggestions et mettre à jour les
|
||||||
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_untranslated_french_pages.py
|
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_untranslated_french_pages.py
|
||||||
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_pages_unavailable_in_french.py
|
0 */6 * * * cd /chemin/vers/wiki_compare && ./find_pages_unavailable_in_french.py
|
||||||
0 */6 * * * cd /chemin/vers/wiki_compare && ./fetch_osm_fr_groups.py
|
0 */6 * * * cd /chemin/vers/wiki_compare && ./fetch_osm_fr_groups.py
|
||||||
|
|
||||||
|
# Récupérer les changements récents et détecter les pages nouvellement créées (toutes les heures)
|
||||||
|
0 * * * * cd /chemin/vers/wiki_compare && ./fetch_recent_changes.py
|
||||||
```
|
```
|
||||||
|
|
||||||
Note : Les scripts de mise à jour des données pour le site web intègrent déjà une vérification de fraîcheur du cache (1 heure),
|
Note : Les scripts de mise à jour des données pour le site web intègrent déjà une vérification de fraîcheur du cache (1 heure),
|
||||||
|
|
|
@ -25,6 +25,7 @@ import argparse
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
@ -41,6 +42,8 @@ logger = logging.getLogger(__name__)
|
||||||
# Use the directory of this script to determine the output file path
|
# Use the directory of this script to determine the output file path
|
||||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
|
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
|
||||||
|
UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
|
||||||
|
CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
|
||||||
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
|
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
|
||||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
||||||
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
||||||
|
@ -340,6 +343,188 @@ def save_results(recent_changes, dry_run=False):
|
||||||
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def load_unavailable_pages():
|
||||||
|
"""
|
||||||
|
Load the list of pages unavailable in French
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (all_pages, grouped_pages, last_updated)
|
||||||
|
"""
|
||||||
|
if not os.path.exists(UNAVAILABLE_PAGES_FILE):
|
||||||
|
logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
|
||||||
|
return [], {}, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
all_pages = data.get('all_pages', [])
|
||||||
|
grouped_pages = data.get('grouped_pages', {})
|
||||||
|
last_updated = data.get('last_updated')
|
||||||
|
return all_pages, grouped_pages, last_updated
|
||||||
|
except (IOError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Error loading unavailable pages file: {e}")
|
||||||
|
return [], {}, None
|
||||||
|
|
||||||
|
def load_created_pages():
|
||||||
|
"""
|
||||||
|
Load the list of newly created French pages
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (created_pages, last_updated)
|
||||||
|
"""
|
||||||
|
if not os.path.exists(CREATED_PAGES_FILE):
|
||||||
|
logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
created_pages = data.get('created_pages', [])
|
||||||
|
last_updated = data.get('last_updated')
|
||||||
|
return created_pages, last_updated
|
||||||
|
except (IOError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Error loading created pages file: {e}")
|
||||||
|
return [], None
|
||||||
|
|
||||||
|
def save_created_pages(created_pages, dry_run=False):
|
||||||
|
"""
|
||||||
|
Save the list of newly created French pages
|
||||||
|
|
||||||
|
Args:
|
||||||
|
created_pages (list): List of newly created French pages
|
||||||
|
dry_run (bool): If True, don't actually save to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if saving was successful or dry run, False otherwise
|
||||||
|
"""
|
||||||
|
if dry_run:
|
||||||
|
logger.info("DRY RUN: Would have saved created pages to file")
|
||||||
|
return True
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"last_updated": datetime.now().isoformat(),
|
||||||
|
"created_pages": created_pages
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")
|
||||||
|
|
||||||
|
# Copy the file to the public directory
|
||||||
|
public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
|
||||||
|
logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
|
||||||
|
shutil.copy2(CREATED_PAGES_FILE, public_file)
|
||||||
|
|
||||||
|
return True
|
||||||
|
except IOError as e:
|
||||||
|
logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
|
||||||
|
"""
|
||||||
|
Save the updated list of pages unavailable in French
|
||||||
|
|
||||||
|
Args:
|
||||||
|
all_pages (list): List of all unavailable pages
|
||||||
|
grouped_pages (dict): Dictionary of pages grouped by language prefix
|
||||||
|
dry_run (bool): If True, don't actually save to file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if saving was successful or dry run, False otherwise
|
||||||
|
"""
|
||||||
|
if dry_run:
|
||||||
|
logger.info("DRY RUN: Would have saved updated unavailable pages to file")
|
||||||
|
return True
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"last_updated": datetime.now().isoformat(),
|
||||||
|
"all_pages": all_pages,
|
||||||
|
"grouped_pages": grouped_pages
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")
|
||||||
|
|
||||||
|
# Copy the file to the public directory
|
||||||
|
public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
|
||||||
|
logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
|
||||||
|
shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)
|
||||||
|
|
||||||
|
return True
|
||||||
|
except IOError as e:
|
||||||
|
logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
|
||||||
|
"""
|
||||||
|
Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French
|
||||||
|
|
||||||
|
Args:
|
||||||
|
recent_changes (list): List of recent change dictionaries
|
||||||
|
all_pages (list): List of all unavailable pages
|
||||||
|
grouped_pages (dict): Dictionary of pages grouped by language prefix
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
|
||||||
|
"""
|
||||||
|
newly_created_pages = []
|
||||||
|
updated_all_pages = all_pages.copy()
|
||||||
|
updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}
|
||||||
|
|
||||||
|
# Check each recent change
|
||||||
|
for change in recent_changes:
|
||||||
|
page_name = change['page_name']
|
||||||
|
page_url = change['page_url']
|
||||||
|
comment = change['comment'].lower()
|
||||||
|
|
||||||
|
# Check if this is a new page creation
|
||||||
|
is_new_page = "page created" in comment or "nouvelle page" in comment
|
||||||
|
|
||||||
|
if is_new_page and page_name.startswith("FR:"):
|
||||||
|
logger.info(f"Found newly created French page: {page_name}")
|
||||||
|
|
||||||
|
# Check if this page was previously in the list of unavailable pages
|
||||||
|
# We need to check if the English version of this page is in the list
|
||||||
|
en_page_name = page_name.replace("FR:", "")
|
||||||
|
|
||||||
|
# Find the English page in the list of unavailable pages
|
||||||
|
found_en_page = None
|
||||||
|
for page in all_pages:
|
||||||
|
if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
|
||||||
|
found_en_page = page
|
||||||
|
break
|
||||||
|
|
||||||
|
if found_en_page:
|
||||||
|
logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")
|
||||||
|
|
||||||
|
# Remove the English page from the list of unavailable pages
|
||||||
|
updated_all_pages.remove(found_en_page)
|
||||||
|
|
||||||
|
# Remove the English page from the grouped pages
|
||||||
|
lang_prefix = found_en_page['language_prefix']
|
||||||
|
if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
|
||||||
|
updated_grouped_pages[lang_prefix].remove(found_en_page)
|
||||||
|
|
||||||
|
# If the group is now empty, remove it
|
||||||
|
if not updated_grouped_pages[lang_prefix]:
|
||||||
|
del updated_grouped_pages[lang_prefix]
|
||||||
|
|
||||||
|
# Add the newly created page to the list
|
||||||
|
newly_created_pages.append({
|
||||||
|
"title": page_name,
|
||||||
|
"url": page_url,
|
||||||
|
"en_title": found_en_page['title'],
|
||||||
|
"en_url": found_en_page['url'],
|
||||||
|
"created_at": change['timestamp'],
|
||||||
|
"created_by": change['user'],
|
||||||
|
"comment": change['comment']
|
||||||
|
})
|
||||||
|
|
||||||
|
return updated_all_pages, updated_grouped_pages, newly_created_pages
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main function to execute the script"""
|
"""Main function to execute the script"""
|
||||||
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
|
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
|
||||||
|
@ -417,6 +602,30 @@ def main():
|
||||||
# Save results
|
# Save results
|
||||||
success = save_results(recent_changes, args.dry_run)
|
success = save_results(recent_changes, args.dry_run)
|
||||||
|
|
||||||
|
# Check for newly created French pages
|
||||||
|
logger.info("Checking for newly created French pages...")
|
||||||
|
all_pages, grouped_pages, last_updated = load_unavailable_pages()
|
||||||
|
created_pages, created_last_updated = load_created_pages()
|
||||||
|
|
||||||
|
if all_pages and grouped_pages:
|
||||||
|
# Check for newly created pages
|
||||||
|
updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)
|
||||||
|
|
||||||
|
# If we found newly created pages, update both files
|
||||||
|
if newly_created:
|
||||||
|
logger.info(f"Found {len(newly_created)} newly created French pages")
|
||||||
|
|
||||||
|
# Add the newly created pages to the existing list
|
||||||
|
created_pages.extend(newly_created)
|
||||||
|
|
||||||
|
# Save the updated files
|
||||||
|
save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
|
||||||
|
save_created_pages(created_pages, args.dry_run)
|
||||||
|
else:
|
||||||
|
logger.info("No newly created French pages found")
|
||||||
|
else:
|
||||||
|
logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
logger.info("Script completed successfully")
|
logger.info("Script completed successfully")
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -135,6 +135,11 @@ def extract_pages_from_category(html_content, current_url):
|
||||||
title = link.get_text()
|
title = link.get_text()
|
||||||
url = WIKI_BASE_URL + link.get('href')
|
url = WIKI_BASE_URL + link.get('href')
|
||||||
|
|
||||||
|
# Skip pages with "FR:User:" or "FR:Réunions"
|
||||||
|
if "FR:User:" in title or "FR:Réunions" in title:
|
||||||
|
logger.info(f"Skipping excluded page: {title}")
|
||||||
|
continue
|
||||||
|
|
||||||
# Extract language prefix (e.g., "En:", "De:", etc.)
|
# Extract language prefix (e.g., "En:", "De:", etc.)
|
||||||
language_prefix = "Other"
|
language_prefix = "Other"
|
||||||
match = re.match(r'^([A-Za-z]{2}):', title)
|
match = re.match(r'^([A-Za-z]{2}):', title)
|
||||||
|
|
|
@ -63,14 +63,15 @@ NUM_WIKI_PAGES = 1
|
||||||
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||||
SPECIFIC_PAGES = [
|
SPECIFIC_PAGES = [
|
||||||
"Anatomie_des_étiquettes_osm",
|
"Anatomie_des_étiquettes_osm",
|
||||||
"FR:Tag:leisure=children_club",
|
"Tag:leisure=children_club",
|
||||||
"FR:Tag:harassment_prevention=Dask_angela",
|
"Tag:harassment_prevention=ask_angela",
|
||||||
"Key:harassment_prevention",
|
"Key:harassment_prevention",
|
||||||
"Proposal process",
|
"Proposal process",
|
||||||
"Automated_Edits_code_of_conduct",
|
"Automated_Edits_code_of_conduct",
|
||||||
"Key:cuisine",
|
"Key:cuisine",
|
||||||
"Libre_Charge_Map",
|
"Libre_Charge_Map",
|
||||||
"OSM_Mon_Commerce"
|
"OSM_Mon_Commerce",
|
||||||
|
"Tag:amenity=charging_station"
|
||||||
]
|
]
|
||||||
|
|
||||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||||
|
@ -225,6 +226,10 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
Returns:
|
Returns:
|
||||||
dict: Dictionary with page information or None if page doesn't exist
|
dict: Dictionary with page information or None if page doesn't exist
|
||||||
"""
|
"""
|
||||||
|
# Skip pages with "FR:User:" or "FR:Réunions"
|
||||||
|
if "FR:User:" in key or "FR:Réunions" in key:
|
||||||
|
logger.info(f"Skipping excluded page: {key}")
|
||||||
|
return None
|
||||||
# Handle different URL formats
|
# Handle different URL formats
|
||||||
if is_specific_page:
|
if is_specific_page:
|
||||||
# Case 1: Full URL
|
# Case 1: Full URL
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue