add wiki fraicheur comparée anglais français

This commit is contained in:
Tykayn 2025-08-21 16:07:49 +02:00 committed by tykayn
parent 0aaddb44c5
commit 83d1972589
12 changed files with 1332 additions and 0 deletions

348
wiki_compare/wiki_compare.py Executable file
View file

@ -0,0 +1,348 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
wiki_compare.py
This script fetches the 10 most used OpenStreetMap keys from TagInfo,
compares their English and French wiki pages, and identifies which pages
need updating based on modification dates and content analysis.
Usage:
python wiki_compare.py
Output:
- top_keys.json: JSON file containing the 10 most used OSM keys
- wiki_pages.csv: CSV file with information about each wiki page
- outdated_pages.json: JSON file containing pages that need updating
- A console output listing the 10 wiki pages that need updating
"""
import json
import csv
import requests
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
import logging
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
def fetch_top_keys(limit=50):
"""
Fetch the most used OSM keys from TagInfo API
Args:
limit (int): Number of keys to fetch
Returns:
list: List of dictionaries containing key information
"""
logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")
params = {
'page': 1,
'rp': limit,
'sortname': 'count_all',
'sortorder': 'desc'
}
try:
response = requests.get(TAGINFO_API_URL, params=params)
response.raise_for_status()
data = response.json()
# Extract just the key names and counts
top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
logger.info(f"Successfully fetched {len(top_keys)} keys")
return top_keys
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching data from TagInfo API: {e}")
return []
def save_to_json(data, filename):
"""
Save data to a JSON file
Args:
data: Data to save
filename (str): Name of the file
"""
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Data saved to {filename}")
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def fetch_wiki_page(key, language='en'):
"""
Fetch wiki page for a given key
Args:
key (str): OSM key
language (str): Language code ('en' or 'fr')
Returns:
dict: Dictionary with page information or None if page doesn't exist
"""
base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
url = f"{base_url}{key}"
logger.info(f"Fetching {language} wiki page for key '{key}': {url}")
try:
response = requests.get(url)
# Check if page exists
if response.status_code == 404:
logger.warning(f"Wiki page for key '{key}' in {language} does not exist")
return None
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Get last modification date
last_modified = None
footer_info = soup.select_one('#footer-info-lastmod')
if footer_info:
date_text = footer_info.text
# Extract date using regex
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
if date_match:
date_str = date_match.group(1)
try:
# Parse date (format may vary based on wiki language)
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
# Count sections (h2, h3, h4)
sections = len(soup.select('h2, h3, h4'))
# Count words in the content
content = soup.select_one('#mw-content-text')
if content:
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Get text and count words
text = content.get_text(separator=' ', strip=True)
word_count = len(text.split())
# Count links
links = content.select('a')
link_count = len(links)
else:
word_count = 0
link_count = 0
return {
'key': key,
'language': language,
'url': url,
'last_modified': last_modified,
'sections': sections,
'word_count': word_count,
'link_count': link_count
}
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
return None
def analyze_wiki_pages(pages):
"""
Analyze wiki pages to determine which ones need updating
Args:
pages (list): List of dictionaries containing page information
Returns:
list: List of pages that need updating, sorted by priority
"""
logger.info("Analyzing wiki pages to identify those needing updates...")
# Group pages by key
pages_by_key = {}
for page in pages:
if page is None:
continue
key = page['key']
if key not in pages_by_key:
pages_by_key[key] = {}
pages_by_key[key][page['language']] = page
# Analyze each key's pages
needs_update = []
for key, lang_pages in pages_by_key.items():
# Skip if either language is missing
if 'en' not in lang_pages or 'fr' not in lang_pages:
if 'en' in lang_pages:
# French page is missing
needs_update.append({
'key': key,
'reason': 'French page missing',
'en_page': lang_pages['en'],
'fr_page': None,
'date_diff': 0,
'word_diff': lang_pages['en']['word_count'],
'section_diff': lang_pages['en']['sections'],
'link_diff': lang_pages['en']['link_count'],
'priority': 100 # High priority for missing pages
})
continue
en_page = lang_pages['en']
fr_page = lang_pages['fr']
# Skip if dates are missing
if not en_page['last_modified'] or not fr_page['last_modified']:
continue
# Calculate date difference in days
en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
date_diff = (en_date - fr_date).days
# Calculate content differences
word_diff = en_page['word_count'] - fr_page['word_count']
section_diff = en_page['sections'] - fr_page['sections']
link_diff = en_page['link_count'] - fr_page['link_count']
# Calculate priority score (higher means needs more urgent update)
# Weight factors can be adjusted
priority = (
abs(date_diff) * 0.4 + # Date difference
abs(word_diff) / 100 * 0.25 + # Word count difference (normalized)
abs(section_diff) * 0.2 + # Section difference
abs(link_diff) / 10 * 0.15 # Link count difference (normalized)
)
if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
reason = []
if date_diff > 30:
reason.append(f"French page outdated by {date_diff} days")
if word_diff > 200:
reason.append(f"English page has {word_diff} more words")
if section_diff > 2:
reason.append(f"English page has {section_diff} more sections")
if link_diff > 20:
reason.append(f"English page has {link_diff} more links")
if fr_page['word_count'] < en_page['word_count'] * 0.7:
reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content")
needs_update.append({
'key': key,
'reason': ', '.join(reason),
'en_page': en_page,
'fr_page': fr_page,
'date_diff': date_diff,
'word_diff': word_diff,
'section_diff': section_diff,
'link_diff': link_diff,
'priority': priority
})
# Sort by priority (descending)
needs_update.sort(key=lambda x: x['priority'], reverse=True)
return needs_update
def main():
"""Main function to execute the script"""
logger.info("Starting wiki_compare.py")
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
# Fetch top keys
top_keys = fetch_top_keys(10)
if not top_keys:
logger.error("Failed to fetch top keys. Exiting.")
return
# Save top keys to JSON
save_to_json(top_keys, TOP_KEYS_FILE)
# Fetch wiki pages for each key
wiki_pages = []
for key_info in top_keys:
key = key_info['key']
# Fetch English page
en_page = fetch_wiki_page(key, 'en')
if en_page:
wiki_pages.append(en_page)
# Fetch French page
fr_page = fetch_wiki_page(key, 'fr')
if fr_page:
wiki_pages.append(fr_page)
# Save wiki pages to CSV
try:
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for page in wiki_pages:
if page: # Skip None values
writer.writerow(page)
logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
except IOError as e:
logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
return
# Analyze pages to find those needing updates
pages_to_update = analyze_wiki_pages(wiki_pages)
# Save pages that need updating to JSON
save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
# Print the top 10 pages needing updates
print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====")
for i, page in enumerate(pages_to_update[:10], 1):
key = page['key']
reason = page['reason']
en_url = page['en_page']['url'] if page['en_page'] else "N/A"
fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"
print(f"{i}. Key: {key}")
print(f" Reason: {reason}")
print(f" English: {en_url}")
print(f" French: {fr_url}")
print()
logger.info("Script completed successfully")
if __name__ == "__main__":
main()