add wiki fraicheur comparée anglais français
This commit is contained in:
parent
0aaddb44c5
commit
83d1972589
12 changed files with 1332 additions and 0 deletions
348
wiki_compare/wiki_compare.py
Executable file
348
wiki_compare/wiki_compare.py
Executable file
|
@ -0,0 +1,348 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
wiki_compare.py
|
||||
|
||||
This script fetches the 10 most used OpenStreetMap keys from TagInfo,
|
||||
compares their English and French wiki pages, and identifies which pages
|
||||
need updating based on modification dates and content analysis.
|
||||
|
||||
Usage:
|
||||
python wiki_compare.py
|
||||
|
||||
Output:
|
||||
- top_keys.json: JSON file containing the 10 most used OSM keys
|
||||
- wiki_pages.csv: CSV file with information about each wiki page
|
||||
- outdated_pages.json: JSON file containing pages that need updating
|
||||
- A console output listing the 10 wiki pages that need updating
|
||||
"""
|
||||
|
||||
import json
|
||||
import csv
|
||||
import requests
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
|
||||
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
||||
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
||||
TOP_KEYS_FILE = "top_keys.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
|
||||
def fetch_top_keys(limit=50):
|
||||
"""
|
||||
Fetch the most used OSM keys from TagInfo API
|
||||
|
||||
Args:
|
||||
limit (int): Number of keys to fetch
|
||||
|
||||
Returns:
|
||||
list: List of dictionaries containing key information
|
||||
"""
|
||||
logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")
|
||||
|
||||
params = {
|
||||
'page': 1,
|
||||
'rp': limit,
|
||||
'sortname': 'count_all',
|
||||
'sortorder': 'desc'
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(TAGINFO_API_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Extract just the key names and counts
|
||||
top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
|
||||
|
||||
logger.info(f"Successfully fetched {len(top_keys)} keys")
|
||||
return top_keys
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching data from TagInfo API: {e}")
|
||||
return []
|
||||
|
||||
def save_to_json(data, filename):
|
||||
"""
|
||||
Save data to a JSON file
|
||||
|
||||
Args:
|
||||
data: Data to save
|
||||
filename (str): Name of the file
|
||||
"""
|
||||
try:
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Data saved to {filename}")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def fetch_wiki_page(key, language='en'):
|
||||
"""
|
||||
Fetch wiki page for a given key
|
||||
|
||||
Args:
|
||||
key (str): OSM key
|
||||
language (str): Language code ('en' or 'fr')
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with page information or None if page doesn't exist
|
||||
"""
|
||||
base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
|
||||
url = f"{base_url}{key}"
|
||||
|
||||
logger.info(f"Fetching {language} wiki page for key '{key}': {url}")
|
||||
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
# Check if page exists
|
||||
if response.status_code == 404:
|
||||
logger.warning(f"Wiki page for key '{key}' in {language} does not exist")
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Get last modification date
|
||||
last_modified = None
|
||||
footer_info = soup.select_one('#footer-info-lastmod')
|
||||
if footer_info:
|
||||
date_text = footer_info.text
|
||||
# Extract date using regex
|
||||
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
||||
if date_match:
|
||||
date_str = date_match.group(1)
|
||||
try:
|
||||
# Parse date (format may vary based on wiki language)
|
||||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
logger.warning(f"Could not parse date: {date_str}")
|
||||
|
||||
# Count sections (h2, h3, h4)
|
||||
sections = len(soup.select('h2, h3, h4'))
|
||||
|
||||
# Count words in the content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Get text and count words
|
||||
text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(text.split())
|
||||
|
||||
# Count links
|
||||
links = content.select('a')
|
||||
link_count = len(links)
|
||||
else:
|
||||
word_count = 0
|
||||
link_count = 0
|
||||
|
||||
return {
|
||||
'key': key,
|
||||
'language': language,
|
||||
'url': url,
|
||||
'last_modified': last_modified,
|
||||
'sections': sections,
|
||||
'word_count': word_count,
|
||||
'link_count': link_count
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
def analyze_wiki_pages(pages):
|
||||
"""
|
||||
Analyze wiki pages to determine which ones need updating
|
||||
|
||||
Args:
|
||||
pages (list): List of dictionaries containing page information
|
||||
|
||||
Returns:
|
||||
list: List of pages that need updating, sorted by priority
|
||||
"""
|
||||
logger.info("Analyzing wiki pages to identify those needing updates...")
|
||||
|
||||
# Group pages by key
|
||||
pages_by_key = {}
|
||||
for page in pages:
|
||||
if page is None:
|
||||
continue
|
||||
|
||||
key = page['key']
|
||||
if key not in pages_by_key:
|
||||
pages_by_key[key] = {}
|
||||
|
||||
pages_by_key[key][page['language']] = page
|
||||
|
||||
# Analyze each key's pages
|
||||
needs_update = []
|
||||
|
||||
for key, lang_pages in pages_by_key.items():
|
||||
# Skip if either language is missing
|
||||
if 'en' not in lang_pages or 'fr' not in lang_pages:
|
||||
if 'en' in lang_pages:
|
||||
# French page is missing
|
||||
needs_update.append({
|
||||
'key': key,
|
||||
'reason': 'French page missing',
|
||||
'en_page': lang_pages['en'],
|
||||
'fr_page': None,
|
||||
'date_diff': 0,
|
||||
'word_diff': lang_pages['en']['word_count'],
|
||||
'section_diff': lang_pages['en']['sections'],
|
||||
'link_diff': lang_pages['en']['link_count'],
|
||||
'priority': 100 # High priority for missing pages
|
||||
})
|
||||
continue
|
||||
|
||||
en_page = lang_pages['en']
|
||||
fr_page = lang_pages['fr']
|
||||
|
||||
# Skip if dates are missing
|
||||
if not en_page['last_modified'] or not fr_page['last_modified']:
|
||||
continue
|
||||
|
||||
# Calculate date difference in days
|
||||
en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
|
||||
fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
|
||||
date_diff = (en_date - fr_date).days
|
||||
|
||||
# Calculate content differences
|
||||
word_diff = en_page['word_count'] - fr_page['word_count']
|
||||
section_diff = en_page['sections'] - fr_page['sections']
|
||||
link_diff = en_page['link_count'] - fr_page['link_count']
|
||||
|
||||
# Calculate priority score (higher means needs more urgent update)
|
||||
# Weight factors can be adjusted
|
||||
priority = (
|
||||
abs(date_diff) * 0.4 + # Date difference
|
||||
abs(word_diff) / 100 * 0.25 + # Word count difference (normalized)
|
||||
abs(section_diff) * 0.2 + # Section difference
|
||||
abs(link_diff) / 10 * 0.15 # Link count difference (normalized)
|
||||
)
|
||||
|
||||
if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||
reason = []
|
||||
if date_diff > 30:
|
||||
reason.append(f"French page outdated by {date_diff} days")
|
||||
if word_diff > 200:
|
||||
reason.append(f"English page has {word_diff} more words")
|
||||
if section_diff > 2:
|
||||
reason.append(f"English page has {section_diff} more sections")
|
||||
if link_diff > 20:
|
||||
reason.append(f"English page has {link_diff} more links")
|
||||
if fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||
reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content")
|
||||
|
||||
needs_update.append({
|
||||
'key': key,
|
||||
'reason': ', '.join(reason),
|
||||
'en_page': en_page,
|
||||
'fr_page': fr_page,
|
||||
'date_diff': date_diff,
|
||||
'word_diff': word_diff,
|
||||
'section_diff': section_diff,
|
||||
'link_diff': link_diff,
|
||||
'priority': priority
|
||||
})
|
||||
|
||||
# Sort by priority (descending)
|
||||
needs_update.sort(key=lambda x: x['priority'], reverse=True)
|
||||
|
||||
return needs_update
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
logger.info("Starting wiki_compare.py")
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
|
||||
|
||||
# Fetch top keys
|
||||
top_keys = fetch_top_keys(10)
|
||||
|
||||
if not top_keys:
|
||||
logger.error("Failed to fetch top keys. Exiting.")
|
||||
return
|
||||
|
||||
# Save top keys to JSON
|
||||
save_to_json(top_keys, TOP_KEYS_FILE)
|
||||
|
||||
# Fetch wiki pages for each key
|
||||
wiki_pages = []
|
||||
|
||||
for key_info in top_keys:
|
||||
key = key_info['key']
|
||||
|
||||
# Fetch English page
|
||||
en_page = fetch_wiki_page(key, 'en')
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Fetch French page
|
||||
fr_page = fetch_wiki_page(key, 'fr')
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Save wiki pages to CSV
|
||||
try:
|
||||
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
|
||||
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count']
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for page in wiki_pages:
|
||||
if page: # Skip None values
|
||||
writer.writerow(page)
|
||||
|
||||
logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
|
||||
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
|
||||
return
|
||||
|
||||
# Analyze pages to find those needing updates
|
||||
pages_to_update = analyze_wiki_pages(wiki_pages)
|
||||
|
||||
# Save pages that need updating to JSON
|
||||
save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
|
||||
|
||||
# Print the top 10 pages needing updates
|
||||
print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====")
|
||||
|
||||
for i, page in enumerate(pages_to_update[:10], 1):
|
||||
key = page['key']
|
||||
reason = page['reason']
|
||||
en_url = page['en_page']['url'] if page['en_page'] else "N/A"
|
||||
fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"
|
||||
|
||||
print(f"{i}. Key: {key}")
|
||||
print(f" Reason: {reason}")
|
||||
print(f" English: {en_url}")
|
||||
print(f" French: {fr_url}")
|
||||
print()
|
||||
|
||||
logger.info("Script completed successfully")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue