#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ wiki_compare.py This script fetches the 10 most used OpenStreetMap keys from TagInfo, compares their English and French wiki pages, and identifies which pages need updating based on modification dates and content analysis. Usage: python wiki_compare.py Output: - top_keys.json: JSON file containing the 10 most used OSM keys - wiki_pages.csv: CSV file with information about each wiki page - outdated_pages.json: JSON file containing pages that need updating - A console output listing the 10 wiki pages that need updating """ import json import csv import requests import re import os from datetime import datetime from bs4 import BeautifulSoup import logging # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all" WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:" WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:" TOP_KEYS_FILE = "top_keys.json" WIKI_PAGES_CSV = "wiki_pages.csv" OUTDATED_PAGES_FILE = "outdated_pages.json" def fetch_top_keys(limit=50): """ Fetch the most used OSM keys from TagInfo API Args: limit (int): Number of keys to fetch Returns: list: List of dictionaries containing key information """ logger.info(f"Fetching top {limit} OSM keys from TagInfo API...") params = { 'page': 1, 'rp': limit, 'sortname': 'count_all', 'sortorder': 'desc' } try: response = requests.get(TAGINFO_API_URL, params=params) response.raise_for_status() data = response.json() # Extract just the key names and counts top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']] logger.info(f"Successfully fetched {len(top_keys)} keys") return top_keys except requests.exceptions.RequestException as e: logger.error(f"Error fetching data from TagInfo API: {e}") return [] def save_to_json(data, filename): """ Save data to a JSON file Args: data: Data to save filename (str): Name of the file """ try: with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Data saved to {filename}") except IOError as e: logger.error(f"Error saving data to {filename}: {e}") def fetch_wiki_page(key, language='en'): """ Fetch wiki page for a given key Args: key (str): OSM key language (str): Language code ('en' or 'fr') Returns: dict: Dictionary with page information or None if page doesn't exist """ base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR url = f"{base_url}{key}" logger.info(f"Fetching {language} wiki page for key '{key}': {url}") try: response = requests.get(url) # Check if page exists if response.status_code == 404: logger.warning(f"Wiki page for key '{key}' in {language} does not exist") return None response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Get last modification date last_modified = None footer_info = soup.select_one('#footer-info-lastmod') if footer_info: date_text = footer_info.text # Extract date using regex date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text) if date_match: date_str = date_match.group(1) try: # Parse date (format may vary based on wiki language) last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d') except ValueError: logger.warning(f"Could not parse date: {date_str}") # Count sections (h2, h3, h4) sections = len(soup.select('h2, h3, h4')) # Count words in the content content = soup.select_one('#mw-content-text') if content: # Remove script and style elements for script in content.select('script, style'): script.extract() # Get text and count words text = content.get_text(separator=' ', strip=True) word_count = len(text.split()) # Count links links = content.select('a') link_count = len(links) else: word_count = 0 link_count = 0 return { 'key': key, 'language': language, 'url': url, 'last_modified': last_modified, 'sections': sections, 'word_count': word_count, 'link_count': link_count } except requests.exceptions.RequestException as e: logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}") return None def analyze_wiki_pages(pages): """ Analyze wiki pages to determine which ones need updating Args: pages (list): List of dictionaries containing page information Returns: list: List of pages that need updating, sorted by priority """ logger.info("Analyzing wiki pages to identify those needing updates...") # Group pages by key pages_by_key = {} for page in pages: if page is None: continue key = page['key'] if key not in pages_by_key: pages_by_key[key] = {} pages_by_key[key][page['language']] = page # Analyze each key's pages needs_update = [] for key, lang_pages in pages_by_key.items(): # Skip if either language is missing if 'en' not in lang_pages or 'fr' not in lang_pages: if 'en' in lang_pages: # French page is missing needs_update.append({ 'key': key, 'reason': 'French page missing', 'en_page': lang_pages['en'], 'fr_page': None, 'date_diff': 0, 'word_diff': lang_pages['en']['word_count'], 'section_diff': lang_pages['en']['sections'], 'link_diff': lang_pages['en']['link_count'], 'priority': 100 # High priority for missing pages }) continue en_page = lang_pages['en'] fr_page = lang_pages['fr'] # Skip if dates are missing if not en_page['last_modified'] or not fr_page['last_modified']: continue # Calculate date difference in days en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d') fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d') date_diff = (en_date - fr_date).days # Calculate content differences word_diff = en_page['word_count'] - fr_page['word_count'] section_diff = en_page['sections'] - fr_page['sections'] link_diff = en_page['link_count'] - fr_page['link_count'] # Calculate priority score (higher means needs more urgent update) # Weight factors can be adjusted priority = ( abs(date_diff) * 0.4 + # Date difference abs(word_diff) / 100 * 0.25 + # Word count difference (normalized) abs(section_diff) * 0.2 + # Section difference abs(link_diff) / 10 * 0.15 # Link count difference (normalized) ) if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7: reason = [] if date_diff > 30: reason.append(f"French page outdated by {date_diff} days") if word_diff > 200: reason.append(f"English page has {word_diff} more words") if section_diff > 2: reason.append(f"English page has {section_diff} more sections") if link_diff > 20: reason.append(f"English page has {link_diff} more links") if fr_page['word_count'] < en_page['word_count'] * 0.7: reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content") needs_update.append({ 'key': key, 'reason': ', '.join(reason), 'en_page': en_page, 'fr_page': fr_page, 'date_diff': date_diff, 'word_diff': word_diff, 'section_diff': section_diff, 'link_diff': link_diff, 'priority': priority }) # Sort by priority (descending) needs_update.sort(key=lambda x: x['priority'], reverse=True) return needs_update def main(): """Main function to execute the script""" logger.info("Starting wiki_compare.py") # Create output directory if it doesn't exist os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True) # Fetch top keys top_keys = fetch_top_keys(10) if not top_keys: logger.error("Failed to fetch top keys. Exiting.") return # Save top keys to JSON save_to_json(top_keys, TOP_KEYS_FILE) # Fetch wiki pages for each key wiki_pages = [] for key_info in top_keys: key = key_info['key'] # Fetch English page en_page = fetch_wiki_page(key, 'en') if en_page: wiki_pages.append(en_page) # Fetch French page fr_page = fetch_wiki_page(key, 'fr') if fr_page: wiki_pages.append(fr_page) # Save wiki pages to CSV try: with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f: fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for page in wiki_pages: if page: # Skip None values writer.writerow(page) logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}") except IOError as e: logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}") return # Analyze pages to find those needing updates pages_to_update = analyze_wiki_pages(wiki_pages) # Save pages that need updating to JSON save_to_json(pages_to_update, OUTDATED_PAGES_FILE) # Print the top 10 pages needing updates print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====") for i, page in enumerate(pages_to_update[:10], 1): key = page['key'] reason = page['reason'] en_url = page['en_page']['url'] if page['en_page'] else "N/A" fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A" print(f"{i}. Key: {key}") print(f" Reason: {reason}") print(f" English: {en_url}") print(f" French: {fr_url}") print() logger.info("Script completed successfully") if __name__ == "__main__": main()