#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
wiki_compare.py

This script fetches the 10 most used OpenStreetMap keys from TagInfo,
compares their English and French wiki pages, and identifies which pages
need updating based on modification dates and content analysis.

Usage:
    python wiki_compare.py

Output:
    - top_keys.json: JSON file containing the 10 most used OSM keys
    - wiki_pages.csv: CSV file with information about each wiki page
    - outdated_pages.json: JSON file containing pages that need updating
    - A console output listing the 10 wiki pages that need updating
"""

import json
import csv
import requests
import re
import os
from datetime import datetime
from bs4 import BeautifulSoup
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"

def fetch_top_keys(limit=50):
    """
    Fetch the most used OSM keys from TagInfo API
    
    Args:
        limit (int): Number of keys to fetch
        
    Returns:
        list: List of dictionaries containing key information
    """
    logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")
    
    params = {
        'page': 1,
        'rp': limit,
        'sortname': 'count_all',
        'sortorder': 'desc'
    }
    
    try:
        response = requests.get(TAGINFO_API_URL, params=params)
        response.raise_for_status()
        data = response.json()
        
        # Extract just the key names and counts
        top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
        
        logger.info(f"Successfully fetched {len(top_keys)} keys")
        return top_keys
    
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching data from TagInfo API: {e}")
        return []

def save_to_json(data, filename):
    """
    Save data to a JSON file
    
    Args:
        data: Data to save
        filename (str): Name of the file
    """
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Data saved to {filename}")
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")

def fetch_wiki_page(key, language='en'):
    """
    Fetch wiki page for a given key
    
    Args:
        key (str): OSM key
        language (str): Language code ('en' or 'fr')
        
    Returns:
        dict: Dictionary with page information or None if page doesn't exist
    """
    base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
    url = f"{base_url}{key}"
    
    logger.info(f"Fetching {language} wiki page for key '{key}': {url}")
    
    try:
        response = requests.get(url)
        
        # Check if page exists
        if response.status_code == 404:
            logger.warning(f"Wiki page for key '{key}' in {language} does not exist")
            return None
        
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Get last modification date
        last_modified = None
        footer_info = soup.select_one('#footer-info-lastmod')
        if footer_info:
            date_text = footer_info.text
            # Extract date using regex
            date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
            if date_match:
                date_str = date_match.group(1)
                try:
                    # Parse date (format may vary based on wiki language)
                    last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
                except ValueError:
                    logger.warning(f"Could not parse date: {date_str}")
        
        # Count sections (h2, h3, h4)
        sections = len(soup.select('h2, h3, h4'))
        
        # Count words in the content
        content = soup.select_one('#mw-content-text')
        if content:
            # Remove script and style elements
            for script in content.select('script, style'):
                script.extract()
            
            # Get text and count words
            text = content.get_text(separator=' ', strip=True)
            word_count = len(text.split())
            
            # Count links
            links = content.select('a')
            link_count = len(links)
        else:
            word_count = 0
            link_count = 0
        
        return {
            'key': key,
            'language': language,
            'url': url,
            'last_modified': last_modified,
            'sections': sections,
            'word_count': word_count,
            'link_count': link_count
        }
    
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
        return None

def analyze_wiki_pages(pages):
    """
    Analyze wiki pages to determine which ones need updating
    
    Args:
        pages (list): List of dictionaries containing page information
        
    Returns:
        list: List of pages that need updating, sorted by priority
    """
    logger.info("Analyzing wiki pages to identify those needing updates...")
    
    # Group pages by key
    pages_by_key = {}
    for page in pages:
        if page is None:
            continue
        
        key = page['key']
        if key not in pages_by_key:
            pages_by_key[key] = {}
        
        pages_by_key[key][page['language']] = page
    
    # Analyze each key's pages
    needs_update = []
    
    for key, lang_pages in pages_by_key.items():
        # Skip if either language is missing
        if 'en' not in lang_pages or 'fr' not in lang_pages:
            if 'en' in lang_pages:
                # French page is missing
                needs_update.append({
                    'key': key,
                    'reason': 'French page missing',
                    'en_page': lang_pages['en'],
                    'fr_page': None,
                    'date_diff': 0,
                    'word_diff': lang_pages['en']['word_count'],
                    'section_diff': lang_pages['en']['sections'],
                    'link_diff': lang_pages['en']['link_count'],
                    'priority': 100  # High priority for missing pages
                })
            continue
        
        en_page = lang_pages['en']
        fr_page = lang_pages['fr']
        
        # Skip if dates are missing
        if not en_page['last_modified'] or not fr_page['last_modified']:
            continue
        
        # Calculate date difference in days
        en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
        fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
        date_diff = (en_date - fr_date).days
        
        # Calculate content differences
        word_diff = en_page['word_count'] - fr_page['word_count']
        section_diff = en_page['sections'] - fr_page['sections']
        link_diff = en_page['link_count'] - fr_page['link_count']
        
        # Calculate priority score (higher means needs more urgent update)
        # Weight factors can be adjusted
        priority = (
            abs(date_diff) * 0.4 +  # Date difference
            abs(word_diff) / 100 * 0.25 +  # Word count difference (normalized)
            abs(section_diff) * 0.2 +  # Section difference
            abs(link_diff) / 10 * 0.15  # Link count difference (normalized)
        )
        
        if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
            reason = []
            if date_diff > 30:
                reason.append(f"French page outdated by {date_diff} days")
            if word_diff > 200:
                reason.append(f"English page has {word_diff} more words")
            if section_diff > 2:
                reason.append(f"English page has {section_diff} more sections")
            if link_diff > 20:
                reason.append(f"English page has {link_diff} more links")
            if fr_page['word_count'] < en_page['word_count'] * 0.7:
                reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content")
            
            needs_update.append({
                'key': key,
                'reason': ', '.join(reason),
                'en_page': en_page,
                'fr_page': fr_page,
                'date_diff': date_diff,
                'word_diff': word_diff,
                'section_diff': section_diff,
                'link_diff': link_diff,
                'priority': priority
            })
    
    # Sort by priority (descending)
    needs_update.sort(key=lambda x: x['priority'], reverse=True)
    
    return needs_update

def main():
    """Main function to execute the script"""
    logger.info("Starting wiki_compare.py")
    
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
    
    # Fetch top keys
    top_keys = fetch_top_keys(10)
    
    if not top_keys:
        logger.error("Failed to fetch top keys. Exiting.")
        return
    
    # Save top keys to JSON
    save_to_json(top_keys, TOP_KEYS_FILE)
    
    # Fetch wiki pages for each key
    wiki_pages = []
    
    for key_info in top_keys:
        key = key_info['key']
        
        # Fetch English page
        en_page = fetch_wiki_page(key, 'en')
        if en_page:
            wiki_pages.append(en_page)
        
        # Fetch French page
        fr_page = fetch_wiki_page(key, 'fr')
        if fr_page:
            wiki_pages.append(fr_page)
    
    # Save wiki pages to CSV
    try:
        with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
            fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            
            writer.writeheader()
            for page in wiki_pages:
                if page:  # Skip None values
                    writer.writerow(page)
        
        logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
    
    except IOError as e:
        logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
        return
    
    # Analyze pages to find those needing updates
    pages_to_update = analyze_wiki_pages(wiki_pages)
    
    # Save pages that need updating to JSON
    save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
    
    # Print the top 10 pages needing updates
    print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====")
    
    for i, page in enumerate(pages_to_update[:10], 1):
        key = page['key']
        reason = page['reason']
        en_url = page['en_page']['url'] if page['en_page'] else "N/A"
        fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"
        
        print(f"{i}. Key: {key}")
        print(f"   Reason: {reason}")
        print(f"   English: {en_url}")
        print(f"   French: {fr_url}")
        print()
    
    logger.info("Script completed successfully")

if __name__ == "__main__":
    main()