auto traduction ollama

2025-09-04 00:14:55 +02:00 · 2025-09-04 00:14:55 +02:00 · eb662fab5a
commit eb662fab5a
parent 2ad98b5864
4 changed files with 407 additions and 7 deletions
--- a/public/logo-osm.png
+++ b/public/logo-osm.png
--- a/wiki_compare/pycache/wiki_compare.cpython-313.pyc
+++ b/wiki_compare/pycache/wiki_compare.cpython-313.pyc
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
-NUM_WIKI_PAGES = 50
+NUM_WIKI_PAGES = 2
 # HTML cache folder
 HTML_CACHE_DIR = "html_cache"
@ -66,6 +66,12 @@ try:
    nltk.data.find('tokenizers/punkt')
 except LookupError:
    nltk.download('punkt')
 # Also download punkt_tab resource which is needed for sent_tokenize
 try:
    nltk.data.find('tokenizers/punkt_tab')
 except LookupError:
    nltk.download('punkt_tab')
 # Create HTML cache directory if it doesn't exist
 Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
        logger.error(f"Error fetching data from TagInfo API: {e}")
        return []
 def load_json_data(filename):
    """
    Load data from a JSON file
    Args:
        filename (str): Name of the file
    Returns:
        dict: Data loaded from the file or empty dict if file doesn't exist
    """
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                data = json.load(f)
            logger.info(f"Data loaded from {filename}")
            return data
        else:
            logger.info(f"File {filename} doesn't exist, returning empty dict")
            return {}
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error loading data from {filename}: {e}")
        return {}
 def save_to_json(data, filename):
    """
    Save data to a JSON file
@ -138,6 +167,52 @@ def save_to_json(data, filename):
        logger.info(f"Data saved to {filename}")
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")
 def save_with_history(data, filename):
    """
    Save data to a JSON file while preserving history
    This function loads existing data from the file (if it exists),
    adds the new data to the history, and saves the updated data back to the file.
    Args:
        data: New data to save
        filename (str): Name of the file
    """
    try:
        # Load existing data
        existing_data = load_json_data(filename)
        # Create a timestamp for the current data
        current_timestamp = datetime.now().isoformat()
        # Initialize history if it doesn't exist
        if 'history' not in existing_data:
            existing_data['history'] = {}
        # Add current regular_pages and specific_pages to history
        history_entry = {
            'regular_pages': data.get('regular_pages', []),
            'specific_pages': data.get('specific_pages', [])
        }
        # Add the entry to history with timestamp as key
        existing_data['history'][current_timestamp] = history_entry
        # Update the current data
        existing_data['regular_pages'] = data.get('regular_pages', [])
        existing_data['specific_pages'] = data.get('specific_pages', [])
        existing_data['last_updated'] = current_timestamp
        # Save the updated data
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing_data, f, indent=2, ensure_ascii=False)
        logger.info(f"Data with history saved to {filename}")
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error saving data with history to {filename}: {e}")
        # Fallback to regular save if there's an error
        save_to_json(data, filename)
 def check_grammar_with_grammalecte(text):
    """
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'grammar_suggestions': grammar_suggestions,
            'html_content': html_content
        }
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
        return None
 def generate_staleness_histogram(wiki_pages):
    """
@ -1183,8 +1254,8 @@ def main():
        "last_updated": datetime.now().isoformat()
    }
-    # Save pages that need updating to JSON
+    # Save pages that need updating to JSON with history
-    save_to_json(output_data, OUTDATED_PAGES_FILE)
+    save_with_history(output_data, OUTDATED_PAGES_FILE)
    # Print the top pages needing updates
    print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
--- a/wiki_compare/wiki_translate.py
+++ b/wiki_compare/wiki_translate.py
@ -0,0 +1,329 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 wiki_translate.py
 This script translates wiki pages that don't have translations using the Ollama server
 with the mistral:7b model. It saves the translations in a JSON file that is ignored by
 .gitignore.
 Usage:
    python wiki_translate.py [key]
    If a key is provided, only that page will be translated.
    If no key is provided, all pages missing translations will be processed.
 Output:
    - translations.json: JSON file containing the translations
 """
 import json
 import os
 import sys
 import logging
 import requests
 from pathlib import Path
 from datetime import datetime
 from bs4 import BeautifulSoup
 # Import functions from wiki_compare.py
 from wiki_compare import (
    fetch_wiki_page,
    load_json_data,
    save_to_json,
    save_with_history,
    SPECIFIC_PAGES,
    logger
 )
 # Constants
 TRANSLATIONS_FILE = "translations.json"
 OLLAMA_API_URL = "http://localhost:11434/api/generate"
 OLLAMA_MODEL = "mistral:7b"
 def extract_main_content(html_content):
    """
    Extract the main content from a wiki page HTML
    Args:
        html_content (str): HTML content of the wiki page
    Returns:
        str: Main content text
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    # Find the main content div
    content = soup.select_one('#mw-content-text')
    if not content:
        logger.warning("Could not find main content div")
        return ""
    # Remove script and style elements
    for script in content.select('script, style'):
        script.extract()
    # Remove navigation elements
    for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
        nav.extract()
    # Get text
    clean_text = content.get_text(separator='\n', strip=True)
    return clean_text
 def translate_text(text, model=OLLAMA_MODEL):
    """
    Translate text using Ollama API
    Args:
        text (str): Text to translate
        model (str): Ollama model to use
    Returns:
        str: Translated text
    """
    logger.info(f"Translating text using Ollama model {model}")
    # Prepare the prompt
    prompt = f"""Translate the following English text to French. 
 Maintain the original formatting as much as possible.
 Keep technical terms intact when appropriate.
 Preserve mediawiki formatting if present.
 English text:
 {text}
 French translation:"""
    # Prepare the request
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    try:
        response = requests.post(OLLAMA_API_URL, json=data)
        response.raise_for_status()
        result = response.json()
        # Extract the translated text
        translated_text = result.get('response', '')
        logger.info(f"Translation successful, received {len(translated_text)} characters")
        return translated_text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error translating text: {e}")
        return ""
 def translate_wiki_page(key):
    """
    Translate a wiki page
    Args:
        key (str): Key or page title
    Returns:
        dict: Translation information
    """
    logger.info(f"Translating wiki page for key: {key}")
    # Check if the key is a specific page
    is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
    # Fetch the English page
    en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
    if not en_page:
        logger.warning(f"English page for key '{key}' not found")
        return None
    # Check if French page already exists
    fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
    if fr_page:
        logger.info(f"French page for key '{key}' already exists")
        return None
    # Extract the main content from the English page
    html_content = en_page.get('html_content', '')
    if not html_content:
        logger.warning(f"No HTML content found for key '{key}'")
        return None
    main_content = extract_main_content(html_content)
    if not main_content:
        logger.warning(f"No main content extracted for key '{key}'")
        return None
    # Translate the main content
    translated_content = translate_text(main_content)
    if not translated_content:
        logger.warning(f"Translation failed for key '{key}'")
        return None
    # Create translation information
    translation_info = {
        'key': key,
        'en_page': {
            'url': en_page.get('url', ''),
            'last_modified': en_page.get('last_modified', ''),
            'word_count': en_page.get('word_count', 0)
        },
        'translated_content': translated_content,
        'translated_at': datetime.now().isoformat(),
        'model': OLLAMA_MODEL,
        'is_specific_page': is_specific_page
    }
    logger.info(f"Translation completed for key '{key}'")
    return translation_info
 def save_translation(translation_info):
    """
    Save translation to the translations file
    Args:
        translation_info (dict): Translation information
    Returns:
        bool: True if successful, False otherwise
    """
    if not translation_info:
        return False
    # Load existing translations
    translations = load_json_data(TRANSLATIONS_FILE)
    # Initialize if empty
    if not translations:
        translations = {
            'translations': {},
            'last_updated': datetime.now().isoformat()
        }
    # Add or update translation
    key = translation_info['key']
    translations['translations'][key] = translation_info
    translations['last_updated'] = datetime.now().isoformat()
    # Save translations
    save_to_json(translations, TRANSLATIONS_FILE)
    logger.info(f"Translation saved for key '{key}'")
    return True
 def update_translation(key):
    """
    Update a translation for a specific key
    Args:
        key (str): Key or page title
    Returns:
        bool: True if successful, False otherwise
    """
    logger.info(f"Updating translation for key: {key}")
    # Translate the page
    translation_info = translate_wiki_page(key)
    # Save the translation
    if translation_info:
        return save_translation(translation_info)
    return False
 def get_missing_translations():
    """
    Get a list of pages missing translations
    Returns:
        list: List of keys for pages missing translations
    """
    from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
    missing_translations = []
    # Process top keys
    top_keys = fetch_top_keys(NUM_WIKI_PAGES)
    for key_info in top_keys:
        key = key_info['key']
        # Fetch English page
        en_page = fetch_wiki_page(key, 'en')
        if not en_page:
            continue
        # Check if French page exists
        fr_page = fetch_wiki_page(key, 'fr')
        if not fr_page:
            missing_translations.append(key)
    # Process specific pages
    for page in SPECIFIC_PAGES:
        # Skip pages with FR: prefix
        if page.startswith('FR:'):
            continue
        # For full URLs, extract the key
        if page.startswith('http'):
            page_title = page.split('/')[-1]
            # Skip if it's a French page
            if 'FR:' in page_title:
                continue
            key = page_title
        else:
            key = page
        # Fetch English page
        en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
        if not en_page:
            continue
        # Check if French page exists
        fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
        if not fr_page:
            missing_translations.append(key)
    return missing_translations
 def get_available_translations():
    """
    Get a list of available translations
    Returns:
        dict: Dictionary of available translations
    """
    translations = load_json_data(TRANSLATIONS_FILE)
    if not translations:
        return {}
    return translations.get('translations', {})
 def main():
    """
    Main function to execute the script
    """
    logger.info("Starting wiki_translate.py")
    # Check if a specific key was provided
    if len(sys.argv) > 1:
        key = sys.argv[1]
        logger.info(f"Translating specific key: {key}")
        update_translation(key)
    else:
        # Get missing translations
        missing_translations = get_missing_translations()
        logger.info(f"Found {len(missing_translations)} pages missing translations")
        # Translate each missing page
        for key in missing_translations:
            logger.info(f"Processing key: {key}")
            update_translation(key)
    logger.info("Translation process completed")
 if __name__ == "__main__":
    main()