auto traduction ollama

2025-09-04 00:14:55 +02:00 · 2025-09-04 00:14:55 +02:00 · eb662fab5a
commit eb662fab5a
parent 2ad98b5864
4 changed files with 407 additions and 7 deletions
--- a/wiki_compare/wiki_translate.py
+++ b/wiki_compare/wiki_translate.py
@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+wiki_translate.py
+
+This script translates wiki pages that don't have translations using the Ollama server
+with the mistral:7b model. It saves the translations in a JSON file that is ignored by
+.gitignore.
+
+Usage:
+    python wiki_translate.py [key]
+
+    If a key is provided, only that page will be translated.
+    If no key is provided, all pages missing translations will be processed.
+
+Output:
+    - translations.json: JSON file containing the translations
+"""
+
+import json
+import os
+import sys
+import logging
+import requests
+from pathlib import Path
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+# Import functions from wiki_compare.py
+from wiki_compare import (
+    fetch_wiki_page,
+    load_json_data,
+    save_to_json,
+    save_with_history,
+    SPECIFIC_PAGES,
+    logger
+)
+
+# Constants
+TRANSLATIONS_FILE = "translations.json"
+OLLAMA_API_URL = "http://localhost:11434/api/generate"
+OLLAMA_MODEL = "mistral:7b"
+
+def extract_main_content(html_content):
+    """
+    Extract the main content from a wiki page HTML
+    
+    Args:
+        html_content (str): HTML content of the wiki page
+        
+    Returns:
+        str: Main content text
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # Find the main content div
+    content = soup.select_one('#mw-content-text')
+    if not content:
+        logger.warning("Could not find main content div")
+        return ""
+    
+    # Remove script and style elements
+    for script in content.select('script, style'):
+        script.extract()
+    
+    # Remove navigation elements
+    for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
+        nav.extract()
+    
+    # Get text
+    clean_text = content.get_text(separator='\n', strip=True)
+    
+    return clean_text
+
+def translate_text(text, model=OLLAMA_MODEL):
+    """
+    Translate text using Ollama API
+    
+    Args:
+        text (str): Text to translate
+        model (str): Ollama model to use
+        
+    Returns:
+        str: Translated text
+    """
+    logger.info(f"Translating text using Ollama model {model}")
+    
+    # Prepare the prompt
+    prompt = f"""Translate the following English text to French. 
+Maintain the original formatting as much as possible.
+Keep technical terms intact when appropriate.
+Preserve mediawiki formatting if present.
+
+English text:
+{text}
+
+French translation:"""
+    
+    # Prepare the request
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False
+    }
+    
+    try:
+        response = requests.post(OLLAMA_API_URL, json=data)
+        response.raise_for_status()
+        result = response.json()
+        
+        # Extract the translated text
+        translated_text = result.get('response', '')
+        
+        logger.info(f"Translation successful, received {len(translated_text)} characters")
+        return translated_text
+    
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error translating text: {e}")
+        return ""
+
+def translate_wiki_page(key):
+    """
+    Translate a wiki page
+    
+    Args:
+        key (str): Key or page title
+        
+    Returns:
+        dict: Translation information
+    """
+    logger.info(f"Translating wiki page for key: {key}")
+    
+    # Check if the key is a specific page
+    is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
+    
+    # Fetch the English page
+    en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
+    if not en_page:
+        logger.warning(f"English page for key '{key}' not found")
+        return None
+    
+    # Check if French page already exists
+    fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
+    if fr_page:
+        logger.info(f"French page for key '{key}' already exists")
+        return None
+    
+    # Extract the main content from the English page
+    html_content = en_page.get('html_content', '')
+    if not html_content:
+        logger.warning(f"No HTML content found for key '{key}'")
+        return None
+    
+    main_content = extract_main_content(html_content)
+    if not main_content:
+        logger.warning(f"No main content extracted for key '{key}'")
+        return None
+    
+    # Translate the main content
+    translated_content = translate_text(main_content)
+    if not translated_content:
+        logger.warning(f"Translation failed for key '{key}'")
+        return None
+    
+    # Create translation information
+    translation_info = {
+        'key': key,
+        'en_page': {
+            'url': en_page.get('url', ''),
+            'last_modified': en_page.get('last_modified', ''),
+            'word_count': en_page.get('word_count', 0)
+        },
+        'translated_content': translated_content,
+        'translated_at': datetime.now().isoformat(),
+        'model': OLLAMA_MODEL,
+        'is_specific_page': is_specific_page
+    }
+    
+    logger.info(f"Translation completed for key '{key}'")
+    return translation_info
+
+def save_translation(translation_info):
+    """
+    Save translation to the translations file
+    
+    Args:
+        translation_info (dict): Translation information
+        
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    if not translation_info:
+        return False
+    
+    # Load existing translations
+    translations = load_json_data(TRANSLATIONS_FILE)
+    
+    # Initialize if empty
+    if not translations:
+        translations = {
+            'translations': {},
+            'last_updated': datetime.now().isoformat()
+        }
+    
+    # Add or update translation
+    key = translation_info['key']
+    translations['translations'][key] = translation_info
+    translations['last_updated'] = datetime.now().isoformat()
+    
+    # Save translations
+    save_to_json(translations, TRANSLATIONS_FILE)
+    
+    logger.info(f"Translation saved for key '{key}'")
+    return True
+
+def update_translation(key):
+    """
+    Update a translation for a specific key
+    
+    Args:
+        key (str): Key or page title
+        
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    logger.info(f"Updating translation for key: {key}")
+    
+    # Translate the page
+    translation_info = translate_wiki_page(key)
+    
+    # Save the translation
+    if translation_info:
+        return save_translation(translation_info)
+    
+    return False
+
+def get_missing_translations():
+    """
+    Get a list of pages missing translations
+    
+    Returns:
+        list: List of keys for pages missing translations
+    """
+    from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
+    
+    missing_translations = []
+    
+    # Process top keys
+    top_keys = fetch_top_keys(NUM_WIKI_PAGES)
+    for key_info in top_keys:
+        key = key_info['key']
+        
+        # Fetch English page
+        en_page = fetch_wiki_page(key, 'en')
+        if not en_page:
+            continue
+        
+        # Check if French page exists
+        fr_page = fetch_wiki_page(key, 'fr')
+        if not fr_page:
+            missing_translations.append(key)
+    
+    # Process specific pages
+    for page in SPECIFIC_PAGES:
+        # Skip pages with FR: prefix
+        if page.startswith('FR:'):
+            continue
+        
+        # For full URLs, extract the key
+        if page.startswith('http'):
+            page_title = page.split('/')[-1]
+            # Skip if it's a French page
+            if 'FR:' in page_title:
+                continue
+            key = page_title
+        else:
+            key = page
+        
+        # Fetch English page
+        en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
+        if not en_page:
+            continue
+        
+        # Check if French page exists
+        fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
+        if not fr_page:
+            missing_translations.append(key)
+    
+    return missing_translations
+
+def get_available_translations():
+    """
+    Get a list of available translations
+    
+    Returns:
+        dict: Dictionary of available translations
+    """
+    translations = load_json_data(TRANSLATIONS_FILE)
+    if not translations:
+        return {}
+    
+    return translations.get('translations', {})
+
+def main():
+    """
+    Main function to execute the script
+    """
+    logger.info("Starting wiki_translate.py")
+    
+    # Check if a specific key was provided
+    if len(sys.argv) > 1:
+        key = sys.argv[1]
+        logger.info(f"Translating specific key: {key}")
+        update_translation(key)
+    else:
+        # Get missing translations
+        missing_translations = get_missing_translations()
+        logger.info(f"Found {len(missing_translations)} pages missing translations")
+        
+        # Translate each missing page
+        for key in missing_translations:
+            logger.info(f"Processing key: {key}")
+            update_translation(key)
+    
+    logger.info("Translation process completed")
+
+if __name__ == "__main__":
+    main()