auto traduction ollama

2025-09-04 00:14:55 +02:00 · 2025-09-04 00:14:55 +02:00 · eb662fab5a
commit eb662fab5a
parent 2ad98b5864
4 changed files with 407 additions and 7 deletions
--- a/public/logo-osm.png
+++ b/public/logo-osm.png
--- a/wiki_compare/pycache/wiki_compare.cpython-313.pyc
+++ b/wiki_compare/pycache/wiki_compare.cpython-313.pyc
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
-NUM_WIKI_PAGES = 50
+NUM_WIKI_PAGES = 2
 # HTML cache folder
 HTML_CACHE_DIR = "html_cache"

@ -66,6 +66,12 @@ try:
    nltk.data.find('tokenizers/punkt')
 except LookupError:
    nltk.download('punkt')
+
+# Also download punkt_tab resource which is needed for sent_tokenize
+try:
+    nltk.data.find('tokenizers/punkt_tab')
+except LookupError:
+    nltk.download('punkt_tab')
    
 # Create HTML cache directory if it doesn't exist
 Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
        logger.error(f"Error fetching data from TagInfo API: {e}")
        return []

+def load_json_data(filename):
+    """
+    Load data from a JSON file
+    
+    Args:
+        filename (str): Name of the file
+        
+    Returns:
+        dict: Data loaded from the file or empty dict if file doesn't exist
+    """
+    try:
+        if os.path.exists(filename):
+            with open(filename, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            logger.info(f"Data loaded from {filename}")
+            return data
+        else:
+            logger.info(f"File {filename} doesn't exist, returning empty dict")
+            return {}
+    except (IOError, json.JSONDecodeError) as e:
+        logger.error(f"Error loading data from {filename}: {e}")
+        return {}
+
 def save_to_json(data, filename):
    """
    Save data to a JSON file
@ -138,6 +167,52 @@ def save_to_json(data, filename):
        logger.info(f"Data saved to {filename}")
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")
+        
+def save_with_history(data, filename):
+    """
+    Save data to a JSON file while preserving history
+    
+    This function loads existing data from the file (if it exists),
+    adds the new data to the history, and saves the updated data back to the file.
+    
+    Args:
+        data: New data to save
+        filename (str): Name of the file
+    """
+    try:
+        # Load existing data
+        existing_data = load_json_data(filename)
+        
+        # Create a timestamp for the current data
+        current_timestamp = datetime.now().isoformat()
+        
+        # Initialize history if it doesn't exist
+        if 'history' not in existing_data:
+            existing_data['history'] = {}
+            
+        # Add current regular_pages and specific_pages to history
+        history_entry = {
+            'regular_pages': data.get('regular_pages', []),
+            'specific_pages': data.get('specific_pages', [])
+        }
+        
+        # Add the entry to history with timestamp as key
+        existing_data['history'][current_timestamp] = history_entry
+        
+        # Update the current data
+        existing_data['regular_pages'] = data.get('regular_pages', [])
+        existing_data['specific_pages'] = data.get('specific_pages', [])
+        existing_data['last_updated'] = current_timestamp
+        
+        # Save the updated data
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(existing_data, f, indent=2, ensure_ascii=False)
+            
+        logger.info(f"Data with history saved to {filename}")
+    except (IOError, json.JSONDecodeError) as e:
+        logger.error(f"Error saving data with history to {filename}: {e}")
+        # Fallback to regular save if there's an error
+        save_to_json(data, filename)

 def check_grammar_with_grammalecte(text):
    """
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'grammar_suggestions': grammar_suggestions,
            'html_content': html_content
        }
-    
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
-        return None

 def generate_staleness_histogram(wiki_pages):
    """
@ -1183,8 +1254,8 @@ def main():
        "last_updated": datetime.now().isoformat()
    }
    
-    # Save pages that need updating to JSON
-    save_to_json(output_data, OUTDATED_PAGES_FILE)
+    # Save pages that need updating to JSON with history
+    save_with_history(output_data, OUTDATED_PAGES_FILE)
    
    # Print the top pages needing updates
    print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
--- a/wiki_compare/wiki_translate.py
+++ b/wiki_compare/wiki_translate.py
@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+wiki_translate.py
+
+This script translates wiki pages that don't have translations using the Ollama server
+with the mistral:7b model. It saves the translations in a JSON file that is ignored by
+.gitignore.
+
+Usage:
+    python wiki_translate.py [key]
+
+    If a key is provided, only that page will be translated.
+    If no key is provided, all pages missing translations will be processed.
+
+Output:
+    - translations.json: JSON file containing the translations
+"""
+
+import json
+import os
+import sys
+import logging
+import requests
+from pathlib import Path
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+# Import functions from wiki_compare.py
+from wiki_compare import (
+    fetch_wiki_page,
+    load_json_data,
+    save_to_json,
+    save_with_history,
+    SPECIFIC_PAGES,
+    logger
+)
+
+# Constants
+TRANSLATIONS_FILE = "translations.json"
+OLLAMA_API_URL = "http://localhost:11434/api/generate"
+OLLAMA_MODEL = "mistral:7b"
+
+def extract_main_content(html_content):
+    """
+    Extract the main content from a wiki page HTML
+    
+    Args:
+        html_content (str): HTML content of the wiki page
+        
+    Returns:
+        str: Main content text
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # Find the main content div
+    content = soup.select_one('#mw-content-text')
+    if not content:
+        logger.warning("Could not find main content div")
+        return ""
+    
+    # Remove script and style elements
+    for script in content.select('script, style'):
+        script.extract()
+    
+    # Remove navigation elements
+    for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
+        nav.extract()
+    
+    # Get text
+    clean_text = content.get_text(separator='\n', strip=True)
+    
+    return clean_text
+
+def translate_text(text, model=OLLAMA_MODEL):
+    """
+    Translate text using Ollama API
+    
+    Args:
+        text (str): Text to translate
+        model (str): Ollama model to use
+        
+    Returns:
+        str: Translated text
+    """
+    logger.info(f"Translating text using Ollama model {model}")
+    
+    # Prepare the prompt
+    prompt = f"""Translate the following English text to French. 
+Maintain the original formatting as much as possible.
+Keep technical terms intact when appropriate.
+Preserve mediawiki formatting if present.
+
+English text:
+{text}
+
+French translation:"""
+    
+    # Prepare the request
+    data = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False
+    }
+    
+    try:
+        response = requests.post(OLLAMA_API_URL, json=data)
+        response.raise_for_status()
+        result = response.json()
+        
+        # Extract the translated text
+        translated_text = result.get('response', '')
+        
+        logger.info(f"Translation successful, received {len(translated_text)} characters")
+        return translated_text
+    
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error translating text: {e}")
+        return ""
+
+def translate_wiki_page(key):
+    """
+    Translate a wiki page
+    
+    Args:
+        key (str): Key or page title
+        
+    Returns:
+        dict: Translation information
+    """
+    logger.info(f"Translating wiki page for key: {key}")
+    
+    # Check if the key is a specific page
+    is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
+    
+    # Fetch the English page
+    en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
+    if not en_page:
+        logger.warning(f"English page for key '{key}' not found")
+        return None
+    
+    # Check if French page already exists
+    fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
+    if fr_page:
+        logger.info(f"French page for key '{key}' already exists")
+        return None
+    
+    # Extract the main content from the English page
+    html_content = en_page.get('html_content', '')
+    if not html_content:
+        logger.warning(f"No HTML content found for key '{key}'")
+        return None
+    
+    main_content = extract_main_content(html_content)
+    if not main_content:
+        logger.warning(f"No main content extracted for key '{key}'")
+        return None
+    
+    # Translate the main content
+    translated_content = translate_text(main_content)
+    if not translated_content:
+        logger.warning(f"Translation failed for key '{key}'")
+        return None
+    
+    # Create translation information
+    translation_info = {
+        'key': key,
+        'en_page': {
+            'url': en_page.get('url', ''),
+            'last_modified': en_page.get('last_modified', ''),
+            'word_count': en_page.get('word_count', 0)
+        },
+        'translated_content': translated_content,
+        'translated_at': datetime.now().isoformat(),
+        'model': OLLAMA_MODEL,
+        'is_specific_page': is_specific_page
+    }
+    
+    logger.info(f"Translation completed for key '{key}'")
+    return translation_info
+
+def save_translation(translation_info):
+    """
+    Save translation to the translations file
+    
+    Args:
+        translation_info (dict): Translation information
+        
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    if not translation_info:
+        return False
+    
+    # Load existing translations
+    translations = load_json_data(TRANSLATIONS_FILE)
+    
+    # Initialize if empty
+    if not translations:
+        translations = {
+            'translations': {},
+            'last_updated': datetime.now().isoformat()
+        }
+    
+    # Add or update translation
+    key = translation_info['key']
+    translations['translations'][key] = translation_info
+    translations['last_updated'] = datetime.now().isoformat()
+    
+    # Save translations
+    save_to_json(translations, TRANSLATIONS_FILE)
+    
+    logger.info(f"Translation saved for key '{key}'")
+    return True
+
+def update_translation(key):
+    """
+    Update a translation for a specific key
+    
+    Args:
+        key (str): Key or page title
+        
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    logger.info(f"Updating translation for key: {key}")
+    
+    # Translate the page
+    translation_info = translate_wiki_page(key)
+    
+    # Save the translation
+    if translation_info:
+        return save_translation(translation_info)
+    
+    return False
+
+def get_missing_translations():
+    """
+    Get a list of pages missing translations
+    
+    Returns:
+        list: List of keys for pages missing translations
+    """
+    from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
+    
+    missing_translations = []
+    
+    # Process top keys
+    top_keys = fetch_top_keys(NUM_WIKI_PAGES)
+    for key_info in top_keys:
+        key = key_info['key']
+        
+        # Fetch English page
+        en_page = fetch_wiki_page(key, 'en')
+        if not en_page:
+            continue
+        
+        # Check if French page exists
+        fr_page = fetch_wiki_page(key, 'fr')
+        if not fr_page:
+            missing_translations.append(key)
+    
+    # Process specific pages
+    for page in SPECIFIC_PAGES:
+        # Skip pages with FR: prefix
+        if page.startswith('FR:'):
+            continue
+        
+        # For full URLs, extract the key
+        if page.startswith('http'):
+            page_title = page.split('/')[-1]
+            # Skip if it's a French page
+            if 'FR:' in page_title:
+                continue
+            key = page_title
+        else:
+            key = page
+        
+        # Fetch English page
+        en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
+        if not en_page:
+            continue
+        
+        # Check if French page exists
+        fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
+        if not fr_page:
+            missing_translations.append(key)
+    
+    return missing_translations
+
+def get_available_translations():
+    """
+    Get a list of available translations
+    
+    Returns:
+        dict: Dictionary of available translations
+    """
+    translations = load_json_data(TRANSLATIONS_FILE)
+    if not translations:
+        return {}
+    
+    return translations.get('translations', {})
+
+def main():
+    """
+    Main function to execute the script
+    """
+    logger.info("Starting wiki_translate.py")
+    
+    # Check if a specific key was provided
+    if len(sys.argv) > 1:
+        key = sys.argv[1]
+        logger.info(f"Translating specific key: {key}")
+        update_translation(key)
+    else:
+        # Get missing translations
+        missing_translations = get_missing_translations()
+        logger.info(f"Found {len(missing_translations)} pages missing translations")
+        
+        # Translate each missing page
+        for key in missing_translations:
+            logger.info(f"Processing key: {key}")
+            update_translation(key)
+    
+    logger.info("Translation process completed")
+
+if __name__ == "__main__":
+    main()