#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ wiki_translate.py This script translates wiki pages that don't have translations using the Ollama server with the mistral:7b model. It saves the translations in a JSON file that is ignored by .gitignore. Usage: python wiki_translate.py [key] If a key is provided, only that page will be translated. If no key is provided, all pages missing translations will be processed. Output: - translations.json: JSON file containing the translations """ import json import os import sys import logging import requests from pathlib import Path from datetime import datetime from bs4 import BeautifulSoup # Import functions from wiki_compare.py from wiki_compare import ( fetch_wiki_page as original_fetch_wiki_page, load_json_data, save_to_json, save_with_history, SPECIFIC_PAGES, logger ) def fetch_wiki_page(key, language='en', is_specific_page=False): """ Wrapper for the original fetch_wiki_page function that doesn't use Grammalecte This function calls the original fetch_wiki_page function but removes the grammar suggestions from the result. It's used to satisfy the requirement to "ne pas utiliser grammalecte, on veut seulement traduire". Args: key (str): OSM key or specific page title/URL language (str): Language code ('en' or 'fr') is_specific_page (bool): Whether this is a specific page rather than a key Returns: dict: Dictionary with page information or None if page doesn't exist """ # Call the original function page_info = original_fetch_wiki_page(key, language, is_specific_page) # If page_info is None, return None if page_info is None: return None # Remove grammar suggestions from the result if 'grammar_suggestions' in page_info: page_info['grammar_suggestions'] = [] return page_info # Constants TRANSLATIONS_FILE = "translations.json" OLLAMA_API_URL = "http://localhost:11434/api/generate" OLLAMA_MODEL = "mistral:7b" def extract_main_content(html_content): """ Extract the main content from a wiki page HTML Args: html_content (str): HTML content of the wiki page Returns: str: Main content text """ soup = BeautifulSoup(html_content, 'html.parser') # Find the main content div content = soup.select_one('#mw-content-text') if not content: logger.warning("Could not find main content div") return "" # Remove script and style elements for script in content.select('script, style'): script.extract() # Remove navigation elements for nav in content.select('.languages, .mw-editsection, #toc, .toc'): nav.extract() # Get text clean_text = content.get_text(separator='\n', strip=True) return clean_text def translate_text(text, model=OLLAMA_MODEL): """ Translate text using Ollama API Args: text (str): Text to translate model (str): Ollama model to use Returns: str: Translated text """ logger.info(f"Translating text using Ollama model {model}") # Prepare the prompt prompt = f"""Translate the following English text to French. Maintain the original formatting as much as possible. Keep technical terms intact when appropriate. Preserve mediawiki formatting if present. English text: {text} French translation:""" # Prepare the request data = { "model": model, "prompt": prompt, "stream": False } try: response = requests.post(OLLAMA_API_URL, json=data) response.raise_for_status() result = response.json() # Extract the translated text translated_text = result.get('response', '') logger.info(f"Translation successful, received {len(translated_text)} characters") return translated_text except requests.exceptions.RequestException as e: logger.error(f"Error translating text: {e}") return "" def translate_wiki_page(key, force=False): """ Translate a wiki page Args: key (str): Key or page title force (bool): Force translation even if French page exists Returns: dict: Translation information """ logger.info(f"Translating wiki page for key: {key!r} (type: {type(key)})") # Check if the key is a specific page is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:') # Fetch the English page en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page) if not en_page: logger.warning(f"English page for key '{key}' not found") return None # Check if French page already exists fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page) if fr_page and not force: logger.info(f"French page for key '{key}' already exists (use force=True to translate anyway)") return None # Extract the main content from the English page html_content = en_page.get('html_content', '') if not html_content: logger.warning(f"No HTML content found for key '{key}'") return None main_content = extract_main_content(html_content) if not main_content: logger.warning(f"No main content extracted for key '{key}'") return None # Translate the main content translated_content = translate_text(main_content) if not translated_content: logger.warning(f"Translation failed for key '{key}'") return None # Create translation information translation_info = { 'key': key, 'en_page': { 'url': en_page.get('url', ''), 'last_modified': en_page.get('last_modified', ''), 'word_count': en_page.get('word_count', 0) }, 'translated_content': translated_content, 'translated_at': datetime.now().isoformat(), 'model': OLLAMA_MODEL, 'is_specific_page': is_specific_page } logger.info(f"Translation completed for key '{key}'") return translation_info def save_translation(translation_info): """ Save translation to the translations file Args: translation_info (dict): Translation information Returns: bool: True if successful, False otherwise """ if not translation_info: logger.warning("No translation info provided, cannot save") return False # Load existing translations translations = load_json_data(TRANSLATIONS_FILE) logger.info(f"Loaded existing translations: {list(translations.get('translations', {}).keys())}") # Initialize if empty if not translations: logger.info("No existing translations found, initializing new translations object") translations = { 'translations': {}, 'last_updated': datetime.now().isoformat() } # Add or update translation key = translation_info['key'] logger.info(f"Adding/updating translation for key '{key!r}' (type: {type(key)})") # Debug: print the translations dictionary structure before adding the new translation logger.info(f"Translations structure before adding: {type(translations)}, keys: {list(translations.keys())}") logger.info(f"Translations['translations'] type: {type(translations.get('translations', {}))}") # Add the translation translations['translations'][key] = translation_info translations['last_updated'] = datetime.now().isoformat() # Debug: print the translations dictionary structure after adding the new translation logger.info(f"Translations structure after adding: keys: {list(translations.keys())}") logger.info(f"Translations['translations'] keys after adding: {list(translations.get('translations', {}).keys())}") # Save translations logger.info(f"Saving translations to {TRANSLATIONS_FILE}") try: save_to_json(translations, TRANSLATIONS_FILE) logger.info("save_to_json completed successfully") except Exception as e: logger.error(f"Error saving translations: {e}") return False # Verify the translation was saved try: verify_translations = load_json_data(TRANSLATIONS_FILE) logger.info(f"Verify translations loaded, keys: {list(verify_translations.get('translations', {}).keys())}") if key in verify_translations.get('translations', {}): logger.info(f"Verified translation for key '{key!r}' was saved") else: logger.warning(f"Failed to verify translation for key '{key!r}' was saved") logger.warning(f"Keys in verify_translations: {list(verify_translations.get('translations', {}).keys())}") except Exception as e: logger.error(f"Error verifying translation: {e}") logger.info(f"Translation saved for key '{key}'") return True def update_translation(key, force=True): """ Update a translation for a specific key Args: key (str): Key or page title force (bool): Force translation even if French page exists Returns: bool: True if successful, False otherwise """ logger.info(f"Updating translation for key: {key}") # Translate the page translation_info = translate_wiki_page(key, force=force) # Save the translation if translation_info: return save_translation(translation_info) return False def get_missing_translations(): """ Get a list of pages missing translations Returns: list: List of keys for pages missing translations """ from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES missing_translations = [] # Process top keys top_keys = fetch_top_keys(NUM_WIKI_PAGES) for key_info in top_keys: key = key_info['key'] # Fetch English page en_page = fetch_wiki_page(key, 'en') if not en_page: continue # Check if French page exists fr_page = fetch_wiki_page(key, 'fr') if not fr_page: missing_translations.append(key) # Process specific pages for page in SPECIFIC_PAGES: # Skip pages with FR: prefix if page.startswith('FR:'): continue # For full URLs, extract the key if page.startswith('http'): page_title = page.split('/')[-1] # Skip if it's a French page if 'FR:' in page_title: continue key = page_title else: key = page # Fetch English page en_page = fetch_wiki_page(key, 'en', is_specific_page=True) if not en_page: continue # Check if French page exists fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True) if not fr_page: missing_translations.append(key) return missing_translations def get_available_translations(): """ Get a list of available translations Returns: dict: Dictionary of available translations """ translations = load_json_data(TRANSLATIONS_FILE) if not translations: return {} return translations.get('translations', {}) def main(): """ Main function to execute the script """ logger.info("Starting wiki_translate.py") # Check if a specific key was provided if len(sys.argv) > 1: key = sys.argv[1] logger.info(f"Translating specific key: {key}") update_translation(key) else: # Get missing translations missing_translations = get_missing_translations() logger.info(f"Found {len(missing_translations)} pages missing translations") # Get available translations available_translations = get_available_translations() logger.info(f"Found {len(available_translations)} existing translations") # Filter out pages that already have translations pages_to_translate = [key for key in missing_translations if key not in available_translations] logger.info(f"After filtering, {len(pages_to_translate)} pages need translation") # Translate each missing page that doesn't already have a translation for key in pages_to_translate: logger.info(f"Processing key: {key}") update_translation(key) logger.info("Translation process completed") if __name__ == "__main__": main()