qualiwiki/wiki_compare/wiki_translate.py
2025-09-04 00:54:24 +02:00

400 lines
No EOL
12 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
wiki_translate.py
This script translates wiki pages that don't have translations using the Ollama server
with the mistral:7b model. It saves the translations in a JSON file that is ignored by
.gitignore.
Usage:
python wiki_translate.py [key]
If a key is provided, only that page will be translated.
If no key is provided, all pages missing translations will be processed.
Output:
- translations.json: JSON file containing the translations
"""
import json
import os
import sys
import logging
import requests
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup
# Import functions from wiki_compare.py
from wiki_compare import (
fetch_wiki_page as original_fetch_wiki_page,
load_json_data,
save_to_json,
save_with_history,
SPECIFIC_PAGES,
logger
)
def fetch_wiki_page(key, language='en', is_specific_page=False):
"""
Wrapper for the original fetch_wiki_page function that doesn't use Grammalecte
This function calls the original fetch_wiki_page function but removes the grammar suggestions
from the result. It's used to satisfy the requirement to "ne pas utiliser grammalecte,
on veut seulement traduire".
Args:
key (str): OSM key or specific page title/URL
language (str): Language code ('en' or 'fr')
is_specific_page (bool): Whether this is a specific page rather than a key
Returns:
dict: Dictionary with page information or None if page doesn't exist
"""
# Call the original function
page_info = original_fetch_wiki_page(key, language, is_specific_page)
# If page_info is None, return None
if page_info is None:
return None
# Remove grammar suggestions from the result
if 'grammar_suggestions' in page_info:
page_info['grammar_suggestions'] = []
return page_info
# Constants
TRANSLATIONS_FILE = "translations.json"
OLLAMA_API_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "mistral:7b"
def extract_main_content(html_content):
"""
Extract the main content from a wiki page HTML
Args:
html_content (str): HTML content of the wiki page
Returns:
str: Main content text
"""
soup = BeautifulSoup(html_content, 'html.parser')
# Find the main content div
content = soup.select_one('#mw-content-text')
if not content:
logger.warning("Could not find main content div")
return ""
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Remove navigation elements
for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
nav.extract()
# Get text
clean_text = content.get_text(separator='\n', strip=True)
return clean_text
def translate_text(text, model=OLLAMA_MODEL):
"""
Translate text using Ollama API
Args:
text (str): Text to translate
model (str): Ollama model to use
Returns:
str: Translated text
"""
logger.info(f"Translating text using Ollama model {model}")
# Prepare the prompt
prompt = f"""Translate the following English text to French.
Maintain the original formatting as much as possible.
Keep technical terms intact when appropriate.
Preserve mediawiki formatting if present.
English text:
{text}
French translation:"""
# Prepare the request
data = {
"model": model,
"prompt": prompt,
"stream": False
}
try:
response = requests.post(OLLAMA_API_URL, json=data)
response.raise_for_status()
result = response.json()
# Extract the translated text
translated_text = result.get('response', '')
logger.info(f"Translation successful, received {len(translated_text)} characters")
return translated_text
except requests.exceptions.RequestException as e:
logger.error(f"Error translating text: {e}")
return ""
def translate_wiki_page(key, force=False):
"""
Translate a wiki page
Args:
key (str): Key or page title
force (bool): Force translation even if French page exists
Returns:
dict: Translation information
"""
logger.info(f"Translating wiki page for key: {key!r} (type: {type(key)})")
# Check if the key is a specific page
is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
# Fetch the English page
en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
if not en_page:
logger.warning(f"English page for key '{key}' not found")
return None
# Check if French page already exists
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
if fr_page and not force:
logger.info(f"French page for key '{key}' already exists (use force=True to translate anyway)")
return None
# Extract the main content from the English page
html_content = en_page.get('html_content', '')
if not html_content:
logger.warning(f"No HTML content found for key '{key}'")
return None
main_content = extract_main_content(html_content)
if not main_content:
logger.warning(f"No main content extracted for key '{key}'")
return None
# Translate the main content
translated_content = translate_text(main_content)
if not translated_content:
logger.warning(f"Translation failed for key '{key}'")
return None
# Create translation information
translation_info = {
'key': key,
'en_page': {
'url': en_page.get('url', ''),
'last_modified': en_page.get('last_modified', ''),
'word_count': en_page.get('word_count', 0)
},
'translated_content': translated_content,
'translated_at': datetime.now().isoformat(),
'model': OLLAMA_MODEL,
'is_specific_page': is_specific_page
}
logger.info(f"Translation completed for key '{key}'")
return translation_info
def save_translation(translation_info):
"""
Save translation to the translations file
Args:
translation_info (dict): Translation information
Returns:
bool: True if successful, False otherwise
"""
if not translation_info:
logger.warning("No translation info provided, cannot save")
return False
# Load existing translations
translations = load_json_data(TRANSLATIONS_FILE)
logger.info(f"Loaded existing translations: {list(translations.get('translations', {}).keys())}")
# Initialize if empty
if not translations:
logger.info("No existing translations found, initializing new translations object")
translations = {
'translations': {},
'last_updated': datetime.now().isoformat()
}
# Add or update translation
key = translation_info['key']
logger.info(f"Adding/updating translation for key '{key!r}' (type: {type(key)})")
# Debug: print the translations dictionary structure before adding the new translation
logger.info(f"Translations structure before adding: {type(translations)}, keys: {list(translations.keys())}")
logger.info(f"Translations['translations'] type: {type(translations.get('translations', {}))}")
# Add the translation
translations['translations'][key] = translation_info
translations['last_updated'] = datetime.now().isoformat()
# Debug: print the translations dictionary structure after adding the new translation
logger.info(f"Translations structure after adding: keys: {list(translations.keys())}")
logger.info(f"Translations['translations'] keys after adding: {list(translations.get('translations', {}).keys())}")
# Save translations
logger.info(f"Saving translations to {TRANSLATIONS_FILE}")
try:
save_to_json(translations, TRANSLATIONS_FILE)
logger.info("save_to_json completed successfully")
except Exception as e:
logger.error(f"Error saving translations: {e}")
return False
# Verify the translation was saved
try:
verify_translations = load_json_data(TRANSLATIONS_FILE)
logger.info(f"Verify translations loaded, keys: {list(verify_translations.get('translations', {}).keys())}")
if key in verify_translations.get('translations', {}):
logger.info(f"Verified translation for key '{key!r}' was saved")
else:
logger.warning(f"Failed to verify translation for key '{key!r}' was saved")
logger.warning(f"Keys in verify_translations: {list(verify_translations.get('translations', {}).keys())}")
except Exception as e:
logger.error(f"Error verifying translation: {e}")
logger.info(f"Translation saved for key '{key}'")
return True
def update_translation(key, force=True):
"""
Update a translation for a specific key
Args:
key (str): Key or page title
force (bool): Force translation even if French page exists
Returns:
bool: True if successful, False otherwise
"""
logger.info(f"Updating translation for key: {key}")
# Translate the page
translation_info = translate_wiki_page(key, force=force)
# Save the translation
if translation_info:
return save_translation(translation_info)
return False
def get_missing_translations():
"""
Get a list of pages missing translations
Returns:
list: List of keys for pages missing translations
"""
from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
missing_translations = []
# Process top keys
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
for key_info in top_keys:
key = key_info['key']
# Fetch English page
en_page = fetch_wiki_page(key, 'en')
if not en_page:
continue
# Check if French page exists
fr_page = fetch_wiki_page(key, 'fr')
if not fr_page:
missing_translations.append(key)
# Process specific pages
for page in SPECIFIC_PAGES:
# Skip pages with FR: prefix
if page.startswith('FR:'):
continue
# For full URLs, extract the key
if page.startswith('http'):
page_title = page.split('/')[-1]
# Skip if it's a French page
if 'FR:' in page_title:
continue
key = page_title
else:
key = page
# Fetch English page
en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
if not en_page:
continue
# Check if French page exists
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
if not fr_page:
missing_translations.append(key)
return missing_translations
def get_available_translations():
"""
Get a list of available translations
Returns:
dict: Dictionary of available translations
"""
translations = load_json_data(TRANSLATIONS_FILE)
if not translations:
return {}
return translations.get('translations', {})
def main():
"""
Main function to execute the script
"""
logger.info("Starting wiki_translate.py")
# Check if a specific key was provided
if len(sys.argv) > 1:
key = sys.argv[1]
logger.info(f"Translating specific key: {key}")
update_translation(key)
else:
# Get missing translations
missing_translations = get_missing_translations()
logger.info(f"Found {len(missing_translations)} pages missing translations")
# Get available translations
available_translations = get_available_translations()
logger.info(f"Found {len(available_translations)} existing translations")
# Filter out pages that already have translations
pages_to_translate = [key for key in missing_translations if key not in available_translations]
logger.info(f"After filtering, {len(pages_to_translate)} pages need translation")
# Translate each missing page that doesn't already have a translation
for key in pages_to_translate:
logger.info(f"Processing key: {key}")
update_translation(key)
logger.info("Translation process completed")
if __name__ == "__main__":
main()