auto traduction ollama
This commit is contained in:
parent
2ad98b5864
commit
eb662fab5a
4 changed files with 407 additions and 7 deletions
329
wiki_compare/wiki_translate.py
Normal file
329
wiki_compare/wiki_translate.py
Normal file
|
@ -0,0 +1,329 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
wiki_translate.py
|
||||
|
||||
This script translates wiki pages that don't have translations using the Ollama server
|
||||
with the mistral:7b model. It saves the translations in a JSON file that is ignored by
|
||||
.gitignore.
|
||||
|
||||
Usage:
|
||||
python wiki_translate.py [key]
|
||||
|
||||
If a key is provided, only that page will be translated.
|
||||
If no key is provided, all pages missing translations will be processed.
|
||||
|
||||
Output:
|
||||
- translations.json: JSON file containing the translations
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Import functions from wiki_compare.py
|
||||
from wiki_compare import (
|
||||
fetch_wiki_page,
|
||||
load_json_data,
|
||||
save_to_json,
|
||||
save_with_history,
|
||||
SPECIFIC_PAGES,
|
||||
logger
|
||||
)
|
||||
|
||||
# Constants
|
||||
TRANSLATIONS_FILE = "translations.json"
|
||||
OLLAMA_API_URL = "http://localhost:11434/api/generate"
|
||||
OLLAMA_MODEL = "mistral:7b"
|
||||
|
||||
def extract_main_content(html_content):
|
||||
"""
|
||||
Extract the main content from a wiki page HTML
|
||||
|
||||
Args:
|
||||
html_content (str): HTML content of the wiki page
|
||||
|
||||
Returns:
|
||||
str: Main content text
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Find the main content div
|
||||
content = soup.select_one('#mw-content-text')
|
||||
if not content:
|
||||
logger.warning("Could not find main content div")
|
||||
return ""
|
||||
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove navigation elements
|
||||
for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
|
||||
nav.extract()
|
||||
|
||||
# Get text
|
||||
clean_text = content.get_text(separator='\n', strip=True)
|
||||
|
||||
return clean_text
|
||||
|
||||
def translate_text(text, model=OLLAMA_MODEL):
|
||||
"""
|
||||
Translate text using Ollama API
|
||||
|
||||
Args:
|
||||
text (str): Text to translate
|
||||
model (str): Ollama model to use
|
||||
|
||||
Returns:
|
||||
str: Translated text
|
||||
"""
|
||||
logger.info(f"Translating text using Ollama model {model}")
|
||||
|
||||
# Prepare the prompt
|
||||
prompt = f"""Translate the following English text to French.
|
||||
Maintain the original formatting as much as possible.
|
||||
Keep technical terms intact when appropriate.
|
||||
Preserve mediawiki formatting if present.
|
||||
|
||||
English text:
|
||||
{text}
|
||||
|
||||
French translation:"""
|
||||
|
||||
# Prepare the request
|
||||
data = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(OLLAMA_API_URL, json=data)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
# Extract the translated text
|
||||
translated_text = result.get('response', '')
|
||||
|
||||
logger.info(f"Translation successful, received {len(translated_text)} characters")
|
||||
return translated_text
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error translating text: {e}")
|
||||
return ""
|
||||
|
||||
def translate_wiki_page(key):
|
||||
"""
|
||||
Translate a wiki page
|
||||
|
||||
Args:
|
||||
key (str): Key or page title
|
||||
|
||||
Returns:
|
||||
dict: Translation information
|
||||
"""
|
||||
logger.info(f"Translating wiki page for key: {key}")
|
||||
|
||||
# Check if the key is a specific page
|
||||
is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
|
||||
|
||||
# Fetch the English page
|
||||
en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
|
||||
if not en_page:
|
||||
logger.warning(f"English page for key '{key}' not found")
|
||||
return None
|
||||
|
||||
# Check if French page already exists
|
||||
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
|
||||
if fr_page:
|
||||
logger.info(f"French page for key '{key}' already exists")
|
||||
return None
|
||||
|
||||
# Extract the main content from the English page
|
||||
html_content = en_page.get('html_content', '')
|
||||
if not html_content:
|
||||
logger.warning(f"No HTML content found for key '{key}'")
|
||||
return None
|
||||
|
||||
main_content = extract_main_content(html_content)
|
||||
if not main_content:
|
||||
logger.warning(f"No main content extracted for key '{key}'")
|
||||
return None
|
||||
|
||||
# Translate the main content
|
||||
translated_content = translate_text(main_content)
|
||||
if not translated_content:
|
||||
logger.warning(f"Translation failed for key '{key}'")
|
||||
return None
|
||||
|
||||
# Create translation information
|
||||
translation_info = {
|
||||
'key': key,
|
||||
'en_page': {
|
||||
'url': en_page.get('url', ''),
|
||||
'last_modified': en_page.get('last_modified', ''),
|
||||
'word_count': en_page.get('word_count', 0)
|
||||
},
|
||||
'translated_content': translated_content,
|
||||
'translated_at': datetime.now().isoformat(),
|
||||
'model': OLLAMA_MODEL,
|
||||
'is_specific_page': is_specific_page
|
||||
}
|
||||
|
||||
logger.info(f"Translation completed for key '{key}'")
|
||||
return translation_info
|
||||
|
||||
def save_translation(translation_info):
|
||||
"""
|
||||
Save translation to the translations file
|
||||
|
||||
Args:
|
||||
translation_info (dict): Translation information
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
if not translation_info:
|
||||
return False
|
||||
|
||||
# Load existing translations
|
||||
translations = load_json_data(TRANSLATIONS_FILE)
|
||||
|
||||
# Initialize if empty
|
||||
if not translations:
|
||||
translations = {
|
||||
'translations': {},
|
||||
'last_updated': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Add or update translation
|
||||
key = translation_info['key']
|
||||
translations['translations'][key] = translation_info
|
||||
translations['last_updated'] = datetime.now().isoformat()
|
||||
|
||||
# Save translations
|
||||
save_to_json(translations, TRANSLATIONS_FILE)
|
||||
|
||||
logger.info(f"Translation saved for key '{key}'")
|
||||
return True
|
||||
|
||||
def update_translation(key):
|
||||
"""
|
||||
Update a translation for a specific key
|
||||
|
||||
Args:
|
||||
key (str): Key or page title
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
logger.info(f"Updating translation for key: {key}")
|
||||
|
||||
# Translate the page
|
||||
translation_info = translate_wiki_page(key)
|
||||
|
||||
# Save the translation
|
||||
if translation_info:
|
||||
return save_translation(translation_info)
|
||||
|
||||
return False
|
||||
|
||||
def get_missing_translations():
|
||||
"""
|
||||
Get a list of pages missing translations
|
||||
|
||||
Returns:
|
||||
list: List of keys for pages missing translations
|
||||
"""
|
||||
from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
|
||||
|
||||
missing_translations = []
|
||||
|
||||
# Process top keys
|
||||
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
|
||||
for key_info in top_keys:
|
||||
key = key_info['key']
|
||||
|
||||
# Fetch English page
|
||||
en_page = fetch_wiki_page(key, 'en')
|
||||
if not en_page:
|
||||
continue
|
||||
|
||||
# Check if French page exists
|
||||
fr_page = fetch_wiki_page(key, 'fr')
|
||||
if not fr_page:
|
||||
missing_translations.append(key)
|
||||
|
||||
# Process specific pages
|
||||
for page in SPECIFIC_PAGES:
|
||||
# Skip pages with FR: prefix
|
||||
if page.startswith('FR:'):
|
||||
continue
|
||||
|
||||
# For full URLs, extract the key
|
||||
if page.startswith('http'):
|
||||
page_title = page.split('/')[-1]
|
||||
# Skip if it's a French page
|
||||
if 'FR:' in page_title:
|
||||
continue
|
||||
key = page_title
|
||||
else:
|
||||
key = page
|
||||
|
||||
# Fetch English page
|
||||
en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
|
||||
if not en_page:
|
||||
continue
|
||||
|
||||
# Check if French page exists
|
||||
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
|
||||
if not fr_page:
|
||||
missing_translations.append(key)
|
||||
|
||||
return missing_translations
|
||||
|
||||
def get_available_translations():
|
||||
"""
|
||||
Get a list of available translations
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of available translations
|
||||
"""
|
||||
translations = load_json_data(TRANSLATIONS_FILE)
|
||||
if not translations:
|
||||
return {}
|
||||
|
||||
return translations.get('translations', {})
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to execute the script
|
||||
"""
|
||||
logger.info("Starting wiki_translate.py")
|
||||
|
||||
# Check if a specific key was provided
|
||||
if len(sys.argv) > 1:
|
||||
key = sys.argv[1]
|
||||
logger.info(f"Translating specific key: {key}")
|
||||
update_translation(key)
|
||||
else:
|
||||
# Get missing translations
|
||||
missing_translations = get_missing_translations()
|
||||
logger.info(f"Found {len(missing_translations)} pages missing translations")
|
||||
|
||||
# Translate each missing page
|
||||
for key in missing_translations:
|
||||
logger.info(f"Processing key: {key}")
|
||||
update_translation(key)
|
||||
|
||||
logger.info("Translation process completed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue