auto traduction ollama

This commit is contained in:
Tykayn 2025-09-04 00:14:55 +02:00 committed by tykayn
parent 2ad98b5864
commit eb662fab5a
4 changed files with 407 additions and 7 deletions

BIN
public/logo-osm.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

View file

@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 50
NUM_WIKI_PAGES = 2
# HTML cache folder
HTML_CACHE_DIR = "html_cache"
@ -66,6 +66,12 @@ try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Also download punkt_tab resource which is needed for sent_tokenize
try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
logger.error(f"Error fetching data from TagInfo API: {e}")
return []
def load_json_data(filename):
"""
Load data from a JSON file
Args:
filename (str): Name of the file
Returns:
dict: Data loaded from the file or empty dict if file doesn't exist
"""
try:
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Data loaded from {filename}")
return data
else:
logger.info(f"File {filename} doesn't exist, returning empty dict")
return {}
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading data from {filename}: {e}")
return {}
def save_to_json(data, filename):
"""
Save data to a JSON file
@ -138,6 +167,52 @@ def save_to_json(data, filename):
logger.info(f"Data saved to {filename}")
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def save_with_history(data, filename):
"""
Save data to a JSON file while preserving history
This function loads existing data from the file (if it exists),
adds the new data to the history, and saves the updated data back to the file.
Args:
data: New data to save
filename (str): Name of the file
"""
try:
# Load existing data
existing_data = load_json_data(filename)
# Create a timestamp for the current data
current_timestamp = datetime.now().isoformat()
# Initialize history if it doesn't exist
if 'history' not in existing_data:
existing_data['history'] = {}
# Add current regular_pages and specific_pages to history
history_entry = {
'regular_pages': data.get('regular_pages', []),
'specific_pages': data.get('specific_pages', [])
}
# Add the entry to history with timestamp as key
existing_data['history'][current_timestamp] = history_entry
# Update the current data
existing_data['regular_pages'] = data.get('regular_pages', [])
existing_data['specific_pages'] = data.get('specific_pages', [])
existing_data['last_updated'] = current_timestamp
# Save the updated data
with open(filename, 'w', encoding='utf-8') as f:
json.dump(existing_data, f, indent=2, ensure_ascii=False)
logger.info(f"Data with history saved to {filename}")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving data with history to {filename}: {e}")
# Fallback to regular save if there's an error
save_to_json(data, filename)
def check_grammar_with_grammalecte(text):
"""
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'grammar_suggestions': grammar_suggestions,
'html_content': html_content
}
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None
def generate_staleness_histogram(wiki_pages):
"""
@ -1183,8 +1254,8 @@ def main():
"last_updated": datetime.now().isoformat()
}
# Save pages that need updating to JSON
save_to_json(output_data, OUTDATED_PAGES_FILE)
# Save pages that need updating to JSON with history
save_with_history(output_data, OUTDATED_PAGES_FILE)
# Print the top pages needing updates
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")

View file

@ -0,0 +1,329 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
wiki_translate.py
This script translates wiki pages that don't have translations using the Ollama server
with the mistral:7b model. It saves the translations in a JSON file that is ignored by
.gitignore.
Usage:
python wiki_translate.py [key]
If a key is provided, only that page will be translated.
If no key is provided, all pages missing translations will be processed.
Output:
- translations.json: JSON file containing the translations
"""
import json
import os
import sys
import logging
import requests
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup
# Import functions from wiki_compare.py
from wiki_compare import (
fetch_wiki_page,
load_json_data,
save_to_json,
save_with_history,
SPECIFIC_PAGES,
logger
)
# Constants
TRANSLATIONS_FILE = "translations.json"
OLLAMA_API_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "mistral:7b"
def extract_main_content(html_content):
"""
Extract the main content from a wiki page HTML
Args:
html_content (str): HTML content of the wiki page
Returns:
str: Main content text
"""
soup = BeautifulSoup(html_content, 'html.parser')
# Find the main content div
content = soup.select_one('#mw-content-text')
if not content:
logger.warning("Could not find main content div")
return ""
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Remove navigation elements
for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
nav.extract()
# Get text
clean_text = content.get_text(separator='\n', strip=True)
return clean_text
def translate_text(text, model=OLLAMA_MODEL):
"""
Translate text using Ollama API
Args:
text (str): Text to translate
model (str): Ollama model to use
Returns:
str: Translated text
"""
logger.info(f"Translating text using Ollama model {model}")
# Prepare the prompt
prompt = f"""Translate the following English text to French.
Maintain the original formatting as much as possible.
Keep technical terms intact when appropriate.
Preserve mediawiki formatting if present.
English text:
{text}
French translation:"""
# Prepare the request
data = {
"model": model,
"prompt": prompt,
"stream": False
}
try:
response = requests.post(OLLAMA_API_URL, json=data)
response.raise_for_status()
result = response.json()
# Extract the translated text
translated_text = result.get('response', '')
logger.info(f"Translation successful, received {len(translated_text)} characters")
return translated_text
except requests.exceptions.RequestException as e:
logger.error(f"Error translating text: {e}")
return ""
def translate_wiki_page(key):
"""
Translate a wiki page
Args:
key (str): Key or page title
Returns:
dict: Translation information
"""
logger.info(f"Translating wiki page for key: {key}")
# Check if the key is a specific page
is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
# Fetch the English page
en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
if not en_page:
logger.warning(f"English page for key '{key}' not found")
return None
# Check if French page already exists
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
if fr_page:
logger.info(f"French page for key '{key}' already exists")
return None
# Extract the main content from the English page
html_content = en_page.get('html_content', '')
if not html_content:
logger.warning(f"No HTML content found for key '{key}'")
return None
main_content = extract_main_content(html_content)
if not main_content:
logger.warning(f"No main content extracted for key '{key}'")
return None
# Translate the main content
translated_content = translate_text(main_content)
if not translated_content:
logger.warning(f"Translation failed for key '{key}'")
return None
# Create translation information
translation_info = {
'key': key,
'en_page': {
'url': en_page.get('url', ''),
'last_modified': en_page.get('last_modified', ''),
'word_count': en_page.get('word_count', 0)
},
'translated_content': translated_content,
'translated_at': datetime.now().isoformat(),
'model': OLLAMA_MODEL,
'is_specific_page': is_specific_page
}
logger.info(f"Translation completed for key '{key}'")
return translation_info
def save_translation(translation_info):
"""
Save translation to the translations file
Args:
translation_info (dict): Translation information
Returns:
bool: True if successful, False otherwise
"""
if not translation_info:
return False
# Load existing translations
translations = load_json_data(TRANSLATIONS_FILE)
# Initialize if empty
if not translations:
translations = {
'translations': {},
'last_updated': datetime.now().isoformat()
}
# Add or update translation
key = translation_info['key']
translations['translations'][key] = translation_info
translations['last_updated'] = datetime.now().isoformat()
# Save translations
save_to_json(translations, TRANSLATIONS_FILE)
logger.info(f"Translation saved for key '{key}'")
return True
def update_translation(key):
"""
Update a translation for a specific key
Args:
key (str): Key or page title
Returns:
bool: True if successful, False otherwise
"""
logger.info(f"Updating translation for key: {key}")
# Translate the page
translation_info = translate_wiki_page(key)
# Save the translation
if translation_info:
return save_translation(translation_info)
return False
def get_missing_translations():
"""
Get a list of pages missing translations
Returns:
list: List of keys for pages missing translations
"""
from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
missing_translations = []
# Process top keys
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
for key_info in top_keys:
key = key_info['key']
# Fetch English page
en_page = fetch_wiki_page(key, 'en')
if not en_page:
continue
# Check if French page exists
fr_page = fetch_wiki_page(key, 'fr')
if not fr_page:
missing_translations.append(key)
# Process specific pages
for page in SPECIFIC_PAGES:
# Skip pages with FR: prefix
if page.startswith('FR:'):
continue
# For full URLs, extract the key
if page.startswith('http'):
page_title = page.split('/')[-1]
# Skip if it's a French page
if 'FR:' in page_title:
continue
key = page_title
else:
key = page
# Fetch English page
en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
if not en_page:
continue
# Check if French page exists
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
if not fr_page:
missing_translations.append(key)
return missing_translations
def get_available_translations():
"""
Get a list of available translations
Returns:
dict: Dictionary of available translations
"""
translations = load_json_data(TRANSLATIONS_FILE)
if not translations:
return {}
return translations.get('translations', {})
def main():
"""
Main function to execute the script
"""
logger.info("Starting wiki_translate.py")
# Check if a specific key was provided
if len(sys.argv) > 1:
key = sys.argv[1]
logger.info(f"Translating specific key: {key}")
update_translation(key)
else:
# Get missing translations
missing_translations = get_missing_translations()
logger.info(f"Found {len(missing_translations)} pages missing translations")
# Translate each missing page
for key in missing_translations:
logger.info(f"Processing key: {key}")
update_translation(key)
logger.info("Translation process completed")
if __name__ == "__main__":
main()