auto traduction ollama
This commit is contained in:
parent
2ad98b5864
commit
eb662fab5a
4 changed files with 407 additions and 7 deletions
BIN
public/logo-osm.png
Normal file
BIN
public/logo-osm.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 12 KiB |
BIN
wiki_compare/__pycache__/wiki_compare.cpython-313.pyc
Normal file
BIN
wiki_compare/__pycache__/wiki_compare.cpython-313.pyc
Normal file
Binary file not shown.
|
@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
|
|||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 50
|
||||
NUM_WIKI_PAGES = 2
|
||||
# HTML cache folder
|
||||
HTML_CACHE_DIR = "html_cache"
|
||||
|
||||
|
@ -66,6 +66,12 @@ try:
|
|||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
|
||||
# Also download punkt_tab resource which is needed for sent_tokenize
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt_tab')
|
||||
except LookupError:
|
||||
nltk.download('punkt_tab')
|
||||
|
||||
# Create HTML cache directory if it doesn't exist
|
||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||
|
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
|||
logger.error(f"Error fetching data from TagInfo API: {e}")
|
||||
return []
|
||||
|
||||
def load_json_data(filename):
|
||||
"""
|
||||
Load data from a JSON file
|
||||
|
||||
Args:
|
||||
filename (str): Name of the file
|
||||
|
||||
Returns:
|
||||
dict: Data loaded from the file or empty dict if file doesn't exist
|
||||
"""
|
||||
try:
|
||||
if os.path.exists(filename):
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
logger.info(f"Data loaded from {filename}")
|
||||
return data
|
||||
else:
|
||||
logger.info(f"File {filename} doesn't exist, returning empty dict")
|
||||
return {}
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error loading data from {filename}: {e}")
|
||||
return {}
|
||||
|
||||
def save_to_json(data, filename):
|
||||
"""
|
||||
Save data to a JSON file
|
||||
|
@ -138,6 +167,52 @@ def save_to_json(data, filename):
|
|||
logger.info(f"Data saved to {filename}")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def save_with_history(data, filename):
|
||||
"""
|
||||
Save data to a JSON file while preserving history
|
||||
|
||||
This function loads existing data from the file (if it exists),
|
||||
adds the new data to the history, and saves the updated data back to the file.
|
||||
|
||||
Args:
|
||||
data: New data to save
|
||||
filename (str): Name of the file
|
||||
"""
|
||||
try:
|
||||
# Load existing data
|
||||
existing_data = load_json_data(filename)
|
||||
|
||||
# Create a timestamp for the current data
|
||||
current_timestamp = datetime.now().isoformat()
|
||||
|
||||
# Initialize history if it doesn't exist
|
||||
if 'history' not in existing_data:
|
||||
existing_data['history'] = {}
|
||||
|
||||
# Add current regular_pages and specific_pages to history
|
||||
history_entry = {
|
||||
'regular_pages': data.get('regular_pages', []),
|
||||
'specific_pages': data.get('specific_pages', [])
|
||||
}
|
||||
|
||||
# Add the entry to history with timestamp as key
|
||||
existing_data['history'][current_timestamp] = history_entry
|
||||
|
||||
# Update the current data
|
||||
existing_data['regular_pages'] = data.get('regular_pages', [])
|
||||
existing_data['specific_pages'] = data.get('specific_pages', [])
|
||||
existing_data['last_updated'] = current_timestamp
|
||||
|
||||
# Save the updated data
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Data with history saved to {filename}")
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error saving data with history to {filename}: {e}")
|
||||
# Fallback to regular save if there's an error
|
||||
save_to_json(data, filename)
|
||||
|
||||
def check_grammar_with_grammalecte(text):
|
||||
"""
|
||||
|
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
'grammar_suggestions': grammar_suggestions,
|
||||
'html_content': html_content
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
def generate_staleness_histogram(wiki_pages):
|
||||
"""
|
||||
|
@ -1183,8 +1254,8 @@ def main():
|
|||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Save pages that need updating to JSON
|
||||
save_to_json(output_data, OUTDATED_PAGES_FILE)
|
||||
# Save pages that need updating to JSON with history
|
||||
save_with_history(output_data, OUTDATED_PAGES_FILE)
|
||||
|
||||
# Print the top pages needing updates
|
||||
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
|
||||
|
|
329
wiki_compare/wiki_translate.py
Normal file
329
wiki_compare/wiki_translate.py
Normal file
|
@ -0,0 +1,329 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
wiki_translate.py
|
||||
|
||||
This script translates wiki pages that don't have translations using the Ollama server
|
||||
with the mistral:7b model. It saves the translations in a JSON file that is ignored by
|
||||
.gitignore.
|
||||
|
||||
Usage:
|
||||
python wiki_translate.py [key]
|
||||
|
||||
If a key is provided, only that page will be translated.
|
||||
If no key is provided, all pages missing translations will be processed.
|
||||
|
||||
Output:
|
||||
- translations.json: JSON file containing the translations
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Import functions from wiki_compare.py
|
||||
from wiki_compare import (
|
||||
fetch_wiki_page,
|
||||
load_json_data,
|
||||
save_to_json,
|
||||
save_with_history,
|
||||
SPECIFIC_PAGES,
|
||||
logger
|
||||
)
|
||||
|
||||
# Constants
|
||||
TRANSLATIONS_FILE = "translations.json"
|
||||
OLLAMA_API_URL = "http://localhost:11434/api/generate"
|
||||
OLLAMA_MODEL = "mistral:7b"
|
||||
|
||||
def extract_main_content(html_content):
|
||||
"""
|
||||
Extract the main content from a wiki page HTML
|
||||
|
||||
Args:
|
||||
html_content (str): HTML content of the wiki page
|
||||
|
||||
Returns:
|
||||
str: Main content text
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Find the main content div
|
||||
content = soup.select_one('#mw-content-text')
|
||||
if not content:
|
||||
logger.warning("Could not find main content div")
|
||||
return ""
|
||||
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove navigation elements
|
||||
for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
|
||||
nav.extract()
|
||||
|
||||
# Get text
|
||||
clean_text = content.get_text(separator='\n', strip=True)
|
||||
|
||||
return clean_text
|
||||
|
||||
def translate_text(text, model=OLLAMA_MODEL):
|
||||
"""
|
||||
Translate text using Ollama API
|
||||
|
||||
Args:
|
||||
text (str): Text to translate
|
||||
model (str): Ollama model to use
|
||||
|
||||
Returns:
|
||||
str: Translated text
|
||||
"""
|
||||
logger.info(f"Translating text using Ollama model {model}")
|
||||
|
||||
# Prepare the prompt
|
||||
prompt = f"""Translate the following English text to French.
|
||||
Maintain the original formatting as much as possible.
|
||||
Keep technical terms intact when appropriate.
|
||||
Preserve mediawiki formatting if present.
|
||||
|
||||
English text:
|
||||
{text}
|
||||
|
||||
French translation:"""
|
||||
|
||||
# Prepare the request
|
||||
data = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(OLLAMA_API_URL, json=data)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
# Extract the translated text
|
||||
translated_text = result.get('response', '')
|
||||
|
||||
logger.info(f"Translation successful, received {len(translated_text)} characters")
|
||||
return translated_text
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error translating text: {e}")
|
||||
return ""
|
||||
|
||||
def translate_wiki_page(key):
|
||||
"""
|
||||
Translate a wiki page
|
||||
|
||||
Args:
|
||||
key (str): Key or page title
|
||||
|
||||
Returns:
|
||||
dict: Translation information
|
||||
"""
|
||||
logger.info(f"Translating wiki page for key: {key}")
|
||||
|
||||
# Check if the key is a specific page
|
||||
is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
|
||||
|
||||
# Fetch the English page
|
||||
en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
|
||||
if not en_page:
|
||||
logger.warning(f"English page for key '{key}' not found")
|
||||
return None
|
||||
|
||||
# Check if French page already exists
|
||||
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
|
||||
if fr_page:
|
||||
logger.info(f"French page for key '{key}' already exists")
|
||||
return None
|
||||
|
||||
# Extract the main content from the English page
|
||||
html_content = en_page.get('html_content', '')
|
||||
if not html_content:
|
||||
logger.warning(f"No HTML content found for key '{key}'")
|
||||
return None
|
||||
|
||||
main_content = extract_main_content(html_content)
|
||||
if not main_content:
|
||||
logger.warning(f"No main content extracted for key '{key}'")
|
||||
return None
|
||||
|
||||
# Translate the main content
|
||||
translated_content = translate_text(main_content)
|
||||
if not translated_content:
|
||||
logger.warning(f"Translation failed for key '{key}'")
|
||||
return None
|
||||
|
||||
# Create translation information
|
||||
translation_info = {
|
||||
'key': key,
|
||||
'en_page': {
|
||||
'url': en_page.get('url', ''),
|
||||
'last_modified': en_page.get('last_modified', ''),
|
||||
'word_count': en_page.get('word_count', 0)
|
||||
},
|
||||
'translated_content': translated_content,
|
||||
'translated_at': datetime.now().isoformat(),
|
||||
'model': OLLAMA_MODEL,
|
||||
'is_specific_page': is_specific_page
|
||||
}
|
||||
|
||||
logger.info(f"Translation completed for key '{key}'")
|
||||
return translation_info
|
||||
|
||||
def save_translation(translation_info):
|
||||
"""
|
||||
Save translation to the translations file
|
||||
|
||||
Args:
|
||||
translation_info (dict): Translation information
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
if not translation_info:
|
||||
return False
|
||||
|
||||
# Load existing translations
|
||||
translations = load_json_data(TRANSLATIONS_FILE)
|
||||
|
||||
# Initialize if empty
|
||||
if not translations:
|
||||
translations = {
|
||||
'translations': {},
|
||||
'last_updated': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Add or update translation
|
||||
key = translation_info['key']
|
||||
translations['translations'][key] = translation_info
|
||||
translations['last_updated'] = datetime.now().isoformat()
|
||||
|
||||
# Save translations
|
||||
save_to_json(translations, TRANSLATIONS_FILE)
|
||||
|
||||
logger.info(f"Translation saved for key '{key}'")
|
||||
return True
|
||||
|
||||
def update_translation(key):
|
||||
"""
|
||||
Update a translation for a specific key
|
||||
|
||||
Args:
|
||||
key (str): Key or page title
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
logger.info(f"Updating translation for key: {key}")
|
||||
|
||||
# Translate the page
|
||||
translation_info = translate_wiki_page(key)
|
||||
|
||||
# Save the translation
|
||||
if translation_info:
|
||||
return save_translation(translation_info)
|
||||
|
||||
return False
|
||||
|
||||
def get_missing_translations():
|
||||
"""
|
||||
Get a list of pages missing translations
|
||||
|
||||
Returns:
|
||||
list: List of keys for pages missing translations
|
||||
"""
|
||||
from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
|
||||
|
||||
missing_translations = []
|
||||
|
||||
# Process top keys
|
||||
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
|
||||
for key_info in top_keys:
|
||||
key = key_info['key']
|
||||
|
||||
# Fetch English page
|
||||
en_page = fetch_wiki_page(key, 'en')
|
||||
if not en_page:
|
||||
continue
|
||||
|
||||
# Check if French page exists
|
||||
fr_page = fetch_wiki_page(key, 'fr')
|
||||
if not fr_page:
|
||||
missing_translations.append(key)
|
||||
|
||||
# Process specific pages
|
||||
for page in SPECIFIC_PAGES:
|
||||
# Skip pages with FR: prefix
|
||||
if page.startswith('FR:'):
|
||||
continue
|
||||
|
||||
# For full URLs, extract the key
|
||||
if page.startswith('http'):
|
||||
page_title = page.split('/')[-1]
|
||||
# Skip if it's a French page
|
||||
if 'FR:' in page_title:
|
||||
continue
|
||||
key = page_title
|
||||
else:
|
||||
key = page
|
||||
|
||||
# Fetch English page
|
||||
en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
|
||||
if not en_page:
|
||||
continue
|
||||
|
||||
# Check if French page exists
|
||||
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
|
||||
if not fr_page:
|
||||
missing_translations.append(key)
|
||||
|
||||
return missing_translations
|
||||
|
||||
def get_available_translations():
|
||||
"""
|
||||
Get a list of available translations
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of available translations
|
||||
"""
|
||||
translations = load_json_data(TRANSLATIONS_FILE)
|
||||
if not translations:
|
||||
return {}
|
||||
|
||||
return translations.get('translations', {})
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to execute the script
|
||||
"""
|
||||
logger.info("Starting wiki_translate.py")
|
||||
|
||||
# Check if a specific key was provided
|
||||
if len(sys.argv) > 1:
|
||||
key = sys.argv[1]
|
||||
logger.info(f"Translating specific key: {key}")
|
||||
update_translation(key)
|
||||
else:
|
||||
# Get missing translations
|
||||
missing_translations = get_missing_translations()
|
||||
logger.info(f"Found {len(missing_translations)} pages missing translations")
|
||||
|
||||
# Translate each missing page
|
||||
for key in missing_translations:
|
||||
logger.info(f"Processing key: {key}")
|
||||
update_translation(key)
|
||||
|
||||
logger.info("Translation process completed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue