auto traduction ollama
This commit is contained in:
parent
2ad98b5864
commit
eb662fab5a
4 changed files with 407 additions and 7 deletions
BIN
public/logo-osm.png
Normal file
BIN
public/logo-osm.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 12 KiB |
BIN
wiki_compare/__pycache__/wiki_compare.cpython-313.pyc
Normal file
BIN
wiki_compare/__pycache__/wiki_compare.cpython-313.pyc
Normal file
Binary file not shown.
|
@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||||
# Number of wiki pages to examine
|
# Number of wiki pages to examine
|
||||||
NUM_WIKI_PAGES = 50
|
NUM_WIKI_PAGES = 2
|
||||||
# HTML cache folder
|
# HTML cache folder
|
||||||
HTML_CACHE_DIR = "html_cache"
|
HTML_CACHE_DIR = "html_cache"
|
||||||
|
|
||||||
|
@ -66,6 +66,12 @@ try:
|
||||||
nltk.data.find('tokenizers/punkt')
|
nltk.data.find('tokenizers/punkt')
|
||||||
except LookupError:
|
except LookupError:
|
||||||
nltk.download('punkt')
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
# Also download punkt_tab resource which is needed for sent_tokenize
|
||||||
|
try:
|
||||||
|
nltk.data.find('tokenizers/punkt_tab')
|
||||||
|
except LookupError:
|
||||||
|
nltk.download('punkt_tab')
|
||||||
|
|
||||||
# Create HTML cache directory if it doesn't exist
|
# Create HTML cache directory if it doesn't exist
|
||||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||||
|
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||||
logger.error(f"Error fetching data from TagInfo API: {e}")
|
logger.error(f"Error fetching data from TagInfo API: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def load_json_data(filename):
|
||||||
|
"""
|
||||||
|
Load data from a JSON file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename (str): Name of the file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Data loaded from the file or empty dict if file doesn't exist
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if os.path.exists(filename):
|
||||||
|
with open(filename, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
logger.info(f"Data loaded from {filename}")
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
logger.info(f"File {filename} doesn't exist, returning empty dict")
|
||||||
|
return {}
|
||||||
|
except (IOError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Error loading data from {filename}: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
def save_to_json(data, filename):
|
def save_to_json(data, filename):
|
||||||
"""
|
"""
|
||||||
Save data to a JSON file
|
Save data to a JSON file
|
||||||
|
@ -138,6 +167,52 @@ def save_to_json(data, filename):
|
||||||
logger.info(f"Data saved to {filename}")
|
logger.info(f"Data saved to {filename}")
|
||||||
except IOError as e:
|
except IOError as e:
|
||||||
logger.error(f"Error saving data to {filename}: {e}")
|
logger.error(f"Error saving data to {filename}: {e}")
|
||||||
|
|
||||||
|
def save_with_history(data, filename):
|
||||||
|
"""
|
||||||
|
Save data to a JSON file while preserving history
|
||||||
|
|
||||||
|
This function loads existing data from the file (if it exists),
|
||||||
|
adds the new data to the history, and saves the updated data back to the file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: New data to save
|
||||||
|
filename (str): Name of the file
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Load existing data
|
||||||
|
existing_data = load_json_data(filename)
|
||||||
|
|
||||||
|
# Create a timestamp for the current data
|
||||||
|
current_timestamp = datetime.now().isoformat()
|
||||||
|
|
||||||
|
# Initialize history if it doesn't exist
|
||||||
|
if 'history' not in existing_data:
|
||||||
|
existing_data['history'] = {}
|
||||||
|
|
||||||
|
# Add current regular_pages and specific_pages to history
|
||||||
|
history_entry = {
|
||||||
|
'regular_pages': data.get('regular_pages', []),
|
||||||
|
'specific_pages': data.get('specific_pages', [])
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add the entry to history with timestamp as key
|
||||||
|
existing_data['history'][current_timestamp] = history_entry
|
||||||
|
|
||||||
|
# Update the current data
|
||||||
|
existing_data['regular_pages'] = data.get('regular_pages', [])
|
||||||
|
existing_data['specific_pages'] = data.get('specific_pages', [])
|
||||||
|
existing_data['last_updated'] = current_timestamp
|
||||||
|
|
||||||
|
# Save the updated data
|
||||||
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
logger.info(f"Data with history saved to {filename}")
|
||||||
|
except (IOError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Error saving data with history to {filename}: {e}")
|
||||||
|
# Fallback to regular save if there's an error
|
||||||
|
save_to_json(data, filename)
|
||||||
|
|
||||||
def check_grammar_with_grammalecte(text):
|
def check_grammar_with_grammalecte(text):
|
||||||
"""
|
"""
|
||||||
|
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
'grammar_suggestions': grammar_suggestions,
|
'grammar_suggestions': grammar_suggestions,
|
||||||
'html_content': html_content
|
'html_content': html_content
|
||||||
}
|
}
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def generate_staleness_histogram(wiki_pages):
|
def generate_staleness_histogram(wiki_pages):
|
||||||
"""
|
"""
|
||||||
|
@ -1183,8 +1254,8 @@ def main():
|
||||||
"last_updated": datetime.now().isoformat()
|
"last_updated": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
# Save pages that need updating to JSON
|
# Save pages that need updating to JSON with history
|
||||||
save_to_json(output_data, OUTDATED_PAGES_FILE)
|
save_with_history(output_data, OUTDATED_PAGES_FILE)
|
||||||
|
|
||||||
# Print the top pages needing updates
|
# Print the top pages needing updates
|
||||||
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
|
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
|
||||||
|
|
329
wiki_compare/wiki_translate.py
Normal file
329
wiki_compare/wiki_translate.py
Normal file
|
@ -0,0 +1,329 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
wiki_translate.py
|
||||||
|
|
||||||
|
This script translates wiki pages that don't have translations using the Ollama server
|
||||||
|
with the mistral:7b model. It saves the translations in a JSON file that is ignored by
|
||||||
|
.gitignore.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python wiki_translate.py [key]
|
||||||
|
|
||||||
|
If a key is provided, only that page will be translated.
|
||||||
|
If no key is provided, all pages missing translations will be processed.
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- translations.json: JSON file containing the translations
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Import functions from wiki_compare.py
|
||||||
|
from wiki_compare import (
|
||||||
|
fetch_wiki_page,
|
||||||
|
load_json_data,
|
||||||
|
save_to_json,
|
||||||
|
save_with_history,
|
||||||
|
SPECIFIC_PAGES,
|
||||||
|
logger
|
||||||
|
)
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
TRANSLATIONS_FILE = "translations.json"
|
||||||
|
OLLAMA_API_URL = "http://localhost:11434/api/generate"
|
||||||
|
OLLAMA_MODEL = "mistral:7b"
|
||||||
|
|
||||||
|
def extract_main_content(html_content):
|
||||||
|
"""
|
||||||
|
Extract the main content from a wiki page HTML
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content (str): HTML content of the wiki page
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Main content text
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# Find the main content div
|
||||||
|
content = soup.select_one('#mw-content-text')
|
||||||
|
if not content:
|
||||||
|
logger.warning("Could not find main content div")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove script and style elements
|
||||||
|
for script in content.select('script, style'):
|
||||||
|
script.extract()
|
||||||
|
|
||||||
|
# Remove navigation elements
|
||||||
|
for nav in content.select('.languages, .mw-editsection, #toc, .toc'):
|
||||||
|
nav.extract()
|
||||||
|
|
||||||
|
# Get text
|
||||||
|
clean_text = content.get_text(separator='\n', strip=True)
|
||||||
|
|
||||||
|
return clean_text
|
||||||
|
|
||||||
|
def translate_text(text, model=OLLAMA_MODEL):
|
||||||
|
"""
|
||||||
|
Translate text using Ollama API
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to translate
|
||||||
|
model (str): Ollama model to use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Translated text
|
||||||
|
"""
|
||||||
|
logger.info(f"Translating text using Ollama model {model}")
|
||||||
|
|
||||||
|
# Prepare the prompt
|
||||||
|
prompt = f"""Translate the following English text to French.
|
||||||
|
Maintain the original formatting as much as possible.
|
||||||
|
Keep technical terms intact when appropriate.
|
||||||
|
Preserve mediawiki formatting if present.
|
||||||
|
|
||||||
|
English text:
|
||||||
|
{text}
|
||||||
|
|
||||||
|
French translation:"""
|
||||||
|
|
||||||
|
# Prepare the request
|
||||||
|
data = {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(OLLAMA_API_URL, json=data)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
# Extract the translated text
|
||||||
|
translated_text = result.get('response', '')
|
||||||
|
|
||||||
|
logger.info(f"Translation successful, received {len(translated_text)} characters")
|
||||||
|
return translated_text
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error translating text: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def translate_wiki_page(key):
|
||||||
|
"""
|
||||||
|
Translate a wiki page
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key (str): Key or page title
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Translation information
|
||||||
|
"""
|
||||||
|
logger.info(f"Translating wiki page for key: {key}")
|
||||||
|
|
||||||
|
# Check if the key is a specific page
|
||||||
|
is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:')
|
||||||
|
|
||||||
|
# Fetch the English page
|
||||||
|
en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page)
|
||||||
|
if not en_page:
|
||||||
|
logger.warning(f"English page for key '{key}' not found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check if French page already exists
|
||||||
|
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page)
|
||||||
|
if fr_page:
|
||||||
|
logger.info(f"French page for key '{key}' already exists")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Extract the main content from the English page
|
||||||
|
html_content = en_page.get('html_content', '')
|
||||||
|
if not html_content:
|
||||||
|
logger.warning(f"No HTML content found for key '{key}'")
|
||||||
|
return None
|
||||||
|
|
||||||
|
main_content = extract_main_content(html_content)
|
||||||
|
if not main_content:
|
||||||
|
logger.warning(f"No main content extracted for key '{key}'")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Translate the main content
|
||||||
|
translated_content = translate_text(main_content)
|
||||||
|
if not translated_content:
|
||||||
|
logger.warning(f"Translation failed for key '{key}'")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Create translation information
|
||||||
|
translation_info = {
|
||||||
|
'key': key,
|
||||||
|
'en_page': {
|
||||||
|
'url': en_page.get('url', ''),
|
||||||
|
'last_modified': en_page.get('last_modified', ''),
|
||||||
|
'word_count': en_page.get('word_count', 0)
|
||||||
|
},
|
||||||
|
'translated_content': translated_content,
|
||||||
|
'translated_at': datetime.now().isoformat(),
|
||||||
|
'model': OLLAMA_MODEL,
|
||||||
|
'is_specific_page': is_specific_page
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Translation completed for key '{key}'")
|
||||||
|
return translation_info
|
||||||
|
|
||||||
|
def save_translation(translation_info):
|
||||||
|
"""
|
||||||
|
Save translation to the translations file
|
||||||
|
|
||||||
|
Args:
|
||||||
|
translation_info (dict): Translation information
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
if not translation_info:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Load existing translations
|
||||||
|
translations = load_json_data(TRANSLATIONS_FILE)
|
||||||
|
|
||||||
|
# Initialize if empty
|
||||||
|
if not translations:
|
||||||
|
translations = {
|
||||||
|
'translations': {},
|
||||||
|
'last_updated': datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add or update translation
|
||||||
|
key = translation_info['key']
|
||||||
|
translations['translations'][key] = translation_info
|
||||||
|
translations['last_updated'] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
# Save translations
|
||||||
|
save_to_json(translations, TRANSLATIONS_FILE)
|
||||||
|
|
||||||
|
logger.info(f"Translation saved for key '{key}'")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def update_translation(key):
|
||||||
|
"""
|
||||||
|
Update a translation for a specific key
|
||||||
|
|
||||||
|
Args:
|
||||||
|
key (str): Key or page title
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if successful, False otherwise
|
||||||
|
"""
|
||||||
|
logger.info(f"Updating translation for key: {key}")
|
||||||
|
|
||||||
|
# Translate the page
|
||||||
|
translation_info = translate_wiki_page(key)
|
||||||
|
|
||||||
|
# Save the translation
|
||||||
|
if translation_info:
|
||||||
|
return save_translation(translation_info)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_missing_translations():
|
||||||
|
"""
|
||||||
|
Get a list of pages missing translations
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of keys for pages missing translations
|
||||||
|
"""
|
||||||
|
from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES
|
||||||
|
|
||||||
|
missing_translations = []
|
||||||
|
|
||||||
|
# Process top keys
|
||||||
|
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
|
||||||
|
for key_info in top_keys:
|
||||||
|
key = key_info['key']
|
||||||
|
|
||||||
|
# Fetch English page
|
||||||
|
en_page = fetch_wiki_page(key, 'en')
|
||||||
|
if not en_page:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if French page exists
|
||||||
|
fr_page = fetch_wiki_page(key, 'fr')
|
||||||
|
if not fr_page:
|
||||||
|
missing_translations.append(key)
|
||||||
|
|
||||||
|
# Process specific pages
|
||||||
|
for page in SPECIFIC_PAGES:
|
||||||
|
# Skip pages with FR: prefix
|
||||||
|
if page.startswith('FR:'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# For full URLs, extract the key
|
||||||
|
if page.startswith('http'):
|
||||||
|
page_title = page.split('/')[-1]
|
||||||
|
# Skip if it's a French page
|
||||||
|
if 'FR:' in page_title:
|
||||||
|
continue
|
||||||
|
key = page_title
|
||||||
|
else:
|
||||||
|
key = page
|
||||||
|
|
||||||
|
# Fetch English page
|
||||||
|
en_page = fetch_wiki_page(key, 'en', is_specific_page=True)
|
||||||
|
if not en_page:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if French page exists
|
||||||
|
fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True)
|
||||||
|
if not fr_page:
|
||||||
|
missing_translations.append(key)
|
||||||
|
|
||||||
|
return missing_translations
|
||||||
|
|
||||||
|
def get_available_translations():
|
||||||
|
"""
|
||||||
|
Get a list of available translations
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary of available translations
|
||||||
|
"""
|
||||||
|
translations = load_json_data(TRANSLATIONS_FILE)
|
||||||
|
if not translations:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
return translations.get('translations', {})
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Main function to execute the script
|
||||||
|
"""
|
||||||
|
logger.info("Starting wiki_translate.py")
|
||||||
|
|
||||||
|
# Check if a specific key was provided
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
key = sys.argv[1]
|
||||||
|
logger.info(f"Translating specific key: {key}")
|
||||||
|
update_translation(key)
|
||||||
|
else:
|
||||||
|
# Get missing translations
|
||||||
|
missing_translations = get_missing_translations()
|
||||||
|
logger.info(f"Found {len(missing_translations)} pages missing translations")
|
||||||
|
|
||||||
|
# Translate each missing page
|
||||||
|
for key in missing_translations:
|
||||||
|
logger.info(f"Processing key: {key}")
|
||||||
|
update_translation(key)
|
||||||
|
|
||||||
|
logger.info("Translation process completed")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Add table
Add a link
Reference in a new issue