recup sources
This commit is contained in:
parent
86622a19ea
commit
65fe2a35f9
155 changed files with 50969 additions and 0 deletions
233
wiki_compare/propose_translation.py
Executable file
233
wiki_compare/propose_translation.py
Executable file
|
@ -0,0 +1,233 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
propose_translation.py
|
||||
|
||||
This script reads the outdated_pages.json file, selects a wiki page (by default the first one),
|
||||
and uses Ollama with the "mistral:7b" model to propose a translation of the page.
|
||||
The translation is saved in the "proposed_translation" property of the JSON file.
|
||||
|
||||
Usage:
|
||||
python propose_translation.py [--page KEY]
|
||||
|
||||
Options:
|
||||
--page KEY Specify the key of the page to translate (default: first page in the file)
|
||||
|
||||
Output:
|
||||
- Updated outdated_pages.json file with proposed translations
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import requests
|
||||
import os
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
OLLAMA_API_URL = "http://localhost:11434/api/generate"
|
||||
OLLAMA_MODEL = "mistral:7b"
|
||||
|
||||
def load_outdated_pages():
|
||||
"""
|
||||
Load the outdated pages from the JSON file
|
||||
|
||||
Returns:
|
||||
list: List of dictionaries containing outdated page information
|
||||
"""
|
||||
try:
|
||||
with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
|
||||
pages = json.load(f)
|
||||
logger.info(f"Successfully loaded {len(pages)} pages from {OUTDATED_PAGES_FILE}")
|
||||
return pages
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
|
||||
return []
|
||||
|
||||
def save_to_json(data, filename):
|
||||
"""
|
||||
Save data to a JSON file
|
||||
|
||||
Args:
|
||||
data: Data to save
|
||||
filename (str): Name of the file
|
||||
"""
|
||||
try:
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Data saved to {filename}")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def fetch_wiki_page_content(url):
|
||||
"""
|
||||
Fetch the content of a wiki page
|
||||
|
||||
Args:
|
||||
url (str): URL of the wiki page
|
||||
|
||||
Returns:
|
||||
str: Content of the wiki page
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Get the main content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
# Get text
|
||||
text = content.get_text(separator=' ', strip=True)
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"Could not find content in page: {url}")
|
||||
return ""
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page content: {e}")
|
||||
return ""
|
||||
|
||||
def translate_with_ollama(text, model=OLLAMA_MODEL):
|
||||
"""
|
||||
Translate text using Ollama
|
||||
|
||||
Args:
|
||||
text (str): Text to translate
|
||||
model (str): Ollama model to use
|
||||
|
||||
Returns:
|
||||
str: Translated text
|
||||
"""
|
||||
prompt = f"""
|
||||
Tu es un traducteur professionnel spécialisé dans la traduction de documentation technique de l'anglais vers le français.
|
||||
Traduis le texte suivant de l'anglais vers le français. Conserve le formatage et la structure du texte original.
|
||||
Ne traduis pas les noms propres, les URLs, et les termes techniques spécifiques à OpenStreetMap.
|
||||
|
||||
Texte à traduire:
|
||||
{text}
|
||||
"""
|
||||
|
||||
try:
|
||||
logger.info(f"Sending request to Ollama with model {model}")
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
response = requests.post(OLLAMA_API_URL, json=payload)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
translation = result.get('response', '')
|
||||
|
||||
logger.info(f"Successfully received translation from Ollama")
|
||||
return translation
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error translating with Ollama: {e}")
|
||||
return ""
|
||||
|
||||
def select_page_for_translation(pages, key=None):
|
||||
"""
|
||||
Select a page for translation
|
||||
|
||||
Args:
|
||||
pages (list): List of dictionaries containing page information
|
||||
key (str): Key of the page to select (if None, select the first page)
|
||||
|
||||
Returns:
|
||||
dict: Selected page or None if no suitable page found
|
||||
"""
|
||||
if not pages:
|
||||
logger.warning("No pages found that need translation")
|
||||
return None
|
||||
|
||||
if key:
|
||||
# Find the page with the specified key
|
||||
for page in pages:
|
||||
if page.get('key') == key:
|
||||
logger.info(f"Selected page for key '{key}' for translation")
|
||||
return page
|
||||
|
||||
logger.warning(f"No page found with key '{key}'")
|
||||
return None
|
||||
else:
|
||||
# Select the first page
|
||||
selected_page = pages[0]
|
||||
logger.info(f"Selected first page (key '{selected_page['key']}') for translation")
|
||||
return selected_page
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
parser = argparse.ArgumentParser(description="Propose a translation for an OSM wiki page using Ollama")
|
||||
parser.add_argument("--page", help="Key of the page to translate (default: first page in the file)")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Starting propose_translation.py")
|
||||
|
||||
# Load pages
|
||||
pages = load_outdated_pages()
|
||||
if not pages:
|
||||
logger.error("No pages found. Run wiki_compare.py first.")
|
||||
sys.exit(1)
|
||||
|
||||
# Select a page for translation
|
||||
selected_page = select_page_for_translation(pages, args.page)
|
||||
if not selected_page:
|
||||
logger.error("Could not select a page for translation.")
|
||||
sys.exit(1)
|
||||
|
||||
# Get the English page URL
|
||||
en_url = selected_page.get('en_page', {}).get('url')
|
||||
if not en_url:
|
||||
logger.error(f"No English page URL found for key '{selected_page['key']}'")
|
||||
sys.exit(1)
|
||||
|
||||
# Fetch the content of the English page
|
||||
logger.info(f"Fetching content from {en_url}")
|
||||
content = fetch_wiki_page_content(en_url)
|
||||
if not content:
|
||||
logger.error(f"Could not fetch content from {en_url}")
|
||||
sys.exit(1)
|
||||
|
||||
# Translate the content
|
||||
logger.info(f"Translating content for key '{selected_page['key']}'")
|
||||
translation = translate_with_ollama(content)
|
||||
if not translation:
|
||||
logger.error("Could not translate content")
|
||||
sys.exit(1)
|
||||
|
||||
# Save the translation in the JSON file
|
||||
logger.info(f"Saving translation for key '{selected_page['key']}'")
|
||||
selected_page['proposed_translation'] = translation
|
||||
|
||||
# Save the updated data back to the file
|
||||
save_to_json(pages, OUTDATED_PAGES_FILE)
|
||||
|
||||
logger.info("Script completed successfully")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue