diff --git a/public/logo-osm.png b/public/logo-osm.png new file mode 100644 index 0000000..4388167 Binary files /dev/null and b/public/logo-osm.png differ diff --git a/wiki_compare/__pycache__/wiki_compare.cpython-313.pyc b/wiki_compare/__pycache__/wiki_compare.cpython-313.pyc new file mode 100644 index 0000000..fc98714 Binary files /dev/null and b/wiki_compare/__pycache__/wiki_compare.cpython-313.pyc differ diff --git a/wiki_compare/wiki_compare.py b/wiki_compare/wiki_compare.py index dbf0806..60156f9 100755 --- a/wiki_compare/wiki_compare.py +++ b/wiki_compare/wiki_compare.py @@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv" OUTDATED_PAGES_FILE = "outdated_pages.json" STALENESS_HISTOGRAM_FILE = "staleness_histogram.png" # Number of wiki pages to examine -NUM_WIKI_PAGES = 50 +NUM_WIKI_PAGES = 2 # HTML cache folder HTML_CACHE_DIR = "html_cache" @@ -66,6 +66,12 @@ try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') + +# Also download punkt_tab resource which is needed for sent_tokenize +try: + nltk.data.find('tokenizers/punkt_tab') +except LookupError: + nltk.download('punkt_tab') # Create HTML cache directory if it doesn't exist Path(HTML_CACHE_DIR).mkdir(exist_ok=True) @@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES): logger.error(f"Error fetching data from TagInfo API: {e}") return [] +def load_json_data(filename): + """ + Load data from a JSON file + + Args: + filename (str): Name of the file + + Returns: + dict: Data loaded from the file or empty dict if file doesn't exist + """ + try: + if os.path.exists(filename): + with open(filename, 'r', encoding='utf-8') as f: + data = json.load(f) + logger.info(f"Data loaded from {filename}") + return data + else: + logger.info(f"File {filename} doesn't exist, returning empty dict") + return {} + except (IOError, json.JSONDecodeError) as e: + logger.error(f"Error loading data from {filename}: {e}") + return {} + def save_to_json(data, filename): """ Save data to a JSON file @@ -138,6 +167,52 @@ def save_to_json(data, filename): logger.info(f"Data saved to {filename}") except IOError as e: logger.error(f"Error saving data to {filename}: {e}") + +def save_with_history(data, filename): + """ + Save data to a JSON file while preserving history + + This function loads existing data from the file (if it exists), + adds the new data to the history, and saves the updated data back to the file. + + Args: + data: New data to save + filename (str): Name of the file + """ + try: + # Load existing data + existing_data = load_json_data(filename) + + # Create a timestamp for the current data + current_timestamp = datetime.now().isoformat() + + # Initialize history if it doesn't exist + if 'history' not in existing_data: + existing_data['history'] = {} + + # Add current regular_pages and specific_pages to history + history_entry = { + 'regular_pages': data.get('regular_pages', []), + 'specific_pages': data.get('specific_pages', []) + } + + # Add the entry to history with timestamp as key + existing_data['history'][current_timestamp] = history_entry + + # Update the current data + existing_data['regular_pages'] = data.get('regular_pages', []) + existing_data['specific_pages'] = data.get('specific_pages', []) + existing_data['last_updated'] = current_timestamp + + # Save the updated data + with open(filename, 'w', encoding='utf-8') as f: + json.dump(existing_data, f, indent=2, ensure_ascii=False) + + logger.info(f"Data with history saved to {filename}") + except (IOError, json.JSONDecodeError) as e: + logger.error(f"Error saving data with history to {filename}: {e}") + # Fallback to regular save if there's an error + save_to_json(data, filename) def check_grammar_with_grammalecte(text): """ @@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): 'grammar_suggestions': grammar_suggestions, 'html_content': html_content } - - except requests.exceptions.RequestException as e: - logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}") - return None def generate_staleness_histogram(wiki_pages): """ @@ -1183,8 +1254,8 @@ def main(): "last_updated": datetime.now().isoformat() } - # Save pages that need updating to JSON - save_to_json(output_data, OUTDATED_PAGES_FILE) + # Save pages that need updating to JSON with history + save_with_history(output_data, OUTDATED_PAGES_FILE) # Print the top pages needing updates print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====") diff --git a/wiki_compare/wiki_translate.py b/wiki_compare/wiki_translate.py new file mode 100644 index 0000000..2b5d8fe --- /dev/null +++ b/wiki_compare/wiki_translate.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +wiki_translate.py + +This script translates wiki pages that don't have translations using the Ollama server +with the mistral:7b model. It saves the translations in a JSON file that is ignored by +.gitignore. + +Usage: + python wiki_translate.py [key] + + If a key is provided, only that page will be translated. + If no key is provided, all pages missing translations will be processed. + +Output: + - translations.json: JSON file containing the translations +""" + +import json +import os +import sys +import logging +import requests +from pathlib import Path +from datetime import datetime +from bs4 import BeautifulSoup + +# Import functions from wiki_compare.py +from wiki_compare import ( + fetch_wiki_page, + load_json_data, + save_to_json, + save_with_history, + SPECIFIC_PAGES, + logger +) + +# Constants +TRANSLATIONS_FILE = "translations.json" +OLLAMA_API_URL = "http://localhost:11434/api/generate" +OLLAMA_MODEL = "mistral:7b" + +def extract_main_content(html_content): + """ + Extract the main content from a wiki page HTML + + Args: + html_content (str): HTML content of the wiki page + + Returns: + str: Main content text + """ + soup = BeautifulSoup(html_content, 'html.parser') + + # Find the main content div + content = soup.select_one('#mw-content-text') + if not content: + logger.warning("Could not find main content div") + return "" + + # Remove script and style elements + for script in content.select('script, style'): + script.extract() + + # Remove navigation elements + for nav in content.select('.languages, .mw-editsection, #toc, .toc'): + nav.extract() + + # Get text + clean_text = content.get_text(separator='\n', strip=True) + + return clean_text + +def translate_text(text, model=OLLAMA_MODEL): + """ + Translate text using Ollama API + + Args: + text (str): Text to translate + model (str): Ollama model to use + + Returns: + str: Translated text + """ + logger.info(f"Translating text using Ollama model {model}") + + # Prepare the prompt + prompt = f"""Translate the following English text to French. +Maintain the original formatting as much as possible. +Keep technical terms intact when appropriate. +Preserve mediawiki formatting if present. + +English text: +{text} + +French translation:""" + + # Prepare the request + data = { + "model": model, + "prompt": prompt, + "stream": False + } + + try: + response = requests.post(OLLAMA_API_URL, json=data) + response.raise_for_status() + result = response.json() + + # Extract the translated text + translated_text = result.get('response', '') + + logger.info(f"Translation successful, received {len(translated_text)} characters") + return translated_text + + except requests.exceptions.RequestException as e: + logger.error(f"Error translating text: {e}") + return "" + +def translate_wiki_page(key): + """ + Translate a wiki page + + Args: + key (str): Key or page title + + Returns: + dict: Translation information + """ + logger.info(f"Translating wiki page for key: {key}") + + # Check if the key is a specific page + is_specific_page = key in SPECIFIC_PAGES or key.startswith('http') or key.startswith('FR:') + + # Fetch the English page + en_page = fetch_wiki_page(key, 'en', is_specific_page=is_specific_page) + if not en_page: + logger.warning(f"English page for key '{key}' not found") + return None + + # Check if French page already exists + fr_page = fetch_wiki_page(key, 'fr', is_specific_page=is_specific_page) + if fr_page: + logger.info(f"French page for key '{key}' already exists") + return None + + # Extract the main content from the English page + html_content = en_page.get('html_content', '') + if not html_content: + logger.warning(f"No HTML content found for key '{key}'") + return None + + main_content = extract_main_content(html_content) + if not main_content: + logger.warning(f"No main content extracted for key '{key}'") + return None + + # Translate the main content + translated_content = translate_text(main_content) + if not translated_content: + logger.warning(f"Translation failed for key '{key}'") + return None + + # Create translation information + translation_info = { + 'key': key, + 'en_page': { + 'url': en_page.get('url', ''), + 'last_modified': en_page.get('last_modified', ''), + 'word_count': en_page.get('word_count', 0) + }, + 'translated_content': translated_content, + 'translated_at': datetime.now().isoformat(), + 'model': OLLAMA_MODEL, + 'is_specific_page': is_specific_page + } + + logger.info(f"Translation completed for key '{key}'") + return translation_info + +def save_translation(translation_info): + """ + Save translation to the translations file + + Args: + translation_info (dict): Translation information + + Returns: + bool: True if successful, False otherwise + """ + if not translation_info: + return False + + # Load existing translations + translations = load_json_data(TRANSLATIONS_FILE) + + # Initialize if empty + if not translations: + translations = { + 'translations': {}, + 'last_updated': datetime.now().isoformat() + } + + # Add or update translation + key = translation_info['key'] + translations['translations'][key] = translation_info + translations['last_updated'] = datetime.now().isoformat() + + # Save translations + save_to_json(translations, TRANSLATIONS_FILE) + + logger.info(f"Translation saved for key '{key}'") + return True + +def update_translation(key): + """ + Update a translation for a specific key + + Args: + key (str): Key or page title + + Returns: + bool: True if successful, False otherwise + """ + logger.info(f"Updating translation for key: {key}") + + # Translate the page + translation_info = translate_wiki_page(key) + + # Save the translation + if translation_info: + return save_translation(translation_info) + + return False + +def get_missing_translations(): + """ + Get a list of pages missing translations + + Returns: + list: List of keys for pages missing translations + """ + from wiki_compare import fetch_top_keys, NUM_WIKI_PAGES + + missing_translations = [] + + # Process top keys + top_keys = fetch_top_keys(NUM_WIKI_PAGES) + for key_info in top_keys: + key = key_info['key'] + + # Fetch English page + en_page = fetch_wiki_page(key, 'en') + if not en_page: + continue + + # Check if French page exists + fr_page = fetch_wiki_page(key, 'fr') + if not fr_page: + missing_translations.append(key) + + # Process specific pages + for page in SPECIFIC_PAGES: + # Skip pages with FR: prefix + if page.startswith('FR:'): + continue + + # For full URLs, extract the key + if page.startswith('http'): + page_title = page.split('/')[-1] + # Skip if it's a French page + if 'FR:' in page_title: + continue + key = page_title + else: + key = page + + # Fetch English page + en_page = fetch_wiki_page(key, 'en', is_specific_page=True) + if not en_page: + continue + + # Check if French page exists + fr_page = fetch_wiki_page(key, 'fr', is_specific_page=True) + if not fr_page: + missing_translations.append(key) + + return missing_translations + +def get_available_translations(): + """ + Get a list of available translations + + Returns: + dict: Dictionary of available translations + """ + translations = load_json_data(TRANSLATIONS_FILE) + if not translations: + return {} + + return translations.get('translations', {}) + +def main(): + """ + Main function to execute the script + """ + logger.info("Starting wiki_translate.py") + + # Check if a specific key was provided + if len(sys.argv) > 1: + key = sys.argv[1] + logger.info(f"Translating specific key: {key}") + update_translation(key) + else: + # Get missing translations + missing_translations = get_missing_translations() + logger.info(f"Found {len(missing_translations)} pages missing translations") + + # Translate each missing page + for key in missing_translations: + logger.info(f"Processing key: {key}") + update_translation(key) + + logger.info("Translation process completed") + +if __name__ == "__main__": + main() \ No newline at end of file