auto traduction ollama

This commit is contained in:
Tykayn 2025-09-04 00:14:55 +02:00 committed by tykayn
parent 2ad98b5864
commit eb662fab5a
4 changed files with 407 additions and 7 deletions

View file

@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 50
NUM_WIKI_PAGES = 2
# HTML cache folder
HTML_CACHE_DIR = "html_cache"
@ -66,6 +66,12 @@ try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Also download punkt_tab resource which is needed for sent_tokenize
try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
logger.error(f"Error fetching data from TagInfo API: {e}")
return []
def load_json_data(filename):
"""
Load data from a JSON file
Args:
filename (str): Name of the file
Returns:
dict: Data loaded from the file or empty dict if file doesn't exist
"""
try:
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Data loaded from {filename}")
return data
else:
logger.info(f"File {filename} doesn't exist, returning empty dict")
return {}
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading data from {filename}: {e}")
return {}
def save_to_json(data, filename):
"""
Save data to a JSON file
@ -138,6 +167,52 @@ def save_to_json(data, filename):
logger.info(f"Data saved to {filename}")
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def save_with_history(data, filename):
"""
Save data to a JSON file while preserving history
This function loads existing data from the file (if it exists),
adds the new data to the history, and saves the updated data back to the file.
Args:
data: New data to save
filename (str): Name of the file
"""
try:
# Load existing data
existing_data = load_json_data(filename)
# Create a timestamp for the current data
current_timestamp = datetime.now().isoformat()
# Initialize history if it doesn't exist
if 'history' not in existing_data:
existing_data['history'] = {}
# Add current regular_pages and specific_pages to history
history_entry = {
'regular_pages': data.get('regular_pages', []),
'specific_pages': data.get('specific_pages', [])
}
# Add the entry to history with timestamp as key
existing_data['history'][current_timestamp] = history_entry
# Update the current data
existing_data['regular_pages'] = data.get('regular_pages', [])
existing_data['specific_pages'] = data.get('specific_pages', [])
existing_data['last_updated'] = current_timestamp
# Save the updated data
with open(filename, 'w', encoding='utf-8') as f:
json.dump(existing_data, f, indent=2, ensure_ascii=False)
logger.info(f"Data with history saved to {filename}")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving data with history to {filename}: {e}")
# Fallback to regular save if there's an error
save_to_json(data, filename)
def check_grammar_with_grammalecte(text):
"""
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'grammar_suggestions': grammar_suggestions,
'html_content': html_content
}
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None
def generate_staleness_histogram(wiki_pages):
"""
@ -1183,8 +1254,8 @@ def main():
"last_updated": datetime.now().isoformat()
}
# Save pages that need updating to JSON
save_to_json(output_data, OUTDATED_PAGES_FILE)
# Save pages that need updating to JSON with history
save_with_history(output_data, OUTDATED_PAGES_FILE)
# Print the top pages needing updates
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")