auto traduction ollama
This commit is contained in:
parent
2ad98b5864
commit
eb662fab5a
4 changed files with 407 additions and 7 deletions
|
@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
|
|||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 50
|
||||
NUM_WIKI_PAGES = 2
|
||||
# HTML cache folder
|
||||
HTML_CACHE_DIR = "html_cache"
|
||||
|
||||
|
@ -66,6 +66,12 @@ try:
|
|||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
|
||||
# Also download punkt_tab resource which is needed for sent_tokenize
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt_tab')
|
||||
except LookupError:
|
||||
nltk.download('punkt_tab')
|
||||
|
||||
# Create HTML cache directory if it doesn't exist
|
||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||
|
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
|||
logger.error(f"Error fetching data from TagInfo API: {e}")
|
||||
return []
|
||||
|
||||
def load_json_data(filename):
|
||||
"""
|
||||
Load data from a JSON file
|
||||
|
||||
Args:
|
||||
filename (str): Name of the file
|
||||
|
||||
Returns:
|
||||
dict: Data loaded from the file or empty dict if file doesn't exist
|
||||
"""
|
||||
try:
|
||||
if os.path.exists(filename):
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
logger.info(f"Data loaded from {filename}")
|
||||
return data
|
||||
else:
|
||||
logger.info(f"File {filename} doesn't exist, returning empty dict")
|
||||
return {}
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error loading data from {filename}: {e}")
|
||||
return {}
|
||||
|
||||
def save_to_json(data, filename):
|
||||
"""
|
||||
Save data to a JSON file
|
||||
|
@ -138,6 +167,52 @@ def save_to_json(data, filename):
|
|||
logger.info(f"Data saved to {filename}")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def save_with_history(data, filename):
|
||||
"""
|
||||
Save data to a JSON file while preserving history
|
||||
|
||||
This function loads existing data from the file (if it exists),
|
||||
adds the new data to the history, and saves the updated data back to the file.
|
||||
|
||||
Args:
|
||||
data: New data to save
|
||||
filename (str): Name of the file
|
||||
"""
|
||||
try:
|
||||
# Load existing data
|
||||
existing_data = load_json_data(filename)
|
||||
|
||||
# Create a timestamp for the current data
|
||||
current_timestamp = datetime.now().isoformat()
|
||||
|
||||
# Initialize history if it doesn't exist
|
||||
if 'history' not in existing_data:
|
||||
existing_data['history'] = {}
|
||||
|
||||
# Add current regular_pages and specific_pages to history
|
||||
history_entry = {
|
||||
'regular_pages': data.get('regular_pages', []),
|
||||
'specific_pages': data.get('specific_pages', [])
|
||||
}
|
||||
|
||||
# Add the entry to history with timestamp as key
|
||||
existing_data['history'][current_timestamp] = history_entry
|
||||
|
||||
# Update the current data
|
||||
existing_data['regular_pages'] = data.get('regular_pages', [])
|
||||
existing_data['specific_pages'] = data.get('specific_pages', [])
|
||||
existing_data['last_updated'] = current_timestamp
|
||||
|
||||
# Save the updated data
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Data with history saved to {filename}")
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error saving data with history to {filename}: {e}")
|
||||
# Fallback to regular save if there's an error
|
||||
save_to_json(data, filename)
|
||||
|
||||
def check_grammar_with_grammalecte(text):
|
||||
"""
|
||||
|
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
'grammar_suggestions': grammar_suggestions,
|
||||
'html_content': html_content
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
def generate_staleness_histogram(wiki_pages):
|
||||
"""
|
||||
|
@ -1183,8 +1254,8 @@ def main():
|
|||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Save pages that need updating to JSON
|
||||
save_to_json(output_data, OUTDATED_PAGES_FILE)
|
||||
# Save pages that need updating to JSON with history
|
||||
save_with_history(output_data, OUTDATED_PAGES_FILE)
|
||||
|
||||
# Print the top pages needing updates
|
||||
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue