auto traduction ollama

2025-09-04 00:14:55 +02:00 · 2025-09-04 00:14:55 +02:00 · eb662fab5a
commit eb662fab5a
parent 2ad98b5864
4 changed files with 407 additions and 7 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -57,7 +57,7 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
 STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
 # Number of wiki pages to examine
-NUM_WIKI_PAGES = 50
+NUM_WIKI_PAGES = 2
 # HTML cache folder
 HTML_CACHE_DIR = "html_cache"

@ -66,6 +66,12 @@ try:
    nltk.data.find('tokenizers/punkt')
 except LookupError:
    nltk.download('punkt')
+
+# Also download punkt_tab resource which is needed for sent_tokenize
+try:
+    nltk.data.find('tokenizers/punkt_tab')
+except LookupError:
+    nltk.download('punkt_tab')
    
 # Create HTML cache directory if it doesn't exist
 Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -124,6 +130,29 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
        logger.error(f"Error fetching data from TagInfo API: {e}")
        return []

+def load_json_data(filename):
+    """
+    Load data from a JSON file
+    
+    Args:
+        filename (str): Name of the file
+        
+    Returns:
+        dict: Data loaded from the file or empty dict if file doesn't exist
+    """
+    try:
+        if os.path.exists(filename):
+            with open(filename, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            logger.info(f"Data loaded from {filename}")
+            return data
+        else:
+            logger.info(f"File {filename} doesn't exist, returning empty dict")
+            return {}
+    except (IOError, json.JSONDecodeError) as e:
+        logger.error(f"Error loading data from {filename}: {e}")
+        return {}
+
 def save_to_json(data, filename):
    """
    Save data to a JSON file
@ -138,6 +167,52 @@ def save_to_json(data, filename):
        logger.info(f"Data saved to {filename}")
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")
+        
+def save_with_history(data, filename):
+    """
+    Save data to a JSON file while preserving history
+    
+    This function loads existing data from the file (if it exists),
+    adds the new data to the history, and saves the updated data back to the file.
+    
+    Args:
+        data: New data to save
+        filename (str): Name of the file
+    """
+    try:
+        # Load existing data
+        existing_data = load_json_data(filename)
+        
+        # Create a timestamp for the current data
+        current_timestamp = datetime.now().isoformat()
+        
+        # Initialize history if it doesn't exist
+        if 'history' not in existing_data:
+            existing_data['history'] = {}
+            
+        # Add current regular_pages and specific_pages to history
+        history_entry = {
+            'regular_pages': data.get('regular_pages', []),
+            'specific_pages': data.get('specific_pages', [])
+        }
+        
+        # Add the entry to history with timestamp as key
+        existing_data['history'][current_timestamp] = history_entry
+        
+        # Update the current data
+        existing_data['regular_pages'] = data.get('regular_pages', [])
+        existing_data['specific_pages'] = data.get('specific_pages', [])
+        existing_data['last_updated'] = current_timestamp
+        
+        # Save the updated data
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(existing_data, f, indent=2, ensure_ascii=False)
+            
+        logger.info(f"Data with history saved to {filename}")
+    except (IOError, json.JSONDecodeError) as e:
+        logger.error(f"Error saving data with history to {filename}: {e}")
+        # Fallback to regular save if there's an error
+        save_to_json(data, filename)

 def check_grammar_with_grammalecte(text):
    """
@ -604,10 +679,6 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'grammar_suggestions': grammar_suggestions,
            'html_content': html_content
        }
-    
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
-        return None

 def generate_staleness_histogram(wiki_pages):
    """
@ -1183,8 +1254,8 @@ def main():
        "last_updated": datetime.now().isoformat()
    }
    
-    # Save pages that need updating to JSON
-    save_to_json(output_data, OUTDATED_PAGES_FILE)
+    # Save pages that need updating to JSON with history
+    save_with_history(output_data, OUTDATED_PAGES_FILE)
    
    # Print the top pages needing updates
    print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")