recup sources

2025-09-01 18:28:23 +02:00 · 2025-09-01 18:28:23 +02:00 · 65fe2a35f9
commit 65fe2a35f9
parent 86622a19ea
155 changed files with 50969 additions and 0 deletions
--- a/wiki_compare/fix_grammar_suggestions.py
+++ b/wiki_compare/fix_grammar_suggestions.py
@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+fix_grammar_suggestions.py
+
+This script adds grammar suggestions to the "type" page in the outdated_pages.json file.
+It fetches the French content for the page, runs the grammar checker, and updates the file.
+"""
+
+import json
+import logging
+import os
+import subprocess
+import tempfile
+import requests
+from bs4 import BeautifulSoup
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger(__name__)
+
+# Constants
+OUTDATED_PAGES_FILE = "outdated_pages.json"
+TARGET_KEY = "type"
+
+def load_outdated_pages():
+    """
+    Load the outdated pages from the JSON file
+    
+    Returns:
+        dict: Dictionary containing outdated page information
+    """
+    try:
+        with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        logger.info(f"Successfully loaded outdated pages from {OUTDATED_PAGES_FILE}")
+        return data
+    except (IOError, json.JSONDecodeError) as e:
+        logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
+        return None
+
+def save_outdated_pages(data):
+    """
+    Save the outdated pages to the JSON file
+    
+    Args:
+        data (dict): Dictionary containing outdated page information
+    """
+    try:
+        with open(OUTDATED_PAGES_FILE, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Successfully saved outdated pages to {OUTDATED_PAGES_FILE}")
+    except IOError as e:
+        logger.error(f"Error saving pages to {OUTDATED_PAGES_FILE}: {e}")
+
+def fetch_wiki_page_content(url):
+    """
+    Fetch the content of a wiki page
+    
+    Args:
+        url (str): URL of the wiki page
+        
+    Returns:
+        str: Content of the wiki page
+    """
+    try:
+        logger.info(f"Fetching content from {url}")
+        response = requests.get(url)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'html.parser')
+        
+        # Get the main content
+        content = soup.select_one('#mw-content-text')
+        if content:
+            # Remove script and style elements
+            for script in content.select('script, style'):
+                script.extract()
+            
+            # Remove .languages elements
+            for languages_elem in content.select('.languages'):
+                languages_elem.extract()
+            
+            # Get text
+            text = content.get_text(separator=' ', strip=True)
+            logger.info(f"Successfully fetched content ({len(text)} characters)")
+            return text
+        else:
+            logger.warning(f"Could not find content in page: {url}")
+            return ""
+    
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching wiki page content: {e}")
+        return ""
+
+def check_grammar_with_grammalecte(text):
+    """
+    Check grammar in French text using grammalecte-cli
+    
+    Args:
+        text (str): French text to check
+        
+    Returns:
+        list: List of grammar suggestions
+    """
+    if not text or len(text.strip()) == 0:
+        logger.warning("Empty text provided for grammar checking")
+        return []
+    
+    logger.info("Checking grammar with grammalecte-cli...")
+    
+    try:
+        # Create a temporary file with the text
+        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
+            temp_file.write(text)
+            temp_file_path = temp_file.name
+        
+        # Run grammalecte-cli on the temporary file
+        cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        
+        # Parse the JSON output
+        grammar_data = json.loads(result.stdout)
+        
+        # Extract grammar errors from all paragraphs
+        grammar_suggestions = []
+        for paragraph in grammar_data.get('data', []):
+            paragraph_index = paragraph.get('iParagraph', 0)
+            
+            # Process grammar errors
+            for error in paragraph.get('lGrammarErrors', []):
+                suggestion = {
+                    'paragraph': paragraph_index,
+                    'start': error.get('nStart', 0),
+                    'end': error.get('nEnd', 0),
+                    'type': error.get('sType', ''),
+                    'message': error.get('sMessage', ''),
+                    'suggestions': error.get('aSuggestions', []),
+                    'text': error.get('sUnderlined', ''),
+                    'before': error.get('sBefore', ''),
+                    'after': error.get('sAfter', '')
+                }
+                grammar_suggestions.append(suggestion)
+            
+            # Process spelling errors
+            for error in paragraph.get('lSpellingErrors', []):
+                suggestion = {
+                    'paragraph': paragraph_index,
+                    'start': error.get('nStart', 0),
+                    'end': error.get('nEnd', 0),
+                    'type': 'spelling',
+                    'message': 'Erreur d\'orthographe',
+                    'suggestions': error.get('aSuggestions', []),
+                    'text': error.get('sUnderlined', ''),
+                    'before': error.get('sBefore', ''),
+                    'after': error.get('sAfter', '')
+                }
+                grammar_suggestions.append(suggestion)
+        
+        # Clean up the temporary file
+        os.unlink(temp_file_path)
+        
+        logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
+        return grammar_suggestions
+    
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error running grammalecte-cli: {e}")
+        logger.error(f"stdout: {e.stdout}")
+        logger.error(f"stderr: {e.stderr}")
+        return []
+    
+    except json.JSONDecodeError as e:
+        logger.error(f"Error parsing grammalecte-cli output: {e}")
+        return []
+    
+    except Exception as e:
+        logger.error(f"Unexpected error during grammar checking: {e}")
+        return []
+
+def main():
+    """Main function to execute the script"""
+    logger.info("Starting fix_grammar_suggestions.py")
+    
+    # Load outdated pages
+    data = load_outdated_pages()
+    if not data:
+        logger.error("Failed to load outdated pages")
+        return
+    
+    # Find the "type" page in the regular_pages array
+    type_page = None
+    for i, page in enumerate(data.get('regular_pages', [])):
+        if page.get('key') == TARGET_KEY:
+            type_page = page
+            type_page_index = i
+            break
+    
+    if not type_page:
+        logger.error(f"Could not find page with key '{TARGET_KEY}'")
+        return
+    
+    # Get the French page URL
+    fr_page = type_page.get('fr_page')
+    if not fr_page:
+        logger.error(f"No French page found for key '{TARGET_KEY}'")
+        return
+    
+    fr_url = fr_page.get('url')
+    if not fr_url:
+        logger.error(f"No URL found for French page of key '{TARGET_KEY}'")
+        return
+    
+    # Fetch the content of the French page
+    content = fetch_wiki_page_content(fr_url)
+    if not content:
+        logger.error(f"Could not fetch content from {fr_url}")
+        return
+    
+    # Check grammar
+    logger.info(f"Checking grammar for key '{TARGET_KEY}'")
+    suggestions = check_grammar_with_grammalecte(content)
+    if not suggestions:
+        logger.warning("No grammar suggestions found or grammar checker not available")
+    
+    # Add the grammar suggestions to the page
+    type_page['grammar_suggestions'] = suggestions
+    
+    # Update the page in the data
+    data['regular_pages'][type_page_index] = type_page
+    
+    # Save the updated data
+    save_outdated_pages(data)
+    
+    logger.info("Script completed successfully")
+
+if __name__ == "__main__":
+    main()