ajout grammalecte

2025-09-01 11:38:19 +02:00 · 2025-09-01 11:38:19 +02:00 · 471eab4cd0
commit 471eab4cd0
parent e61d932565
8 changed files with 45296 additions and 283 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -28,6 +28,8 @@ import csv
 import requests
 import re
 import os
+import subprocess
+import tempfile
 from datetime import datetime
 from bs4 import BeautifulSoup
 import logging
@ -61,12 +63,14 @@ NUM_WIKI_PAGES = 1
 # 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
 SPECIFIC_PAGES = [
    "Anatomie_des_étiquettes_osm",
-    "FR:Tag:leisure%3Dchildren_club",
-    "FR:Tag:harassment_prevention%3Dask_angela",
+    "FR:Tag:leisure=children_club",
+    "FR:Tag:harassment_prevention=Dask_angela",
    "Key:harassment_prevention",
    "Proposal process",
    "Automated_Edits_code_of_conduct",
-    "Key:cuisine"
+    "Key:cuisine",
+    "Libre_Charge_Map",
+    "OSM_Mon_Commerce"
 ]

 def fetch_top_keys(limit=NUM_WIKI_PAGES):
@ -118,6 +122,90 @@ def save_to_json(data, filename):
    except IOError as e:
        logger.error(f"Error saving data to {filename}: {e}")

+def check_grammar_with_grammalecte(text):
+    """
+    Check grammar in French text using grammalecte-cli
+    
+    Args:
+        text (str): French text to check
+        
+    Returns:
+        list: List of grammar suggestions
+    """
+    if not text or len(text.strip()) == 0:
+        logger.warning("Empty text provided for grammar checking")
+        return []
+    
+    logger.info("Checking grammar with grammalecte-cli...")
+    
+    try:
+        # Create a temporary file with the text
+        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
+            temp_file.write(text)
+            temp_file_path = temp_file.name
+        
+        # Run grammalecte-cli on the temporary file
+        cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        
+        # Parse the JSON output
+        grammar_data = json.loads(result.stdout)
+        
+        # Extract grammar errors from all paragraphs
+        grammar_suggestions = []
+        for paragraph in grammar_data.get('data', []):
+            paragraph_index = paragraph.get('iParagraph', 0)
+            
+            # Process grammar errors
+            for error in paragraph.get('lGrammarErrors', []):
+                suggestion = {
+                    'paragraph': paragraph_index,
+                    'start': error.get('nStart', 0),
+                    'end': error.get('nEnd', 0),
+                    'type': error.get('sType', ''),
+                    'message': error.get('sMessage', ''),
+                    'suggestions': error.get('aSuggestions', []),
+                    'text': error.get('sUnderlined', ''),
+                    'before': error.get('sBefore', ''),
+                    'after': error.get('sAfter', '')
+                }
+                grammar_suggestions.append(suggestion)
+            
+            # Process spelling errors
+            for error in paragraph.get('lSpellingErrors', []):
+                suggestion = {
+                    'paragraph': paragraph_index,
+                    'start': error.get('nStart', 0),
+                    'end': error.get('nEnd', 0),
+                    'type': 'spelling',
+                    'message': 'Erreur d\'orthographe',
+                    'suggestions': error.get('aSuggestions', []),
+                    'text': error.get('sUnderlined', ''),
+                    'before': error.get('sBefore', ''),
+                    'after': error.get('sAfter', '')
+                }
+                grammar_suggestions.append(suggestion)
+        
+        # Clean up the temporary file
+        os.unlink(temp_file_path)
+        
+        logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
+        return grammar_suggestions
+    
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error running grammalecte-cli: {e}")
+        logger.error(f"stdout: {e.stdout}")
+        logger.error(f"stderr: {e.stderr}")
+        return []
+    
+    except json.JSONDecodeError as e:
+        logger.error(f"Error parsing grammalecte-cli output: {e}")
+        return []
+    
+    except Exception as e:
+        logger.error(f"Unexpected error during grammar checking: {e}")
+        return []
+
 def fetch_wiki_page(key, language='en', is_specific_page=False):
    """
    Fetch wiki page for a given key or specific page
@ -225,6 +313,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
        
        # Count words in the content
        content = soup.select_one('#mw-content-text')
+        clean_text = ""
        if content:
            # Remove script and style elements
            for script in content.select('script, style'):
@ -235,8 +324,14 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
                languages_elem.extract()
            
            # Get text and count words
-            text = content.get_text(separator=' ', strip=True)
-            word_count = len(text.split())
+            clean_text = content.get_text(separator=' ', strip=True)
+            word_count = len(clean_text.split())
+            
+            # Check grammar for French pages
+            grammar_suggestions = []
+            if language == 'fr':
+                logger.info(f"Checking grammar for French page: {key}")
+                grammar_suggestions = check_grammar_with_grammalecte(clean_text)
            
            # Extract links
            links = content.select('a')
@ -433,6 +528,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            media_count = 0
            media_details = []
            categories = []
+            grammar_suggestions = []
        
        return {
            'key': key,
@ -449,7 +545,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
            'media_details': media_details,
            'categories': categories,
            'description_img_url': description_img_url,
-            'is_specific_page': is_specific_page
+            'is_specific_page': is_specific_page,
+            'grammar_suggestions': grammar_suggestions
        }
    
    except requests.exceptions.RequestException as e: