ajout grammalecte

This commit is contained in:
Tykayn 2025-09-01 11:38:19 +02:00 committed by tykayn
parent e61d932565
commit 471eab4cd0
8 changed files with 45296 additions and 283 deletions

View file

@ -28,6 +28,8 @@ import csv
import requests
import re
import os
import subprocess
import tempfile
from datetime import datetime
from bs4 import BeautifulSoup
import logging
@ -61,12 +63,14 @@ NUM_WIKI_PAGES = 1
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
SPECIFIC_PAGES = [
"Anatomie_des_étiquettes_osm",
"FR:Tag:leisure%3Dchildren_club",
"FR:Tag:harassment_prevention%3Dask_angela",
"FR:Tag:leisure=children_club",
"FR:Tag:harassment_prevention=Dask_angela",
"Key:harassment_prevention",
"Proposal process",
"Automated_Edits_code_of_conduct",
"Key:cuisine"
"Key:cuisine",
"Libre_Charge_Map",
"OSM_Mon_Commerce"
]
def fetch_top_keys(limit=NUM_WIKI_PAGES):
@ -118,6 +122,90 @@ def save_to_json(data, filename):
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def check_grammar_with_grammalecte(text):
"""
Check grammar in French text using grammalecte-cli
Args:
text (str): French text to check
Returns:
list: List of grammar suggestions
"""
if not text or len(text.strip()) == 0:
logger.warning("Empty text provided for grammar checking")
return []
logger.info("Checking grammar with grammalecte-cli...")
try:
# Create a temporary file with the text
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
temp_file.write(text)
temp_file_path = temp_file.name
# Run grammalecte-cli on the temporary file
cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
# Parse the JSON output
grammar_data = json.loads(result.stdout)
# Extract grammar errors from all paragraphs
grammar_suggestions = []
for paragraph in grammar_data.get('data', []):
paragraph_index = paragraph.get('iParagraph', 0)
# Process grammar errors
for error in paragraph.get('lGrammarErrors', []):
suggestion = {
'paragraph': paragraph_index,
'start': error.get('nStart', 0),
'end': error.get('nEnd', 0),
'type': error.get('sType', ''),
'message': error.get('sMessage', ''),
'suggestions': error.get('aSuggestions', []),
'text': error.get('sUnderlined', ''),
'before': error.get('sBefore', ''),
'after': error.get('sAfter', '')
}
grammar_suggestions.append(suggestion)
# Process spelling errors
for error in paragraph.get('lSpellingErrors', []):
suggestion = {
'paragraph': paragraph_index,
'start': error.get('nStart', 0),
'end': error.get('nEnd', 0),
'type': 'spelling',
'message': 'Erreur d\'orthographe',
'suggestions': error.get('aSuggestions', []),
'text': error.get('sUnderlined', ''),
'before': error.get('sBefore', ''),
'after': error.get('sAfter', '')
}
grammar_suggestions.append(suggestion)
# Clean up the temporary file
os.unlink(temp_file_path)
logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
return grammar_suggestions
except subprocess.CalledProcessError as e:
logger.error(f"Error running grammalecte-cli: {e}")
logger.error(f"stdout: {e.stdout}")
logger.error(f"stderr: {e.stderr}")
return []
except json.JSONDecodeError as e:
logger.error(f"Error parsing grammalecte-cli output: {e}")
return []
except Exception as e:
logger.error(f"Unexpected error during grammar checking: {e}")
return []
def fetch_wiki_page(key, language='en', is_specific_page=False):
"""
Fetch wiki page for a given key or specific page
@ -225,6 +313,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
# Count words in the content
content = soup.select_one('#mw-content-text')
clean_text = ""
if content:
# Remove script and style elements
for script in content.select('script, style'):
@ -235,8 +324,14 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
languages_elem.extract()
# Get text and count words
text = content.get_text(separator=' ', strip=True)
word_count = len(text.split())
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Check grammar for French pages
grammar_suggestions = []
if language == 'fr':
logger.info(f"Checking grammar for French page: {key}")
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
# Extract links
links = content.select('a')
@ -433,6 +528,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
media_count = 0
media_details = []
categories = []
grammar_suggestions = []
return {
'key': key,
@ -449,7 +545,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
'media_details': media_details,
'categories': categories,
'description_img_url': description_img_url,
'is_specific_page': is_specific_page
'is_specific_page': is_specific_page,
'grammar_suggestions': grammar_suggestions
}
except requests.exceptions.RequestException as e: