ajout grammalecte
This commit is contained in:
parent
e61d932565
commit
471eab4cd0
8 changed files with 45296 additions and 283 deletions
|
@ -28,6 +28,8 @@ import csv
|
|||
import requests
|
||||
import re
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
@ -61,12 +63,14 @@ NUM_WIKI_PAGES = 1
|
|||
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||
SPECIFIC_PAGES = [
|
||||
"Anatomie_des_étiquettes_osm",
|
||||
"FR:Tag:leisure%3Dchildren_club",
|
||||
"FR:Tag:harassment_prevention%3Dask_angela",
|
||||
"FR:Tag:leisure=children_club",
|
||||
"FR:Tag:harassment_prevention=Dask_angela",
|
||||
"Key:harassment_prevention",
|
||||
"Proposal process",
|
||||
"Automated_Edits_code_of_conduct",
|
||||
"Key:cuisine"
|
||||
"Key:cuisine",
|
||||
"Libre_Charge_Map",
|
||||
"OSM_Mon_Commerce"
|
||||
]
|
||||
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
|
@ -118,6 +122,90 @@ def save_to_json(data, filename):
|
|||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def check_grammar_with_grammalecte(text):
|
||||
"""
|
||||
Check grammar in French text using grammalecte-cli
|
||||
|
||||
Args:
|
||||
text (str): French text to check
|
||||
|
||||
Returns:
|
||||
list: List of grammar suggestions
|
||||
"""
|
||||
if not text or len(text.strip()) == 0:
|
||||
logger.warning("Empty text provided for grammar checking")
|
||||
return []
|
||||
|
||||
logger.info("Checking grammar with grammalecte-cli...")
|
||||
|
||||
try:
|
||||
# Create a temporary file with the text
|
||||
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
|
||||
temp_file.write(text)
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
# Run grammalecte-cli on the temporary file
|
||||
cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
|
||||
# Parse the JSON output
|
||||
grammar_data = json.loads(result.stdout)
|
||||
|
||||
# Extract grammar errors from all paragraphs
|
||||
grammar_suggestions = []
|
||||
for paragraph in grammar_data.get('data', []):
|
||||
paragraph_index = paragraph.get('iParagraph', 0)
|
||||
|
||||
# Process grammar errors
|
||||
for error in paragraph.get('lGrammarErrors', []):
|
||||
suggestion = {
|
||||
'paragraph': paragraph_index,
|
||||
'start': error.get('nStart', 0),
|
||||
'end': error.get('nEnd', 0),
|
||||
'type': error.get('sType', ''),
|
||||
'message': error.get('sMessage', ''),
|
||||
'suggestions': error.get('aSuggestions', []),
|
||||
'text': error.get('sUnderlined', ''),
|
||||
'before': error.get('sBefore', ''),
|
||||
'after': error.get('sAfter', '')
|
||||
}
|
||||
grammar_suggestions.append(suggestion)
|
||||
|
||||
# Process spelling errors
|
||||
for error in paragraph.get('lSpellingErrors', []):
|
||||
suggestion = {
|
||||
'paragraph': paragraph_index,
|
||||
'start': error.get('nStart', 0),
|
||||
'end': error.get('nEnd', 0),
|
||||
'type': 'spelling',
|
||||
'message': 'Erreur d\'orthographe',
|
||||
'suggestions': error.get('aSuggestions', []),
|
||||
'text': error.get('sUnderlined', ''),
|
||||
'before': error.get('sBefore', ''),
|
||||
'after': error.get('sAfter', '')
|
||||
}
|
||||
grammar_suggestions.append(suggestion)
|
||||
|
||||
# Clean up the temporary file
|
||||
os.unlink(temp_file_path)
|
||||
|
||||
logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
|
||||
return grammar_suggestions
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Error running grammalecte-cli: {e}")
|
||||
logger.error(f"stdout: {e.stdout}")
|
||||
logger.error(f"stderr: {e.stderr}")
|
||||
return []
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Error parsing grammalecte-cli output: {e}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error during grammar checking: {e}")
|
||||
return []
|
||||
|
||||
def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||
"""
|
||||
Fetch wiki page for a given key or specific page
|
||||
|
@ -225,6 +313,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
|
||||
# Count words in the content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
clean_text = ""
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
|
@ -235,8 +324,14 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
languages_elem.extract()
|
||||
|
||||
# Get text and count words
|
||||
text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(text.split())
|
||||
clean_text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(clean_text.split())
|
||||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
if language == 'fr':
|
||||
logger.info(f"Checking grammar for French page: {key}")
|
||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
|
@ -433,6 +528,7 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
media_count = 0
|
||||
media_details = []
|
||||
categories = []
|
||||
grammar_suggestions = []
|
||||
|
||||
return {
|
||||
'key': key,
|
||||
|
@ -449,7 +545,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
'media_details': media_details,
|
||||
'categories': categories,
|
||||
'description_img_url': description_img_url,
|
||||
'is_specific_page': is_specific_page
|
||||
'is_specific_page': is_specific_page,
|
||||
'grammar_suggestions': grammar_suggestions
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue