osm-labo/wiki_compare/fix_grammar_suggestions.py
2025-09-01 15:41:31 +02:00

242 lines
No EOL
7.8 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
fix_grammar_suggestions.py
This script adds grammar suggestions to the "type" page in the outdated_pages.json file.
It fetches the French content for the page, runs the grammar checker, and updates the file.
"""
import json
import logging
import os
import subprocess
import tempfile
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
OUTDATED_PAGES_FILE = "outdated_pages.json"
TARGET_KEY = "type"
def load_outdated_pages():
"""
Load the outdated pages from the JSON file
Returns:
dict: Dictionary containing outdated page information
"""
try:
with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
logger.info(f"Successfully loaded outdated pages from {OUTDATED_PAGES_FILE}")
return data
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
return None
def save_outdated_pages(data):
"""
Save the outdated pages to the JSON file
Args:
data (dict): Dictionary containing outdated page information
"""
try:
with open(OUTDATED_PAGES_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved outdated pages to {OUTDATED_PAGES_FILE}")
except IOError as e:
logger.error(f"Error saving pages to {OUTDATED_PAGES_FILE}: {e}")
def fetch_wiki_page_content(url):
"""
Fetch the content of a wiki page
Args:
url (str): URL of the wiki page
Returns:
str: Content of the wiki page
"""
try:
logger.info(f"Fetching content from {url}")
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Get the main content
content = soup.select_one('#mw-content-text')
if content:
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Get text
text = content.get_text(separator=' ', strip=True)
logger.info(f"Successfully fetched content ({len(text)} characters)")
return text
else:
logger.warning(f"Could not find content in page: {url}")
return ""
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page content: {e}")
return ""
def check_grammar_with_grammalecte(text):
"""
Check grammar in French text using grammalecte-cli
Args:
text (str): French text to check
Returns:
list: List of grammar suggestions
"""
if not text or len(text.strip()) == 0:
logger.warning("Empty text provided for grammar checking")
return []
logger.info("Checking grammar with grammalecte-cli...")
try:
# Create a temporary file with the text
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
temp_file.write(text)
temp_file_path = temp_file.name
# Run grammalecte-cli on the temporary file
cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
# Parse the JSON output
grammar_data = json.loads(result.stdout)
# Extract grammar errors from all paragraphs
grammar_suggestions = []
for paragraph in grammar_data.get('data', []):
paragraph_index = paragraph.get('iParagraph', 0)
# Process grammar errors
for error in paragraph.get('lGrammarErrors', []):
suggestion = {
'paragraph': paragraph_index,
'start': error.get('nStart', 0),
'end': error.get('nEnd', 0),
'type': error.get('sType', ''),
'message': error.get('sMessage', ''),
'suggestions': error.get('aSuggestions', []),
'text': error.get('sUnderlined', ''),
'before': error.get('sBefore', ''),
'after': error.get('sAfter', '')
}
grammar_suggestions.append(suggestion)
# Process spelling errors
for error in paragraph.get('lSpellingErrors', []):
suggestion = {
'paragraph': paragraph_index,
'start': error.get('nStart', 0),
'end': error.get('nEnd', 0),
'type': 'spelling',
'message': 'Erreur d\'orthographe',
'suggestions': error.get('aSuggestions', []),
'text': error.get('sUnderlined', ''),
'before': error.get('sBefore', ''),
'after': error.get('sAfter', '')
}
grammar_suggestions.append(suggestion)
# Clean up the temporary file
os.unlink(temp_file_path)
logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
return grammar_suggestions
except subprocess.CalledProcessError as e:
logger.error(f"Error running grammalecte-cli: {e}")
logger.error(f"stdout: {e.stdout}")
logger.error(f"stderr: {e.stderr}")
return []
except json.JSONDecodeError as e:
logger.error(f"Error parsing grammalecte-cli output: {e}")
return []
except Exception as e:
logger.error(f"Unexpected error during grammar checking: {e}")
return []
def main():
"""Main function to execute the script"""
logger.info("Starting fix_grammar_suggestions.py")
# Load outdated pages
data = load_outdated_pages()
if not data:
logger.error("Failed to load outdated pages")
return
# Find the "type" page in the regular_pages array
type_page = None
for i, page in enumerate(data.get('regular_pages', [])):
if page.get('key') == TARGET_KEY:
type_page = page
type_page_index = i
break
if not type_page:
logger.error(f"Could not find page with key '{TARGET_KEY}'")
return
# Get the French page URL
fr_page = type_page.get('fr_page')
if not fr_page:
logger.error(f"No French page found for key '{TARGET_KEY}'")
return
fr_url = fr_page.get('url')
if not fr_url:
logger.error(f"No URL found for French page of key '{TARGET_KEY}'")
return
# Fetch the content of the French page
content = fetch_wiki_page_content(fr_url)
if not content:
logger.error(f"Could not fetch content from {fr_url}")
return
# Check grammar
logger.info(f"Checking grammar for key '{TARGET_KEY}'")
suggestions = check_grammar_with_grammalecte(content)
if not suggestions:
logger.warning("No grammar suggestions found or grammar checker not available")
# Add the grammar suggestions to the page
type_page['grammar_suggestions'] = suggestions
# Update the page in the data
data['regular_pages'][type_page_index] = type_page
# Save the updated data
save_outdated_pages(data)
logger.info("Script completed successfully")
if __name__ == "__main__":
main()