qualiwiki/wiki_compare/fix_grammar_suggestions.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
fix_grammar_suggestions.py

This script adds grammar suggestions to the "type" page in the outdated_pages.json file.
It fetches the French content for the page, runs the grammar checker, and updates the file.
"""

import json
import logging
import os
import subprocess
import tempfile
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Constants
OUTDATED_PAGES_FILE = "outdated_pages.json"
TARGET_KEY = "type"

def load_outdated_pages():
    """
    Load the outdated pages from the JSON file

    Returns:
        dict: Dictionary containing outdated page information
    """
    try:
        with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
        logger.info(f"Successfully loaded outdated pages from {OUTDATED_PAGES_FILE}")
        return data
    except (IOError, json.JSONDecodeError) as e:
        logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
        return None

def save_outdated_pages(data):
    """
    Save the outdated pages to the JSON file

    Args:
        data (dict): Dictionary containing outdated page information
    """
    try:
        with open(OUTDATED_PAGES_FILE, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully saved outdated pages to {OUTDATED_PAGES_FILE}")
    except IOError as e:
        logger.error(f"Error saving pages to {OUTDATED_PAGES_FILE}: {e}")

def fetch_wiki_page_content(url):
    """
    Fetch the content of a wiki page

    Args:
        url (str): URL of the wiki page

    Returns:
        str: Content of the wiki page
    """
    try:
        logger.info(f"Fetching content from {url}")
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Get the main content
        content = soup.select_one('#mw-content-text')
        if content:
            # Remove script and style elements
            for script in content.select('script, style'):
                script.extract()

            # Remove .languages elements
            for languages_elem in content.select('.languages'):
                languages_elem.extract()

            # Get text
            text = content.get_text(separator=' ', strip=True)
            logger.info(f"Successfully fetched content ({len(text)} characters)")
            return text
        else:
            logger.warning(f"Could not find content in page: {url}")
            return ""

    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching wiki page content: {e}")
        return ""

def check_grammar_with_grammalecte(text):
    """
    Check grammar in French text using grammalecte-cli

    Args:
        text (str): French text to check

    Returns:
        list: List of grammar suggestions
    """
    if not text or len(text.strip()) == 0:
        logger.warning("Empty text provided for grammar checking")
        return []

    logger.info("Checking grammar with grammalecte-cli...")

    try:
        # Create a temporary file with the text
        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
            temp_file.write(text)
            temp_file_path = temp_file.name

        # Run grammalecte-cli on the temporary file
        cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)

        # Parse the JSON output
        grammar_data = json.loads(result.stdout)

        # Extract grammar errors from all paragraphs
        grammar_suggestions = []
        for paragraph in grammar_data.get('data', []):
            paragraph_index = paragraph.get('iParagraph', 0)

            # Process grammar errors
            for error in paragraph.get('lGrammarErrors', []):
                suggestion = {
                    'paragraph': paragraph_index,
                    'start': error.get('nStart', 0),
                    'end': error.get('nEnd', 0),
                    'type': error.get('sType', ''),
                    'message': error.get('sMessage', ''),
                    'suggestions': error.get('aSuggestions', []),
                    'text': error.get('sUnderlined', ''),
                    'before': error.get('sBefore', ''),
                    'after': error.get('sAfter', '')
                }
                grammar_suggestions.append(suggestion)

            # Process spelling errors
            for error in paragraph.get('lSpellingErrors', []):
                suggestion = {
                    'paragraph': paragraph_index,
                    'start': error.get('nStart', 0),
                    'end': error.get('nEnd', 0),
                    'type': 'spelling',
                    'message': 'Erreur d\'orthographe',
                    'suggestions': error.get('aSuggestions', []),
                    'text': error.get('sUnderlined', ''),
                    'before': error.get('sBefore', ''),
                    'after': error.get('sAfter', '')
                }
                grammar_suggestions.append(suggestion)

        # Clean up the temporary file
        os.unlink(temp_file_path)

        logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
        return grammar_suggestions

    except subprocess.CalledProcessError as e:
        logger.error(f"Error running grammalecte-cli: {e}")
        logger.error(f"stdout: {e.stdout}")
        logger.error(f"stderr: {e.stderr}")
        return []

    except json.JSONDecodeError as e:
        logger.error(f"Error parsing grammalecte-cli output: {e}")
        return []

    except Exception as e:
        logger.error(f"Unexpected error during grammar checking: {e}")
        return []

def main():
    """Main function to execute the script"""
    logger.info("Starting fix_grammar_suggestions.py")

    # Load outdated pages
    data = load_outdated_pages()
    if not data:
        logger.error("Failed to load outdated pages")
        return

    # Find the "type" page in the regular_pages array
    type_page = None
    for i, page in enumerate(data.get('regular_pages', [])):
        if page.get('key') == TARGET_KEY:
            type_page = page
            type_page_index = i
            break

    if not type_page:
        logger.error(f"Could not find page with key '{TARGET_KEY}'")
        return

    # Get the French page URL
    fr_page = type_page.get('fr_page')
    if not fr_page:
        logger.error(f"No French page found for key '{TARGET_KEY}'")
        return

    fr_url = fr_page.get('url')
    if not fr_url:
        logger.error(f"No URL found for French page of key '{TARGET_KEY}'")
        return

    # Fetch the content of the French page
    content = fetch_wiki_page_content(fr_url)
    if not content:
        logger.error(f"Could not fetch content from {fr_url}")
        return

    # Check grammar
    logger.info(f"Checking grammar for key '{TARGET_KEY}'")
    suggestions = check_grammar_with_grammalecte(content)
    if not suggestions:
        logger.warning("No grammar suggestions found or grammar checker not available")

    # Add the grammar suggestions to the page
    type_page['grammar_suggestions'] = suggestions

    # Update the page in the data
    data['regular_pages'][type_page_index] = type_page

    # Save the updated data
    save_outdated_pages(data)

    logger.info("Script completed successfully")

if __name__ == "__main__":
    main()