242 lines
		
	
	
		
			No EOL
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			242 lines
		
	
	
		
			No EOL
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| # -*- coding: utf-8 -*-
 | |
| 
 | |
| """
 | |
| fix_grammar_suggestions.py
 | |
| 
 | |
| This script adds grammar suggestions to the "type" page in the outdated_pages.json file.
 | |
| It fetches the French content for the page, runs the grammar checker, and updates the file.
 | |
| """
 | |
| 
 | |
| import json
 | |
| import logging
 | |
| import os
 | |
| import subprocess
 | |
| import tempfile
 | |
| import requests
 | |
| from bs4 import BeautifulSoup
 | |
| 
 | |
| # Configure logging
 | |
| logging.basicConfig(
 | |
|     level=logging.INFO,
 | |
|     format='%(asctime)s - %(levelname)s - %(message)s',
 | |
|     datefmt='%Y-%m-%d %H:%M:%S'
 | |
| )
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| # Constants
 | |
| OUTDATED_PAGES_FILE = "outdated_pages.json"
 | |
| TARGET_KEY = "type"
 | |
| 
 | |
| def load_outdated_pages():
 | |
|     """
 | |
|     Load the outdated pages from the JSON file
 | |
|     
 | |
|     Returns:
 | |
|         dict: Dictionary containing outdated page information
 | |
|     """
 | |
|     try:
 | |
|         with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
 | |
|             data = json.load(f)
 | |
|         logger.info(f"Successfully loaded outdated pages from {OUTDATED_PAGES_FILE}")
 | |
|         return data
 | |
|     except (IOError, json.JSONDecodeError) as e:
 | |
|         logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
 | |
|         return None
 | |
| 
 | |
| def save_outdated_pages(data):
 | |
|     """
 | |
|     Save the outdated pages to the JSON file
 | |
|     
 | |
|     Args:
 | |
|         data (dict): Dictionary containing outdated page information
 | |
|     """
 | |
|     try:
 | |
|         with open(OUTDATED_PAGES_FILE, 'w', encoding='utf-8') as f:
 | |
|             json.dump(data, f, indent=2, ensure_ascii=False)
 | |
|         logger.info(f"Successfully saved outdated pages to {OUTDATED_PAGES_FILE}")
 | |
|     except IOError as e:
 | |
|         logger.error(f"Error saving pages to {OUTDATED_PAGES_FILE}: {e}")
 | |
| 
 | |
| def fetch_wiki_page_content(url):
 | |
|     """
 | |
|     Fetch the content of a wiki page
 | |
|     
 | |
|     Args:
 | |
|         url (str): URL of the wiki page
 | |
|         
 | |
|     Returns:
 | |
|         str: Content of the wiki page
 | |
|     """
 | |
|     try:
 | |
|         logger.info(f"Fetching content from {url}")
 | |
|         response = requests.get(url)
 | |
|         response.raise_for_status()
 | |
|         
 | |
|         soup = BeautifulSoup(response.text, 'html.parser')
 | |
|         
 | |
|         # Get the main content
 | |
|         content = soup.select_one('#mw-content-text')
 | |
|         if content:
 | |
|             # Remove script and style elements
 | |
|             for script in content.select('script, style'):
 | |
|                 script.extract()
 | |
|             
 | |
|             # Remove .languages elements
 | |
|             for languages_elem in content.select('.languages'):
 | |
|                 languages_elem.extract()
 | |
|             
 | |
|             # Get text
 | |
|             text = content.get_text(separator=' ', strip=True)
 | |
|             logger.info(f"Successfully fetched content ({len(text)} characters)")
 | |
|             return text
 | |
|         else:
 | |
|             logger.warning(f"Could not find content in page: {url}")
 | |
|             return ""
 | |
|     
 | |
|     except requests.exceptions.RequestException as e:
 | |
|         logger.error(f"Error fetching wiki page content: {e}")
 | |
|         return ""
 | |
| 
 | |
| def check_grammar_with_grammalecte(text):
 | |
|     """
 | |
|     Check grammar in French text using grammalecte-cli
 | |
|     
 | |
|     Args:
 | |
|         text (str): French text to check
 | |
|         
 | |
|     Returns:
 | |
|         list: List of grammar suggestions
 | |
|     """
 | |
|     if not text or len(text.strip()) == 0:
 | |
|         logger.warning("Empty text provided for grammar checking")
 | |
|         return []
 | |
|     
 | |
|     logger.info("Checking grammar with grammalecte-cli...")
 | |
|     
 | |
|     try:
 | |
|         # Create a temporary file with the text
 | |
|         with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
 | |
|             temp_file.write(text)
 | |
|             temp_file_path = temp_file.name
 | |
|         
 | |
|         # Run grammalecte-cli on the temporary file
 | |
|         cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
 | |
|         result = subprocess.run(cmd, capture_output=True, text=True, check=True)
 | |
|         
 | |
|         # Parse the JSON output
 | |
|         grammar_data = json.loads(result.stdout)
 | |
|         
 | |
|         # Extract grammar errors from all paragraphs
 | |
|         grammar_suggestions = []
 | |
|         for paragraph in grammar_data.get('data', []):
 | |
|             paragraph_index = paragraph.get('iParagraph', 0)
 | |
|             
 | |
|             # Process grammar errors
 | |
|             for error in paragraph.get('lGrammarErrors', []):
 | |
|                 suggestion = {
 | |
|                     'paragraph': paragraph_index,
 | |
|                     'start': error.get('nStart', 0),
 | |
|                     'end': error.get('nEnd', 0),
 | |
|                     'type': error.get('sType', ''),
 | |
|                     'message': error.get('sMessage', ''),
 | |
|                     'suggestions': error.get('aSuggestions', []),
 | |
|                     'text': error.get('sUnderlined', ''),
 | |
|                     'before': error.get('sBefore', ''),
 | |
|                     'after': error.get('sAfter', '')
 | |
|                 }
 | |
|                 grammar_suggestions.append(suggestion)
 | |
|             
 | |
|             # Process spelling errors
 | |
|             for error in paragraph.get('lSpellingErrors', []):
 | |
|                 suggestion = {
 | |
|                     'paragraph': paragraph_index,
 | |
|                     'start': error.get('nStart', 0),
 | |
|                     'end': error.get('nEnd', 0),
 | |
|                     'type': 'spelling',
 | |
|                     'message': 'Erreur d\'orthographe',
 | |
|                     'suggestions': error.get('aSuggestions', []),
 | |
|                     'text': error.get('sUnderlined', ''),
 | |
|                     'before': error.get('sBefore', ''),
 | |
|                     'after': error.get('sAfter', '')
 | |
|                 }
 | |
|                 grammar_suggestions.append(suggestion)
 | |
|         
 | |
|         # Clean up the temporary file
 | |
|         os.unlink(temp_file_path)
 | |
|         
 | |
|         logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
 | |
|         return grammar_suggestions
 | |
|     
 | |
|     except subprocess.CalledProcessError as e:
 | |
|         logger.error(f"Error running grammalecte-cli: {e}")
 | |
|         logger.error(f"stdout: {e.stdout}")
 | |
|         logger.error(f"stderr: {e.stderr}")
 | |
|         return []
 | |
|     
 | |
|     except json.JSONDecodeError as e:
 | |
|         logger.error(f"Error parsing grammalecte-cli output: {e}")
 | |
|         return []
 | |
|     
 | |
|     except Exception as e:
 | |
|         logger.error(f"Unexpected error during grammar checking: {e}")
 | |
|         return []
 | |
| 
 | |
| def main():
 | |
|     """Main function to execute the script"""
 | |
|     logger.info("Starting fix_grammar_suggestions.py")
 | |
|     
 | |
|     # Load outdated pages
 | |
|     data = load_outdated_pages()
 | |
|     if not data:
 | |
|         logger.error("Failed to load outdated pages")
 | |
|         return
 | |
|     
 | |
|     # Find the "type" page in the regular_pages array
 | |
|     type_page = None
 | |
|     for i, page in enumerate(data.get('regular_pages', [])):
 | |
|         if page.get('key') == TARGET_KEY:
 | |
|             type_page = page
 | |
|             type_page_index = i
 | |
|             break
 | |
|     
 | |
|     if not type_page:
 | |
|         logger.error(f"Could not find page with key '{TARGET_KEY}'")
 | |
|         return
 | |
|     
 | |
|     # Get the French page URL
 | |
|     fr_page = type_page.get('fr_page')
 | |
|     if not fr_page:
 | |
|         logger.error(f"No French page found for key '{TARGET_KEY}'")
 | |
|         return
 | |
|     
 | |
|     fr_url = fr_page.get('url')
 | |
|     if not fr_url:
 | |
|         logger.error(f"No URL found for French page of key '{TARGET_KEY}'")
 | |
|         return
 | |
|     
 | |
|     # Fetch the content of the French page
 | |
|     content = fetch_wiki_page_content(fr_url)
 | |
|     if not content:
 | |
|         logger.error(f"Could not fetch content from {fr_url}")
 | |
|         return
 | |
|     
 | |
|     # Check grammar
 | |
|     logger.info(f"Checking grammar for key '{TARGET_KEY}'")
 | |
|     suggestions = check_grammar_with_grammalecte(content)
 | |
|     if not suggestions:
 | |
|         logger.warning("No grammar suggestions found or grammar checker not available")
 | |
|     
 | |
|     # Add the grammar suggestions to the page
 | |
|     type_page['grammar_suggestions'] = suggestions
 | |
|     
 | |
|     # Update the page in the data
 | |
|     data['regular_pages'][type_page_index] = type_page
 | |
|     
 | |
|     # Save the updated data
 | |
|     save_outdated_pages(data)
 | |
|     
 | |
|     logger.info("Script completed successfully")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main() | 
