242 lines
No EOL
7.8 KiB
Python
242 lines
No EOL
7.8 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
fix_grammar_suggestions.py
|
|
|
|
This script adds grammar suggestions to the "type" page in the outdated_pages.json file.
|
|
It fetches the French content for the page, runs the grammar checker, and updates the file.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
|
TARGET_KEY = "type"
|
|
|
|
def load_outdated_pages():
|
|
"""
|
|
Load the outdated pages from the JSON file
|
|
|
|
Returns:
|
|
dict: Dictionary containing outdated page information
|
|
"""
|
|
try:
|
|
with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
logger.info(f"Successfully loaded outdated pages from {OUTDATED_PAGES_FILE}")
|
|
return data
|
|
except (IOError, json.JSONDecodeError) as e:
|
|
logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
|
|
return None
|
|
|
|
def save_outdated_pages(data):
|
|
"""
|
|
Save the outdated pages to the JSON file
|
|
|
|
Args:
|
|
data (dict): Dictionary containing outdated page information
|
|
"""
|
|
try:
|
|
with open(OUTDATED_PAGES_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Successfully saved outdated pages to {OUTDATED_PAGES_FILE}")
|
|
except IOError as e:
|
|
logger.error(f"Error saving pages to {OUTDATED_PAGES_FILE}: {e}")
|
|
|
|
def fetch_wiki_page_content(url):
|
|
"""
|
|
Fetch the content of a wiki page
|
|
|
|
Args:
|
|
url (str): URL of the wiki page
|
|
|
|
Returns:
|
|
str: Content of the wiki page
|
|
"""
|
|
try:
|
|
logger.info(f"Fetching content from {url}")
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Get the main content
|
|
content = soup.select_one('#mw-content-text')
|
|
if content:
|
|
# Remove script and style elements
|
|
for script in content.select('script, style'):
|
|
script.extract()
|
|
|
|
# Remove .languages elements
|
|
for languages_elem in content.select('.languages'):
|
|
languages_elem.extract()
|
|
|
|
# Get text
|
|
text = content.get_text(separator=' ', strip=True)
|
|
logger.info(f"Successfully fetched content ({len(text)} characters)")
|
|
return text
|
|
else:
|
|
logger.warning(f"Could not find content in page: {url}")
|
|
return ""
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching wiki page content: {e}")
|
|
return ""
|
|
|
|
def check_grammar_with_grammalecte(text):
|
|
"""
|
|
Check grammar in French text using grammalecte-cli
|
|
|
|
Args:
|
|
text (str): French text to check
|
|
|
|
Returns:
|
|
list: List of grammar suggestions
|
|
"""
|
|
if not text or len(text.strip()) == 0:
|
|
logger.warning("Empty text provided for grammar checking")
|
|
return []
|
|
|
|
logger.info("Checking grammar with grammalecte-cli...")
|
|
|
|
try:
|
|
# Create a temporary file with the text
|
|
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
|
|
temp_file.write(text)
|
|
temp_file_path = temp_file.name
|
|
|
|
# Run grammalecte-cli on the temporary file
|
|
cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
|
|
# Parse the JSON output
|
|
grammar_data = json.loads(result.stdout)
|
|
|
|
# Extract grammar errors from all paragraphs
|
|
grammar_suggestions = []
|
|
for paragraph in grammar_data.get('data', []):
|
|
paragraph_index = paragraph.get('iParagraph', 0)
|
|
|
|
# Process grammar errors
|
|
for error in paragraph.get('lGrammarErrors', []):
|
|
suggestion = {
|
|
'paragraph': paragraph_index,
|
|
'start': error.get('nStart', 0),
|
|
'end': error.get('nEnd', 0),
|
|
'type': error.get('sType', ''),
|
|
'message': error.get('sMessage', ''),
|
|
'suggestions': error.get('aSuggestions', []),
|
|
'text': error.get('sUnderlined', ''),
|
|
'before': error.get('sBefore', ''),
|
|
'after': error.get('sAfter', '')
|
|
}
|
|
grammar_suggestions.append(suggestion)
|
|
|
|
# Process spelling errors
|
|
for error in paragraph.get('lSpellingErrors', []):
|
|
suggestion = {
|
|
'paragraph': paragraph_index,
|
|
'start': error.get('nStart', 0),
|
|
'end': error.get('nEnd', 0),
|
|
'type': 'spelling',
|
|
'message': 'Erreur d\'orthographe',
|
|
'suggestions': error.get('aSuggestions', []),
|
|
'text': error.get('sUnderlined', ''),
|
|
'before': error.get('sBefore', ''),
|
|
'after': error.get('sAfter', '')
|
|
}
|
|
grammar_suggestions.append(suggestion)
|
|
|
|
# Clean up the temporary file
|
|
os.unlink(temp_file_path)
|
|
|
|
logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
|
|
return grammar_suggestions
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Error running grammalecte-cli: {e}")
|
|
logger.error(f"stdout: {e.stdout}")
|
|
logger.error(f"stderr: {e.stderr}")
|
|
return []
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Error parsing grammalecte-cli output: {e}")
|
|
return []
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during grammar checking: {e}")
|
|
return []
|
|
|
|
def main():
|
|
"""Main function to execute the script"""
|
|
logger.info("Starting fix_grammar_suggestions.py")
|
|
|
|
# Load outdated pages
|
|
data = load_outdated_pages()
|
|
if not data:
|
|
logger.error("Failed to load outdated pages")
|
|
return
|
|
|
|
# Find the "type" page in the regular_pages array
|
|
type_page = None
|
|
for i, page in enumerate(data.get('regular_pages', [])):
|
|
if page.get('key') == TARGET_KEY:
|
|
type_page = page
|
|
type_page_index = i
|
|
break
|
|
|
|
if not type_page:
|
|
logger.error(f"Could not find page with key '{TARGET_KEY}'")
|
|
return
|
|
|
|
# Get the French page URL
|
|
fr_page = type_page.get('fr_page')
|
|
if not fr_page:
|
|
logger.error(f"No French page found for key '{TARGET_KEY}'")
|
|
return
|
|
|
|
fr_url = fr_page.get('url')
|
|
if not fr_url:
|
|
logger.error(f"No URL found for French page of key '{TARGET_KEY}'")
|
|
return
|
|
|
|
# Fetch the content of the French page
|
|
content = fetch_wiki_page_content(fr_url)
|
|
if not content:
|
|
logger.error(f"Could not fetch content from {fr_url}")
|
|
return
|
|
|
|
# Check grammar
|
|
logger.info(f"Checking grammar for key '{TARGET_KEY}'")
|
|
suggestions = check_grammar_with_grammalecte(content)
|
|
if not suggestions:
|
|
logger.warning("No grammar suggestions found or grammar checker not available")
|
|
|
|
# Add the grammar suggestions to the page
|
|
type_page['grammar_suggestions'] = suggestions
|
|
|
|
# Update the page in the data
|
|
data['regular_pages'][type_page_index] = type_page
|
|
|
|
# Save the updated data
|
|
save_outdated_pages(data)
|
|
|
|
logger.info("Script completed successfully")
|
|
|
|
if __name__ == "__main__":
|
|
main() |