qualiwiki/wiki_compare/suggest_grammar_improvements.py
2025-09-01 18:28:23 +02:00

381 lines
No EOL
13 KiB
Python
Executable file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
suggest_grammar_improvements.py
This script reads the outdated_pages.json file, selects a wiki page (by default the first one),
and uses grammalecte to check the grammar of the French page content.
The grammar suggestions are saved in the "grammar_suggestions" property of the JSON file.
The script is compatible with different versions of the grammalecte API:
- For newer versions where GrammarChecker is directly in the grammalecte module
- For older versions where GrammarChecker is in the grammalecte.fr module
Usage:
python suggest_grammar_improvements.py [--page KEY]
Options:
--page KEY Specify the key of the page to check (default: first page in the file)
Output:
- Updated outdated_pages.json file with grammar suggestions
"""
import json
import argparse
import logging
import requests
import os
import sys
import subprocess
from bs4 import BeautifulSoup
try:
import grammalecte
import grammalecte.text as txt
# Check if GrammarChecker is available directly in the grammalecte module (newer versions)
try:
from grammalecte import GrammarChecker
GRAMMALECTE_DIRECT_API = True
except ImportError:
# Try the older API structure with fr submodule
try:
import grammalecte.fr as gr_fr
GRAMMALECTE_DIRECT_API = False
except ImportError:
# Neither API is available
raise ImportError("Could not import GrammarChecker from grammalecte")
GRAMMALECTE_AVAILABLE = True
except ImportError:
GRAMMALECTE_AVAILABLE = False
GRAMMALECTE_DIRECT_API = False
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
OUTDATED_PAGES_FILE = "outdated_pages.json"
def load_outdated_pages():
"""
Load the outdated pages from the JSON file
Returns:
list: List of dictionaries containing outdated page information
"""
try:
with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
pages = json.load(f)
logger.info(f"Successfully loaded {len(pages)} pages from {OUTDATED_PAGES_FILE}")
return pages
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
return []
def save_to_json(data, filename):
"""
Save data to a JSON file
Args:
data: Data to save
filename (str): Name of the file
"""
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Data saved to {filename}")
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def fetch_wiki_page_content(url):
"""
Fetch the content of a wiki page
Args:
url (str): URL of the wiki page
Returns:
str: Content of the wiki page
"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Get the main content
content = soup.select_one('#mw-content-text')
if content:
# Remove script and style elements
for script in content.select('script, style'):
script.extract()
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Get text
text = content.get_text(separator=' ', strip=True)
return text
else:
logger.warning(f"Could not find content in page: {url}")
return ""
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page content: {e}")
return ""
def check_grammar_with_grammalecte(text):
"""
Check grammar using grammalecte
Args:
text (str): Text to check
Returns:
list: List of grammar suggestions
"""
if not GRAMMALECTE_AVAILABLE:
logger.error("Grammalecte is not installed. Please install it with: pip install grammalecte")
return []
try:
logger.info("Checking grammar with grammalecte")
# Initialize grammalecte based on which API version is available
if GRAMMALECTE_DIRECT_API:
# New API: GrammarChecker is directly in grammalecte module
logger.info("Using direct GrammarChecker API")
gce = GrammarChecker("fr")
# Split text into paragraphs
paragraphs = txt.getParagraph(text)
# Check grammar for each paragraph
suggestions = []
for i, paragraph in enumerate(paragraphs):
if paragraph.strip():
# Use getParagraphErrors method
errors = gce.getParagraphErrors(paragraph)
for error in errors:
# Filter out spelling errors if needed
if "sType" in error and error["sType"] != "WORD" and error.get("bError", True):
suggestion = {
"paragraph": i + 1,
"start": error.get("nStart", 0),
"end": error.get("nEnd", 0),
"type": error.get("sType", ""),
"message": error.get("sMessage", ""),
"suggestions": error.get("aSuggestions", []),
"context": paragraph[max(0, error.get("nStart", 0) - 20):min(len(paragraph), error.get("nEnd", 0) + 20)]
}
suggestions.append(suggestion)
else:
# Old API: GrammarChecker is in grammalecte.fr module
logger.info("Using legacy grammalecte.fr.GrammarChecker API")
gce = gr_fr.GrammarChecker("fr")
# Split text into paragraphs
paragraphs = txt.getParagraph(text)
# Check grammar for each paragraph
suggestions = []
for i, paragraph in enumerate(paragraphs):
if paragraph.strip():
# Use parse method for older API
for error in gce.parse(paragraph, "FR", False):
if error["sType"] != "WORD" and error["bError"]:
suggestion = {
"paragraph": i + 1,
"start": error["nStart"],
"end": error["nEnd"],
"type": error["sType"],
"message": error["sMessage"],
"suggestions": error.get("aSuggestions", []),
"context": paragraph[max(0, error["nStart"] - 20):min(len(paragraph), error["nEnd"] + 20)]
}
suggestions.append(suggestion)
logger.info(f"Found {len(suggestions)} grammar suggestions")
return suggestions
except Exception as e:
logger.error(f"Error checking grammar with grammalecte: {e}")
return []
def check_grammar_with_cli(text):
"""
Check grammar using grammalecte-cli command
Args:
text (str): Text to check
Returns:
list: List of grammar suggestions
"""
try:
logger.info("Checking grammar with grammalecte-cli")
# Create a temporary file with the text
temp_file = "temp_text_for_grammar_check.txt"
with open(temp_file, 'w', encoding='utf-8') as f:
f.write(text)
# Run grammalecte-cli
cmd = ["grammalecte-cli", "--json", "--file", temp_file]
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
# Remove temporary file
if os.path.exists(temp_file):
os.remove(temp_file)
if result.returncode != 0:
logger.error(f"Error running grammalecte-cli: {result.stderr}")
return []
# Parse JSON output
output = json.loads(result.stdout)
# Extract grammar suggestions
suggestions = []
for paragraph_data in output.get("data", []):
paragraph_index = paragraph_data.get("iParagraph", 0)
for error in paragraph_data.get("lGrammarErrors", []):
suggestion = {
"paragraph": paragraph_index + 1,
"start": error.get("nStart", 0),
"end": error.get("nEnd", 0),
"type": error.get("sType", ""),
"message": error.get("sMessage", ""),
"suggestions": error.get("aSuggestions", []),
"context": error.get("sContext", "")
}
suggestions.append(suggestion)
logger.info(f"Found {len(suggestions)} grammar suggestions")
return suggestions
except Exception as e:
logger.error(f"Error checking grammar with grammalecte-cli: {e}")
return []
def check_grammar(text):
"""
Check grammar using available method (Python library or CLI)
Args:
text (str): Text to check
Returns:
list: List of grammar suggestions
"""
# Try using the Python library first
if GRAMMALECTE_AVAILABLE:
return check_grammar_with_grammalecte(text)
# Fall back to CLI if available
try:
# Check if grammalecte-cli is available
subprocess.run(["grammalecte-cli", "--help"], capture_output=True)
return check_grammar_with_cli(text)
except (subprocess.SubprocessError, FileNotFoundError):
logger.error("Neither grammalecte Python package nor grammalecte-cli is available.")
logger.error("Please install grammalecte with: pip install grammalecte")
return []
def select_page_for_grammar_check(pages, key=None):
"""
Select a page for grammar checking
Args:
pages (list): List of dictionaries containing page information
key (str): Key of the page to select (if None, select the first page)
Returns:
dict: Selected page or None if no suitable page found
"""
if not pages:
logger.warning("No pages found that need grammar checking")
return None
if key:
# Find the page with the specified key
for page in pages:
if page.get('key') == key:
# Check if the page has a French version
if page.get('fr_page') is None:
logger.warning(f"Page with key '{key}' does not have a French version")
return None
logger.info(f"Selected page for key '{key}' for grammar checking")
return page
logger.warning(f"No page found with key '{key}'")
return None
else:
# Select the first page that has a French version
for page in pages:
if page.get('fr_page') is not None:
logger.info(f"Selected first page with French version (key '{page['key']}') for grammar checking")
return page
logger.warning("No pages found with French versions")
return None
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Suggest grammar improvements for an OSM wiki page using grammalecte")
parser.add_argument("--page", help="Key of the page to check (default: first page with a French version)")
args = parser.parse_args()
logger.info("Starting suggest_grammar_improvements.py")
# Load pages
pages = load_outdated_pages()
if not pages:
logger.error("No pages found. Run wiki_compare.py first.")
sys.exit(1)
# Select a page for grammar checking
selected_page = select_page_for_grammar_check(pages, args.page)
if not selected_page:
logger.error("Could not select a page for grammar checking.")
sys.exit(1)
# Get the French page URL
fr_url = selected_page.get('fr_page', {}).get('url')
if not fr_url:
logger.error(f"No French page URL found for key '{selected_page['key']}'")
sys.exit(1)
# Fetch the content of the French page
logger.info(f"Fetching content from {fr_url}")
content = fetch_wiki_page_content(fr_url)
if not content:
logger.error(f"Could not fetch content from {fr_url}")
sys.exit(1)
# Check grammar
logger.info(f"Checking grammar for key '{selected_page['key']}'")
suggestions = check_grammar(content)
if not suggestions:
logger.warning("No grammar suggestions found or grammar checker not available")
# Save the grammar suggestions in the JSON file
logger.info(f"Saving grammar suggestions for key '{selected_page['key']}'")
selected_page['grammar_suggestions'] = suggestions
# Save the updated data back to the file
save_to_json(pages, OUTDATED_PAGES_FILE)
logger.info("Script completed successfully")
if __name__ == "__main__":
main()