381 lines
No EOL
13 KiB
Python
Executable file
381 lines
No EOL
13 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
suggest_grammar_improvements.py
|
|
|
|
This script reads the outdated_pages.json file, selects a wiki page (by default the first one),
|
|
and uses grammalecte to check the grammar of the French page content.
|
|
The grammar suggestions are saved in the "grammar_suggestions" property of the JSON file.
|
|
|
|
The script is compatible with different versions of the grammalecte API:
|
|
- For newer versions where GrammarChecker is directly in the grammalecte module
|
|
- For older versions where GrammarChecker is in the grammalecte.fr module
|
|
|
|
Usage:
|
|
python suggest_grammar_improvements.py [--page KEY]
|
|
|
|
Options:
|
|
--page KEY Specify the key of the page to check (default: first page in the file)
|
|
|
|
Output:
|
|
- Updated outdated_pages.json file with grammar suggestions
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import logging
|
|
import requests
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
from bs4 import BeautifulSoup
|
|
|
|
try:
|
|
import grammalecte
|
|
import grammalecte.text as txt
|
|
|
|
# Check if GrammarChecker is available directly in the grammalecte module (newer versions)
|
|
try:
|
|
from grammalecte import GrammarChecker
|
|
GRAMMALECTE_DIRECT_API = True
|
|
except ImportError:
|
|
# Try the older API structure with fr submodule
|
|
try:
|
|
import grammalecte.fr as gr_fr
|
|
GRAMMALECTE_DIRECT_API = False
|
|
except ImportError:
|
|
# Neither API is available
|
|
raise ImportError("Could not import GrammarChecker from grammalecte")
|
|
|
|
GRAMMALECTE_AVAILABLE = True
|
|
except ImportError:
|
|
GRAMMALECTE_AVAILABLE = False
|
|
GRAMMALECTE_DIRECT_API = False
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
|
|
|
def load_outdated_pages():
|
|
"""
|
|
Load the outdated pages from the JSON file
|
|
|
|
Returns:
|
|
list: List of dictionaries containing outdated page information
|
|
"""
|
|
try:
|
|
with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
|
|
pages = json.load(f)
|
|
logger.info(f"Successfully loaded {len(pages)} pages from {OUTDATED_PAGES_FILE}")
|
|
return pages
|
|
except (IOError, json.JSONDecodeError) as e:
|
|
logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
|
|
return []
|
|
|
|
def save_to_json(data, filename):
|
|
"""
|
|
Save data to a JSON file
|
|
|
|
Args:
|
|
data: Data to save
|
|
filename (str): Name of the file
|
|
"""
|
|
try:
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Data saved to {filename}")
|
|
except IOError as e:
|
|
logger.error(f"Error saving data to {filename}: {e}")
|
|
|
|
def fetch_wiki_page_content(url):
|
|
"""
|
|
Fetch the content of a wiki page
|
|
|
|
Args:
|
|
url (str): URL of the wiki page
|
|
|
|
Returns:
|
|
str: Content of the wiki page
|
|
"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Get the main content
|
|
content = soup.select_one('#mw-content-text')
|
|
if content:
|
|
# Remove script and style elements
|
|
for script in content.select('script, style'):
|
|
script.extract()
|
|
|
|
# Remove .languages elements
|
|
for languages_elem in content.select('.languages'):
|
|
languages_elem.extract()
|
|
|
|
# Get text
|
|
text = content.get_text(separator=' ', strip=True)
|
|
return text
|
|
else:
|
|
logger.warning(f"Could not find content in page: {url}")
|
|
return ""
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching wiki page content: {e}")
|
|
return ""
|
|
|
|
def check_grammar_with_grammalecte(text):
|
|
"""
|
|
Check grammar using grammalecte
|
|
|
|
Args:
|
|
text (str): Text to check
|
|
|
|
Returns:
|
|
list: List of grammar suggestions
|
|
"""
|
|
if not GRAMMALECTE_AVAILABLE:
|
|
logger.error("Grammalecte is not installed. Please install it with: pip install grammalecte")
|
|
return []
|
|
|
|
try:
|
|
logger.info("Checking grammar with grammalecte")
|
|
|
|
# Initialize grammalecte based on which API version is available
|
|
if GRAMMALECTE_DIRECT_API:
|
|
# New API: GrammarChecker is directly in grammalecte module
|
|
logger.info("Using direct GrammarChecker API")
|
|
gce = GrammarChecker("fr")
|
|
|
|
# Split text into paragraphs
|
|
paragraphs = txt.getParagraph(text)
|
|
|
|
# Check grammar for each paragraph
|
|
suggestions = []
|
|
for i, paragraph in enumerate(paragraphs):
|
|
if paragraph.strip():
|
|
# Use getParagraphErrors method
|
|
errors = gce.getParagraphErrors(paragraph)
|
|
for error in errors:
|
|
# Filter out spelling errors if needed
|
|
if "sType" in error and error["sType"] != "WORD" and error.get("bError", True):
|
|
suggestion = {
|
|
"paragraph": i + 1,
|
|
"start": error.get("nStart", 0),
|
|
"end": error.get("nEnd", 0),
|
|
"type": error.get("sType", ""),
|
|
"message": error.get("sMessage", ""),
|
|
"suggestions": error.get("aSuggestions", []),
|
|
"context": paragraph[max(0, error.get("nStart", 0) - 20):min(len(paragraph), error.get("nEnd", 0) + 20)]
|
|
}
|
|
suggestions.append(suggestion)
|
|
else:
|
|
# Old API: GrammarChecker is in grammalecte.fr module
|
|
logger.info("Using legacy grammalecte.fr.GrammarChecker API")
|
|
gce = gr_fr.GrammarChecker("fr")
|
|
|
|
# Split text into paragraphs
|
|
paragraphs = txt.getParagraph(text)
|
|
|
|
# Check grammar for each paragraph
|
|
suggestions = []
|
|
for i, paragraph in enumerate(paragraphs):
|
|
if paragraph.strip():
|
|
# Use parse method for older API
|
|
for error in gce.parse(paragraph, "FR", False):
|
|
if error["sType"] != "WORD" and error["bError"]:
|
|
suggestion = {
|
|
"paragraph": i + 1,
|
|
"start": error["nStart"],
|
|
"end": error["nEnd"],
|
|
"type": error["sType"],
|
|
"message": error["sMessage"],
|
|
"suggestions": error.get("aSuggestions", []),
|
|
"context": paragraph[max(0, error["nStart"] - 20):min(len(paragraph), error["nEnd"] + 20)]
|
|
}
|
|
suggestions.append(suggestion)
|
|
|
|
logger.info(f"Found {len(suggestions)} grammar suggestions")
|
|
return suggestions
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking grammar with grammalecte: {e}")
|
|
return []
|
|
|
|
def check_grammar_with_cli(text):
|
|
"""
|
|
Check grammar using grammalecte-cli command
|
|
|
|
Args:
|
|
text (str): Text to check
|
|
|
|
Returns:
|
|
list: List of grammar suggestions
|
|
"""
|
|
try:
|
|
logger.info("Checking grammar with grammalecte-cli")
|
|
|
|
# Create a temporary file with the text
|
|
temp_file = "temp_text_for_grammar_check.txt"
|
|
with open(temp_file, 'w', encoding='utf-8') as f:
|
|
f.write(text)
|
|
|
|
# Run grammalecte-cli
|
|
cmd = ["grammalecte-cli", "--json", "--file", temp_file]
|
|
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
|
|
|
|
# Remove temporary file
|
|
if os.path.exists(temp_file):
|
|
os.remove(temp_file)
|
|
|
|
if result.returncode != 0:
|
|
logger.error(f"Error running grammalecte-cli: {result.stderr}")
|
|
return []
|
|
|
|
# Parse JSON output
|
|
output = json.loads(result.stdout)
|
|
|
|
# Extract grammar suggestions
|
|
suggestions = []
|
|
for paragraph_data in output.get("data", []):
|
|
paragraph_index = paragraph_data.get("iParagraph", 0)
|
|
for error in paragraph_data.get("lGrammarErrors", []):
|
|
suggestion = {
|
|
"paragraph": paragraph_index + 1,
|
|
"start": error.get("nStart", 0),
|
|
"end": error.get("nEnd", 0),
|
|
"type": error.get("sType", ""),
|
|
"message": error.get("sMessage", ""),
|
|
"suggestions": error.get("aSuggestions", []),
|
|
"context": error.get("sContext", "")
|
|
}
|
|
suggestions.append(suggestion)
|
|
|
|
logger.info(f"Found {len(suggestions)} grammar suggestions")
|
|
return suggestions
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking grammar with grammalecte-cli: {e}")
|
|
return []
|
|
|
|
def check_grammar(text):
|
|
"""
|
|
Check grammar using available method (Python library or CLI)
|
|
|
|
Args:
|
|
text (str): Text to check
|
|
|
|
Returns:
|
|
list: List of grammar suggestions
|
|
"""
|
|
# Try using the Python library first
|
|
if GRAMMALECTE_AVAILABLE:
|
|
return check_grammar_with_grammalecte(text)
|
|
|
|
# Fall back to CLI if available
|
|
try:
|
|
# Check if grammalecte-cli is available
|
|
subprocess.run(["grammalecte-cli", "--help"], capture_output=True)
|
|
return check_grammar_with_cli(text)
|
|
except (subprocess.SubprocessError, FileNotFoundError):
|
|
logger.error("Neither grammalecte Python package nor grammalecte-cli is available.")
|
|
logger.error("Please install grammalecte with: pip install grammalecte")
|
|
return []
|
|
|
|
def select_page_for_grammar_check(pages, key=None):
|
|
"""
|
|
Select a page for grammar checking
|
|
|
|
Args:
|
|
pages (list): List of dictionaries containing page information
|
|
key (str): Key of the page to select (if None, select the first page)
|
|
|
|
Returns:
|
|
dict: Selected page or None if no suitable page found
|
|
"""
|
|
if not pages:
|
|
logger.warning("No pages found that need grammar checking")
|
|
return None
|
|
|
|
if key:
|
|
# Find the page with the specified key
|
|
for page in pages:
|
|
if page.get('key') == key:
|
|
# Check if the page has a French version
|
|
if page.get('fr_page') is None:
|
|
logger.warning(f"Page with key '{key}' does not have a French version")
|
|
return None
|
|
logger.info(f"Selected page for key '{key}' for grammar checking")
|
|
return page
|
|
|
|
logger.warning(f"No page found with key '{key}'")
|
|
return None
|
|
else:
|
|
# Select the first page that has a French version
|
|
for page in pages:
|
|
if page.get('fr_page') is not None:
|
|
logger.info(f"Selected first page with French version (key '{page['key']}') for grammar checking")
|
|
return page
|
|
|
|
logger.warning("No pages found with French versions")
|
|
return None
|
|
|
|
def main():
|
|
"""Main function to execute the script"""
|
|
parser = argparse.ArgumentParser(description="Suggest grammar improvements for an OSM wiki page using grammalecte")
|
|
parser.add_argument("--page", help="Key of the page to check (default: first page with a French version)")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("Starting suggest_grammar_improvements.py")
|
|
|
|
# Load pages
|
|
pages = load_outdated_pages()
|
|
if not pages:
|
|
logger.error("No pages found. Run wiki_compare.py first.")
|
|
sys.exit(1)
|
|
|
|
# Select a page for grammar checking
|
|
selected_page = select_page_for_grammar_check(pages, args.page)
|
|
if not selected_page:
|
|
logger.error("Could not select a page for grammar checking.")
|
|
sys.exit(1)
|
|
|
|
# Get the French page URL
|
|
fr_url = selected_page.get('fr_page', {}).get('url')
|
|
if not fr_url:
|
|
logger.error(f"No French page URL found for key '{selected_page['key']}'")
|
|
sys.exit(1)
|
|
|
|
# Fetch the content of the French page
|
|
logger.info(f"Fetching content from {fr_url}")
|
|
content = fetch_wiki_page_content(fr_url)
|
|
if not content:
|
|
logger.error(f"Could not fetch content from {fr_url}")
|
|
sys.exit(1)
|
|
|
|
# Check grammar
|
|
logger.info(f"Checking grammar for key '{selected_page['key']}'")
|
|
suggestions = check_grammar(content)
|
|
if not suggestions:
|
|
logger.warning("No grammar suggestions found or grammar checker not available")
|
|
|
|
# Save the grammar suggestions in the JSON file
|
|
logger.info(f"Saving grammar suggestions for key '{selected_page['key']}'")
|
|
selected_page['grammar_suggestions'] = suggestions
|
|
|
|
# Save the updated data back to the file
|
|
save_to_json(pages, OUTDATED_PAGES_FILE)
|
|
|
|
logger.info("Script completed successfully")
|
|
|
|
if __name__ == "__main__":
|
|
main() |