#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ suggest_grammar_improvements.py This script reads the outdated_pages.json file, selects a wiki page (by default the first one), and uses grammalecte to check the grammar of the French page content. The grammar suggestions are saved in the "grammar_suggestions" property of the JSON file. The script is compatible with different versions of the grammalecte API: - For newer versions where GrammarChecker is directly in the grammalecte module - For older versions where GrammarChecker is in the grammalecte.fr module Usage: python suggest_grammar_improvements.py [--page KEY] Options: --page KEY Specify the key of the page to check (default: first page in the file) Output: - Updated outdated_pages.json file with grammar suggestions """ import json import argparse import logging import requests import os import sys import subprocess from bs4 import BeautifulSoup try: import grammalecte import grammalecte.text as txt # Check if GrammarChecker is available directly in the grammalecte module (newer versions) try: from grammalecte import GrammarChecker GRAMMALECTE_DIRECT_API = True except ImportError: # Try the older API structure with fr submodule try: import grammalecte.fr as gr_fr GRAMMALECTE_DIRECT_API = False except ImportError: # Neither API is available raise ImportError("Could not import GrammarChecker from grammalecte") GRAMMALECTE_AVAILABLE = True except ImportError: GRAMMALECTE_AVAILABLE = False GRAMMALECTE_DIRECT_API = False # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants OUTDATED_PAGES_FILE = "outdated_pages.json" def load_outdated_pages(): """ Load the outdated pages from the JSON file Returns: list: List of dictionaries containing outdated page information """ try: with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f: pages = json.load(f) logger.info(f"Successfully loaded {len(pages)} pages from {OUTDATED_PAGES_FILE}") return pages except (IOError, json.JSONDecodeError) as e: logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}") return [] def save_to_json(data, filename): """ Save data to a JSON file Args: data: Data to save filename (str): Name of the file """ try: with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Data saved to {filename}") except IOError as e: logger.error(f"Error saving data to {filename}: {e}") def fetch_wiki_page_content(url): """ Fetch the content of a wiki page Args: url (str): URL of the wiki page Returns: str: Content of the wiki page """ try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Get the main content content = soup.select_one('#mw-content-text') if content: # Remove script and style elements for script in content.select('script, style'): script.extract() # Remove .languages elements for languages_elem in content.select('.languages'): languages_elem.extract() # Get text text = content.get_text(separator=' ', strip=True) return text else: logger.warning(f"Could not find content in page: {url}") return "" except requests.exceptions.RequestException as e: logger.error(f"Error fetching wiki page content: {e}") return "" def check_grammar_with_grammalecte(text): """ Check grammar using grammalecte Args: text (str): Text to check Returns: list: List of grammar suggestions """ if not GRAMMALECTE_AVAILABLE: logger.error("Grammalecte is not installed. Please install it with: pip install grammalecte") return [] try: logger.info("Checking grammar with grammalecte") # Initialize grammalecte based on which API version is available if GRAMMALECTE_DIRECT_API: # New API: GrammarChecker is directly in grammalecte module logger.info("Using direct GrammarChecker API") gce = GrammarChecker("fr") # Split text into paragraphs paragraphs = txt.getParagraph(text) # Check grammar for each paragraph suggestions = [] for i, paragraph in enumerate(paragraphs): if paragraph.strip(): # Use getParagraphErrors method errors = gce.getParagraphErrors(paragraph) for error in errors: # Filter out spelling errors if needed if "sType" in error and error["sType"] != "WORD" and error.get("bError", True): suggestion = { "paragraph": i + 1, "start": error.get("nStart", 0), "end": error.get("nEnd", 0), "type": error.get("sType", ""), "message": error.get("sMessage", ""), "suggestions": error.get("aSuggestions", []), "context": paragraph[max(0, error.get("nStart", 0) - 20):min(len(paragraph), error.get("nEnd", 0) + 20)] } suggestions.append(suggestion) else: # Old API: GrammarChecker is in grammalecte.fr module logger.info("Using legacy grammalecte.fr.GrammarChecker API") gce = gr_fr.GrammarChecker("fr") # Split text into paragraphs paragraphs = txt.getParagraph(text) # Check grammar for each paragraph suggestions = [] for i, paragraph in enumerate(paragraphs): if paragraph.strip(): # Use parse method for older API for error in gce.parse(paragraph, "FR", False): if error["sType"] != "WORD" and error["bError"]: suggestion = { "paragraph": i + 1, "start": error["nStart"], "end": error["nEnd"], "type": error["sType"], "message": error["sMessage"], "suggestions": error.get("aSuggestions", []), "context": paragraph[max(0, error["nStart"] - 20):min(len(paragraph), error["nEnd"] + 20)] } suggestions.append(suggestion) logger.info(f"Found {len(suggestions)} grammar suggestions") return suggestions except Exception as e: logger.error(f"Error checking grammar with grammalecte: {e}") return [] def check_grammar_with_cli(text): """ Check grammar using grammalecte-cli command Args: text (str): Text to check Returns: list: List of grammar suggestions """ try: logger.info("Checking grammar with grammalecte-cli") # Create a temporary file with the text temp_file = "temp_text_for_grammar_check.txt" with open(temp_file, 'w', encoding='utf-8') as f: f.write(text) # Run grammalecte-cli cmd = ["grammalecte-cli", "--json", "--file", temp_file] result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8') # Remove temporary file if os.path.exists(temp_file): os.remove(temp_file) if result.returncode != 0: logger.error(f"Error running grammalecte-cli: {result.stderr}") return [] # Parse JSON output output = json.loads(result.stdout) # Extract grammar suggestions suggestions = [] for paragraph_data in output.get("data", []): paragraph_index = paragraph_data.get("iParagraph", 0) for error in paragraph_data.get("lGrammarErrors", []): suggestion = { "paragraph": paragraph_index + 1, "start": error.get("nStart", 0), "end": error.get("nEnd", 0), "type": error.get("sType", ""), "message": error.get("sMessage", ""), "suggestions": error.get("aSuggestions", []), "context": error.get("sContext", "") } suggestions.append(suggestion) logger.info(f"Found {len(suggestions)} grammar suggestions") return suggestions except Exception as e: logger.error(f"Error checking grammar with grammalecte-cli: {e}") return [] def check_grammar(text): """ Check grammar using available method (Python library or CLI) Args: text (str): Text to check Returns: list: List of grammar suggestions """ # Try using the Python library first if GRAMMALECTE_AVAILABLE: return check_grammar_with_grammalecte(text) # Fall back to CLI if available try: # Check if grammalecte-cli is available subprocess.run(["grammalecte-cli", "--help"], capture_output=True) return check_grammar_with_cli(text) except (subprocess.SubprocessError, FileNotFoundError): logger.error("Neither grammalecte Python package nor grammalecte-cli is available.") logger.error("Please install grammalecte with: pip install grammalecte") return [] def select_page_for_grammar_check(pages, key=None): """ Select a page for grammar checking Args: pages (list): List of dictionaries containing page information key (str): Key of the page to select (if None, select the first page) Returns: dict: Selected page or None if no suitable page found """ if not pages: logger.warning("No pages found that need grammar checking") return None if key: # Find the page with the specified key for page in pages: if page.get('key') == key: # Check if the page has a French version if page.get('fr_page') is None: logger.warning(f"Page with key '{key}' does not have a French version") return None logger.info(f"Selected page for key '{key}' for grammar checking") return page logger.warning(f"No page found with key '{key}'") return None else: # Select the first page that has a French version for page in pages: if page.get('fr_page') is not None: logger.info(f"Selected first page with French version (key '{page['key']}') for grammar checking") return page logger.warning("No pages found with French versions") return None def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Suggest grammar improvements for an OSM wiki page using grammalecte") parser.add_argument("--page", help="Key of the page to check (default: first page with a French version)") args = parser.parse_args() logger.info("Starting suggest_grammar_improvements.py") # Load pages pages = load_outdated_pages() if not pages: logger.error("No pages found. Run wiki_compare.py first.") sys.exit(1) # Select a page for grammar checking selected_page = select_page_for_grammar_check(pages, args.page) if not selected_page: logger.error("Could not select a page for grammar checking.") sys.exit(1) # Get the French page URL fr_url = selected_page.get('fr_page', {}).get('url') if not fr_url: logger.error(f"No French page URL found for key '{selected_page['key']}'") sys.exit(1) # Fetch the content of the French page logger.info(f"Fetching content from {fr_url}") content = fetch_wiki_page_content(fr_url) if not content: logger.error(f"Could not fetch content from {fr_url}") sys.exit(1) # Check grammar logger.info(f"Checking grammar for key '{selected_page['key']}'") suggestions = check_grammar(content) if not suggestions: logger.warning("No grammar suggestions found or grammar checker not available") # Save the grammar suggestions in the JSON file logger.info(f"Saving grammar suggestions for key '{selected_page['key']}'") selected_page['grammar_suggestions'] = suggestions # Save the updated data back to the file save_to_json(pages, OUTDATED_PAGES_FILE) logger.info("Script completed successfully") if __name__ == "__main__": main()