recup sources
This commit is contained in:
parent
86622a19ea
commit
65fe2a35f9
155 changed files with 50969 additions and 0 deletions
381
wiki_compare/suggest_grammar_improvements.py
Executable file
381
wiki_compare/suggest_grammar_improvements.py
Executable file
|
@ -0,0 +1,381 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
suggest_grammar_improvements.py
|
||||
|
||||
This script reads the outdated_pages.json file, selects a wiki page (by default the first one),
|
||||
and uses grammalecte to check the grammar of the French page content.
|
||||
The grammar suggestions are saved in the "grammar_suggestions" property of the JSON file.
|
||||
|
||||
The script is compatible with different versions of the grammalecte API:
|
||||
- For newer versions where GrammarChecker is directly in the grammalecte module
|
||||
- For older versions where GrammarChecker is in the grammalecte.fr module
|
||||
|
||||
Usage:
|
||||
python suggest_grammar_improvements.py [--page KEY]
|
||||
|
||||
Options:
|
||||
--page KEY Specify the key of the page to check (default: first page in the file)
|
||||
|
||||
Output:
|
||||
- Updated outdated_pages.json file with grammar suggestions
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import requests
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
try:
|
||||
import grammalecte
|
||||
import grammalecte.text as txt
|
||||
|
||||
# Check if GrammarChecker is available directly in the grammalecte module (newer versions)
|
||||
try:
|
||||
from grammalecte import GrammarChecker
|
||||
GRAMMALECTE_DIRECT_API = True
|
||||
except ImportError:
|
||||
# Try the older API structure with fr submodule
|
||||
try:
|
||||
import grammalecte.fr as gr_fr
|
||||
GRAMMALECTE_DIRECT_API = False
|
||||
except ImportError:
|
||||
# Neither API is available
|
||||
raise ImportError("Could not import GrammarChecker from grammalecte")
|
||||
|
||||
GRAMMALECTE_AVAILABLE = True
|
||||
except ImportError:
|
||||
GRAMMALECTE_AVAILABLE = False
|
||||
GRAMMALECTE_DIRECT_API = False
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
|
||||
def load_outdated_pages():
|
||||
"""
|
||||
Load the outdated pages from the JSON file
|
||||
|
||||
Returns:
|
||||
list: List of dictionaries containing outdated page information
|
||||
"""
|
||||
try:
|
||||
with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
|
||||
pages = json.load(f)
|
||||
logger.info(f"Successfully loaded {len(pages)} pages from {OUTDATED_PAGES_FILE}")
|
||||
return pages
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
|
||||
return []
|
||||
|
||||
def save_to_json(data, filename):
|
||||
"""
|
||||
Save data to a JSON file
|
||||
|
||||
Args:
|
||||
data: Data to save
|
||||
filename (str): Name of the file
|
||||
"""
|
||||
try:
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Data saved to {filename}")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def fetch_wiki_page_content(url):
|
||||
"""
|
||||
Fetch the content of a wiki page
|
||||
|
||||
Args:
|
||||
url (str): URL of the wiki page
|
||||
|
||||
Returns:
|
||||
str: Content of the wiki page
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Get the main content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
# Get text
|
||||
text = content.get_text(separator=' ', strip=True)
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"Could not find content in page: {url}")
|
||||
return ""
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page content: {e}")
|
||||
return ""
|
||||
|
||||
def check_grammar_with_grammalecte(text):
|
||||
"""
|
||||
Check grammar using grammalecte
|
||||
|
||||
Args:
|
||||
text (str): Text to check
|
||||
|
||||
Returns:
|
||||
list: List of grammar suggestions
|
||||
"""
|
||||
if not GRAMMALECTE_AVAILABLE:
|
||||
logger.error("Grammalecte is not installed. Please install it with: pip install grammalecte")
|
||||
return []
|
||||
|
||||
try:
|
||||
logger.info("Checking grammar with grammalecte")
|
||||
|
||||
# Initialize grammalecte based on which API version is available
|
||||
if GRAMMALECTE_DIRECT_API:
|
||||
# New API: GrammarChecker is directly in grammalecte module
|
||||
logger.info("Using direct GrammarChecker API")
|
||||
gce = GrammarChecker("fr")
|
||||
|
||||
# Split text into paragraphs
|
||||
paragraphs = txt.getParagraph(text)
|
||||
|
||||
# Check grammar for each paragraph
|
||||
suggestions = []
|
||||
for i, paragraph in enumerate(paragraphs):
|
||||
if paragraph.strip():
|
||||
# Use getParagraphErrors method
|
||||
errors = gce.getParagraphErrors(paragraph)
|
||||
for error in errors:
|
||||
# Filter out spelling errors if needed
|
||||
if "sType" in error and error["sType"] != "WORD" and error.get("bError", True):
|
||||
suggestion = {
|
||||
"paragraph": i + 1,
|
||||
"start": error.get("nStart", 0),
|
||||
"end": error.get("nEnd", 0),
|
||||
"type": error.get("sType", ""),
|
||||
"message": error.get("sMessage", ""),
|
||||
"suggestions": error.get("aSuggestions", []),
|
||||
"context": paragraph[max(0, error.get("nStart", 0) - 20):min(len(paragraph), error.get("nEnd", 0) + 20)]
|
||||
}
|
||||
suggestions.append(suggestion)
|
||||
else:
|
||||
# Old API: GrammarChecker is in grammalecte.fr module
|
||||
logger.info("Using legacy grammalecte.fr.GrammarChecker API")
|
||||
gce = gr_fr.GrammarChecker("fr")
|
||||
|
||||
# Split text into paragraphs
|
||||
paragraphs = txt.getParagraph(text)
|
||||
|
||||
# Check grammar for each paragraph
|
||||
suggestions = []
|
||||
for i, paragraph in enumerate(paragraphs):
|
||||
if paragraph.strip():
|
||||
# Use parse method for older API
|
||||
for error in gce.parse(paragraph, "FR", False):
|
||||
if error["sType"] != "WORD" and error["bError"]:
|
||||
suggestion = {
|
||||
"paragraph": i + 1,
|
||||
"start": error["nStart"],
|
||||
"end": error["nEnd"],
|
||||
"type": error["sType"],
|
||||
"message": error["sMessage"],
|
||||
"suggestions": error.get("aSuggestions", []),
|
||||
"context": paragraph[max(0, error["nStart"] - 20):min(len(paragraph), error["nEnd"] + 20)]
|
||||
}
|
||||
suggestions.append(suggestion)
|
||||
|
||||
logger.info(f"Found {len(suggestions)} grammar suggestions")
|
||||
return suggestions
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking grammar with grammalecte: {e}")
|
||||
return []
|
||||
|
||||
def check_grammar_with_cli(text):
|
||||
"""
|
||||
Check grammar using grammalecte-cli command
|
||||
|
||||
Args:
|
||||
text (str): Text to check
|
||||
|
||||
Returns:
|
||||
list: List of grammar suggestions
|
||||
"""
|
||||
try:
|
||||
logger.info("Checking grammar with grammalecte-cli")
|
||||
|
||||
# Create a temporary file with the text
|
||||
temp_file = "temp_text_for_grammar_check.txt"
|
||||
with open(temp_file, 'w', encoding='utf-8') as f:
|
||||
f.write(text)
|
||||
|
||||
# Run grammalecte-cli
|
||||
cmd = ["grammalecte-cli", "--json", "--file", temp_file]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8')
|
||||
|
||||
# Remove temporary file
|
||||
if os.path.exists(temp_file):
|
||||
os.remove(temp_file)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.error(f"Error running grammalecte-cli: {result.stderr}")
|
||||
return []
|
||||
|
||||
# Parse JSON output
|
||||
output = json.loads(result.stdout)
|
||||
|
||||
# Extract grammar suggestions
|
||||
suggestions = []
|
||||
for paragraph_data in output.get("data", []):
|
||||
paragraph_index = paragraph_data.get("iParagraph", 0)
|
||||
for error in paragraph_data.get("lGrammarErrors", []):
|
||||
suggestion = {
|
||||
"paragraph": paragraph_index + 1,
|
||||
"start": error.get("nStart", 0),
|
||||
"end": error.get("nEnd", 0),
|
||||
"type": error.get("sType", ""),
|
||||
"message": error.get("sMessage", ""),
|
||||
"suggestions": error.get("aSuggestions", []),
|
||||
"context": error.get("sContext", "")
|
||||
}
|
||||
suggestions.append(suggestion)
|
||||
|
||||
logger.info(f"Found {len(suggestions)} grammar suggestions")
|
||||
return suggestions
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking grammar with grammalecte-cli: {e}")
|
||||
return []
|
||||
|
||||
def check_grammar(text):
|
||||
"""
|
||||
Check grammar using available method (Python library or CLI)
|
||||
|
||||
Args:
|
||||
text (str): Text to check
|
||||
|
||||
Returns:
|
||||
list: List of grammar suggestions
|
||||
"""
|
||||
# Try using the Python library first
|
||||
if GRAMMALECTE_AVAILABLE:
|
||||
return check_grammar_with_grammalecte(text)
|
||||
|
||||
# Fall back to CLI if available
|
||||
try:
|
||||
# Check if grammalecte-cli is available
|
||||
subprocess.run(["grammalecte-cli", "--help"], capture_output=True)
|
||||
return check_grammar_with_cli(text)
|
||||
except (subprocess.SubprocessError, FileNotFoundError):
|
||||
logger.error("Neither grammalecte Python package nor grammalecte-cli is available.")
|
||||
logger.error("Please install grammalecte with: pip install grammalecte")
|
||||
return []
|
||||
|
||||
def select_page_for_grammar_check(pages, key=None):
|
||||
"""
|
||||
Select a page for grammar checking
|
||||
|
||||
Args:
|
||||
pages (list): List of dictionaries containing page information
|
||||
key (str): Key of the page to select (if None, select the first page)
|
||||
|
||||
Returns:
|
||||
dict: Selected page or None if no suitable page found
|
||||
"""
|
||||
if not pages:
|
||||
logger.warning("No pages found that need grammar checking")
|
||||
return None
|
||||
|
||||
if key:
|
||||
# Find the page with the specified key
|
||||
for page in pages:
|
||||
if page.get('key') == key:
|
||||
# Check if the page has a French version
|
||||
if page.get('fr_page') is None:
|
||||
logger.warning(f"Page with key '{key}' does not have a French version")
|
||||
return None
|
||||
logger.info(f"Selected page for key '{key}' for grammar checking")
|
||||
return page
|
||||
|
||||
logger.warning(f"No page found with key '{key}'")
|
||||
return None
|
||||
else:
|
||||
# Select the first page that has a French version
|
||||
for page in pages:
|
||||
if page.get('fr_page') is not None:
|
||||
logger.info(f"Selected first page with French version (key '{page['key']}') for grammar checking")
|
||||
return page
|
||||
|
||||
logger.warning("No pages found with French versions")
|
||||
return None
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
parser = argparse.ArgumentParser(description="Suggest grammar improvements for an OSM wiki page using grammalecte")
|
||||
parser.add_argument("--page", help="Key of the page to check (default: first page with a French version)")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Starting suggest_grammar_improvements.py")
|
||||
|
||||
# Load pages
|
||||
pages = load_outdated_pages()
|
||||
if not pages:
|
||||
logger.error("No pages found. Run wiki_compare.py first.")
|
||||
sys.exit(1)
|
||||
|
||||
# Select a page for grammar checking
|
||||
selected_page = select_page_for_grammar_check(pages, args.page)
|
||||
if not selected_page:
|
||||
logger.error("Could not select a page for grammar checking.")
|
||||
sys.exit(1)
|
||||
|
||||
# Get the French page URL
|
||||
fr_url = selected_page.get('fr_page', {}).get('url')
|
||||
if not fr_url:
|
||||
logger.error(f"No French page URL found for key '{selected_page['key']}'")
|
||||
sys.exit(1)
|
||||
|
||||
# Fetch the content of the French page
|
||||
logger.info(f"Fetching content from {fr_url}")
|
||||
content = fetch_wiki_page_content(fr_url)
|
||||
if not content:
|
||||
logger.error(f"Could not fetch content from {fr_url}")
|
||||
sys.exit(1)
|
||||
|
||||
# Check grammar
|
||||
logger.info(f"Checking grammar for key '{selected_page['key']}'")
|
||||
suggestions = check_grammar(content)
|
||||
if not suggestions:
|
||||
logger.warning("No grammar suggestions found or grammar checker not available")
|
||||
|
||||
# Save the grammar suggestions in the JSON file
|
||||
logger.info(f"Saving grammar suggestions for key '{selected_page['key']}'")
|
||||
selected_page['grammar_suggestions'] = suggestions
|
||||
|
||||
# Save the updated data back to the file
|
||||
save_to_json(pages, OUTDATED_PAGES_FILE)
|
||||
|
||||
logger.info("Script completed successfully")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue