recup sources
This commit is contained in:
parent
86622a19ea
commit
65fe2a35f9
155 changed files with 50969 additions and 0 deletions
242
wiki_compare/fix_grammar_suggestions.py
Normal file
242
wiki_compare/fix_grammar_suggestions.py
Normal file
|
@ -0,0 +1,242 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
fix_grammar_suggestions.py
|
||||
|
||||
This script adds grammar suggestions to the "type" page in the outdated_pages.json file.
|
||||
It fetches the French content for the page, runs the grammar checker, and updates the file.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
TARGET_KEY = "type"
|
||||
|
||||
def load_outdated_pages():
|
||||
"""
|
||||
Load the outdated pages from the JSON file
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing outdated page information
|
||||
"""
|
||||
try:
|
||||
with open(OUTDATED_PAGES_FILE, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
logger.info(f"Successfully loaded outdated pages from {OUTDATED_PAGES_FILE}")
|
||||
return data
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error loading pages from {OUTDATED_PAGES_FILE}: {e}")
|
||||
return None
|
||||
|
||||
def save_outdated_pages(data):
|
||||
"""
|
||||
Save the outdated pages to the JSON file
|
||||
|
||||
Args:
|
||||
data (dict): Dictionary containing outdated page information
|
||||
"""
|
||||
try:
|
||||
with open(OUTDATED_PAGES_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
logger.info(f"Successfully saved outdated pages to {OUTDATED_PAGES_FILE}")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving pages to {OUTDATED_PAGES_FILE}: {e}")
|
||||
|
||||
def fetch_wiki_page_content(url):
|
||||
"""
|
||||
Fetch the content of a wiki page
|
||||
|
||||
Args:
|
||||
url (str): URL of the wiki page
|
||||
|
||||
Returns:
|
||||
str: Content of the wiki page
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Fetching content from {url}")
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Get the main content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
if content:
|
||||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
# Get text
|
||||
text = content.get_text(separator=' ', strip=True)
|
||||
logger.info(f"Successfully fetched content ({len(text)} characters)")
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"Could not find content in page: {url}")
|
||||
return ""
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page content: {e}")
|
||||
return ""
|
||||
|
||||
def check_grammar_with_grammalecte(text):
|
||||
"""
|
||||
Check grammar in French text using grammalecte-cli
|
||||
|
||||
Args:
|
||||
text (str): French text to check
|
||||
|
||||
Returns:
|
||||
list: List of grammar suggestions
|
||||
"""
|
||||
if not text or len(text.strip()) == 0:
|
||||
logger.warning("Empty text provided for grammar checking")
|
||||
return []
|
||||
|
||||
logger.info("Checking grammar with grammalecte-cli...")
|
||||
|
||||
try:
|
||||
# Create a temporary file with the text
|
||||
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
|
||||
temp_file.write(text)
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
# Run grammalecte-cli on the temporary file
|
||||
cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
|
||||
# Parse the JSON output
|
||||
grammar_data = json.loads(result.stdout)
|
||||
|
||||
# Extract grammar errors from all paragraphs
|
||||
grammar_suggestions = []
|
||||
for paragraph in grammar_data.get('data', []):
|
||||
paragraph_index = paragraph.get('iParagraph', 0)
|
||||
|
||||
# Process grammar errors
|
||||
for error in paragraph.get('lGrammarErrors', []):
|
||||
suggestion = {
|
||||
'paragraph': paragraph_index,
|
||||
'start': error.get('nStart', 0),
|
||||
'end': error.get('nEnd', 0),
|
||||
'type': error.get('sType', ''),
|
||||
'message': error.get('sMessage', ''),
|
||||
'suggestions': error.get('aSuggestions', []),
|
||||
'text': error.get('sUnderlined', ''),
|
||||
'before': error.get('sBefore', ''),
|
||||
'after': error.get('sAfter', '')
|
||||
}
|
||||
grammar_suggestions.append(suggestion)
|
||||
|
||||
# Process spelling errors
|
||||
for error in paragraph.get('lSpellingErrors', []):
|
||||
suggestion = {
|
||||
'paragraph': paragraph_index,
|
||||
'start': error.get('nStart', 0),
|
||||
'end': error.get('nEnd', 0),
|
||||
'type': 'spelling',
|
||||
'message': 'Erreur d\'orthographe',
|
||||
'suggestions': error.get('aSuggestions', []),
|
||||
'text': error.get('sUnderlined', ''),
|
||||
'before': error.get('sBefore', ''),
|
||||
'after': error.get('sAfter', '')
|
||||
}
|
||||
grammar_suggestions.append(suggestion)
|
||||
|
||||
# Clean up the temporary file
|
||||
os.unlink(temp_file_path)
|
||||
|
||||
logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
|
||||
return grammar_suggestions
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Error running grammalecte-cli: {e}")
|
||||
logger.error(f"stdout: {e.stdout}")
|
||||
logger.error(f"stderr: {e.stderr}")
|
||||
return []
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Error parsing grammalecte-cli output: {e}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error during grammar checking: {e}")
|
||||
return []
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
logger.info("Starting fix_grammar_suggestions.py")
|
||||
|
||||
# Load outdated pages
|
||||
data = load_outdated_pages()
|
||||
if not data:
|
||||
logger.error("Failed to load outdated pages")
|
||||
return
|
||||
|
||||
# Find the "type" page in the regular_pages array
|
||||
type_page = None
|
||||
for i, page in enumerate(data.get('regular_pages', [])):
|
||||
if page.get('key') == TARGET_KEY:
|
||||
type_page = page
|
||||
type_page_index = i
|
||||
break
|
||||
|
||||
if not type_page:
|
||||
logger.error(f"Could not find page with key '{TARGET_KEY}'")
|
||||
return
|
||||
|
||||
# Get the French page URL
|
||||
fr_page = type_page.get('fr_page')
|
||||
if not fr_page:
|
||||
logger.error(f"No French page found for key '{TARGET_KEY}'")
|
||||
return
|
||||
|
||||
fr_url = fr_page.get('url')
|
||||
if not fr_url:
|
||||
logger.error(f"No URL found for French page of key '{TARGET_KEY}'")
|
||||
return
|
||||
|
||||
# Fetch the content of the French page
|
||||
content = fetch_wiki_page_content(fr_url)
|
||||
if not content:
|
||||
logger.error(f"Could not fetch content from {fr_url}")
|
||||
return
|
||||
|
||||
# Check grammar
|
||||
logger.info(f"Checking grammar for key '{TARGET_KEY}'")
|
||||
suggestions = check_grammar_with_grammalecte(content)
|
||||
if not suggestions:
|
||||
logger.warning("No grammar suggestions found or grammar checker not available")
|
||||
|
||||
# Add the grammar suggestions to the page
|
||||
type_page['grammar_suggestions'] = suggestions
|
||||
|
||||
# Update the page in the data
|
||||
data['regular_pages'][type_page_index] = type_page
|
||||
|
||||
# Save the updated data
|
||||
save_outdated_pages(data)
|
||||
|
||||
logger.info("Script completed successfully")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Add table
Add a link
Reference in a new issue