book-generator-orgmode/analyse_orthographe_grammaire.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script pour analyser les fautes d'orthographe et de grammaire dans un fichier livre.org
et générer un rapport par chapitre.

Ce script:
1. Lit le fichier livre.org
2. Extrait le texte par chapitre
3. Analyse les fautes d'orthographe et de grammaire dans chaque chapitre
4. Génère un rapport détaillé des erreurs trouvées
"""

import re
import os
import csv
import argparse
from spellchecker import SpellChecker
import language_tool_python

# Définir les arguments en ligne de commande
parser = argparse.ArgumentParser(description='Analyser les fautes d\'orthographe et de grammaire dans un fichier Org-mode.')
parser.add_argument('dossier', nargs='?', help='Le chemin du dossier contenant le fichier livre.org. Si aucun dossier n\'est spécifié, le dossier courant sera utilisé.', default=os.getcwd())
args = parser.parse_args()

# Chemin vers le fichier livre.org
fichier_livre = f"{args.dossier}/livre.org"

def extract_chapters(file_path):
    """
    Extrait les chapitres d'un fichier org-mode.
    Retourne un dictionnaire avec les titres des chapitres comme clés et leur contenu comme valeurs.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Diviser le contenu par chapitres (lignes commençant par **)
    chapter_pattern = r'^\*\* (.*?)$(.*?)(?=^\*\* |\Z)'
    chapters = re.findall(chapter_pattern, content, re.MULTILINE | re.DOTALL)

    chapter_dict = {}
    for title, content in chapters:
        # Nettoyer le titre (supprimer ": title:" s'il existe)
        clean_title = re.sub(r'\s*:\s*title\s*:', '', title).strip()

        # Nettoyer le contenu
        clean_content = clean_chapter_content(content)

        chapter_dict[clean_title] = clean_content

    return chapter_dict

def clean_chapter_content(content):
    """
    Nettoie le contenu d'un chapitre en supprimant les commentaires et les balises org-mode.
    """
    # Supprimer les blocs de commentaires
    content = re.sub(r'#\+begin_comment.*?#\+end_comment', '', content, flags=re.DOTALL | re.IGNORECASE)

    # Supprimer les lignes de métadonnées (commençant par #+)
    content = re.sub(r'^\s*#\+.*$', '', content, flags=re.MULTILINE)

    # Supprimer les sous-titres (lignes commençant par ***)
    content = re.sub(r'^\s*\*\*\*.*$', '', content, flags=re.MULTILINE)

    # Supprimer les liens org-mode [[...][...]] et [[...]]
    content = re.sub(r'\[\[.*?\]\](?:\[.*?\])?', '', content)

    # Supprimer les lignes vides multiples
    content = re.sub(r'\n\s*\n', '\n\n', content)

    return content.strip()

def load_custom_dictionary(file_path):
    """
    Charge le dictionnaire personnalisé à partir d'un fichier texte.
    Retourne un ensemble de mots à considérer comme corrects.
    """
    custom_words = set()

    # Vérifier si le fichier existe
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Ignorer les lignes vides et les commentaires
                line = line.strip()
                if line and not line.startswith('#'):
                    custom_words.add(line.lower())

    return custom_words

def check_spelling(text, lang='fr', custom_dict_path='dictionnaire_personnalise.txt'):
    """
    Vérifie l'orthographe d'un texte et retourne les mots mal orthographiés.
    Utilise un dictionnaire personnalisé pour exclure certains mots de la vérification.
    """
    spell = SpellChecker(language=lang)

    # Charger le dictionnaire personnalisé
    custom_words = load_custom_dictionary(custom_dict_path)

    # Ajouter les mots du dictionnaire personnalisé au dictionnaire du vérificateur
    if custom_words:
        spell.word_frequency.load_words(custom_words)

    # Diviser le texte en mots
    words = re.findall(r'\b\w+\b', text.lower())

    # Trouver les mots mal orthographiés
    misspelled = spell.unknown(words)

    # Créer un dictionnaire avec les mots mal orthographiés et leurs suggestions
    spelling_errors = {}
    for word in misspelled:
        # Vérifier si le mot est dans le dictionnaire personnalisé
        if word in custom_words:
            continue

        # Obtenir les suggestions de correction
        suggestions = spell.candidates(word)
        # Limiter à 5 suggestions maximum
        suggestions_list = list(suggestions) if suggestions is not None else []
        suggestions_list = suggestions_list[:5]
        spelling_errors[word] = suggestions_list

    return spelling_errors

def check_grammar(text, lang='fr'):
    """
    Vérifie la grammaire d'un texte et retourne les erreurs grammaticales.
    """
    # Initialiser l'outil de vérification grammaticale
    tool = language_tool_python.LanguageTool(lang)

    # Vérifier le texte
    matches = tool.check(text)

    # Créer une liste d'erreurs grammaticales
    grammar_errors = []
    for match in matches:
        # Ignorer les erreurs d'orthographe (déjà traitées par le vérificateur d'orthographe)
        if match.ruleId.startswith('MORFOLOGIK_RULE'):
            continue

        error = {
            'message': match.message,
            'context': match.context,
            'suggestions': match.replacements,
            'offset': match.offset,
            'length': match.errorLength,
            'rule': match.ruleId
        }
        grammar_errors.append(error)

    # Fermer l'outil pour libérer les ressources
    tool.close()

    return grammar_errors

def generate_error_report(chapters, output_path):
    """
    Génère un rapport des erreurs d'orthographe et de grammaire par chapitre.
    """
    with open(output_path, 'w', encoding='utf-8') as report_file:
        report_file.write("# Rapport d'analyse orthographique et grammaticale\n\n")

        total_spelling_errors = 0
        total_grammar_errors = 0

        for chapter_title, chapter_content in chapters.items():
            report_file.write(f"## Chapitre: {chapter_title}\n\n")

            # Vérifier l'orthographe
            spelling_errors = check_spelling(chapter_content)

            # Vérifier la grammaire
            grammar_errors = check_grammar(chapter_content)

            # Mettre à jour les totaux
            total_spelling_errors += len(spelling_errors)
            total_grammar_errors += len(grammar_errors)

            # Écrire les erreurs d'orthographe
            report_file.write("### Erreurs d'orthographe\n\n")
            if spelling_errors:
                for word, suggestions in spelling_errors.items():
                    suggestions_str = ", ".join(suggestions) if suggestions else "Aucune suggestion"
                    report_file.write(f"- **{word}**: {suggestions_str}\n")
            else:
                report_file.write("Aucune erreur d'orthographe détectée.\n")

            report_file.write("\n")

            # Écrire les erreurs grammaticales
            report_file.write("### Erreurs grammaticales\n\n")
            if grammar_errors:
                for error in grammar_errors:
                    suggestions_str = ", ".join(error['suggestions'][:5]) if error['suggestions'] else "Aucune suggestion"
                    context = error['context'].replace(error['context'][error['offset']:error['offset']+error['length']],
                                                      f"**{error['context'][error['offset']:error['offset']+error['length']]}**")
                    report_file.write(f"- **Erreur**: {error['message']}\n")
                    report_file.write(f"  - **Contexte**: {context}\n")
                    report_file.write(f"  - **Suggestions**: {suggestions_str}\n\n")
            else:
                report_file.write("Aucune erreur grammaticale détectée.\n")

            report_file.write("\n---\n\n")

        # Écrire le résumé
        report_file.write("## Résumé\n\n")
        report_file.write(f"- **Nombre total de chapitres analysés**: {len(chapters)}\n")
        report_file.write(f"- **Nombre total d'erreurs d'orthographe**: {total_spelling_errors}\n")
        report_file.write(f"- **Nombre total d'erreurs grammaticales**: {total_grammar_errors}\n")

    print(f"Rapport d'erreurs généré: {output_path}")

def save_to_csv(chapters, output_path):
    """
    Sauvegarde un résumé des erreurs dans un fichier CSV.
    """
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Chapitre', 'Erreurs d\'orthographe', 'Erreurs grammaticales', 'Total'])

        for chapter_title, chapter_content in chapters.items():
            spelling_errors = check_spelling(chapter_content)
            grammar_errors = check_grammar(chapter_content)

            total_errors = len(spelling_errors) + len(grammar_errors)
            writer.writerow([chapter_title, len(spelling_errors), len(grammar_errors), total_errors])

    print(f"Résumé des erreurs sauvegardé dans {output_path}")

def main():
    print(f"Analyse du fichier: {fichier_livre}")

    # Extraire les chapitres
    chapters = extract_chapters(fichier_livre)
    print(f"Nombre de chapitres trouvés: {len(chapters)}")

    # Définir les chemins de sortie
    report_output = f"{args.dossier}/rapport_orthographe_grammaire.md"
    csv_output = f"{args.dossier}/resume_erreurs.csv"

    # Générer le rapport d'erreurs
    generate_error_report(chapters, report_output)

    # Sauvegarder le résumé en CSV
    save_to_csv(chapters, csv_output)

    print("Analyse orthographique et grammaticale terminée avec succès!")

if __name__ == "__main__":
    main()