orgmode-to-gemini-blog/export_to_epub.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script pour exporter un ou plusieurs blogs au format EPUB en utilisant Calibre.
"""

import os
import re
import argparse
import subprocess
import shutil
import tempfile
from datetime import datetime
import locale

try:
    import pypandoc
    HAS_PYPANDOC = True
except ImportError:
    HAS_PYPANDOC = False
    print("Attention: pypandoc n'est pas installé. Installation requise: pip install pypandoc")

from utils.utils import (
    find_extract_in_content_org,
    find_first_level1_title,
    find_year_and_slug_on_filename,
    get_blog_template_conf
)
from website_config import configs_sites


def verifier_calibre():
    """Vérifie si Calibre est installé."""
    try:
        subprocess.run(['ebook-convert', '--version'],
                      stdout=subprocess.PIPE,
                      stderr=subprocess.PIPE,
                      check=True)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


def extraire_date_du_contenu(content):
    """Extrait la date de création du contenu org."""
    # Chercher #+CREATED
    match = re.search(r'#\+CREATED:\s*(\d{4}-\d{2}-\d{2})', content)
    if match:
        try:
            return datetime.strptime(match.group(1), '%Y-%m-%d')
        except ValueError:
            pass

    # Chercher #+post_date_published
    match = re.search(r'#\+post_date_published:\s*(\d{4}-\d{2}-\d{2})', content)
    if match:
        try:
            return datetime.strptime(match.group(1), '%Y-%m-%d')
        except ValueError:
            pass

    return None


def extraire_date_du_fichier(filename):
    """Extrait la date du nom de fichier."""
    try:
        date_str, annee, slug = find_year_and_slug_on_filename(filename)
        if date_str and len(date_str) >= 8:
            year = date_str[:4]
            month = date_str[4:6]
            day = date_str[6:8]
            return datetime(int(year), int(month), int(day))
    except:
        pass
    return None


def collecter_articles(blog_path):
    """Collecte tous les articles .org d'un blog."""
    articles = []

    # Chercher dans lang_fr et lang_en
    for lang_dir in ['lang_fr', 'lang_en']:
        lang_path = os.path.join(blog_path, lang_dir)
        if not os.path.exists(lang_path):
            continue

        # Lister tous les fichiers .org
        for filename in os.listdir(lang_path):
            if filename.endswith('.org'):
                filepath = os.path.join(lang_path, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        content = f.read()

                    # Extraire les métadonnées
                    title = find_first_level1_title(content)
                    if not title:
                        continue

                    # Nettoyer le titre
                    title = title.replace('*', '').strip()

                    date = extraire_date_du_contenu(content)
                    if not date:
                        date = extraire_date_du_fichier(filename)
                    if not date:
                        date = datetime.fromtimestamp(os.path.getmtime(filepath))

                    articles.append({
                        'title': title,
                        'content': content,
                        'date': date,
                        'filename': filename
                    })
                except Exception as e:
                    print(f"  Erreur lors de la lecture de {filepath}: {e}")

    # Trier par date (plus ancien en premier)
    articles.sort(key=lambda x: x['date'])

    return articles


def convertir_org_en_html(org_content):
    """Convertit le contenu org en HTML."""
    if not HAS_PYPANDOC:
        raise RuntimeError("pypandoc n'est pas installé")

    # Nettoyer le contenu (retirer les métadonnées)
    content_clean = find_extract_in_content_org(org_content)

    # Convertir avec pandoc
    try:
        html = pypandoc.convert_text(content_clean, 'html', format='org')
        return html
    except Exception as e:
        print(f"  Erreur lors de la conversion: {e}")
        return f"<p>Erreur lors de la conversion: {e}</p>"


def generer_html_epub(articles, blog_config):
    """Génère un fichier HTML combiné pour l'export EPUB."""
    html_parts = []

    # En-tête HTML
    html_parts.append('''<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="fr">
<head>
    <meta charset="UTF-8"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
    <title>{}</title>
    <style>
        body {{
            font-family: serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            line-height: 1.6;
        }}
        h1 {{
            border-bottom: 2px solid #333;
            padding-bottom: 10px;
            margin-top: 40px;
        }}
        h2 {{
            color: #555;
            margin-top: 30px;
        }}
        .article-date {{
            color: #666;
            font-size: 0.9em;
            margin-bottom: 20px;
        }}
        img {{
            max-width: 100%;
            height: auto;
        }}
        a {{
            color: #0066cc;
        }}
        code {{
            background-color: #f4f4f4;
            padding: 2px 4px;
            border-radius: 3px;
        }}
        pre {{
            background-color: #f4f4f4;
            padding: 10px;
            border-radius: 5px;
            overflow-x: auto;
        }}
    </style>
</head>
<body>
    <h1>{}</h1>
    <p><em>{}</em></p>
    <p>Auteur: {}</p>
    <p>Export généré le: {}</p>
'''.format(
        blog_config.get('BLOG_TITLE', 'Blog'),
        blog_config.get('BLOG_TITLE', 'Blog'),
        blog_config.get('BLOG_SUBTITLE', ''),
        blog_config.get('AUTHOR', 'Auteur inconnu'),
        datetime.now().strftime('%d/%m/%Y à %H:%M:%S')
    ))

    # Ajouter chaque article
    for article in articles:
        html_parts.append(f'<h1>{article["title"]}</h1>')
        html_parts.append(f'<div class="article-date">Publié le: {article["date"].strftime("%d/%m/%Y")}</div>')

        # Convertir le contenu org en HTML
        html_content = convertir_org_en_html(article['content'])
        html_parts.append(html_content)

        html_parts.append('<hr/>')

    # Pied de page
    html_parts.append('''
    <div style="margin-top: 50px; padding-top: 20px; border-top: 1px solid #ccc;">
        <p><em>Export généré depuis: {}</em></p>
        <p>{}</p>
    </div>
</body>
</html>'''.format(
        blog_config.get('NDD', ''),
        blog_config.get('DESCRIPTION', '')
    ))

    return '\n'.join(html_parts)


def exporter_epub(blogs, output_dir='exports/epub', titre=None):
    """Exporte un ou plusieurs blogs en EPUB."""

    # Vérifier que Calibre est installé
    if not verifier_calibre():
        print("Erreur: Calibre n'est pas installé ou ebook-convert n'est pas dans le PATH.")
        print("Installation: https://calibre-ebook.com/download")
        return

    if not HAS_PYPANDOC:
        print("Erreur: pypandoc n'est pas installé.")
        print("Installation: pip install pypandoc")
        return

    sources_dir = "sources"
    if not os.path.exists(sources_dir):
        print(f"Erreur: Le dossier {sources_dir} n'existe pas")
        return

    # Créer le dossier de sortie
    os.makedirs(output_dir, exist_ok=True)

    # Collecter les articles de tous les blogs
    all_articles = []
    blog_configs = {}
    blog_titles = []

    for blog_name in blogs:
        blog_path = os.path.join(sources_dir, blog_name)
        if not os.path.exists(blog_path):
            print(f"⚠️  Blog '{blog_name}' introuvable dans {sources_dir}")
            continue

        print(f"📚 Collecte des articles de {blog_name}...")
        blog_config = get_blog_template_conf(blog_name)
        blog_configs[blog_name] = blog_config

        articles = collecter_articles(blog_path)
        print(f"  {len(articles)} articles trouvés")

        for article in articles:
            article['blog_name'] = blog_name
            all_articles.append(article)

        blog_titles.append(blog_config.get('BLOG_TITLE', blog_name))

    if not all_articles:
        print("Aucun article trouvé.")
        return

    # Trier tous les articles par date
    all_articles.sort(key=lambda x: x['date'])

    print(f"\n📖 Total: {len(all_articles)} articles")

    # Déterminer le titre et l'auteur pour l'EPUB
    if len(blogs) == 1:
        config = blog_configs[blogs[0]]
        epub_title = titre or config.get('BLOG_TITLE', blogs[0])
        epub_author = config.get('AUTHOR', 'Auteur inconnu')
    else:
        epub_title = titre or ' - '.join(blog_titles)
        # Prendre le premier auteur ou combiner si différents
        auteurs = list(set([blog_configs[b].get('AUTHOR', '') for b in blogs]))
        epub_author = auteurs[0] if len(auteurs) == 1 else ' & '.join(auteurs)

    # Générer le HTML combiné
    print(f"\n🔄 Génération du fichier HTML...")

    # Pour plusieurs blogs, utiliser la config du premier
    main_config = blog_configs[blogs[0]] if blogs else {}

    html_content = generer_html_epub(all_articles, main_config)

    # Créer un fichier temporaire HTML
    with tempfile.NamedTemporaryFile(mode='w', suffix='.html',
                                     encoding='utf-8', delete=False) as tmp_html:
        tmp_html.write(html_content)
        tmp_html_path = tmp_html.name

    # Générer le nom du fichier EPUB
    safe_title = re.sub(r'[^\w\s-]', '', epub_title).strip()
    safe_title = re.sub(r'[-\s]+', '-', safe_title)
    epub_filename = f"{safe_title}.epub"
    epub_path = os.path.join(output_dir, epub_filename)

    # Convertir en EPUB avec Calibre
    print(f"\n📦 Conversion en EPUB avec Calibre...")
    print(f"  Titre: {epub_title}")
    print(f"  Auteur: {epub_author}")
    print(f"  Fichier: {epub_path}")

    try:
        cmd = [
            'ebook-convert',
            tmp_html_path,
            epub_path,
            '--title', epub_title,
            '--authors', epub_author,
            '--language', 'fr',
            '--page-breaks-before', '/',
            '--insert-blank-line',
            '--smarten-punctuation',
            '--margin-top', '50',
            '--margin-bottom', '50',
            '--margin-left', '50',
            '--margin-right', '50',
        ]

        # Ajouter la description si disponible
        if main_config.get('DESCRIPTION'):
            cmd.extend(['--comments', main_config.get('DESCRIPTION')])

        result = subprocess.run(cmd, capture_output=True, text=True, check=True)

        print(f"\n✅ EPUB généré avec succès: {epub_path}")

        # Nettoyer le fichier temporaire
        os.unlink(tmp_html_path)

    except subprocess.CalledProcessError as e:
        print(f"\n❌ Erreur lors de la conversion:")
        print(f"  {e.stderr}")
        os.unlink(tmp_html_path)
        return
    except Exception as e:
        print(f"\n❌ Erreur: {e}")
        if os.path.exists(tmp_html_path):
            os.unlink(tmp_html_path)
        return


def main():
    """Fonction principale."""
    parser = argparse.ArgumentParser(
        description='Exporte un ou plusieurs blogs au format EPUB avec Calibre'
    )
    parser.add_argument(
        'blogs',
        nargs='+',
        help='Noms des blogs à exporter (ex: tykayn_blog cipherbliss_blog)'
    )
    parser.add_argument(
        '--output',
        '-o',
        default='exports/epub',
        help='Dossier de sortie pour les fichiers EPUB (défaut: exports/epub)'
    )
    parser.add_argument(
        '--titre',
        '-t',
        help='Titre personnalisé pour l\'EPUB (par défaut: titre du blog ou combinaison)'
    )

    args = parser.parse_args()

    exporter_epub(args.blogs, args.output, args.titre)


if __name__ == "__main__":
    main()