convert to gemini avec md2gemini, conversion de plusieurs langues

2025-11-19 23:00:35 +01:00 · 2025-02-27 16:18:47 +01:00 · 2025-02-27 16:18:47 +01:00 · bba1df0377
commit bba1df0377
parent 255e8fdc04
10 changed files with 462 additions and 202 deletions
--- a/gather_tags_in_json.py
+++ b/gather_tags_in_json.py
@ -67,6 +67,9 @@ def group_files_by_tags(org_files, excluded_tags):

        for tag in tags:
            tag_to_files[tag].add(slug)
+    # Sauvegarder les fichiers sans tags
+    save_untagged_files(output_file=f"sources/{blog_folder}/build/articles_without_tags.json")
+
    return tag_to_files


--- a/install.sh
+++ b/install.sh
@ -1,4 +1,4 @@
 #!/bin/bash
 sudo apt install pandoc python3-pip npm
-pip install uuid argparse os md2gemini
+pip install uuid argparse os md2gemini pytest pypandoc
 npm install -g sass
--- a/linking_articles_prev_next.py
+++ b/linking_articles_prev_next.py
@ -42,7 +42,12 @@ def get_basename(file_name):
    return os.path.splitext(file_name)[0]

 # Chemin du dossier contenant les fichiers orgmode
-directory = f'sources/{args.blog}/lang_fr'
+directory_pages = f'sources/{args.blog}/'
+directory_fr = f'sources/{args.blog}/lang_fr'
+directory_en = f'sources/{args.blog}/lang_en'
+
+directories_to_scan = [directory_pages, directory_fr, directory_en]
+
 destination_json = f'sources/{args.blog}/build'
 destination_html = f'html-websites/{args.blog}/'
 destination_gmi = f'gemini-capsules/{args.blog}/'
@ -64,68 +69,32 @@ else:
    files_dict = {}


-def get_first_picture_url(content):
-    # Utiliser une expression régulière pour trouver la première URL d'image dans le contenu
-    pattern = r'\[\[(.*?)\]\]'
-    match = re.search(pattern, content)
-    if match:
-        return match.group(1)
-    else:
-        return None


-def org_to_gmi(org_text: str, output_filename_slug: str) -> str:
-    """
-    Convertit un texte au format Org en un fichier au format GMI (Gemini)
-    en utilisant pypandoc.
+count_articles = count_files_in_directories(directories_to_scan)

-    Args:
-    - org_text (str): Le texte au format Org à convertir.
-    - output_file (str): Chemin du fichier de sortie au format GMI, sans avoir à préciser l'extension.
-    """
-    output = """
-# mock land output
-===========
-
-blah blah blah
-
-----------------
-Tykayn blog mock content
-----------------
-
-Navigation:
-
-=> accueil.gmi Accueil 
-=> a-propos.gmi à propos
-    """
-    # Conversion du texte Org en GMI via Pandoc
-    try:
-        output = pypandoc.convert_text(org_text, 'markdown', format='org')
-    except RuntimeError as e:
-        print(f"Erreur de conversion : {e}")
-        return
-
-    # Sauvegarde du contenu GMI dans un fichier
-    try:
-        with open(destination_gmi+'/'+output_filename_slug+'.gmi', 'w', encoding='utf-8') as f:
-            f.write(output)
-        print(f"Fichier GMI sauvegardé avec succès : {output_filename_slug}")
-    except OSError as e:
-        print(f"Erreur lors de la sauvegarde du fichier : {e}")
-    return output
-
-count_articles = len(os.listdir(directory))
 counter=0
 rebuild_counter = 0
 pandoc_runs_counter = 0
+lang_folder = global_config.get('lang_default', 'fr')

 if generate_linkings_json :
    
    print(f"Génération des liens entre articles pour {count_articles} articles")
    print(f"run_pandoc: {run_pandoc}")
    print(f"run_gemini: {run_gemini}")
-
+    article_type = "article"
    # Parcourir les fichiers du dossier
+    
+    for index, directory in enumerate(directories_to_scan):
+        # Déterminer le type d'article en fonction du chemin
+        if directory == '/':
+            article_type = "page"
+        else:
+            article_type = "article"
+        # Extraire la langue du dossier si elle commence par "lang_"
+        if directory.split('/')[-1].startswith('lang_'):
+            lang_folder = directory.split('/')[-1][5:]  # Prend les caractères après "lang_"
        for file_name in os.listdir(directory):
            if file_name.endswith('.org'):
                counter+=1
@ -209,7 +178,8 @@ if generate_linkings_json :

                    if run_pandoc and rebuild_this_article_html or force_html_regen:
                        # convertir le contenu d'article org vers html
-                    print(f"\033[91mBRRRRRRRRRRRRR pandoc time {time.strftime('%H:%M:%S')} : Conversion de {file_name} en html\033[0m")
+                        # print(f"\033[91mBRRRRRRRRRRRRR pandoc time {time.strftime('%H:%M:%S')} : Conversion de {file_name} en html\033[0m")
+                        print(f"\033[91m.\033[0m", end='', flush=True)
                        
                        html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
                        pandoc_runs_counter += 1
@ -219,6 +189,7 @@ if generate_linkings_json :
                    if run_gemini and rebuild_this_article_gemini:
                        os.makedirs(destination_gmi, exist_ok=True)
                        # convertir le contenu d'article org vers gmi pour la capsule gemini
+                        print(f"Conversion de {file_name} en gemini")
                        gemini_content = org_to_gmi(content_without_h1, slug)


@ -232,6 +203,8 @@ if generate_linkings_json :
                        'slug': f"{slug}/",
                        'slug_with_year': f"{annee}/{slug}",
                        'date': boom[0],
+                        'lang': lang_folder,
+                        'article_type': article_type,
                        'date_modified' : date_modified,
                        'first_picture_url' : get_first_picture_url(content),
                        'date_formattee': datetime.strptime(date_str, '%Y%m%d%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 14 else datetime.strptime(date_str, '%Y%m%dT%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 15 else datetime.strptime(date_str, '%Y-%m-%d').strftime('%d %B %Y'),
@ -244,7 +217,8 @@ if generate_linkings_json :
                        'last_gemini_build': last_gemini_build,
                        'org_content': content,  # Contenu Org original
                        'html_content_without_h1': html_content_without_h1,  # Contenu HTML converti sans le titre de premier niveau
-                    'html_content': html_content  # Contenu first_picture_urlHTML converti
+                        'html_content': html_content,  # Contenu first_picture_urlHTML converti
+                        'gemini_content': gemini_content,  # Contenu gemini
                    }

    print(f"======= Nombre d'articles reconstruits: {rebuild_counter}")
--- a/new_article.py
+++ b/new_article.py
@ -89,34 +89,63 @@ def create_uuid_property():
    uuid_value = uuid.uuid4()
    return uuid_value

-# Écriture du fichier org
-with open(filename, "w") as f: 
-    uuid = create_uuid_property()
-    f.write(f"""
-:PROPERTIES:
-:ID:       {uuid}
+def make_article(config):
+    """
+    Crée le contenu d'un nouvel article avec les propriétés spécifiées.
+
+    Args:
+        config (dict): Dictionnaire contenant les paramètres de l'article:
+            - uuid (str): Identifiant unique de l'article
+            - slug (str): Slug de l'URL de l'article 
+            - title (str): Titre de l'article
+            - date_string_full (str): Date complète au format YYYY-MM-DD HH:MM:SS
+            - date_string (str): Date au format YYYYMMDDHHMMSS
+            - schema_slug (str): Slug avec ou sans préfixe année selon la config
+            - blog_dir (str): Dossier du blog
+
+    Returns:
+        str: Contenu formaté de l'article avec les propriétés et métadonnées
+    """
+
+
+    return f""":PROPERTIES:
+:ID:       {config.get('uuid')}    
 :END:

-#+title: {args.title}
+#+title: {config.get('title')}
 #+post_ID: 
-#+post_slug: {slug}
+#+post_slug: {config.get('slug')}


-#+post_url: https://www.ciperbliss.com/{schema_slug}
-#+post_title: {args.title}
+#+post_url: https://www.ciperbliss.com/{config.get('schema_slug')}
+#+post_title: {config.get('title')}
 #+post_tags: 
 #+post_series: 
 #+post_type: post
 #+post_status: publish
 #+post_picture: 
-#+post_date_published: <{date_string_full}>
-#+post_date_modified: <{date_string_full}>
-#+post_index_page_roam_id: {uuid}
-#+BLOG: {args.blog_dir}
+#+post_date_published: <{config.get('date_string_full')}>
+#+post_date_modified: <{config.get('date_string_full')}>
+#+post_index_page_roam_id: {config.get('uuid')}
+#+BLOG: {config.get('blog_dir')}

-* {args.title}
+* {config.get('title')}


-""")
+"""
+
+# Écriture du fichier org
+with open(filename, "w") as f: 
+    uuid = create_uuid_property()
+    config={
+        'uuid': uuid,
+        'slug': slug,
+        'title': args.title,
+        'date_string_full': date_string_full,
+        'date_string': date_string,
+        'schema_slug': schema_slug,
+        'blog_dir': args.blog_dir,
+    }
+    f.write(make_article(config))

 print(f"Le fichier '{filename}' a été créé avec succès.")
--- a/sources/dragonfeu_blog/lang_en/2024-11-02-hello-gemini.org
+++ b/sources/dragonfeu_blog/lang_en/2024-11-02-hello-gemini.org
@ -1,2 +0,0 @@
-* Hi, giminiciens
-hop hop hello in English
--- a/sources/dragonfeu_blog/lang_en/20250227154104__coucou-gemini-en-2025.org
+++ b/sources/dragonfeu_blog/lang_en/20250227154104__coucou-gemini-en-2025.org
@ -0,0 +1,25 @@
+:PROPERTIES:
+:ID:       41decd55-85b9-43a6-9c24-2da4985f2d87    
+:END:
+
+#+title: Coucou gemini en 2025
+#+post_ID: 
+#+post_slug: coucou-gemini-en-2025
+
+
+#+post_url: https://www.ciperbliss.com/2025/coucou-gemini-en-2025
+#+post_title: Coucou gemini en 2025
+#+post_tags: 
+#+post_series: 
+#+post_type: post
+#+post_status: publish
+#+post_picture: 
+#+post_date_published: <2025-02-27 15:41:04>
+#+post_date_modified: <2025-02-27 15:41:04>
+#+post_index_page_roam_id: 41decd55-85b9-43a6-9c24-2da4985f2d87
+#+BLOG: dragonfeu_blog
+
+* Hey gemini in 2025
+
+hey yoooooooooooo
+
--- a/sources/dragonfeu_blog/lang_en/20250227154659__coucou-gemini-en-2025.org
+++ b/sources/dragonfeu_blog/lang_en/20250227154659__coucou-gemini-en-2025.org
@ -0,0 +1,24 @@
+:PROPERTIES:
+:ID:       7a77f219-b581-4a67-be7a-c66588b7e3f7    
+:END:
+
+#+title: Coucou gemini en 2025
+#+post_ID: 
+#+post_slug: coucou-gemini-en-2025
+
+
+#+post_url: https://www.ciperbliss.com/2025/coucou-gemini-en-2025
+#+post_title: Coucou gemini en 2025
+#+post_tags: 
+#+post_series: 
+#+post_type: post
+#+post_status: publish
+#+post_picture: 
+#+post_date_published: <2025-02-27 15:46:59>
+#+post_date_modified: <2025-02-27 15:46:59>
+#+post_index_page_roam_id: 7a77f219-b581-4a67-be7a-c66588b7e3f7
+#+BLOG: dragonfeu_blog
+
+* Coucou gemini en 2025
+
+
--- a/test_org_conversion.py
+++ b/test_org_conversion.py
@ -0,0 +1,72 @@
+import pytest
+from utils import convert_org_to_gemini
+
+def test_org_to_gemini_conversion():
+    # Exemple de contenu org
+    org_content = """#+TITLE: Test Article
+#+AUTHOR: John Doe
+#+DATE: 2024-03-14
+
+* Premier titre
+Voici du texte simple.
+
+** Sous-titre
+- Liste item 1
+- Liste item 2
+
+* Deuxième titre
+Un lien [[https://example.com][Example]]
+Et du *texte en gras* avec /italique/."""
+
+    # Convertir le contenu directement
+    result = convert_org_to_gemini(org_content)
+    result = result.strip()
+    print(f"result: {result}")
+    # Vérifier les éléments clés de la conversion
+    assert "# Premier titre" in result
+    assert "## Sous-titre" in result
+    assert "* Liste item 1" in result
+    assert "* Liste item 2" in result
+    assert "=> https://example.com Example" in result
+
+def test_org_to_gemini_tags():
+    """Test de la détection des tags"""
+    org_content = """#+TITLE: Test Article
+#+TAGS: chaton, mignon, félin
+
+* Un article sur les chatons
+Du contenu sur les chatons..."""
+
+    result = find_tags_in_org_content(org_content)
+    assert "chaton" in result, "Le tag 'chaton' devrait être présent dans le résultat"
+
+
+# def test_org_to_gemini_code_blocks():
+#     """Test de la conversion des blocs de code"""
+#     org_content = """#+BEGIN_SRC python
+# def hello():
+#     print("Hello, World!")
+# #+END_SRC"""
+
+#     result = convert_org_to_gemini(org_content)
+#     assert "```python" in result
+#     assert "def hello():" in result
+#     assert 'print("Hello, World!")' in result
+#     assert "```" in result
+
+# def test_org_to_gemini_tables():
+#     """Test de la conversion des tableaux"""
+#     org_content = """| Colonne 1 | Colonne 2 |
+# |-----------|-----------|
+# | Valeur 1  | Valeur 2  |
+# | Valeur 3  | Valeur 4  |"""
+
+#     result = convert_org_to_gemini(org_content)
+#     # Vérifier que le tableau est converti en texte lisible
+#     assert "Colonne 1" in result
+#     assert "Colonne 2" in result
+#     assert "Valeur 1" in result
+#     assert "Valeur 2" in result
+
+if __name__ == '__main__':
+    pytest.main([__file__]) 
--- a/utils.py
+++ b/utils.py
@ -5,6 +5,9 @@ import shutil
 from datetime import datetime
 import unicodedata
 import pypandoc
+import subprocess
+import tempfile
+from md2gemini import md2gemini

 from website_config import *

@ -82,22 +85,23 @@ def get_blog_template_conf(blogname) -> dict:
    else:
        return configs_sites[blogname]

-def find_year_and_slug_on_filename(fichier):
-    fichier = fichier.replace('..', '.') 
-    slug = ''
-    annee = datetime.now().year
-    date_str = f'{annee}-00-00'
-    date = f'{annee}-00-00'
-    boom = fichier.split('__')
+def find_year_and_slug_on_filename(filename):
+    print(f"Traitement du fichier: {filename}")  # Debug
+    try:
+        # Supposons que le format attendu est "YYYYMMDDHHMMSS-slug.org"
+        date_str = filename[:14]  # Prend les 14 premiers caractères pour la date
+        annee = date_str[:4]      # Prend les 4 premiers caractères pour l'année
        
-    if boom :
-        date_str = boom[0]
-        annee = date_str[:4]
-        slug = boom[1].replace('.org', '')
-        if "-" in date_str:
-            slug = enlever_premier_tiret_ou_underscore(slug)
-        return [date_str, annee, slug]
-    return [date_str, annee, fichier.replace(' ', '-').replace('.org', '')]
+        # Gestion plus robuste du slug
+        if '-' in filename:
+            slug = filename.split('-', 1)[1].replace('.org', '')
+        else:
+            slug = filename.replace('.org', '')
+            
+        return date_str, annee, slug
+    except Exception as e:
+        print(f"Format de fichier non standard: {filename}")
+        return None, None, filename.replace('.org', '')


 def enlever_premier_tiret_ou_underscore(chaîne):
@ -212,18 +216,39 @@ def add_tags_from_content(tags=None, file_content="", words_to_check=None):
                tags.add(word)

    return tags
+# Variable globale pour stocker les fichiers sans tags
+untagged_files = []

-def extract_tags_from_file(file_path, excluded_tags):
+def save_untagged_files(output_file="sources/site_web/build/articles_without_tags.json"):
+    """
+    Sauvegarde la liste des fichiers sans tags dans un fichier JSON.
+    
+    :param output_file: Chemin du fichier JSON de sortie
+    """
+    import json
+    import os
+    
+    # Créer le dossier de sortie si nécessaire
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    
+    print('save_untagged_files', len(untagged_files))
+    # Sauvegarder la liste dans le fichier JSON
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(untagged_files, f, ensure_ascii=False, indent=4)
+
+
+def extract_tags_from_file(file_path, excluded_tags, auto_detected_tags_list=global_config['auto_tag_terms']):
    tags = set()
    with open(file_path, 'r', encoding='utf-8') as file_content:
        tag_found = False
        for line in file_content:
-            if global_config['automatic_tagging_enabled']:
-                tags = add_tags_from_content(tags, line, global_config['auto_tag_terms'])
+            if global_config['automatic_tagging_enabled'] and len(auto_detected_tags_list) > 0:
+                tags = add_tags_from_content(tags, line, auto_detected_tags_list)
            # Check for orgmode tags :tag1:tag2:
+            if global_config.get('automatic_tagging_org_files', True):
                if ':' in line:
                    for word in line.split():
-                    if len(word) and word.startswith(':') and word.endswith(':'):
+                        if len(word) > 1 and word.startswith(':') and word.endswith(':'):
                            tag = word[1:-1]
                            if tag not in excluded_tags:
                                tags.add(tag)
@ -236,7 +261,8 @@ def extract_tags_from_file(file_path, excluded_tags):
                        tags.add(tag)
                        tag_found = True

-    # if not tag_found:
+    if not tag_found:
+        untagged_files.append(file_path)
    #     print('no tag in the article', file_path)
    return tags

@ -380,3 +406,110 @@ def convert_org_to_html(org_file, output_html_file):
        print(f"Conversion réussie : {org_file} -> {output_html_file}")
    except Exception as e:
        print(f"Erreur lors de la conversion de {org_file} : {e}")
+
+
+
+def get_first_picture_url(content):
+    # Utiliser une expression régulière pour
+    # trouver la première URL d'image dans le contenu
+    pattern = r'\[\[(.*?)\]\]'
+    match = re.search(pattern, content)
+    if match:
+        return match.group(1)
+    else:
+        return None
+
+
+def org_to_gmi(org_text: str, output_filename_slug: str) -> str:
+    """
+    Convertit un texte au format Org en un fichier au format GMI (Gemini)
+    en utilisant pypandoc.
+
+    Args:
+    - org_text (str): Le texte au format Org à convertir.
+    - output_file (str): Chemin du fichier de sortie au format GMI, sans avoir à préciser l'extension.
+    """
+    output = """
+# mock land output
+===========
+
+blah blah blah
+
+-----------------
+Tykayn blog mock content
+-----------------
+
+Navigation:
+
+=> accueil.gmi Accueil 
+=> a-propos.gmi à propos
+    """
+    # Conversion du texte Org en GMI via Pandoc
+    try:
+        output = pypandoc.convert_text(org_text, 'markdown', format='org')
+    except RuntimeError as e:
+        print(f"Erreur de conversion : {e}")
+        return
+
+    # Sauvegarde du contenu GMI dans un fichier
+    try:
+        with open(destination_gmi+'/'+output_filename_slug+'.gmi', 'w', encoding='utf-8') as f:
+            f.write(output)
+        print(f"Fichier GMI sauvegardé avec succès : {output_filename_slug}")
+    except OSError as e:
+        print(f"Erreur lors de la sauvegarde du fichier : {e}")
+    return output
+
+def count_files_in_directories(directories):
+    total_count = 0
+    for directory in directories:
+        for root, dirs, files in os.walk(directory):
+            total_count += len(files)
+    return total_count
+
+
+def convert_org_to_gemini(org_content):
+    """
+    Convertit un contenu org en gemini en utilisant pandoc et md2gemini
+    
+    Args:
+        org_content (str): Contenu au format org
+    
+    Returns:
+        str: Contenu converti en format gemini
+    """
+    try:
+        # Créer un fichier temporaire avec le contenu org
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.org', encoding='utf-8') as temp_org:
+            temp_org.write(org_content)
+            temp_org.flush()
+            
+            # Première étape : conversion org vers markdown avec pandoc
+            pandoc_cmd = [
+                'pandoc',
+                '-f', 'org',
+                '-t', 'markdown',
+                temp_org.name
+            ]
+            
+            markdown_content = subprocess.check_output(
+                pandoc_cmd,
+                text=True,
+                stderr=subprocess.PIPE
+            )
+        
+        # Deuxième étape : conversion markdown vers gemini avec md2gemini
+        gemini_content = md2gemini(
+            markdown_content,
+            frontmatter=True,
+            links='inline',
+        )
+        
+        return gemini_content.strip()
+        
+    except subprocess.CalledProcessError as e:
+        print(f"Erreur lors de la conversion avec pandoc: {e.stderr}")
+        raise
+    except Exception as e:
+        print(f"Erreur lors de la conversion: {str(e)}")
+        raise
--- a/website_config.py
+++ b/website_config.py
@ -4,7 +4,9 @@ global_config = {
    "slug_with_year": True,
    # "show_logs": False,
    "show_logs": True,
+    "lang_default": "fr",
    "automatic_tagging_enabled": True,
+    "automatic_tagging_org_files": True,
    "rebuild_files_filter": 2024,
    "posts_per_page": 10,
    "source_files_extension": "org",