convert to gemini avec md2gemini, conversion de plusieurs langues

2025-06-20 09:04:42 +02:00 · 2025-02-27 16:18:47 +01:00 · 2025-02-27 16:18:47 +01:00 · bba1df0377
commit bba1df0377
parent 255e8fdc04
10 changed files with 462 additions and 202 deletions
--- a/utils.py
+++ b/utils.py
@ -5,6 +5,9 @@ import shutil
 from datetime import datetime
 import unicodedata
 import pypandoc
+import subprocess
+import tempfile
+from md2gemini import md2gemini

 from website_config import *

@ -82,22 +85,23 @@ def get_blog_template_conf(blogname) -> dict:
    else:
        return configs_sites[blogname]

-def find_year_and_slug_on_filename(fichier):
-    fichier = fichier.replace('..', '.') 
-    slug = ''
-    annee = datetime.now().year
-    date_str = f'{annee}-00-00'
-    date = f'{annee}-00-00'
-    boom = fichier.split('__')
-
-    if boom :
-        date_str = boom[0]
-        annee = date_str[:4]
-        slug = boom[1].replace('.org', '')
-        if "-" in date_str:
-            slug = enlever_premier_tiret_ou_underscore(slug)
-        return [date_str, annee, slug]
-    return [date_str, annee, fichier.replace(' ', '-').replace('.org', '')]
+def find_year_and_slug_on_filename(filename):
+    print(f"Traitement du fichier: {filename}")  # Debug
+    try:
+        # Supposons que le format attendu est "YYYYMMDDHHMMSS-slug.org"
+        date_str = filename[:14]  # Prend les 14 premiers caractères pour la date
+        annee = date_str[:4]      # Prend les 4 premiers caractères pour l'année
+        
+        # Gestion plus robuste du slug
+        if '-' in filename:
+            slug = filename.split('-', 1)[1].replace('.org', '')
+        else:
+            slug = filename.replace('.org', '')
+            
+        return date_str, annee, slug
+    except Exception as e:
+        print(f"Format de fichier non standard: {filename}")
+        return None, None, filename.replace('.org', '')


 def enlever_premier_tiret_ou_underscore(chaîne):
@ -212,21 +216,42 @@ def add_tags_from_content(tags=None, file_content="", words_to_check=None):
                tags.add(word)

    return tags
+# Variable globale pour stocker les fichiers sans tags
+untagged_files = []

-def extract_tags_from_file(file_path, excluded_tags):
+def save_untagged_files(output_file="sources/site_web/build/articles_without_tags.json"):
+    """
+    Sauvegarde la liste des fichiers sans tags dans un fichier JSON.
+    
+    :param output_file: Chemin du fichier JSON de sortie
+    """
+    import json
+    import os
+    
+    # Créer le dossier de sortie si nécessaire
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    
+    print('save_untagged_files', len(untagged_files))
+    # Sauvegarder la liste dans le fichier JSON
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(untagged_files, f, ensure_ascii=False, indent=4)
+
+
+def extract_tags_from_file(file_path, excluded_tags, auto_detected_tags_list=global_config['auto_tag_terms']):
    tags = set()
    with open(file_path, 'r', encoding='utf-8') as file_content:
        tag_found = False
        for line in file_content:
-            if global_config['automatic_tagging_enabled']:
-                tags = add_tags_from_content(tags, line, global_config['auto_tag_terms'])
+            if global_config['automatic_tagging_enabled'] and len(auto_detected_tags_list) > 0:
+                tags = add_tags_from_content(tags, line, auto_detected_tags_list)
            # Check for orgmode tags :tag1:tag2:
-            if ':' in line:
-                for word in line.split():
-                    if len(word) and word.startswith(':') and word.endswith(':'):
-                        tag = word[1:-1]
-                        if tag not in excluded_tags:
-                            tags.add(tag)
+            if global_config.get('automatic_tagging_org_files', True):
+                if ':' in line:
+                    for word in line.split():
+                        if len(word) > 1 and word.startswith(':') and word.endswith(':'):
+                            tag = word[1:-1]
+                            if tag not in excluded_tags:
+                                tags.add(tag)
                            tag_found = True
            # Check for #+tags: tag1,tag2
            if line.startswith('#+tags:'):
@ -236,7 +261,8 @@ def extract_tags_from_file(file_path, excluded_tags):
                        tags.add(tag)
                        tag_found = True

-    # if not tag_found:
+    if not tag_found:
+        untagged_files.append(file_path)
    #     print('no tag in the article', file_path)
    return tags

@ -380,3 +406,110 @@ def convert_org_to_html(org_file, output_html_file):
        print(f"Conversion réussie : {org_file} -> {output_html_file}")
    except Exception as e:
        print(f"Erreur lors de la conversion de {org_file} : {e}")
+
+
+
+def get_first_picture_url(content):
+    # Utiliser une expression régulière pour
+    # trouver la première URL d'image dans le contenu
+    pattern = r'\[\[(.*?)\]\]'
+    match = re.search(pattern, content)
+    if match:
+        return match.group(1)
+    else:
+        return None
+
+
+def org_to_gmi(org_text: str, output_filename_slug: str) -> str:
+    """
+    Convertit un texte au format Org en un fichier au format GMI (Gemini)
+    en utilisant pypandoc.
+
+    Args:
+    - org_text (str): Le texte au format Org à convertir.
+    - output_file (str): Chemin du fichier de sortie au format GMI, sans avoir à préciser l'extension.
+    """
+    output = """
+# mock land output
+===========
+
+blah blah blah
+
+-----------------
+Tykayn blog mock content
+-----------------
+
+Navigation:
+
+=> accueil.gmi Accueil 
+=> a-propos.gmi à propos
+    """
+    # Conversion du texte Org en GMI via Pandoc
+    try:
+        output = pypandoc.convert_text(org_text, 'markdown', format='org')
+    except RuntimeError as e:
+        print(f"Erreur de conversion : {e}")
+        return
+
+    # Sauvegarde du contenu GMI dans un fichier
+    try:
+        with open(destination_gmi+'/'+output_filename_slug+'.gmi', 'w', encoding='utf-8') as f:
+            f.write(output)
+        print(f"Fichier GMI sauvegardé avec succès : {output_filename_slug}")
+    except OSError as e:
+        print(f"Erreur lors de la sauvegarde du fichier : {e}")
+    return output
+
+def count_files_in_directories(directories):
+    total_count = 0
+    for directory in directories:
+        for root, dirs, files in os.walk(directory):
+            total_count += len(files)
+    return total_count
+
+
+def convert_org_to_gemini(org_content):
+    """
+    Convertit un contenu org en gemini en utilisant pandoc et md2gemini
+    
+    Args:
+        org_content (str): Contenu au format org
+    
+    Returns:
+        str: Contenu converti en format gemini
+    """
+    try:
+        # Créer un fichier temporaire avec le contenu org
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.org', encoding='utf-8') as temp_org:
+            temp_org.write(org_content)
+            temp_org.flush()
+            
+            # Première étape : conversion org vers markdown avec pandoc
+            pandoc_cmd = [
+                'pandoc',
+                '-f', 'org',
+                '-t', 'markdown',
+                temp_org.name
+            ]
+            
+            markdown_content = subprocess.check_output(
+                pandoc_cmd,
+                text=True,
+                stderr=subprocess.PIPE
+            )
+        
+        # Deuxième étape : conversion markdown vers gemini avec md2gemini
+        gemini_content = md2gemini(
+            markdown_content,
+            frontmatter=True,
+            links='inline',
+        )
+        
+        return gemini_content.strip()
+        
+    except subprocess.CalledProcessError as e:
+        print(f"Erreur lors de la conversion avec pandoc: {e.stderr}")
+        raise
+    except Exception as e:
+        print(f"Erreur lors de la conversion: {str(e)}")
+        raise