convert to gemini avec md2gemini, conversion de plusieurs langues

2025-10-09 17:02:45 +02:00 · 2025-02-27 16:18:47 +01:00 · 2025-02-27 16:18:47 +01:00 · bba1df0377
commit bba1df0377
parent 255e8fdc04
10 changed files with 462 additions and 202 deletions
--- a/linking_articles_prev_next.py
+++ b/linking_articles_prev_next.py
@ -42,7 +42,12 @@ def get_basename(file_name):
    return os.path.splitext(file_name)[0]

 # Chemin du dossier contenant les fichiers orgmode
-directory = f'sources/{args.blog}/lang_fr'
+directory_pages = f'sources/{args.blog}/'
+directory_fr = f'sources/{args.blog}/lang_fr'
+directory_en = f'sources/{args.blog}/lang_en'
+
+directories_to_scan = [directory_pages, directory_fr, directory_en]
+
 destination_json = f'sources/{args.blog}/build'
 destination_html = f'html-websites/{args.blog}/'
 destination_gmi = f'gemini-capsules/{args.blog}/'
@ -64,188 +69,157 @@ else:
    files_dict = {}


-def get_first_picture_url(content):
-    # Utiliser une expression régulière pour trouver la première URL d'image dans le contenu
-    pattern = r'\[\[(.*?)\]\]'
-    match = re.search(pattern, content)
-    if match:
-        return match.group(1)
-    else:
-        return None


-def org_to_gmi(org_text: str, output_filename_slug: str) -> str:
-    """
-    Convertit un texte au format Org en un fichier au format GMI (Gemini)
-    en utilisant pypandoc.
+count_articles = count_files_in_directories(directories_to_scan)

-    Args:
-    - org_text (str): Le texte au format Org à convertir.
-    - output_file (str): Chemin du fichier de sortie au format GMI, sans avoir à préciser l'extension.
-    """
-    output = """
-# mock land output
-===========
-
-blah blah blah
-
-----------------
-Tykayn blog mock content
-----------------
-
-Navigation:
-
-=> accueil.gmi Accueil 
-=> a-propos.gmi à propos
-    """
-    # Conversion du texte Org en GMI via Pandoc
-    try:
-        output = pypandoc.convert_text(org_text, 'markdown', format='org')
-    except RuntimeError as e:
-        print(f"Erreur de conversion : {e}")
-        return
-
-    # Sauvegarde du contenu GMI dans un fichier
-    try:
-        with open(destination_gmi+'/'+output_filename_slug+'.gmi', 'w', encoding='utf-8') as f:
-            f.write(output)
-        print(f"Fichier GMI sauvegardé avec succès : {output_filename_slug}")
-    except OSError as e:
-        print(f"Erreur lors de la sauvegarde du fichier : {e}")
-    return output
-
-count_articles = len(os.listdir(directory))
 counter=0
 rebuild_counter = 0
 pandoc_runs_counter = 0
+lang_folder = global_config.get('lang_default', 'fr')

 if generate_linkings_json :
    
    print(f"Génération des liens entre articles pour {count_articles} articles")
    print(f"run_pandoc: {run_pandoc}")
    print(f"run_gemini: {run_gemini}")
-
+    article_type = "article"
    # Parcourir les fichiers du dossier
-    for file_name in os.listdir(directory):
-        if file_name.endswith('.org'):
-            counter+=1
-            if force_html_regen and counter % 10 == 0:
-                print(f"{time.strftime('%H:%M:%S')} : Articles traités : {counter}/{count_articles}")
-            file_path = os.path.join(directory, file_name)
-            with open(file_path, "r", encoding="utf-8") as f:
-                content = f.read()
-                date_modified = time.ctime(os.path.getmtime(file_path))
+    
+    for index, directory in enumerate(directories_to_scan):
+        # Déterminer le type d'article en fonction du chemin
+        if directory == '/':
+            article_type = "page"
+        else:
+            article_type = "article"
+        # Extraire la langue du dossier si elle commence par "lang_"
+        if directory.split('/')[-1].startswith('lang_'):
+            lang_folder = directory.split('/')[-1][5:]  # Prend les caractères après "lang_"
+        for file_name in os.listdir(directory):
+            if file_name.endswith('.org'):
+                counter+=1
+                if force_html_regen and counter % 10 == 0:
+                    print(f"{time.strftime('%H:%M:%S')} : Articles traités : {counter}/{count_articles}")
+                file_path = os.path.join(directory, file_name)
+                with open(file_path, "r", encoding="utf-8") as f:
+                    content = f.read()
+                    date_modified = time.ctime(os.path.getmtime(file_path))

-                basename = get_basename(file_name)
-                date_str, annee, slug = find_year_and_slug_on_filename(basename)
-                tags = extract_tags_from_file(file_path, global_config['excluded_tags'])
-                
-                # Convertir les tags en liste si c'est un set
-                if isinstance(tags, set):
-                    tags = list(tags)
-                boom = basename.split('__')
-                # Convertir le contenu Org en HTML
-                title = find_first_level1_title(content)
-
-                # Désactiver les warning d'identifiant dupliqué dans la conversion pandoc
-                content_without_h1 = re.sub(r'^\*.*?$', '', content, count=1, flags=re.MULTILINE)
-
-                gemini_content = ''
-                html_content = ''
-                html_content_without_h1 = ''
-                # Vérifier l'existence du fichier HTML pour déterminer last_html_build
-                html_path = f"html-websites/{args.blog}/{annee}/{slug}/index.html"
-                last_html_build_time = None
-                if os.path.exists(html_path):
-                    # Obtenir la date de création du fichier HTML
-                    last_html_build_time = os.path.getctime(html_path)
-
-                    # print(f"last_html_build: {last_html_build_time} : {html_path}")
-                else:
-                    print(f"----------- last_html_build html_path: {html_path} n'existe pas")
-                # Vérifier l'existence du fichier Gemini pour déterminer last_gemini_build
-                gemini_path = f"gemini-capsules/{args.blog}/{slug}.gmi"
-                last_gemini_build = None
-                rebuild_this_article_gemini = False
-                if os.path.exists(gemini_path):
-                    last_gemini_build = time.ctime(os.path.getmtime(gemini_path))
-                # Vérifier si l'article doit être reconstruit en comparant les dates de modification
-                if last_gemini_build:
-                    file_modified_time = os.path.getmtime(file_path)
-                    last_build_time = time.mktime(time.strptime(last_gemini_build))
-                    rebuild_this_article_gemini = file_modified_time > last_build_time
-                else:
+                    basename = get_basename(file_name)
+                    date_str, annee, slug = find_year_and_slug_on_filename(basename)
+                    tags = extract_tags_from_file(file_path, global_config['excluded_tags'])
                    
-                    rebuild_this_article_gemini = True
+                    # Convertir les tags en liste si c'est un set
+                    if isinstance(tags, set):
+                        tags = list(tags)
+                    boom = basename.split('__')
+                    # Convertir le contenu Org en HTML
+                    title = find_first_level1_title(content)

-                # Vérifier si l'article doit être reconstruit en comparant les dates de modification
-                rebuild_this_article_html = False
-                if last_html_build_time:
-                    file_modified_time = os.path.getmtime(file_path)
-                    # print(f"--------- file_modified_time: {file_path} : {file_modified_time}")
-                    # Obtenir l'heure de dernière modification du fichier HTML
-                    
-                    rebuild_this_article_html = file_modified_time > last_html_build_time
-                    # print(f"--------- article modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}")
-                else:
-                    # si il n'y a pas de fichier html, on le construit pour la première fois
-                    print('on reconstruit le html de l\'article', file_name)
-                    
-                    rebuild_this_article_html = True
-                
-                if rebuild_this_article_html:
-                    rebuild_counter += 1
+                    # Désactiver les warning d'identifiant dupliqué dans la conversion pandoc
+                    content_without_h1 = re.sub(r'^\*.*?$', '', content, count=1, flags=re.MULTILINE)

+                    gemini_content = ''
+                    html_content = ''
+                    html_content_without_h1 = ''
+                    # Vérifier l'existence du fichier HTML pour déterminer last_html_build
+                    html_path = f"html-websites/{args.blog}/{annee}/{slug}/index.html"
+                    last_html_build_time = None
+                    if os.path.exists(html_path):
+                        # Obtenir la date de création du fichier HTML
+                        last_html_build_time = os.path.getctime(html_path)

-                # Garder le contenu HTML existant si déjà présent
-                if f"{annee}/{slug}" in files_dict and 'html_content' in files_dict[f"{annee}/{slug}"]:
-                    print('on reprend le contenu html existant')
-                    if len(files_dict[f"{annee}/{slug}"]['html_content']) > 0:
-                        html_content = files_dict[f"{annee}/{slug}"]['html_content']
-                    if len(files_dict[f"{annee}/{slug}"]['html_content_without_h1']) > 0:
-                        html_content_without_h1 = files_dict[f"{annee}/{slug}"]['html_content_without_h1']
+                        # print(f"last_html_build: {last_html_build_time} : {html_path}")
                    else:
-                        html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
+                        print(f"----------- last_html_build html_path: {html_path} n'existe pas")
+                    # Vérifier l'existence du fichier Gemini pour déterminer last_gemini_build
+                    gemini_path = f"gemini-capsules/{args.blog}/{slug}.gmi"
+                    last_gemini_build = None
+                    rebuild_this_article_gemini = False
+                    if os.path.exists(gemini_path):
+                        last_gemini_build = time.ctime(os.path.getmtime(gemini_path))
+                    # Vérifier si l'article doit être reconstruit en comparant les dates de modification
+                    if last_gemini_build:
+                        file_modified_time = os.path.getmtime(file_path)
+                        last_build_time = time.mktime(time.strptime(last_gemini_build))
+                        rebuild_this_article_gemini = file_modified_time > last_build_time
+                    else:
+                        
+                        rebuild_this_article_gemini = True

-                if run_pandoc and rebuild_this_article_html or force_html_regen:
-                    # convertir le contenu d'article org vers html
-                    print(f"\033[91mBRRRRRRRRRRRRR pandoc time {time.strftime('%H:%M:%S')} : Conversion de {file_name} en html\033[0m")
+                    # Vérifier si l'article doit être reconstruit en comparant les dates de modification
+                    rebuild_this_article_html = False
+                    if last_html_build_time:
+                        file_modified_time = os.path.getmtime(file_path)
+                        # print(f"--------- file_modified_time: {file_path} : {file_modified_time}")
+                        # Obtenir l'heure de dernière modification du fichier HTML
+                        
+                        rebuild_this_article_html = file_modified_time > last_html_build_time
+                        # print(f"--------- article modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}")
+                    else:
+                        # si il n'y a pas de fichier html, on le construit pour la première fois
+                        print('on reconstruit le html de l\'article', file_name)
+                        
+                        rebuild_this_article_html = True
                    
-                    html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
-                    pandoc_runs_counter += 1
-                else:
-                    html_content = content_without_h1
+                    if rebuild_this_article_html:
+                        rebuild_counter += 1

-                if run_gemini and rebuild_this_article_gemini:
-                    os.makedirs(destination_gmi, exist_ok=True)
-                    # convertir le contenu d'article org vers gmi pour la capsule gemini
-                    gemini_content = org_to_gmi(content_without_h1, slug)
+
+                    # Garder le contenu HTML existant si déjà présent
+                    if f"{annee}/{slug}" in files_dict and 'html_content' in files_dict[f"{annee}/{slug}"]:
+                        print('on reprend le contenu html existant')
+                        if len(files_dict[f"{annee}/{slug}"]['html_content']) > 0:
+                            html_content = files_dict[f"{annee}/{slug}"]['html_content']
+                        if len(files_dict[f"{annee}/{slug}"]['html_content_without_h1']) > 0:
+                            html_content_without_h1 = files_dict[f"{annee}/{slug}"]['html_content_without_h1']
+                        else:
+                            html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
+
+                    if run_pandoc and rebuild_this_article_html or force_html_regen:
+                        # convertir le contenu d'article org vers html
+                        # print(f"\033[91mBRRRRRRRRRRRRR pandoc time {time.strftime('%H:%M:%S')} : Conversion de {file_name} en html\033[0m")
+                        print(f"\033[91m.\033[0m", end='', flush=True)
+                        
+                        html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
+                        pandoc_runs_counter += 1
+                    else:
+                        html_content = content_without_h1
+
+                    if run_gemini and rebuild_this_article_gemini:
+                        os.makedirs(destination_gmi, exist_ok=True)
+                        # convertir le contenu d'article org vers gmi pour la capsule gemini
+                        print(f"Conversion de {file_name} en gemini")
+                        gemini_content = org_to_gmi(content_without_h1, slug)





-                files_dict[f"{annee}/{slug}"] = {
-                    'path': file_path,
-                    'basename': basename,
-                    'roam_id': find_org_roam_id(content),
-                    'slug': f"{slug}/",
-                    'slug_with_year': f"{annee}/{slug}",
-                    'date': boom[0],
-                    'date_modified' : date_modified,
-                    'first_picture_url' : get_first_picture_url(content),
-                    'date_formattee': datetime.strptime(date_str, '%Y%m%d%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 14 else datetime.strptime(date_str, '%Y%m%dT%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 15 else datetime.strptime(date_str, '%Y-%m-%d').strftime('%d %B %Y'),
-                    'annee': annee,
-                    'tags': tags,
-                    'title': title,
-                    'next': None,
-                    'previous': None,
-                    'last_html_build': last_html_build_time,
-                    'last_gemini_build': last_gemini_build,
-                    'org_content': content,  # Contenu Org original
-                    'html_content_without_h1': html_content_without_h1,  # Contenu HTML converti sans le titre de premier niveau
-                    'html_content': html_content  # Contenu first_picture_urlHTML converti
-                }
+                    files_dict[f"{annee}/{slug}"] = {
+                        'path': file_path,
+                        'basename': basename,
+                        'roam_id': find_org_roam_id(content),
+                        'slug': f"{slug}/",
+                        'slug_with_year': f"{annee}/{slug}",
+                        'date': boom[0],
+                        'lang': lang_folder,
+                        'article_type': article_type,
+                        'date_modified' : date_modified,
+                        'first_picture_url' : get_first_picture_url(content),
+                        'date_formattee': datetime.strptime(date_str, '%Y%m%d%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 14 else datetime.strptime(date_str, '%Y%m%dT%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 15 else datetime.strptime(date_str, '%Y-%m-%d').strftime('%d %B %Y'),
+                        'annee': annee,
+                        'tags': tags,
+                        'title': title,
+                        'next': None,
+                        'previous': None,
+                        'last_html_build': last_html_build_time,
+                        'last_gemini_build': last_gemini_build,
+                        'org_content': content,  # Contenu Org original
+                        'html_content_without_h1': html_content_without_h1,  # Contenu HTML converti sans le titre de premier niveau
+                        'html_content': html_content,  # Contenu first_picture_urlHTML converti
+                        'gemini_content': gemini_content,  # Contenu gemini
+                    }

    print(f"======= Nombre d'articles reconstruits: {rebuild_counter}")
    print(f"======= Nombre de runs de pandoc: {pandoc_runs_counter}")