remake detection already built

2025-06-20 09:04:42 +02:00 · 2025-02-28 13:12:02 +01:00 · 2025-02-28 13:12:02 +01:00 · 3b9061514f
commit 3b9061514f
parent a029792664
24 changed files with 248 additions and 1696 deletions
--- a/linking_articles_prev_next.py
+++ b/linking_articles_prev_next.py
@ -100,172 +100,197 @@ if generate_linkings_json :
    # Parcourir les fichiers du dossier
    
    for index, directory in enumerate(directories_to_scan):
-        print(f"Traitement du dossier {directory}, {index}/{len(directories_to_scan)}") 
-        for index, subdir in enumerate(os.listdir(directory)):
-            print(f"Traitement du dossier {directory}/{subdir}, {index}/{len(os.listdir(directory))}")
-            if subdir == ".." or subdir == "build" or subdir == "templates" or subdir == "img":
+        print(f"Traitement du dossier {directory}, {index+1}/{len(directories_to_scan)}")
+        try:
+            # Vérifier si le répertoire existe
+            if not os.path.exists(directory):
+                print(f"Le répertoire {directory} n'existe pas, on passe au suivant")
                continue
-            # Déterminer le type d'article en fonction du chemin
-            if directory == '/':
-                article_type = "page"
-            else:
-                article_type = "article"
-            # Extraire la langue du dossier si elle commence par "lang_"
-            if directory.split('/')[-1].startswith('lang_'):
-                lang_folder = directory.split('/')[-1][5:]  # Prend les caractères après "lang_"
-            for index, file_name in enumerate(os.listdir(f'{directory}/{subdir}')):
-                print(f"directory: {subdir}, {article_type}, {file_name}, {index}/{len(os.listdir(f'{directory}/{subdir}'))}")
-                # Vérifier si le fichier se termine par une extension supportée
-                if not (file_name.endswith('.org') or file_name.endswith('.md') or file_name.endswith('.gmi')):
-                    print(f"Fichier {file_name} non supporté")
-                    continue
-                if file_name.endswith('.org'):
-                    counter+=1
-                    # print(f"Traitement de l'article {counter}/{count_articles} {file_name}")
-                    file_path = os.path.join(directory, subdir, file_name)
-                    if force_html_regen and counter % 10 == 0:
-                        print(f"{time.strftime('%H:%M:%S')} : Articles traités : {counter}/{count_articles}")
-                        
-                    # on ouvre chacun des fichiers interprétables
-                    # pour déterminer les informations qu'il contient
-                    # afin de les stocker dans un json pour la génération des pages html et gemini
-                    with open(file_path, "r", encoding="utf-8") as f:
-                        print(f"----- Traitement de l'article {counter}/{count_articles} {file_name}")
-                        content = f.read()
-                        # Convertir le contenu Org en HTML
-                        title = find_first_level1_title(content)
+            
+            subdirs = [d for d in os.listdir(directory) 
+                       if os.path.isdir(os.path.join(directory, d)) 
+                       and d not in ["..", "build", "templates", "img"]]
+            
+            for subdir_index, subdir in enumerate(subdirs):
+                print(f"Traitement du sous-dossier {subdir}, {subdir_index+1}/{len(subdirs)}")
+                subdir_path = os.path.join(directory, subdir)
+                
+                try:
+                    # Liste tous les fichiers du sous-dossier avec les extensions supportées
+                    files = [f for f in os.listdir(subdir_path)
+                            if f.endswith(('.org', '.md', '.gmi'))]
+                    
+                    for file_index, file_name in enumerate(files):
+                        print(f"Traitement du fichier {file_name}, {file_index+1}/{len(files)}")
+                        # Vérifier si le fichier se termine par une extension supportée
+                        if not (file_name.endswith('.org') or file_name.endswith('.md') or file_name.endswith('.gmi')):
+                            print(f"Fichier {file_name} non supporté")
+                            continue
+                        if file_name.endswith('.org'):
+                            counter+=1
+                            # print(f"Traitement de l'article {counter}/{count_articles} {file_name}")
+                            file_path = os.path.join(directory, subdir, file_name)
+                            if force_html_regen and counter % 10 == 0:
+                                print(f"{time.strftime('%H:%M:%S')} : Articles traités : {counter}/{count_articles}")
+                                
+                            # on ouvre chacun des fichiers interprétables
+                            # pour déterminer les informations qu'il contient
+                            # afin de les stocker dans un json pour la génération des pages html et gemini
+                            with open(file_path, "r", encoding="utf-8") as f:
+                                print(f"----- Traitement de l'article {counter}/{count_articles} {file_name}")
+                                content = f.read()
+                                # Convertir le contenu Org en HTML
+                                title = find_first_level1_title(content)

-                        date_modified = time.ctime(os.path.getmtime(file_path))
+                                date_modified = time.ctime(os.path.getmtime(file_path))

-                        rebuild_this_article_gemini = False
-                        rebuild_this_article_html = False
-
-                        basename = get_basename(file_name)
-                        date_str, annee, slug = find_year_and_slug_on_filename(basename)
-                        slug = slugify_title(title)
-                        tags = extract_tags_from_file(file_path, global_config['excluded_tags'])
-                        
-                        # Convertir les tags en liste si c'est un set
-                        if isinstance(tags, set):
-                            tags = list(tags)
-                        boom = basename.split('__')
-                        
-
-                        # Désactiver les warning d'identifiant dupliqué dans la conversion pandoc
-                        content_without_h1 = re.sub(r'^\*.*?$', '', content, count=1, flags=re.MULTILINE)
-
-                        gemini_content = ''
-                        html_content = ''
-                        html_content_without_h1 = ''
-                        # Vérifier l'existence du fichier HTML pour déterminer last_html_build
-                        html_path = f"html-websites/{args.blog}/{annee}/{slug}/index.html"
-                        print(f"html_path existe il? : {html_path}")
-                        last_html_build_time = None
-                        if os.path.exists(html_path):
-                            # Obtenir la date de création du fichier HTML
-                            last_html_build_time = os.path.getctime(html_path)
-
-                            print(f"----- last_html_build EXISTE: {last_html_build_time} : {html_path}")
-                        else:
-                            print(f"html_path n'existe pas: on va le créer")
-                            #print(f"----------- last_html_build html_path: {html_path} n'existe pas")
-                        # Vérifier l'existence du fichier Gemini pour déterminer last_gemini_build
-                        
-                        gemini_path = f"./gemini-capsules/{args.blog}/{annee}/{slug}.gmi"
-                        last_gemini_build = None
-                        if os.path.exists(gemini_path):
-                            last_gemini_build = time.ctime(os.path.getmtime(gemini_path))
-                        # Vérifier si l'article doit être reconstruit en comparant les dates de modification
-                        if last_gemini_build:
-                            file_modified_time = os.path.getmtime(file_path)
-                            last_build_time = time.mktime(time.strptime(last_gemini_build))
-                            rebuild_this_article_gemini = file_modified_time > last_build_time
-                        else:
-                            rebuild_this_article_gemini = True
-                        # print(f"rebuild_this_article_gemini: {rebuild_this_article_gemini}")
-                        
-                        # Vérifier si l'article doit être reconstruit en comparant les dates de modification
-                        
-                        if last_html_build_time:
-                            file_modified_time = os.path.getmtime(file_path)
-                            print(f"--------- file_modified_time: {file_path} : {file_modified_time}")
-                            print(f"--------- last_html_build_time: {last_html_build_time}")
-                            # Obtenir l'heure de dernière modification du fichier HTML
-                            
-                            rebuild_this_article_html = file_modified_time > last_html_build_time
-                            if rebuild_this_article_html:
-                                print(f"--------- article modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}")
-                            else:
-                                print(f"--------- article non modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}, on ne recrée pas")
+                                rebuild_this_article_gemini = False
                                rebuild_this_article_html = False
-                        else:
-                            # si il n'y a pas de fichier html, on le construit pour la première fois
-                            print('on reconstruit le html de l\'article', file_name)
-                            rebuild_this_article_html = True
-                        
-                        if rebuild_this_article_html:
-                            rebuild_counter += 1
+
+                                basename = get_basename(file_name)
+                                date_str, annee, slug = find_year_and_slug_on_filename(basename)
+                                slug = slugify_title(title)
+                                tags = extract_tags_from_file(file_path, global_config['excluded_tags'])
+                                
+                                # Convertir les tags en liste si c'est un set
+                                if isinstance(tags, set):
+                                    tags = list(tags)
+                                boom = basename.split('__')
+                                
+
+                                # Désactiver les warning d'identifiant dupliqué dans la conversion pandoc
+                                content_without_h1 = re.sub(r'^\*.*?$', '', content, count=1, flags=re.MULTILINE)
+
+                                gemini_content = ''
+                                html_content = ''
+                                html_content_without_h1 = ''
+                                # Vérifier l'existence du fichier HTML pour déterminer last_html_build
+                                html_path = f"html-websites/{args.blog}/{annee}/{slug}/index.html"
+                                print(f"html_path existe il? : {html_path}")
+                                last_html_build_time = None
+                                if os.path.exists(html_path):
+                                    # Obtenir la date de création du fichier HTML
+                                    last_html_build_time = os.path.getctime(html_path)
+
+                                    print(f"----- last_html_build EXISTE: {last_html_build_time} : {html_path}")
+                                else:
+                                    print(f"html_path n'existe pas: on va le créer")
+                                # Vérifier l'existence du fichier Gemini pour déterminer last_gemini_build
+                                
+                                gemini_path = f"./gemini-capsules/{args.blog}/{annee}/{slug}.gmi"
+                                last_gemini_build = None
+                                if os.path.exists(gemini_path):
+                                    try:
+                                        # Obtenir directement le timestamp au lieu de la chaîne de date
+                                        last_gemini_build_time = os.path.getmtime(gemini_path)
+                                        last_gemini_build = time.ctime(last_gemini_build_time)
+                                        
+                                        # Vérifier si l'article doit être reconstruit
+                                        file_modified_time = os.path.getmtime(file_path)
+                                        rebuild_this_article_gemini = file_modified_time > last_gemini_build_time
+                                    except Exception as e:
+                                        print(f"Erreur lors de la vérification des dates pour {gemini_path}: {e}")
+                                        rebuild_this_article_gemini = True
+                                else:
+                                    rebuild_this_article_gemini = True
+                                # print(f"rebuild_this_article_gemini: {rebuild_this_article_gemini}")
+                                
+                                # Vérifier si l'article doit être reconstruit en comparant les dates de modification
+                                
+                                if last_html_build_time:
+                                    file_modified_time = os.path.getmtime(file_path)
+                                    print(f"--------- file_modified_time: {file_path} : {file_modified_time}")
+                                    print(f"--------- last_html_build_time: {last_html_build_time}")
+                                    # Obtenir l'heure de dernière modification du fichier HTML
+                                    
+                                    rebuild_this_article_html = file_modified_time > last_html_build_time
+                                    if rebuild_this_article_html:
+                                        print(f"--------- article modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}")
+                                    else:
+                                        print(f"--------- article non modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}, on ne recrée pas")
+                                        rebuild_this_article_html = False
+                                else:
+                                    # si il n'y a pas de fichier html, on le construit pour la première fois
+                                    print('on reconstruit le html de l\'article', file_name)
+                                    rebuild_this_article_html = True
+                                
+                                if rebuild_this_article_html:
+                                    rebuild_counter += 1


-                        # Garder le contenu HTML existant si déjà présent
-                        if f"{annee}/{slug}" in files_dict and 'html_content' in files_dict[f"{annee}/{slug}"]:
-                            #print('on reprend le contenu html existant')
-                            if len(files_dict[f"{annee}/{slug}"]['html_content']) > 0:
-                                html_content = files_dict[f"{annee}/{slug}"]['html_content']
-                            if len(files_dict[f"{annee}/{slug}"]['html_content_without_h1']) > 0:
-                                html_content_without_h1 = files_dict[f"{annee}/{slug}"]['html_content_without_h1']
-                            else:
-                                html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
+                                # Garder le contenu HTML existant si déjà présent
+                                if f"{annee}/{slug}" in files_dict and 'html_content' in files_dict[f"{annee}/{slug}"]:
+                                    #print('on reprend le contenu html existant')
+                                    if len(files_dict[f"{annee}/{slug}"]['html_content']) > 0 :
+                                        html_content = files_dict[f"{annee}/{slug}"]['html_content']
+                                    if len(files_dict[f"{annee}/{slug}"]['html_content_without_h1']) > 0 :
+                                        html_content_without_h1 = files_dict[f"{annee}/{slug}"]['html_content_without_h1']
+                                    else:
+                                        html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
+                                else:
+                                    print(f"\033[91m {time.strftime('%H:%M:%S')} BRRRRRRRRRRRRR pandoc : {title} en html\033[0m")
+                                    pandoc_runs_counter += 1
+                                    html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
+                                    html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
+                                    

+                                    
+                                if run_pandoc and rebuild_this_article_html or force_html_regen:
+                                    # convertir le contenu d'article org vers html
+                                    print(f"\033[91m {time.strftime('%H:%M:%S')} BRRRRRRRRRRRRR pandoc : {title} en html\033[0m")
+                                    # print(f"\033[91m.\033[0m", end='', flush=True)
+                                    if not html_content:
+                                        html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
+                                        html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
+                                        pandoc_runs_counter += 1
+                                # else:
+                                #     html_content = content_without_h1

-                            
-                        if run_pandoc and rebuild_this_article_html or force_html_regen:
-                            # convertir le contenu d'article org vers html
-                            print(f"\033[91m {time.strftime('%H:%M:%S')} BRRRRRRRRRRRRR pandoc : {title} en html\033[0m")
-                            # print(f"\033[91m.\033[0m", end='', flush=True)
-                            
-                            html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
-                            pandoc_runs_counter += 1
-                        else:
-                            html_content = content_without_h1
+                                # if run_gemini and rebuild_this_article_gemini:
+                                #     #print('-----------on régénère le gemini')
+                                #     # convertir le contenu d'article org vers gmi pour la capsule gemini
+                                #     gemini_content = org_to_gmi(content_without_h1)
+                                #     #print('len(gemini_content)', len(gemini_content))
+                                # else:
+                                #     print('-----------on ne régénère pas le gemini')
+                                if rebuild_this_article_gemini:
+                                    print(f"\033[91m {time.strftime('%H:%M:%S')} BRRRRRRRRRRRRR gemini : {title} en gmi\033[0m")
+                                    gemini_content = org_to_gmi(content_without_h1)

-                        # if run_gemini and rebuild_this_article_gemini:
-                        #     #print('-----------on régénère le gemini')
-                        #     # convertir le contenu d'article org vers gmi pour la capsule gemini
-                        #     gemini_content = org_to_gmi(content_without_h1)
-                        #     #print('len(gemini_content)', len(gemini_content))
-                        # else:
-                        #     print('-----------on ne régénère pas le gemini')
-                        if rebuild_this_article_gemini:
-                            print(f"\033[91m {time.strftime('%H:%M:%S')} BRRRRRRRRRRRRR gemini : {title} en gmi\033[0m")
-                            gemini_content = org_to_gmi(content_without_h1)
+                                files_dict[f"{annee}/{slug}"] = {
+                                    'path': file_path,
+                                    'basename': basename,
+                                    'roam_id': find_org_roam_id(content),
+                                    'slug': f"{slug}/",
+                                    'slug_with_year': f"{annee}/{slug}",
+                                    'date': boom[0],
+                                    'lang': lang_folder,
+                                    'article_type': article_type,
+                                    'date_modified' : date_modified,
+                                    'first_picture_url' : get_first_picture_url(content),
+                                    'date_formattee': format_date_str(date_str),
+                                    'annee': annee,
+                                    'tags': tags,
+                                    'title': title,
+                                    'next': None,
+                                    'previous': None,
+                                    'last_html_build': last_html_build_time,
+                                    'last_gemini_build': last_gemini_build,
+                                    'org_content': content,  # Contenu Org original
+                                    'html_content_without_h1': html_content_without_h1,  # Contenu HTML converti sans le titre de premier niveau
+                                    'html_content': html_content,  # Contenu first_picture_urlHTML converti
+                                    'gemini_content': gemini_content, 
+                                    'gemini_file_path': f"{annee}/{slug}.gmi",
+                                        # Contenu gemini
+                                }

-                        files_dict[f"{annee}/{slug}"] = {
-                            'path': file_path,
-                            'basename': basename,
-                            'roam_id': find_org_roam_id(content),
-                            'slug': f"{slug}/",
-                            'slug_with_year': f"{annee}/{slug}",
-                            'date': boom[0],
-                            'lang': lang_folder,
-                            'article_type': article_type,
-                            'date_modified' : date_modified,
-                            'first_picture_url' : get_first_picture_url(content),
-                            'date_formattee': format_date_str(date_str),
-                            'annee': annee,
-                            'tags': tags,
-                            'title': title,
-                            'next': None,
-                            'previous': None,
-                            'last_html_build': last_html_build_time,
-                            'last_gemini_build': last_gemini_build,
-                            'org_content': content,  # Contenu Org original
-                            'html_content_without_h1': html_content_without_h1,  # Contenu HTML converti sans le titre de premier niveau
-                            'html_content': html_content,  # Contenu first_picture_urlHTML converti
-                            'gemini_content': gemini_content, 
-                            'gemini_file_path': f"{annee}/{slug}.gmi",
-                                # Contenu gemini
-                        }
+                except OSError as e:
+                    print(f"Erreur lors de la lecture du sous-dossier {subdir_path}: {e}")
+                    continue
+                
+        except OSError as e:
+            print(f"Erreur lors de la lecture du dossier {directory}: {e}")
+            continue

    print(f"======= Nombre d'articles reconstruits: {rebuild_counter}")
    print(f"======= Nombre de runs de pandoc: {pandoc_runs_counter}")
@ -497,41 +522,3 @@ generate_article_pages(destination_json + '/articles_info.json', 'templates/html
 # À la fin du script, calculer et afficher le temps d'exécution
 execution_time = time.time() - start_time
 #print(f"Temps d'exécution : {execution_time:.2f} secondes")
-
-def format_date_str(date_str):
-    """
-    Formate une chaîne de date dans différents formats possibles
-    """
-    try:
-        # Format YYYYMMDDHHMMSS (14 caractères)
-        if len(date_str) == 14:
-            return datetime.strptime(date_str, '%Y%m%d%H%M%S').strftime('%d %B %Y à %H:%M:%S')
-        
-        # Format YYYYMMDDTHHMMSS (15 caractères avec T)
-        elif len(date_str) == 15 and 'T' in date_str:
-            return datetime.strptime(date_str, '%Y%m%dT%H%M%S').strftime('%d %B %Y à %H:%M:%S')
-        
-        # Format YYYYMMDDTHHMM (13 caractères avec T)
-        elif len(date_str) == 13 and 'T' in date_str:
-            return datetime.strptime(date_str, '%Y%m%dT%H%M').strftime('%d %B %Y à %H:%M')
-        
-        # Format YYYY-MM-DD
-        elif len(date_str) == 10 and '-' in date_str:
-            return datetime.strptime(date_str, '%Y-%m-%d').strftime('%d %B %Y')
-        
-        # Format YYYYMMDDTHHMMS (14 caractères avec T, manque un chiffre pour les secondes)
-        elif len(date_str) == 14 and 'T' in date_str:
-            # Ajouter un '0' pour compléter les secondes
-            date_str = date_str + '0'
-            return datetime.strptime(date_str, '%Y%m%dT%H%M%S').strftime('%d %B %Y à %H:%M:%S')
-        
-        else:
-            print(f"Format de date non reconnu: {date_str}")
-            return date_str
-            
-    except ValueError as e:
-        print(f"Erreur lors du formatage de la date {date_str}: {str(e)}")
-        return date_str
-
-
-