site builder ok

2025-06-20 09:04:42 +02:00 · 2024-11-11 00:58:44 +01:00 · 2024-11-11 00:58:44 +01:00 · 83dd9f7472
commit 83dd9f7472
parent 7e9e8f2436
27 changed files with 526 additions and 235 deletions
--- a/build_indexes.py
+++ b/build_indexes.py
@ -26,17 +26,37 @@ regex_orgroam = r"^(\d{14})_([a-zA-Z0-9_-]+)\.gmi$"

 use_article_file_for_name=False
 website_name = args.source
+def extract_body_content(html_content):
+    pattern = r'<body>.+?</body>'
+    match = re.search(pattern, html_content, re.DOTALL)
+    if match:
+        return match.group(1)
+    else:
+        return None
+def extract_body_content(html_content):
+    pattern = r'<body[^>]*?>(.*?)</body>'
+    match = re.search(pattern, html_content, re.DOTALL)
+    if match:
+        return match.group(1)
+    else:
+        return None

-def trouver_nom_article(fichier_org):
+def trouver_nom_article(fichier_org, format="html"):
    print('fichier_org, ',fichier_org)
    with open(fichier_org, 'r') as file:
        lignes = file.readlines()
-    
-    # Expressions régulières pour trouver les titres de niveau 1 et 2
-    titre_niveau_1 = r'^\*+ (.+)$'
-    titre_niveau_2 = r'^\*\*+ (.+)$'
-    
+        
    nom_article = None
+
+    # Expressions régulières pour trouver les titres de niveau 1 et 2
+    if format == 'html':
+        titre_niveau_1 = r'^\<h1 id.*?\>(.+)\<\/h1\>$'
+        titre_niveau_2 = r'^\<h2.*?\>(.+)\<\/h2\>$'
+    else:
+        titre_niveau_1 = r'^\*+ (.+)$'
+        titre_niveau_2 = r'^\*\*+ (.+)$'
+
+
    
    # Itérer sur les lignes du fichier
    for ligne in lignes:
@ -182,9 +202,9 @@ def generer_index(dossier_source, fichier_index, titre_index):
            if use_article_file_for_name:
                article_name = link_html
            else:
-                file_path_org = os.path.join(dossier_parent,"sources",website_name,link_org)
+                file_path_org = os.path.join(dossier_parent,"sources",website_name, link_org)
                print('-------------- trouver_nom_article ',file_path_org)
-                article_name=trouver_nom_article(file_path_org)
+                article_name=trouver_nom_article(file_path_org, 'org')
        
            if not article_name:
                article_name = link_html