add gather data

2025-06-20 09:04:42 +02:00 · 2024-11-20 00:24:09 +01:00 · 2024-11-20 00:24:09 +01:00 · 87d27dc8a2
commit 87d27dc8a2
parent baac2fd2f1
5 changed files with 131 additions and 66 deletions
--- a/utils.py
+++ b/utils.py
@ -173,6 +173,61 @@ def extract_body_content(html_content):
        print('---- extract_body_content : no body found in this html')
        return html_content

+def add_tags_from_content(tags=None, file_content="", words_to_check=None):
+    """
+    Ajoute des tags à l'ensemble `tags` si les mots correspondants sont trouvés dans le contenu du fichier.
+
+    :param tags: Ensemble de tags (set). Si None, un nouvel ensemble est créé (type set, optionnel).
+    :param file_content: Contenu du fichier (str).
+    :param words_to_check: Liste de mots à repérer (list). Si None, une liste vide est utilisée (type list, optionnel).
+    :return: Ensemble de tags mis à jour (set).
+    """
+    # Initialiser l'ensemble tags s'il est None
+    if tags is None:
+        tags = set()
+
+    # Initialiser la liste words_to_check s'il est None
+    if words_to_check is None:
+        words_to_check = []
+
+    # Convertir le contenu du fichier en minuscules pour une recherche insensible à la casse
+    file_content_lower = file_content.lower()
+
+    # Parcourir chaque mot à vérifier
+    for word in words_to_check:
+        # Vérifier si le mot est présent dans le contenu du fichier
+        if word.lower() in file_content_lower:
+            # Ajouter le tag correspondant à l'ensemble de tags
+            tags.add(word)
+
+    return tags
+
+def extract_tags_from_file(file_path, excluded_tags):
+    tags = set()
+    with open(file_path, 'r', encoding='utf-8') as file_content:
+        tag_found = False
+        for line in file_content:
+            if global_config['automatic_tagging_enabled']:
+                tags = add_tags_from_content(tags, line, global_config['auto_tag_terms'])
+            # Check for orgmode tags :tag1:tag2:
+            if ':' in line:
+                for word in line.split():
+                    if len(word) and word.startswith(':') and word.endswith(':'):
+                        tag = word[1:-1]
+                        if tag not in excluded_tags:
+                            tags.add(tag)
+                            tag_found = True
+            # Check for #+tags: tag1,tag2
+            if line.startswith('#+tags:'):
+                for tag in line[len('#+tags:'):].split(','):
+                    tag = tag.strip()
+                    if tag and tag not in excluded_tags:
+                        tags.add(tag)
+                        tag_found = True
+
+    if not tag_found:
+        print('no tag in the article', file_path)
+    return tags

 def remove_properties_section(text):
    pattern = r"<h1 id=\"article\">Article</h1>.+?</ul>"