mirror of
https://forge.chapril.org/tykayn/orgmode-to-gemini-blog
synced 2025-06-20 09:04:42 +02:00
convert to gemini avec md2gemini, conversion de plusieurs langues
This commit is contained in:
parent
255e8fdc04
commit
bba1df0377
10 changed files with 462 additions and 202 deletions
|
@ -42,7 +42,12 @@ def get_basename(file_name):
|
|||
return os.path.splitext(file_name)[0]
|
||||
|
||||
# Chemin du dossier contenant les fichiers orgmode
|
||||
directory = f'sources/{args.blog}/lang_fr'
|
||||
directory_pages = f'sources/{args.blog}/'
|
||||
directory_fr = f'sources/{args.blog}/lang_fr'
|
||||
directory_en = f'sources/{args.blog}/lang_en'
|
||||
|
||||
directories_to_scan = [directory_pages, directory_fr, directory_en]
|
||||
|
||||
destination_json = f'sources/{args.blog}/build'
|
||||
destination_html = f'html-websites/{args.blog}/'
|
||||
destination_gmi = f'gemini-capsules/{args.blog}/'
|
||||
|
@ -64,188 +69,157 @@ else:
|
|||
files_dict = {}
|
||||
|
||||
|
||||
def get_first_picture_url(content):
|
||||
# Utiliser une expression régulière pour trouver la première URL d'image dans le contenu
|
||||
pattern = r'\[\[(.*?)\]\]'
|
||||
match = re.search(pattern, content)
|
||||
if match:
|
||||
return match.group(1)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def org_to_gmi(org_text: str, output_filename_slug: str) -> str:
|
||||
"""
|
||||
Convertit un texte au format Org en un fichier au format GMI (Gemini)
|
||||
en utilisant pypandoc.
|
||||
count_articles = count_files_in_directories(directories_to_scan)
|
||||
|
||||
Args:
|
||||
- org_text (str): Le texte au format Org à convertir.
|
||||
- output_file (str): Chemin du fichier de sortie au format GMI, sans avoir à préciser l'extension.
|
||||
"""
|
||||
output = """
|
||||
# mock land output
|
||||
===========
|
||||
|
||||
blah blah blah
|
||||
|
||||
-----------------
|
||||
Tykayn blog mock content
|
||||
-----------------
|
||||
|
||||
Navigation:
|
||||
|
||||
=> accueil.gmi Accueil
|
||||
=> a-propos.gmi à propos
|
||||
"""
|
||||
# Conversion du texte Org en GMI via Pandoc
|
||||
try:
|
||||
output = pypandoc.convert_text(org_text, 'markdown', format='org')
|
||||
except RuntimeError as e:
|
||||
print(f"Erreur de conversion : {e}")
|
||||
return
|
||||
|
||||
# Sauvegarde du contenu GMI dans un fichier
|
||||
try:
|
||||
with open(destination_gmi+'/'+output_filename_slug+'.gmi', 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f"Fichier GMI sauvegardé avec succès : {output_filename_slug}")
|
||||
except OSError as e:
|
||||
print(f"Erreur lors de la sauvegarde du fichier : {e}")
|
||||
return output
|
||||
|
||||
count_articles = len(os.listdir(directory))
|
||||
counter=0
|
||||
rebuild_counter = 0
|
||||
pandoc_runs_counter = 0
|
||||
lang_folder = global_config.get('lang_default', 'fr')
|
||||
|
||||
if generate_linkings_json :
|
||||
|
||||
print(f"Génération des liens entre articles pour {count_articles} articles")
|
||||
print(f"run_pandoc: {run_pandoc}")
|
||||
print(f"run_gemini: {run_gemini}")
|
||||
|
||||
article_type = "article"
|
||||
# Parcourir les fichiers du dossier
|
||||
for file_name in os.listdir(directory):
|
||||
if file_name.endswith('.org'):
|
||||
counter+=1
|
||||
if force_html_regen and counter % 10 == 0:
|
||||
print(f"{time.strftime('%H:%M:%S')} : Articles traités : {counter}/{count_articles}")
|
||||
file_path = os.path.join(directory, file_name)
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
date_modified = time.ctime(os.path.getmtime(file_path))
|
||||
|
||||
for index, directory in enumerate(directories_to_scan):
|
||||
# Déterminer le type d'article en fonction du chemin
|
||||
if directory == '/':
|
||||
article_type = "page"
|
||||
else:
|
||||
article_type = "article"
|
||||
# Extraire la langue du dossier si elle commence par "lang_"
|
||||
if directory.split('/')[-1].startswith('lang_'):
|
||||
lang_folder = directory.split('/')[-1][5:] # Prend les caractères après "lang_"
|
||||
for file_name in os.listdir(directory):
|
||||
if file_name.endswith('.org'):
|
||||
counter+=1
|
||||
if force_html_regen and counter % 10 == 0:
|
||||
print(f"{time.strftime('%H:%M:%S')} : Articles traités : {counter}/{count_articles}")
|
||||
file_path = os.path.join(directory, file_name)
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
date_modified = time.ctime(os.path.getmtime(file_path))
|
||||
|
||||
basename = get_basename(file_name)
|
||||
date_str, annee, slug = find_year_and_slug_on_filename(basename)
|
||||
tags = extract_tags_from_file(file_path, global_config['excluded_tags'])
|
||||
|
||||
# Convertir les tags en liste si c'est un set
|
||||
if isinstance(tags, set):
|
||||
tags = list(tags)
|
||||
boom = basename.split('__')
|
||||
# Convertir le contenu Org en HTML
|
||||
title = find_first_level1_title(content)
|
||||
|
||||
# Désactiver les warning d'identifiant dupliqué dans la conversion pandoc
|
||||
content_without_h1 = re.sub(r'^\*.*?$', '', content, count=1, flags=re.MULTILINE)
|
||||
|
||||
gemini_content = ''
|
||||
html_content = ''
|
||||
html_content_without_h1 = ''
|
||||
# Vérifier l'existence du fichier HTML pour déterminer last_html_build
|
||||
html_path = f"html-websites/{args.blog}/{annee}/{slug}/index.html"
|
||||
last_html_build_time = None
|
||||
if os.path.exists(html_path):
|
||||
# Obtenir la date de création du fichier HTML
|
||||
last_html_build_time = os.path.getctime(html_path)
|
||||
|
||||
# print(f"last_html_build: {last_html_build_time} : {html_path}")
|
||||
else:
|
||||
print(f"----------- last_html_build html_path: {html_path} n'existe pas")
|
||||
# Vérifier l'existence du fichier Gemini pour déterminer last_gemini_build
|
||||
gemini_path = f"gemini-capsules/{args.blog}/{slug}.gmi"
|
||||
last_gemini_build = None
|
||||
rebuild_this_article_gemini = False
|
||||
if os.path.exists(gemini_path):
|
||||
last_gemini_build = time.ctime(os.path.getmtime(gemini_path))
|
||||
# Vérifier si l'article doit être reconstruit en comparant les dates de modification
|
||||
if last_gemini_build:
|
||||
file_modified_time = os.path.getmtime(file_path)
|
||||
last_build_time = time.mktime(time.strptime(last_gemini_build))
|
||||
rebuild_this_article_gemini = file_modified_time > last_build_time
|
||||
else:
|
||||
basename = get_basename(file_name)
|
||||
date_str, annee, slug = find_year_and_slug_on_filename(basename)
|
||||
tags = extract_tags_from_file(file_path, global_config['excluded_tags'])
|
||||
|
||||
rebuild_this_article_gemini = True
|
||||
# Convertir les tags en liste si c'est un set
|
||||
if isinstance(tags, set):
|
||||
tags = list(tags)
|
||||
boom = basename.split('__')
|
||||
# Convertir le contenu Org en HTML
|
||||
title = find_first_level1_title(content)
|
||||
|
||||
# Vérifier si l'article doit être reconstruit en comparant les dates de modification
|
||||
rebuild_this_article_html = False
|
||||
if last_html_build_time:
|
||||
file_modified_time = os.path.getmtime(file_path)
|
||||
# print(f"--------- file_modified_time: {file_path} : {file_modified_time}")
|
||||
# Obtenir l'heure de dernière modification du fichier HTML
|
||||
|
||||
rebuild_this_article_html = file_modified_time > last_html_build_time
|
||||
# print(f"--------- article modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}")
|
||||
else:
|
||||
# si il n'y a pas de fichier html, on le construit pour la première fois
|
||||
print('on reconstruit le html de l\'article', file_name)
|
||||
|
||||
rebuild_this_article_html = True
|
||||
|
||||
if rebuild_this_article_html:
|
||||
rebuild_counter += 1
|
||||
# Désactiver les warning d'identifiant dupliqué dans la conversion pandoc
|
||||
content_without_h1 = re.sub(r'^\*.*?$', '', content, count=1, flags=re.MULTILINE)
|
||||
|
||||
gemini_content = ''
|
||||
html_content = ''
|
||||
html_content_without_h1 = ''
|
||||
# Vérifier l'existence du fichier HTML pour déterminer last_html_build
|
||||
html_path = f"html-websites/{args.blog}/{annee}/{slug}/index.html"
|
||||
last_html_build_time = None
|
||||
if os.path.exists(html_path):
|
||||
# Obtenir la date de création du fichier HTML
|
||||
last_html_build_time = os.path.getctime(html_path)
|
||||
|
||||
# Garder le contenu HTML existant si déjà présent
|
||||
if f"{annee}/{slug}" in files_dict and 'html_content' in files_dict[f"{annee}/{slug}"]:
|
||||
print('on reprend le contenu html existant')
|
||||
if len(files_dict[f"{annee}/{slug}"]['html_content']) > 0:
|
||||
html_content = files_dict[f"{annee}/{slug}"]['html_content']
|
||||
if len(files_dict[f"{annee}/{slug}"]['html_content_without_h1']) > 0:
|
||||
html_content_without_h1 = files_dict[f"{annee}/{slug}"]['html_content_without_h1']
|
||||
# print(f"last_html_build: {last_html_build_time} : {html_path}")
|
||||
else:
|
||||
html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
|
||||
print(f"----------- last_html_build html_path: {html_path} n'existe pas")
|
||||
# Vérifier l'existence du fichier Gemini pour déterminer last_gemini_build
|
||||
gemini_path = f"gemini-capsules/{args.blog}/{slug}.gmi"
|
||||
last_gemini_build = None
|
||||
rebuild_this_article_gemini = False
|
||||
if os.path.exists(gemini_path):
|
||||
last_gemini_build = time.ctime(os.path.getmtime(gemini_path))
|
||||
# Vérifier si l'article doit être reconstruit en comparant les dates de modification
|
||||
if last_gemini_build:
|
||||
file_modified_time = os.path.getmtime(file_path)
|
||||
last_build_time = time.mktime(time.strptime(last_gemini_build))
|
||||
rebuild_this_article_gemini = file_modified_time > last_build_time
|
||||
else:
|
||||
|
||||
rebuild_this_article_gemini = True
|
||||
|
||||
if run_pandoc and rebuild_this_article_html or force_html_regen:
|
||||
# convertir le contenu d'article org vers html
|
||||
print(f"\033[91mBRRRRRRRRRRRRR pandoc time {time.strftime('%H:%M:%S')} : Conversion de {file_name} en html\033[0m")
|
||||
# Vérifier si l'article doit être reconstruit en comparant les dates de modification
|
||||
rebuild_this_article_html = False
|
||||
if last_html_build_time:
|
||||
file_modified_time = os.path.getmtime(file_path)
|
||||
# print(f"--------- file_modified_time: {file_path} : {file_modified_time}")
|
||||
# Obtenir l'heure de dernière modification du fichier HTML
|
||||
|
||||
rebuild_this_article_html = file_modified_time > last_html_build_time
|
||||
# print(f"--------- article modifié après le build de son rendu html: {file_path}, {rebuild_this_article_html}")
|
||||
else:
|
||||
# si il n'y a pas de fichier html, on le construit pour la première fois
|
||||
print('on reconstruit le html de l\'article', file_name)
|
||||
|
||||
rebuild_this_article_html = True
|
||||
|
||||
html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
|
||||
pandoc_runs_counter += 1
|
||||
else:
|
||||
html_content = content_without_h1
|
||||
if rebuild_this_article_html:
|
||||
rebuild_counter += 1
|
||||
|
||||
if run_gemini and rebuild_this_article_gemini:
|
||||
os.makedirs(destination_gmi, exist_ok=True)
|
||||
# convertir le contenu d'article org vers gmi pour la capsule gemini
|
||||
gemini_content = org_to_gmi(content_without_h1, slug)
|
||||
|
||||
# Garder le contenu HTML existant si déjà présent
|
||||
if f"{annee}/{slug}" in files_dict and 'html_content' in files_dict[f"{annee}/{slug}"]:
|
||||
print('on reprend le contenu html existant')
|
||||
if len(files_dict[f"{annee}/{slug}"]['html_content']) > 0:
|
||||
html_content = files_dict[f"{annee}/{slug}"]['html_content']
|
||||
if len(files_dict[f"{annee}/{slug}"]['html_content_without_h1']) > 0:
|
||||
html_content_without_h1 = files_dict[f"{annee}/{slug}"]['html_content_without_h1']
|
||||
else:
|
||||
html_content_without_h1 = re.sub(r'<h1>.*?</h1>', '', html_content)
|
||||
|
||||
if run_pandoc and rebuild_this_article_html or force_html_regen:
|
||||
# convertir le contenu d'article org vers html
|
||||
# print(f"\033[91mBRRRRRRRRRRRRR pandoc time {time.strftime('%H:%M:%S')} : Conversion de {file_name} en html\033[0m")
|
||||
print(f"\033[91m.\033[0m", end='', flush=True)
|
||||
|
||||
html_content = pypandoc.convert_text(content_without_h1, 'html', format='org')
|
||||
pandoc_runs_counter += 1
|
||||
else:
|
||||
html_content = content_without_h1
|
||||
|
||||
if run_gemini and rebuild_this_article_gemini:
|
||||
os.makedirs(destination_gmi, exist_ok=True)
|
||||
# convertir le contenu d'article org vers gmi pour la capsule gemini
|
||||
print(f"Conversion de {file_name} en gemini")
|
||||
gemini_content = org_to_gmi(content_without_h1, slug)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
files_dict[f"{annee}/{slug}"] = {
|
||||
'path': file_path,
|
||||
'basename': basename,
|
||||
'roam_id': find_org_roam_id(content),
|
||||
'slug': f"{slug}/",
|
||||
'slug_with_year': f"{annee}/{slug}",
|
||||
'date': boom[0],
|
||||
'date_modified' : date_modified,
|
||||
'first_picture_url' : get_first_picture_url(content),
|
||||
'date_formattee': datetime.strptime(date_str, '%Y%m%d%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 14 else datetime.strptime(date_str, '%Y%m%dT%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 15 else datetime.strptime(date_str, '%Y-%m-%d').strftime('%d %B %Y'),
|
||||
'annee': annee,
|
||||
'tags': tags,
|
||||
'title': title,
|
||||
'next': None,
|
||||
'previous': None,
|
||||
'last_html_build': last_html_build_time,
|
||||
'last_gemini_build': last_gemini_build,
|
||||
'org_content': content, # Contenu Org original
|
||||
'html_content_without_h1': html_content_without_h1, # Contenu HTML converti sans le titre de premier niveau
|
||||
'html_content': html_content # Contenu first_picture_urlHTML converti
|
||||
}
|
||||
files_dict[f"{annee}/{slug}"] = {
|
||||
'path': file_path,
|
||||
'basename': basename,
|
||||
'roam_id': find_org_roam_id(content),
|
||||
'slug': f"{slug}/",
|
||||
'slug_with_year': f"{annee}/{slug}",
|
||||
'date': boom[0],
|
||||
'lang': lang_folder,
|
||||
'article_type': article_type,
|
||||
'date_modified' : date_modified,
|
||||
'first_picture_url' : get_first_picture_url(content),
|
||||
'date_formattee': datetime.strptime(date_str, '%Y%m%d%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 14 else datetime.strptime(date_str, '%Y%m%dT%H%M%S').strftime('%d %B %Y à %H:%M:%S') if len(date_str) == 15 else datetime.strptime(date_str, '%Y-%m-%d').strftime('%d %B %Y'),
|
||||
'annee': annee,
|
||||
'tags': tags,
|
||||
'title': title,
|
||||
'next': None,
|
||||
'previous': None,
|
||||
'last_html_build': last_html_build_time,
|
||||
'last_gemini_build': last_gemini_build,
|
||||
'org_content': content, # Contenu Org original
|
||||
'html_content_without_h1': html_content_without_h1, # Contenu HTML converti sans le titre de premier niveau
|
||||
'html_content': html_content, # Contenu first_picture_urlHTML converti
|
||||
'gemini_content': gemini_content, # Contenu gemini
|
||||
}
|
||||
|
||||
print(f"======= Nombre d'articles reconstruits: {rebuild_counter}")
|
||||
print(f"======= Nombre de runs de pandoc: {pandoc_runs_counter}")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue