add journées mondiales et vacances

2025-10-10 17:45:23 +02:00 · 2025-10-10 17:45:23 +02:00 · 26bfe4ae36
commit 26bfe4ae36
parent d22dbde2e7
4 changed files with 670 additions and 0 deletions
--- a/extractors/world_days_extractor.py
+++ b/extractors/world_days_extractor.py
@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Extracteur des journées mondiales/internationales.
+Source: https://www.journee-mondiale.com/les-journees-mondiales.htm
+
+Fonctionnalités:
+- Cache JSON pour limiter les requêtes
+- Paramètres CLI (base_url, dry-run, ttl cache)
+- Conversion vers format OEDB (what par défaut: culture.arts)
+- Une journée d'événement, positionnée à la prochaine occurrence dans les 365 jours à venir
+- Rapport succès/échecs et impression du GeoJSON en dry-run
+"""
+
+import argparse
+import datetime as dt
+import re
+import sys
+from typing import Any, Dict, List, Tuple
+
+from bs4 import BeautifulSoup
+from utils_extractor_common import (
+    CacheConfig,
+    load_cache,
+    save_cache,
+    oedb_feature,
+    post_oedb_features,
+    http_get_json,
+)
+
+
+DEFAULT_CACHE = "extractors_cache/world_days_cache.json"
+OEDB_DEFAULT = "https://api.openeventdatabase.org"
+SOURCE_URL = "https://www.journee-mondiale.com/les-journees-mondiales.htm"
+
+
+def build_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Extracteur journées mondiales/internationales -> OEDB")
+    p.add_argument("--base-url", help="Base URL OEDB", default=OEDB_DEFAULT)
+    p.add_argument("--cache", help="Fichier de cache JSON", default=DEFAULT_CACHE)
+    p.add_argument("--cache-ttl", help="Durée de vie du cache (sec)", type=int, default=24*3600)
+    p.add_argument("--limit", help="Limiter le nombre d'événements à traiter", type=int, default=None)
+    # dry-run activé par défaut; passer --no-dry-run pour envoyer
+    p.add_argument("--no-dry-run", dest="dry_run", help="Désactive le dry-run (envoie à l'API)", action="store_false")
+    p.set_defaults(dry_run=True)
+    return p.parse_args()
+
+
+MONTHS = {
+    "janvier": 1, "février": 2, "fevrier": 2, "mars": 3, "avril": 4, "mai": 5, "juin": 6,
+    "juillet": 7, "août": 8, "aout": 8, "septembre": 9, "octobre": 10, "novembre": 11, "décembre": 12, "decembre": 12
+}
+
+
+def parse_days_from_html(html: str) -> List[Tuple[int, int, str, str]]:
+    """Parse les journées depuis le HTML en ciblant les ancres dans id=texte et class=content."""
+    days: List[Tuple[int, int, str, str]] = []
+    
+    soup = BeautifulSoup(html, 'html.parser')
+    
+    # Cibler spécifiquement les ancres dans id=texte et class=content
+    text_section = soup.find('div', id='texte')
+    if not text_section:
+        return days
+    
+    content_section = text_section.find('div', class_='content')
+    if not content_section:
+        return days
+    
+    # Chercher tous les articles (mois) dans cette section
+    articles = content_section.find_all('article')
+    
+    for article in articles:
+        # Chercher toutes les ancres dans chaque article
+        links = article.find_all('a')
+        
+        for link in links:
+            # Extraire le texte du lien et l'URL
+            text = link.get_text(strip=True)
+            url = link.get('href', '')
+            if not text:
+                continue
+                
+            # Pattern pour capturer: "1er janvier" ou "15 janvier" + "Journée mondiale..."
+            pattern = re.compile(r"\b(\d{1,2}|1er)\s+([a-zA-Zéèêëàâîïôöùûüç]+)\s*:\s*(.+)")
+            match = pattern.search(text)
+            
+            if match:
+                day_str = match.group(1).lower()
+                day = 1 if day_str == "1er" else int(re.sub(r"\D", "", day_str))
+                month_name = match.group(2).lower()
+                month = MONTHS.get(month_name)
+                label = match.group(3).strip()
+                
+                if month is not None and label:
+                    days.append((month, day, label, url))
+    
+    return days
+
+
+def fetch_sources(cache_cfg: CacheConfig) -> Dict[str, Any]:
+    cache = load_cache(cache_cfg)
+    if cache:
+        return cache
+
+    # Récupérer la page HTML
+    import requests
+    r = requests.get(SOURCE_URL, timeout=30)
+    r.raise_for_status()
+    html = r.text
+    
+    # Parser le HTML pour extraire les journées
+    items = parse_days_from_html(html)
+
+    out: Dict[str, Any] = {"items": items}
+    save_cache(cache_cfg, out)
+    return out
+
+
+def create_event_date(month: int, day: int, today: dt.date) -> dt.date:
+    """Crée la date de l'événement pour l'année courante à partir d'aujourd'hui."""
+    year = today.year
+    try:
+        # Essayer de créer la date pour l'année courante
+        event_date = dt.date(year, month, day)
+        # Si la date est dans le passé, utiliser l'année suivante
+        if event_date < today:
+            event_date = dt.date(year + 1, month, day)
+        return event_date
+    except ValueError:
+        # Gérer les cas comme le 29 février dans une année non-bissextile
+        # Utiliser l'année suivante
+        try:
+            return dt.date(year + 1, month, day)
+        except ValueError:
+            # Si toujours impossible, utiliser le 28 février
+            return dt.date(year + 1, 2, 28)
+
+
+def convert_to_oedb(data: Dict[str, Any], limit: int | None = None) -> List[Dict[str, Any]]:
+    features: List[Dict[str, Any]] = []
+    today = dt.date.today()
+    for (month, day, label, url) in data.get("items", []) or []:
+        try:
+            date_obj = create_event_date(month, day, today)
+        except Exception:
+            continue
+        date_iso = date_obj.isoformat()
+        
+        # Déterminer la zone selon le titre
+        label_lower = label.lower()
+        if "mondial" in label_lower or "international" in label_lower:
+            zone = "world"
+            where = "Monde"
+        else:
+            zone = "france"
+            where = "France"
+        
+        # Créer l'événement avec propriété zone
+        feature = oedb_feature(
+            label=label,
+            what="culture.days",
+            start=f"{date_iso}T00:00:00Z",
+            stop=f"{date_iso}T23:59:59Z",
+            description="Journée mondiale/internationale",
+            where=where,
+            online=True,
+        )
+        # Ajouter la propriété type requise par l'API OEDB
+        feature["properties"]["type"] = "scheduled"
+        # Ajouter la propriété zone
+        feature["properties"]["zone"] = zone
+        # Ajouter l'URL si disponible
+        if url:
+            feature["properties"]["url"] = url
+        
+        features.append(feature)
+        # Appliquer la limite si définie
+        if limit and len(features) >= limit:
+            break
+    return features
+
+
+def main() -> int:
+    args = build_args()
+    cache_cfg = CacheConfig(path=args.cache, ttl_seconds=args.cache_ttl)
+
+    src = fetch_sources(cache_cfg)
+    feats = convert_to_oedb(src, args.limit)
+
+    if args.dry_run:
+        # Imprimer le GeoJSON prêt à envoyer
+        collection = {"type": "FeatureCollection", "features": feats}
+        import json
+        print(json.dumps(collection, ensure_ascii=False, indent=2))
+
+    # Utiliser un cache pour éviter de renvoyer les événements déjà traités
+    sent_cache_path = "extractors_cache/world_days_sent.json"
+    ok, failed, neterr = post_oedb_features(args.base_url, feats, dry_run=args.dry_run, sent_cache_path=sent_cache_path)
+    print(json_report(ok, failed, neterr))
+    return 0
+
+
+def json_report(ok: int, failed: int, neterr: int) -> str:
+    import json
+    return json.dumps({"success": ok, "failed": failed, "networkErrors": neterr}, indent=2)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+
+