oedb-backend/extractors/fr_holidays_extractor.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Extracteur des vacances scolaires en France (par zones A/B/C et Corse) + jours fériés.
Sources:
- Vacances scolaires (ICS): https://www.data.gouv.fr/api/1/datasets/r/e5f40fbc-7a84-4c4a-94e4-55ac4299b222
- Jours fériés: https://calendrier.api.gouv.fr/jours-feries.json

Fonctionnalités:
- Cache JSON pour limiter les requêtes
- Paramètres CLI (période, zone optionnelle, dry-run, base_url OEDB, ttl cache)
- Conversion vers format Feature OEDB (un évènement par zone et par période de vacances)
- Pas de coordonnées GPS (point [0,0])
- Rapport succès/échecs à l'issue de l'envoi
"""

import argparse
import datetime as dt
import sys
from typing import Any, Dict, List, Tuple

from utils_extractor_common import (
    CacheConfig,
    load_cache,
    save_cache,
    oedb_feature,
    post_oedb_features,
    http_get_json,
)


DEFAULT_CACHE = "extractors_cache/fr_holidays_cache.json"
OEDB_DEFAULT = "https://api.openeventdatabase.org"
ICS_URL = "https://www.data.gouv.fr/api/1/datasets/r/e5f40fbc-7a84-4c4a-94e4-55ac4299b222"


def build_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Extracteur vacances scolaires FR -> OEDB")
    p.add_argument("--start", help="Date de début YYYY-MM-DD", default=None)
    p.add_argument("--end", help="Date de fin YYYY-MM-DD", default=None)
    p.add_argument("--academie", help="Filtrer par académie (optionnel)", default=None)
    p.add_argument("--base-url", help="Base URL OEDB", default=OEDB_DEFAULT)
    p.add_argument("--cache", help="Fichier de cache JSON", default=DEFAULT_CACHE)
    p.add_argument("--cache-ttl", help="Durée de vie du cache (sec)", type=int, default=24*3600)
    p.add_argument("--limit", help="Limiter le nombre d'événements à traiter", type=int, default=None)
    p.add_argument("--dry-run", help="N'envoie pas à l'API OEDB", action="store_true")
    return p.parse_args()


def date_in_range(d: str, start: str, end: str) -> bool:
    if not start and not end:
        return True
    dd = dt.date.fromisoformat(d)
    if start:
        if dd < dt.date.fromisoformat(start):
            return False
    if end:
        if dd > dt.date.fromisoformat(end):
            return False
    return True


def _zones_from_text(summary: str, location: str) -> List[str]:
    s = f"{summary} {location}".lower()
    zones: List[str] = []
    if "corse" in s:
        zones.append("Corse")
    # Chercher motifs Zones A/B/C, B/C, A/B, Zone A, etc.
    # Simpliste mais robuste pour notre source
    if "zones a/b/c" in s or "zones a / b / c" in s:
        zones.extend(["A", "B", "C"]) 
    else:
        if "zones a/b" in s or "zones a / b" in s:
            zones.extend(["A", "B"]) 
        if "zones b/c" in s or "zones b / c" in s:
            zones.extend(["B", "C"]) 
        if "zones a/c" in s or "zones a / c" in s:
            zones.extend(["A", "C"]) 
        if "zone a" in s:
            zones.append("A")
        if "zone b" in s:
            zones.append("B")
        if "zone c" in s:
            zones.append("C")
    # Dédupliquer en conservant l'ordre
    seen = set()
    out: List[str] = []
    for z in zones:
        if z not in seen:
            seen.add(z)
            out.append(z)
    return out or ["A", "B", "C"]  # fallback si non indiqué


def _parse_ics_events(ics_text: str) -> List[Dict[str, Any]]:
    events: List[Dict[str, Any]] = []
    current: Dict[str, str] = {}
    in_event = False
    for raw in ics_text.splitlines():
        line = raw.strip()
        if line == "BEGIN:VEVENT":
            in_event = True
            current = {}
            continue
        if line == "END:VEVENT":
            if current:
                events.append(current)
            in_event = False
            current = {}
            continue
        if not in_event:
            continue
        if line.startswith("DTSTART"):
            # DTSTART;VALUE=DATE:YYYYMMDD
            val = line.split(":", 1)[-1]
            current["DTSTART"] = val
        elif line.startswith("DTEND"):
            val = line.split(":", 1)[-1]
            current["DTEND"] = val
        elif line.startswith("SUMMARY:"):
            current["SUMMARY"] = line[len("SUMMARY:"):].strip()
        elif line.startswith("LOCATION:"):
            current["LOCATION"] = line[len("LOCATION:"):].strip()
    return events


def _yymmdd_to_iso(d: str) -> str:
    # d: YYYYMMDD
    return f"{d[0:4]}-{d[4:6]}-{d[6:8]}"


def fetch_sources(cache_cfg: CacheConfig) -> Dict[str, Any]:
    cache = load_cache(cache_cfg)
    if cache:
        return cache

    out: Dict[str, Any] = {}
    # Jours fériés France métropolitaine (année courante)
    year = dt.date.today().year
    holidays_url = f"https://calendrier.api.gouv.fr/jours-feries/metropole/{year}.json"
    out["jours_feries"] = http_get_json(holidays_url)

    # Vacances scolaires via ICS data.gouv
    import requests
    r = requests.get(ICS_URL, timeout=30)
    r.raise_for_status()
    ics_text = r.text
    vevents = _parse_ics_events(ics_text)
    vacances: List[Dict[str, Any]] = []
    for ev in vevents:
        dtstart = ev.get("DTSTART")
        dtend = ev.get("DTEND")
        summary = ev.get("SUMMARY", "")
        location = ev.get("LOCATION", "")
        if not (dtstart and dtend and summary):
            continue
        start_iso = _yymmdd_to_iso(dtstart)
        end_excl_iso = _yymmdd_to_iso(dtend)
        # DTEND valeur-date dans ICS est exclusive -> stop inclusif = end_excl - 1 jour
        end_excl = dt.date.fromisoformat(end_excl_iso)
        stop_incl = (end_excl - dt.timedelta(days=1)).isoformat()
        zones = _zones_from_text(summary, location)
        vacances.append({
            "label": summary,
            "start": start_iso,
            "stop": stop_incl,
            "zones": zones,
        })
    out["vacances_scolaires_ics"] = vacances

    save_cache(cache_cfg, out)
    return out


def convert_to_oedb(data: Dict[str, Any], start: str | None, end: str | None, academie: str | None, limit: int | None = None) -> List[Dict[str, Any]]:
    features: List[Dict[str, Any]] = []

    # Jours fériés
    jf: Dict[str, str] = data.get("jours_feries", {}) or {}
    for date_iso, label in jf.items():
        if not date_in_range(date_iso, start, end):
            continue
        # Améliorer le nom avec la date
        try:
            date_obj = dt.date.fromisoformat(date_iso)
            day_name = date_obj.strftime("%A %d %B %Y")
            full_label = f"{label} ({day_name})"
        except:
            full_label = label
        
        feature = oedb_feature(
            label=full_label,
            what="time.daylight.holiday",
            start=f"{date_iso}T00:00:00Z",
            stop=f"{date_iso}T23:59:59Z",
            description="Jour férié national",
            where="France",
        )
        # Ajouter la propriété type requise par l'API OEDB
        feature["properties"]["type"] = "scheduled"
        features.append(feature)
        # Appliquer la limite si définie
        if limit and len(features) >= limit:
            return features[:limit]

    # Vacances scolaires via ICS – un évènement par zone listée
    vs_ics: List[Dict[str, Any]] = data.get("vacances_scolaires_ics", []) or []
    for item in vs_ics:
        s = item.get("start")
        e = item.get("stop")
        label = item.get("label") or "Vacances scolaires"
        zones: List[str] = item.get("zones") or []
        if not (s and e and zones):
            continue
        if not (date_in_range(s, start, end) or date_in_range(e, start, end)):
            continue
        for z in zones:
            if academie and z != academie:
                continue
            # Améliorer le nom avec la période et la zone
            try:
                start_date = dt.date.fromisoformat(s)
                end_date = dt.date.fromisoformat(e)
                period_duration = (end_date - start_date).days + 1
                full_label = f"{label} - Zone {z} ({period_duration} jours)"
            except:
                full_label = f"{label} - Zone {z}"
            feature = oedb_feature(
                label=full_label,
                what="time.holidays",
                start=f"{s}T00:00:00Z",
                stop=f"{e}T23:59:59Z",
                description=f"Vacances scolaires zone {z}",
                where=f"Zone {z}",
            )
            # Ajouter la propriété type requise par l'API OEDB
            feature["properties"]["type"] = "event"
            features.append(feature)

    return features


def main() -> int:
    args = build_args()
    cache_cfg = CacheConfig(path=args.cache, ttl_seconds=args.cache_ttl)

    src = fetch_sources(cache_cfg)
    feats = convert_to_oedb(src, args.start, args.end, args.academie, args.limit)

    # Utiliser un cache pour éviter de renvoyer les événements déjà traités
    sent_cache_path = "extractors_cache/fr_holidays_sent.json"
    ok, failed, neterr = post_oedb_features(args.base_url, feats, dry_run=args.dry_run, sent_cache_path=sent_cache_path)
    print(json_report(ok, failed, neterr))
    return 0


def json_report(ok: int, failed: int, neterr: int) -> str:
    import json
    return json.dumps({"success": ok, "failed": failed, "networkErrors": neterr}, indent=2)


if __name__ == "__main__":
    sys.exit(main())