oedb-backend/extractors/fr_holidays_extractor.py
2025-10-10 17:45:23 +02:00

266 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extracteur des vacances scolaires en France (par zones A/B/C et Corse) + jours fériés.
Sources:
- Vacances scolaires (ICS): https://www.data.gouv.fr/api/1/datasets/r/e5f40fbc-7a84-4c4a-94e4-55ac4299b222
- Jours fériés: https://calendrier.api.gouv.fr/jours-feries.json
Fonctionnalités:
- Cache JSON pour limiter les requêtes
- Paramètres CLI (période, zone optionnelle, dry-run, base_url OEDB, ttl cache)
- Conversion vers format Feature OEDB (un évènement par zone et par période de vacances)
- Pas de coordonnées GPS (point [0,0])
- Rapport succès/échecs à l'issue de l'envoi
"""
import argparse
import datetime as dt
import sys
from typing import Any, Dict, List, Tuple
from utils_extractor_common import (
CacheConfig,
load_cache,
save_cache,
oedb_feature,
post_oedb_features,
http_get_json,
)
DEFAULT_CACHE = "extractors_cache/fr_holidays_cache.json"
OEDB_DEFAULT = "https://api.openeventdatabase.org"
ICS_URL = "https://www.data.gouv.fr/api/1/datasets/r/e5f40fbc-7a84-4c4a-94e4-55ac4299b222"
def build_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Extracteur vacances scolaires FR -> OEDB")
p.add_argument("--start", help="Date de début YYYY-MM-DD", default=None)
p.add_argument("--end", help="Date de fin YYYY-MM-DD", default=None)
p.add_argument("--academie", help="Filtrer par académie (optionnel)", default=None)
p.add_argument("--base-url", help="Base URL OEDB", default=OEDB_DEFAULT)
p.add_argument("--cache", help="Fichier de cache JSON", default=DEFAULT_CACHE)
p.add_argument("--cache-ttl", help="Durée de vie du cache (sec)", type=int, default=24*3600)
p.add_argument("--limit", help="Limiter le nombre d'événements à traiter", type=int, default=None)
p.add_argument("--dry-run", help="N'envoie pas à l'API OEDB", action="store_true")
return p.parse_args()
def date_in_range(d: str, start: str, end: str) -> bool:
if not start and not end:
return True
dd = dt.date.fromisoformat(d)
if start:
if dd < dt.date.fromisoformat(start):
return False
if end:
if dd > dt.date.fromisoformat(end):
return False
return True
def _zones_from_text(summary: str, location: str) -> List[str]:
s = f"{summary} {location}".lower()
zones: List[str] = []
if "corse" in s:
zones.append("Corse")
# Chercher motifs Zones A/B/C, B/C, A/B, Zone A, etc.
# Simpliste mais robuste pour notre source
if "zones a/b/c" in s or "zones a / b / c" in s:
zones.extend(["A", "B", "C"])
else:
if "zones a/b" in s or "zones a / b" in s:
zones.extend(["A", "B"])
if "zones b/c" in s or "zones b / c" in s:
zones.extend(["B", "C"])
if "zones a/c" in s or "zones a / c" in s:
zones.extend(["A", "C"])
if "zone a" in s:
zones.append("A")
if "zone b" in s:
zones.append("B")
if "zone c" in s:
zones.append("C")
# Dédupliquer en conservant l'ordre
seen = set()
out: List[str] = []
for z in zones:
if z not in seen:
seen.add(z)
out.append(z)
return out or ["A", "B", "C"] # fallback si non indiqué
def _parse_ics_events(ics_text: str) -> List[Dict[str, Any]]:
events: List[Dict[str, Any]] = []
current: Dict[str, str] = {}
in_event = False
for raw in ics_text.splitlines():
line = raw.strip()
if line == "BEGIN:VEVENT":
in_event = True
current = {}
continue
if line == "END:VEVENT":
if current:
events.append(current)
in_event = False
current = {}
continue
if not in_event:
continue
if line.startswith("DTSTART"):
# DTSTART;VALUE=DATE:YYYYMMDD
val = line.split(":", 1)[-1]
current["DTSTART"] = val
elif line.startswith("DTEND"):
val = line.split(":", 1)[-1]
current["DTEND"] = val
elif line.startswith("SUMMARY:"):
current["SUMMARY"] = line[len("SUMMARY:"):].strip()
elif line.startswith("LOCATION:"):
current["LOCATION"] = line[len("LOCATION:"):].strip()
return events
def _yymmdd_to_iso(d: str) -> str:
# d: YYYYMMDD
return f"{d[0:4]}-{d[4:6]}-{d[6:8]}"
def fetch_sources(cache_cfg: CacheConfig) -> Dict[str, Any]:
cache = load_cache(cache_cfg)
if cache:
return cache
out: Dict[str, Any] = {}
# Jours fériés France métropolitaine (année courante)
year = dt.date.today().year
holidays_url = f"https://calendrier.api.gouv.fr/jours-feries/metropole/{year}.json"
out["jours_feries"] = http_get_json(holidays_url)
# Vacances scolaires via ICS data.gouv
import requests
r = requests.get(ICS_URL, timeout=30)
r.raise_for_status()
ics_text = r.text
vevents = _parse_ics_events(ics_text)
vacances: List[Dict[str, Any]] = []
for ev in vevents:
dtstart = ev.get("DTSTART")
dtend = ev.get("DTEND")
summary = ev.get("SUMMARY", "")
location = ev.get("LOCATION", "")
if not (dtstart and dtend and summary):
continue
start_iso = _yymmdd_to_iso(dtstart)
end_excl_iso = _yymmdd_to_iso(dtend)
# DTEND valeur-date dans ICS est exclusive -> stop inclusif = end_excl - 1 jour
end_excl = dt.date.fromisoformat(end_excl_iso)
stop_incl = (end_excl - dt.timedelta(days=1)).isoformat()
zones = _zones_from_text(summary, location)
vacances.append({
"label": summary,
"start": start_iso,
"stop": stop_incl,
"zones": zones,
})
out["vacances_scolaires_ics"] = vacances
save_cache(cache_cfg, out)
return out
def convert_to_oedb(data: Dict[str, Any], start: str | None, end: str | None, academie: str | None, limit: int | None = None) -> List[Dict[str, Any]]:
features: List[Dict[str, Any]] = []
# Jours fériés
jf: Dict[str, str] = data.get("jours_feries", {}) or {}
for date_iso, label in jf.items():
if not date_in_range(date_iso, start, end):
continue
# Améliorer le nom avec la date
try:
date_obj = dt.date.fromisoformat(date_iso)
day_name = date_obj.strftime("%A %d %B %Y")
full_label = f"{label} ({day_name})"
except:
full_label = label
feature = oedb_feature(
label=full_label,
what="time.daylight.holiday",
start=f"{date_iso}T00:00:00Z",
stop=f"{date_iso}T23:59:59Z",
description="Jour férié national",
where="France",
)
# Ajouter la propriété type requise par l'API OEDB
feature["properties"]["type"] = "scheduled"
features.append(feature)
# Appliquer la limite si définie
if limit and len(features) >= limit:
return features[:limit]
# Vacances scolaires via ICS un évènement par zone listée
vs_ics: List[Dict[str, Any]] = data.get("vacances_scolaires_ics", []) or []
for item in vs_ics:
s = item.get("start")
e = item.get("stop")
label = item.get("label") or "Vacances scolaires"
zones: List[str] = item.get("zones") or []
if not (s and e and zones):
continue
if not (date_in_range(s, start, end) or date_in_range(e, start, end)):
continue
for z in zones:
if academie and z != academie:
continue
# Améliorer le nom avec la période et la zone
try:
start_date = dt.date.fromisoformat(s)
end_date = dt.date.fromisoformat(e)
period_duration = (end_date - start_date).days + 1
full_label = f"{label} - Zone {z} ({period_duration} jours)"
except:
full_label = f"{label} - Zone {z}"
feature = oedb_feature(
label=full_label,
what="time.holidays",
start=f"{s}T00:00:00Z",
stop=f"{e}T23:59:59Z",
description=f"Vacances scolaires zone {z}",
where=f"Zone {z}",
)
# Ajouter la propriété type requise par l'API OEDB
feature["properties"]["type"] = "event"
features.append(feature)
return features
def main() -> int:
args = build_args()
cache_cfg = CacheConfig(path=args.cache, ttl_seconds=args.cache_ttl)
src = fetch_sources(cache_cfg)
feats = convert_to_oedb(src, args.start, args.end, args.academie, args.limit)
# Utiliser un cache pour éviter de renvoyer les événements déjà traités
sent_cache_path = "extractors_cache/fr_holidays_sent.json"
ok, failed, neterr = post_oedb_features(args.base_url, feats, dry_run=args.dry_run, sent_cache_path=sent_cache_path)
print(json_report(ok, failed, neterr))
return 0
def json_report(ok: int, failed: int, neterr: int) -> str:
import json
return json.dumps({"success": ok, "failed": failed, "networkErrors": neterr}, indent=2)
if __name__ == "__main__":
sys.exit(main())