add journées mondiales et vacances
This commit is contained in:
parent
d22dbde2e7
commit
26bfe4ae36
4 changed files with 670 additions and 0 deletions
39
extractors/README_extractor_events.md
Normal file
39
extractors/README_extractor_events.md
Normal file
|
@ -0,0 +1,39 @@
|
|||
# Extractors d'évènements (vacances, journées mondiales)
|
||||
|
||||
Scripts CLI ajoutant des évènements dans OEDB, avec cache JSON, paramètres et rapport.
|
||||
|
||||
## Commun
|
||||
- Cache: dossier `extractors_cache/` (créé automatiquement)
|
||||
- Paramètres: `--dry-run` pour simuler sans écrire dans OEDB
|
||||
- API OEDB: `--base-url` (par défaut `https://api.openeventdatabase.org`)
|
||||
|
||||
## Vacances scolaires FR
|
||||
|
||||
```bash
|
||||
python3 extractors/fr_holidays_extractor.py \
|
||||
--start 2025-01-01 --end 2025-12-31 \
|
||||
--academie A \
|
||||
--cache extractors_cache/fr_holidays_cache.json \
|
||||
--cache-ttl $((24*3600)) \
|
||||
--base-url https://api.openeventdatabase.org \
|
||||
--dry-run
|
||||
```
|
||||
|
||||
Sortie: JSON avec `success`, `failed`, `networkErrors`.
|
||||
|
||||
## Journées mondiales / internationales
|
||||
|
||||
```bash
|
||||
python3 extractors/world_days_extractor.py \
|
||||
--year 2025 \
|
||||
--cache extractors_cache/world_days_cache.json \
|
||||
--cache-ttl $((24*3600)) \
|
||||
--base-url https://api.openeventdatabase.org \
|
||||
--dry-run
|
||||
```
|
||||
|
||||
Remarques:
|
||||
- Les sources sont branchées de façon minimaliste (exemples). Brancher des sources plus riches selon besoin.
|
||||
- Conversion en format OEDB: évènements non localisés (Point [0,0]) par défaut, `online=yes` pour journées mondiales.
|
||||
|
||||
|
266
extractors/fr_holidays_extractor.py
Normal file
266
extractors/fr_holidays_extractor.py
Normal file
|
@ -0,0 +1,266 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Extracteur des vacances scolaires en France (par zones A/B/C et Corse) + jours fériés.
|
||||
Sources:
|
||||
- Vacances scolaires (ICS): https://www.data.gouv.fr/api/1/datasets/r/e5f40fbc-7a84-4c4a-94e4-55ac4299b222
|
||||
- Jours fériés: https://calendrier.api.gouv.fr/jours-feries.json
|
||||
|
||||
Fonctionnalités:
|
||||
- Cache JSON pour limiter les requêtes
|
||||
- Paramètres CLI (période, zone optionnelle, dry-run, base_url OEDB, ttl cache)
|
||||
- Conversion vers format Feature OEDB (un évènement par zone et par période de vacances)
|
||||
- Pas de coordonnées GPS (point [0,0])
|
||||
- Rapport succès/échecs à l'issue de l'envoi
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import sys
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from utils_extractor_common import (
|
||||
CacheConfig,
|
||||
load_cache,
|
||||
save_cache,
|
||||
oedb_feature,
|
||||
post_oedb_features,
|
||||
http_get_json,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_CACHE = "extractors_cache/fr_holidays_cache.json"
|
||||
OEDB_DEFAULT = "https://api.openeventdatabase.org"
|
||||
ICS_URL = "https://www.data.gouv.fr/api/1/datasets/r/e5f40fbc-7a84-4c4a-94e4-55ac4299b222"
|
||||
|
||||
|
||||
def build_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="Extracteur vacances scolaires FR -> OEDB")
|
||||
p.add_argument("--start", help="Date de début YYYY-MM-DD", default=None)
|
||||
p.add_argument("--end", help="Date de fin YYYY-MM-DD", default=None)
|
||||
p.add_argument("--academie", help="Filtrer par académie (optionnel)", default=None)
|
||||
p.add_argument("--base-url", help="Base URL OEDB", default=OEDB_DEFAULT)
|
||||
p.add_argument("--cache", help="Fichier de cache JSON", default=DEFAULT_CACHE)
|
||||
p.add_argument("--cache-ttl", help="Durée de vie du cache (sec)", type=int, default=24*3600)
|
||||
p.add_argument("--limit", help="Limiter le nombre d'événements à traiter", type=int, default=None)
|
||||
p.add_argument("--dry-run", help="N'envoie pas à l'API OEDB", action="store_true")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def date_in_range(d: str, start: str, end: str) -> bool:
|
||||
if not start and not end:
|
||||
return True
|
||||
dd = dt.date.fromisoformat(d)
|
||||
if start:
|
||||
if dd < dt.date.fromisoformat(start):
|
||||
return False
|
||||
if end:
|
||||
if dd > dt.date.fromisoformat(end):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _zones_from_text(summary: str, location: str) -> List[str]:
|
||||
s = f"{summary} {location}".lower()
|
||||
zones: List[str] = []
|
||||
if "corse" in s:
|
||||
zones.append("Corse")
|
||||
# Chercher motifs Zones A/B/C, B/C, A/B, Zone A, etc.
|
||||
# Simpliste mais robuste pour notre source
|
||||
if "zones a/b/c" in s or "zones a / b / c" in s:
|
||||
zones.extend(["A", "B", "C"])
|
||||
else:
|
||||
if "zones a/b" in s or "zones a / b" in s:
|
||||
zones.extend(["A", "B"])
|
||||
if "zones b/c" in s or "zones b / c" in s:
|
||||
zones.extend(["B", "C"])
|
||||
if "zones a/c" in s or "zones a / c" in s:
|
||||
zones.extend(["A", "C"])
|
||||
if "zone a" in s:
|
||||
zones.append("A")
|
||||
if "zone b" in s:
|
||||
zones.append("B")
|
||||
if "zone c" in s:
|
||||
zones.append("C")
|
||||
# Dédupliquer en conservant l'ordre
|
||||
seen = set()
|
||||
out: List[str] = []
|
||||
for z in zones:
|
||||
if z not in seen:
|
||||
seen.add(z)
|
||||
out.append(z)
|
||||
return out or ["A", "B", "C"] # fallback si non indiqué
|
||||
|
||||
|
||||
def _parse_ics_events(ics_text: str) -> List[Dict[str, Any]]:
|
||||
events: List[Dict[str, Any]] = []
|
||||
current: Dict[str, str] = {}
|
||||
in_event = False
|
||||
for raw in ics_text.splitlines():
|
||||
line = raw.strip()
|
||||
if line == "BEGIN:VEVENT":
|
||||
in_event = True
|
||||
current = {}
|
||||
continue
|
||||
if line == "END:VEVENT":
|
||||
if current:
|
||||
events.append(current)
|
||||
in_event = False
|
||||
current = {}
|
||||
continue
|
||||
if not in_event:
|
||||
continue
|
||||
if line.startswith("DTSTART"):
|
||||
# DTSTART;VALUE=DATE:YYYYMMDD
|
||||
val = line.split(":", 1)[-1]
|
||||
current["DTSTART"] = val
|
||||
elif line.startswith("DTEND"):
|
||||
val = line.split(":", 1)[-1]
|
||||
current["DTEND"] = val
|
||||
elif line.startswith("SUMMARY:"):
|
||||
current["SUMMARY"] = line[len("SUMMARY:"):].strip()
|
||||
elif line.startswith("LOCATION:"):
|
||||
current["LOCATION"] = line[len("LOCATION:"):].strip()
|
||||
return events
|
||||
|
||||
|
||||
def _yymmdd_to_iso(d: str) -> str:
|
||||
# d: YYYYMMDD
|
||||
return f"{d[0:4]}-{d[4:6]}-{d[6:8]}"
|
||||
|
||||
|
||||
def fetch_sources(cache_cfg: CacheConfig) -> Dict[str, Any]:
|
||||
cache = load_cache(cache_cfg)
|
||||
if cache:
|
||||
return cache
|
||||
|
||||
out: Dict[str, Any] = {}
|
||||
# Jours fériés France métropolitaine (année courante)
|
||||
year = dt.date.today().year
|
||||
holidays_url = f"https://calendrier.api.gouv.fr/jours-feries/metropole/{year}.json"
|
||||
out["jours_feries"] = http_get_json(holidays_url)
|
||||
|
||||
# Vacances scolaires via ICS data.gouv
|
||||
import requests
|
||||
r = requests.get(ICS_URL, timeout=30)
|
||||
r.raise_for_status()
|
||||
ics_text = r.text
|
||||
vevents = _parse_ics_events(ics_text)
|
||||
vacances: List[Dict[str, Any]] = []
|
||||
for ev in vevents:
|
||||
dtstart = ev.get("DTSTART")
|
||||
dtend = ev.get("DTEND")
|
||||
summary = ev.get("SUMMARY", "")
|
||||
location = ev.get("LOCATION", "")
|
||||
if not (dtstart and dtend and summary):
|
||||
continue
|
||||
start_iso = _yymmdd_to_iso(dtstart)
|
||||
end_excl_iso = _yymmdd_to_iso(dtend)
|
||||
# DTEND valeur-date dans ICS est exclusive -> stop inclusif = end_excl - 1 jour
|
||||
end_excl = dt.date.fromisoformat(end_excl_iso)
|
||||
stop_incl = (end_excl - dt.timedelta(days=1)).isoformat()
|
||||
zones = _zones_from_text(summary, location)
|
||||
vacances.append({
|
||||
"label": summary,
|
||||
"start": start_iso,
|
||||
"stop": stop_incl,
|
||||
"zones": zones,
|
||||
})
|
||||
out["vacances_scolaires_ics"] = vacances
|
||||
|
||||
save_cache(cache_cfg, out)
|
||||
return out
|
||||
|
||||
|
||||
def convert_to_oedb(data: Dict[str, Any], start: str | None, end: str | None, academie: str | None, limit: int | None = None) -> List[Dict[str, Any]]:
|
||||
features: List[Dict[str, Any]] = []
|
||||
|
||||
# Jours fériés
|
||||
jf: Dict[str, str] = data.get("jours_feries", {}) or {}
|
||||
for date_iso, label in jf.items():
|
||||
if not date_in_range(date_iso, start, end):
|
||||
continue
|
||||
# Améliorer le nom avec la date
|
||||
try:
|
||||
date_obj = dt.date.fromisoformat(date_iso)
|
||||
day_name = date_obj.strftime("%A %d %B %Y")
|
||||
full_label = f"{label} ({day_name})"
|
||||
except:
|
||||
full_label = label
|
||||
|
||||
feature = oedb_feature(
|
||||
label=full_label,
|
||||
what="time.daylight.holiday",
|
||||
start=f"{date_iso}T00:00:00Z",
|
||||
stop=f"{date_iso}T23:59:59Z",
|
||||
description="Jour férié national",
|
||||
where="France",
|
||||
)
|
||||
# Ajouter la propriété type requise par l'API OEDB
|
||||
feature["properties"]["type"] = "scheduled"
|
||||
features.append(feature)
|
||||
# Appliquer la limite si définie
|
||||
if limit and len(features) >= limit:
|
||||
return features[:limit]
|
||||
|
||||
# Vacances scolaires via ICS – un évènement par zone listée
|
||||
vs_ics: List[Dict[str, Any]] = data.get("vacances_scolaires_ics", []) or []
|
||||
for item in vs_ics:
|
||||
s = item.get("start")
|
||||
e = item.get("stop")
|
||||
label = item.get("label") or "Vacances scolaires"
|
||||
zones: List[str] = item.get("zones") or []
|
||||
if not (s and e and zones):
|
||||
continue
|
||||
if not (date_in_range(s, start, end) or date_in_range(e, start, end)):
|
||||
continue
|
||||
for z in zones:
|
||||
if academie and z != academie:
|
||||
continue
|
||||
# Améliorer le nom avec la période et la zone
|
||||
try:
|
||||
start_date = dt.date.fromisoformat(s)
|
||||
end_date = dt.date.fromisoformat(e)
|
||||
period_duration = (end_date - start_date).days + 1
|
||||
full_label = f"{label} - Zone {z} ({period_duration} jours)"
|
||||
except:
|
||||
full_label = f"{label} - Zone {z}"
|
||||
feature = oedb_feature(
|
||||
label=full_label,
|
||||
what="time.holidays",
|
||||
start=f"{s}T00:00:00Z",
|
||||
stop=f"{e}T23:59:59Z",
|
||||
description=f"Vacances scolaires zone {z}",
|
||||
where=f"Zone {z}",
|
||||
)
|
||||
# Ajouter la propriété type requise par l'API OEDB
|
||||
feature["properties"]["type"] = "event"
|
||||
features.append(feature)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = build_args()
|
||||
cache_cfg = CacheConfig(path=args.cache, ttl_seconds=args.cache_ttl)
|
||||
|
||||
src = fetch_sources(cache_cfg)
|
||||
feats = convert_to_oedb(src, args.start, args.end, args.academie, args.limit)
|
||||
|
||||
# Utiliser un cache pour éviter de renvoyer les événements déjà traités
|
||||
sent_cache_path = "extractors_cache/fr_holidays_sent.json"
|
||||
ok, failed, neterr = post_oedb_features(args.base_url, feats, dry_run=args.dry_run, sent_cache_path=sent_cache_path)
|
||||
print(json_report(ok, failed, neterr))
|
||||
return 0
|
||||
|
||||
|
||||
def json_report(ok: int, failed: int, neterr: int) -> str:
|
||||
import json
|
||||
return json.dumps({"success": ok, "failed": failed, "networkErrors": neterr}, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
|
152
extractors/utils_extractor_common.py
Normal file
152
extractors/utils_extractor_common.py
Normal file
|
@ -0,0 +1,152 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
import logging as logger
|
||||
|
||||
|
||||
@dataclass
|
||||
class CacheConfig:
|
||||
path: str
|
||||
ttl_seconds: int = 24 * 3600
|
||||
|
||||
|
||||
def load_cache(cfg: CacheConfig) -> Dict[str, Any]:
|
||||
if not cfg.path or not os.path.exists(cfg.path):
|
||||
return {}
|
||||
try:
|
||||
with open(cfg.path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
ts = data.get("__fetched_at__")
|
||||
if cfg.ttl_seconds > 0 and isinstance(ts, (int, float)):
|
||||
if time.time() - ts > cfg.ttl_seconds:
|
||||
return {}
|
||||
return data
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def save_cache(cfg: CacheConfig, data: Dict[str, Any]) -> None:
|
||||
if not cfg.path:
|
||||
return
|
||||
os.makedirs(os.path.dirname(cfg.path), exist_ok=True)
|
||||
payload = dict(data)
|
||||
payload["__fetched_at__"] = int(time.time())
|
||||
with open(cfg.path, 'w', encoding='utf-8') as f:
|
||||
json.dump(payload, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def oedb_feature(label: str, what: str, start: str, stop: Optional[str] = None, description: str = "", where: str = "", online: Optional[bool] = None) -> Dict[str, Any]:
|
||||
props: Dict[str, Any] = {
|
||||
"name": label,
|
||||
"what": what,
|
||||
"start": start,
|
||||
"description": description,
|
||||
}
|
||||
if stop:
|
||||
props["stop"] = stop
|
||||
if where:
|
||||
props["where"] = where
|
||||
if online is not None:
|
||||
props["online"] = "yes" if online else "no"
|
||||
logger.info(f"props: {json.dumps(props, ensure_ascii=False, indent=2)}")
|
||||
return {
|
||||
"type": "Feature",
|
||||
"properties": props,
|
||||
# Non localisé par défaut
|
||||
"geometry": {"type": "Point", "coordinates": [0, 0]},
|
||||
}
|
||||
|
||||
|
||||
def post_oedb_features(base_url: str, features: List[Dict[str, Any]], dry_run: bool = True, timeout: int = 20, sent_cache_path: str = None) -> Tuple[int, int, int]:
|
||||
ok = 0
|
||||
failed = 0
|
||||
neterr = 0
|
||||
|
||||
# Charger le cache des événements déjà envoyés
|
||||
sent_events = set()
|
||||
if sent_cache_path and os.path.exists(sent_cache_path):
|
||||
try:
|
||||
with open(sent_cache_path, 'r', encoding='utf-8') as f:
|
||||
sent_events = set(json.load(f))
|
||||
except:
|
||||
pass
|
||||
|
||||
new_sent_events = set()
|
||||
|
||||
for feat in features:
|
||||
if dry_run:
|
||||
ok += 1
|
||||
continue
|
||||
|
||||
# Générer un ID unique pour l'événement basé sur ses propriétés
|
||||
event_id = generate_event_id(feat)
|
||||
|
||||
# Vérifier si l'événement a déjà été envoyé
|
||||
if event_id in sent_events:
|
||||
print(f"Événement déjà envoyé, ignoré: {feat.get('properties', {}).get('name', 'Sans nom')}")
|
||||
continue
|
||||
|
||||
try:
|
||||
r = requests.post(f"{base_url.rstrip('/')}/event", json=feat, timeout=timeout)
|
||||
if 200 <= r.status_code < 300:
|
||||
ok += 1
|
||||
new_sent_events.add(event_id)
|
||||
elif r.status_code == 409:
|
||||
# Doublon - considérer comme déjà envoyé
|
||||
print(f"Événement déjà existant (doublon), ignoré: {feat.get('properties', {}).get('name', 'Sans nom')}")
|
||||
ok += 1
|
||||
new_sent_events.add(event_id)
|
||||
else:
|
||||
print(f"Erreur HTTP {r.status_code}: {r.text}")
|
||||
failed += 1
|
||||
except requests.RequestException as e:
|
||||
print(f"Erreur réseau: {e}")
|
||||
neterr += 1
|
||||
|
||||
# Sauvegarder les nouveaux événements envoyés
|
||||
if sent_cache_path and new_sent_events:
|
||||
all_sent_events = sent_events | new_sent_events
|
||||
try:
|
||||
os.makedirs(os.path.dirname(sent_cache_path), exist_ok=True)
|
||||
with open(sent_cache_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(list(all_sent_events), f, ensure_ascii=False, indent=2)
|
||||
except Exception as e:
|
||||
print(f"Erreur lors de la sauvegarde du cache: {e}")
|
||||
|
||||
return ok, failed, neterr
|
||||
|
||||
|
||||
def generate_event_id(feature: Dict[str, Any]) -> str:
|
||||
"""Génère un ID unique pour un événement basé sur ses propriétés."""
|
||||
props = feature.get('properties', {})
|
||||
# Utiliser les propriétés clés pour créer un ID unique
|
||||
key_props = {
|
||||
'name': props.get('name', ''),
|
||||
'what': props.get('what', ''),
|
||||
'start': props.get('start', ''),
|
||||
'where': props.get('where', '')
|
||||
}
|
||||
# Créer un hash des propriétés clés
|
||||
import hashlib
|
||||
content = json.dumps(key_props, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
||||
|
||||
|
||||
def http_get_json(url: str, timeout: int = 20, headers: Optional[Dict[str, str]] = None) -> Any:
|
||||
r = requests.get(url, timeout=timeout, headers=headers)
|
||||
r.raise_for_status()
|
||||
ct = r.headers.get("content-type", "")
|
||||
if "json" in ct:
|
||||
return r.json()
|
||||
return json.loads(r.text)
|
||||
|
||||
|
213
extractors/world_days_extractor.py
Normal file
213
extractors/world_days_extractor.py
Normal file
|
@ -0,0 +1,213 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Extracteur des journées mondiales/internationales.
|
||||
Source: https://www.journee-mondiale.com/les-journees-mondiales.htm
|
||||
|
||||
Fonctionnalités:
|
||||
- Cache JSON pour limiter les requêtes
|
||||
- Paramètres CLI (base_url, dry-run, ttl cache)
|
||||
- Conversion vers format OEDB (what par défaut: culture.arts)
|
||||
- Une journée d'événement, positionnée à la prochaine occurrence dans les 365 jours à venir
|
||||
- Rapport succès/échecs et impression du GeoJSON en dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from utils_extractor_common import (
|
||||
CacheConfig,
|
||||
load_cache,
|
||||
save_cache,
|
||||
oedb_feature,
|
||||
post_oedb_features,
|
||||
http_get_json,
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_CACHE = "extractors_cache/world_days_cache.json"
|
||||
OEDB_DEFAULT = "https://api.openeventdatabase.org"
|
||||
SOURCE_URL = "https://www.journee-mondiale.com/les-journees-mondiales.htm"
|
||||
|
||||
|
||||
def build_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="Extracteur journées mondiales/internationales -> OEDB")
|
||||
p.add_argument("--base-url", help="Base URL OEDB", default=OEDB_DEFAULT)
|
||||
p.add_argument("--cache", help="Fichier de cache JSON", default=DEFAULT_CACHE)
|
||||
p.add_argument("--cache-ttl", help="Durée de vie du cache (sec)", type=int, default=24*3600)
|
||||
p.add_argument("--limit", help="Limiter le nombre d'événements à traiter", type=int, default=None)
|
||||
# dry-run activé par défaut; passer --no-dry-run pour envoyer
|
||||
p.add_argument("--no-dry-run", dest="dry_run", help="Désactive le dry-run (envoie à l'API)", action="store_false")
|
||||
p.set_defaults(dry_run=True)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
MONTHS = {
|
||||
"janvier": 1, "février": 2, "fevrier": 2, "mars": 3, "avril": 4, "mai": 5, "juin": 6,
|
||||
"juillet": 7, "août": 8, "aout": 8, "septembre": 9, "octobre": 10, "novembre": 11, "décembre": 12, "decembre": 12
|
||||
}
|
||||
|
||||
|
||||
def parse_days_from_html(html: str) -> List[Tuple[int, int, str, str]]:
|
||||
"""Parse les journées depuis le HTML en ciblant les ancres dans id=texte et class=content."""
|
||||
days: List[Tuple[int, int, str, str]] = []
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Cibler spécifiquement les ancres dans id=texte et class=content
|
||||
text_section = soup.find('div', id='texte')
|
||||
if not text_section:
|
||||
return days
|
||||
|
||||
content_section = text_section.find('div', class_='content')
|
||||
if not content_section:
|
||||
return days
|
||||
|
||||
# Chercher tous les articles (mois) dans cette section
|
||||
articles = content_section.find_all('article')
|
||||
|
||||
for article in articles:
|
||||
# Chercher toutes les ancres dans chaque article
|
||||
links = article.find_all('a')
|
||||
|
||||
for link in links:
|
||||
# Extraire le texte du lien et l'URL
|
||||
text = link.get_text(strip=True)
|
||||
url = link.get('href', '')
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Pattern pour capturer: "1er janvier" ou "15 janvier" + "Journée mondiale..."
|
||||
pattern = re.compile(r"\b(\d{1,2}|1er)\s+([a-zA-Zéèêëàâîïôöùûüç]+)\s*:\s*(.+)")
|
||||
match = pattern.search(text)
|
||||
|
||||
if match:
|
||||
day_str = match.group(1).lower()
|
||||
day = 1 if day_str == "1er" else int(re.sub(r"\D", "", day_str))
|
||||
month_name = match.group(2).lower()
|
||||
month = MONTHS.get(month_name)
|
||||
label = match.group(3).strip()
|
||||
|
||||
if month is not None and label:
|
||||
days.append((month, day, label, url))
|
||||
|
||||
return days
|
||||
|
||||
|
||||
def fetch_sources(cache_cfg: CacheConfig) -> Dict[str, Any]:
|
||||
cache = load_cache(cache_cfg)
|
||||
if cache:
|
||||
return cache
|
||||
|
||||
# Récupérer la page HTML
|
||||
import requests
|
||||
r = requests.get(SOURCE_URL, timeout=30)
|
||||
r.raise_for_status()
|
||||
html = r.text
|
||||
|
||||
# Parser le HTML pour extraire les journées
|
||||
items = parse_days_from_html(html)
|
||||
|
||||
out: Dict[str, Any] = {"items": items}
|
||||
save_cache(cache_cfg, out)
|
||||
return out
|
||||
|
||||
|
||||
def create_event_date(month: int, day: int, today: dt.date) -> dt.date:
|
||||
"""Crée la date de l'événement pour l'année courante à partir d'aujourd'hui."""
|
||||
year = today.year
|
||||
try:
|
||||
# Essayer de créer la date pour l'année courante
|
||||
event_date = dt.date(year, month, day)
|
||||
# Si la date est dans le passé, utiliser l'année suivante
|
||||
if event_date < today:
|
||||
event_date = dt.date(year + 1, month, day)
|
||||
return event_date
|
||||
except ValueError:
|
||||
# Gérer les cas comme le 29 février dans une année non-bissextile
|
||||
# Utiliser l'année suivante
|
||||
try:
|
||||
return dt.date(year + 1, month, day)
|
||||
except ValueError:
|
||||
# Si toujours impossible, utiliser le 28 février
|
||||
return dt.date(year + 1, 2, 28)
|
||||
|
||||
|
||||
def convert_to_oedb(data: Dict[str, Any], limit: int | None = None) -> List[Dict[str, Any]]:
|
||||
features: List[Dict[str, Any]] = []
|
||||
today = dt.date.today()
|
||||
for (month, day, label, url) in data.get("items", []) or []:
|
||||
try:
|
||||
date_obj = create_event_date(month, day, today)
|
||||
except Exception:
|
||||
continue
|
||||
date_iso = date_obj.isoformat()
|
||||
|
||||
# Déterminer la zone selon le titre
|
||||
label_lower = label.lower()
|
||||
if "mondial" in label_lower or "international" in label_lower:
|
||||
zone = "world"
|
||||
where = "Monde"
|
||||
else:
|
||||
zone = "france"
|
||||
where = "France"
|
||||
|
||||
# Créer l'événement avec propriété zone
|
||||
feature = oedb_feature(
|
||||
label=label,
|
||||
what="culture.days",
|
||||
start=f"{date_iso}T00:00:00Z",
|
||||
stop=f"{date_iso}T23:59:59Z",
|
||||
description="Journée mondiale/internationale",
|
||||
where=where,
|
||||
online=True,
|
||||
)
|
||||
# Ajouter la propriété type requise par l'API OEDB
|
||||
feature["properties"]["type"] = "scheduled"
|
||||
# Ajouter la propriété zone
|
||||
feature["properties"]["zone"] = zone
|
||||
# Ajouter l'URL si disponible
|
||||
if url:
|
||||
feature["properties"]["url"] = url
|
||||
|
||||
features.append(feature)
|
||||
# Appliquer la limite si définie
|
||||
if limit and len(features) >= limit:
|
||||
break
|
||||
return features
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = build_args()
|
||||
cache_cfg = CacheConfig(path=args.cache, ttl_seconds=args.cache_ttl)
|
||||
|
||||
src = fetch_sources(cache_cfg)
|
||||
feats = convert_to_oedb(src, args.limit)
|
||||
|
||||
if args.dry_run:
|
||||
# Imprimer le GeoJSON prêt à envoyer
|
||||
collection = {"type": "FeatureCollection", "features": feats}
|
||||
import json
|
||||
print(json.dumps(collection, ensure_ascii=False, indent=2))
|
||||
|
||||
# Utiliser un cache pour éviter de renvoyer les événements déjà traités
|
||||
sent_cache_path = "extractors_cache/world_days_sent.json"
|
||||
ok, failed, neterr = post_oedb_features(args.base_url, feats, dry_run=args.dry_run, sent_cache_path=sent_cache_path)
|
||||
print(json_report(ok, failed, neterr))
|
||||
return 0
|
||||
|
||||
|
||||
def json_report(ok: int, failed: int, neterr: int) -> str:
|
||||
import json
|
||||
return json.dumps({"success": ok, "failed": failed, "networkErrors": neterr}, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue