oedb-backend/extractors/mobilizon.py
2025-10-07 14:10:08 +02:00

582 lines
No EOL
24 KiB
Python

#!/usr/bin/env python3
"""
Import d'événements depuis l'API GraphQL de Mobilizon vers OEDB
Usage:
python3 mobilizon.py --limit 25 --page-size 10 --instance-url https://mobilizon.fr \
--api-url https://api.openeventdatabase.org --dry-run --verbose
Notes:
- S'inspire de extractors/agenda_geek.py pour la structure générale (CLI, dry-run,
session HTTP, envoi vers /event) et évite de scraper les pages web en
utilisant l'API GraphQL officielle.
- Ajoute un paramètre --limit pour borner le nombre d'événements à insérer.
"""
import argparse
import json
import logging
import time
import os
import math
import os
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Dict, Iterable, List, Optional, Tuple
import requests
# Configuration logging (alignée avec agenda_geek.py)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
]
)
logger = logging.getLogger(__name__)
@dataclass
class MobilizonEvent:
uuid: Optional[str]
url: Optional[str]
title: Optional[str]
description: Optional[str]
begins_on: Optional[str]
ends_on: Optional[str]
status: Optional[str]
latitude: Optional[float]
longitude: Optional[float]
address_text: Optional[str]
tags: Optional[List[str]]
organizer_name: Optional[str]
organizer_url: Optional[str]
category: Optional[str]
website: Optional[str]
class MobilizonClient:
def __init__(self, instance_url: str = "https://mobilizon.fr") -> None:
self.base = instance_url.rstrip('/')
# L'endpoint GraphQL public d'une instance Mobilizon est typiquement /api
self.endpoint = f"{self.base}/api"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'OEDB-Mobilizon-Importer/1.0 (+https://github.com/cquest/oedb)',
'Content-Type': 'application/json'
})
def fetch_events_page(self, page: int, page_size: int) -> Tuple[List[MobilizonEvent], int]:
"""Récupère une page d'événements publics via GraphQL.
Retourne (events, total) où total est le total connu côté API (si exposé), sinon 0.
"""
# Plusieurs schémas existent selon versions; on tente un query générique.
# Le champ events retourne elements[] + total dans de nombreuses versions.
query = """
query Events($page: Int!, $limit: Int!) {
events(page: $page, limit: $limit) {
total
elements {
uuid
url
title
description
beginsOn
endsOn
status
physicalAddress {
description
locality
geom
street
postalCode
region
country
}
onlineAddress
tags { title slug }
organizerActor { name url }
category
}
}
}
"""
variables = {"page": page, "limit": page_size}
try:
logger.info(f"Fetching events page {page} with size {page_size}")
logger.info(f"Query: {query}")
logger.info(f"Variables: {variables}")
logger.info(f"Endpoint: {self.endpoint}")
resp = self.session.post(self.endpoint, json={"query": query, "variables": variables}, timeout=30)
resp.raise_for_status()
data = resp.json()
except requests.RequestException as e:
logger.error(f"Erreur HTTP GraphQL: {e}")
return ([], 0)
except ValueError:
logger.error("Réponse GraphQL non JSON")
return ([], 0)
if 'errors' in data:
logger.error(f"Erreurs GraphQL: {data['errors']}")
return ([], 0)
events_raw = (((data.get('data') or {}).get('events')) or {}).get('elements') or []
total = (((data.get('data') or {}).get('events')) or {}).get('total') or 0
parsed: List[MobilizonEvent] = []
for ev in events_raw:
# Adresse/coords
addr = ev.get('physicalAddress') or {}
address_text = None
if addr:
parts = [
addr.get('description'),
addr.get('street'),
addr.get('postalCode'),
addr.get('locality'),
addr.get('region'),
addr.get('country'),
]
address_text = ", ".join([p for p in parts if p]) or None
lat = None
lon = None
geom = addr.get('geom') if isinstance(addr, dict) else None
# geom peut être un scalaire JSON (string) ou déjà un objet (selon versions)
if geom:
parsed_ok = False
# 1) Essayer JSON
if isinstance(geom, (dict, list)):
try:
g = geom
if isinstance(g, dict) and isinstance(g.get('coordinates'), (list, tuple)):
coords = g.get('coordinates')
if isinstance(coords, list) and len(coords) >= 2:
lon = float(coords[0])
lat = float(coords[1])
parsed_ok = True
except Exception:
pass
else:
# string -> tenter json, sinon WKT POINT(lon lat)
try:
g = json.loads(geom)
if isinstance(g, dict) and isinstance(g.get('coordinates'), (list, tuple)):
coords = g.get('coordinates')
if isinstance(coords, list) and len(coords) >= 2:
lon = float(coords[0])
lat = float(coords[1])
parsed_ok = True
except Exception:
# WKT
import re
m = re.search(r"POINT\s*\(\s*([+-]?[0-9]*\.?[0-9]+)\s+([+-]?[0-9]*\.?[0-9]+)\s*\)", str(geom))
if m:
try:
lon = float(m.group(1))
lat = float(m.group(2))
parsed_ok = True
except Exception:
pass
# tags
tags_field = ev.get('tags')
tags_list: Optional[List[str]] = None
if isinstance(tags_field, list):
tags_list = []
for t in tags_field:
if isinstance(t, dict):
val = t.get('title') or t.get('slug') or t.get('name')
if val:
tags_list.append(val)
elif isinstance(t, str):
tags_list.append(t)
if not tags_list:
tags_list = None
# organizer
organizer = ev.get('organizerActor') or {}
organizer_name = organizer.get('name') if isinstance(organizer, dict) else None
organizer_url = organizer.get('url') if isinstance(organizer, dict) else None
# category & website
category = ev.get('category')
website = ev.get('onlineAddress') or ev.get('url')
parsed.append(MobilizonEvent(
uuid=ev.get('uuid') or ev.get('id'),
url=ev.get('url') or ev.get('onlineAddress'),
title=ev.get('title'),
description=ev.get('description'),
begins_on=ev.get('beginsOn'),
ends_on=ev.get('endsOn'),
status=ev.get('status'),
latitude=lat,
longitude=lon,
address_text=address_text,
tags=tags_list,
organizer_name=organizer_name,
organizer_url=organizer_url,
category=category,
website=website,
))
return (parsed, total)
class MobilizonImporter:
def __init__(self, api_url: str, instance_url: str, dry_run: bool = False, geocode_missing: bool = False, cache_file: Optional[str] = None) -> None:
self.api_url = api_url.rstrip('/')
self.client = MobilizonClient(instance_url)
self.dry_run = dry_run
self.geocode_missing = geocode_missing
self.cache_file = cache_file
self.cache = {"fetched": {}, "sent": {}, "events": {}} # uid -> ts, uid -> event dict
if self.cache_file:
self._load_cache()
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'OEDB-Mobilizon-Importer/1.0 (+https://github.com/cquest/oedb)'
})
def _load_cache(self) -> None:
try:
if self.cache_file and os.path.exists(self.cache_file):
with open(self.cache_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, dict):
self.cache["fetched"] = data.get("fetched", {})
self.cache["sent"] = data.get("sent", {})
self.cache["events"] = data.get("events", {})
logger.info(f"Cache chargé: fetched={len(self.cache['fetched'])}, sent={len(self.cache['sent'])}, events={len(self.cache['events'])}")
except Exception as e:
logger.warning(f"Chargement du cache échoué: {e}")
def _save_cache(self) -> None:
if not self.cache_file:
return
try:
tmp = self.cache_file + ".tmp"
with open(tmp, 'w', encoding='utf-8') as f:
json.dump(self.cache, f, ensure_ascii=False, indent=2)
os.replace(tmp, self.cache_file)
except Exception as e:
logger.warning(f"Écriture du cache échouée: {e}")
def geocode_address(self, address: str) -> Optional[Tuple[float, float]]:
if not address or address.strip() == '':
return None
try:
geocode_url = "https://nominatim.openstreetmap.org/search"
params = {
'q': address,
'format': 'json',
'limit': 1,
'addressdetails': 0,
}
# Utiliser une session distincte pour respecter headers/politiques
s = requests.Session()
s.headers.update({'User-Agent': 'OEDB-Mobilizon-Importer/1.0 (+https://github.com/cquest/oedb)'})
r = s.get(geocode_url, params=params, timeout=15)
r.raise_for_status()
results = r.json()
if isinstance(results, list) and results:
lat = float(results[0]['lat'])
lon = float(results[0]['lon'])
return (lat, lon)
except Exception as e:
logger.warning(f"Géocodage échoué pour '{address}': {e}")
return None
@staticmethod
def _iso_or_none(dt_str: Optional[str]) -> Optional[str]:
if not dt_str:
return None
try:
# Mobilizon renvoie souvent des ISO 8601 déjà valides.
dt = datetime.fromisoformat(dt_str.replace('Z', '+00:00'))
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.isoformat()
except Exception:
return None
@staticmethod
def _parse_dt(dt_str: Optional[str]) -> Optional[datetime]:
if not dt_str:
return None
try:
dt = datetime.fromisoformat(dt_str.replace('Z', '+00:00'))
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt
except Exception:
return None
@staticmethod
def _oedb_feature(ev: MobilizonEvent) -> Optional[Dict]:
# Nécessite des coords; si absentes on ignore (évite un géocodage agressif).
if ev.latitude is None or ev.longitude is None:
return None
start_iso = MobilizonImporter._iso_or_none(ev.begins_on)
end_iso = MobilizonImporter._iso_or_none(ev.ends_on)
properties = {
"label": ev.title or "Événement Mobilizon",
"type": "scheduled",
"what": "culture.meetup",
"start": start_iso,
"stop": end_iso,
"where": ev.address_text or "",
"description": ev.description or "",
"source:name": "Mobilizon",
"source:url": ev.url or "",
"source:uid": ev.uuid or "",
"url": ev.url or "",
}
if ev.tags:
properties["tags"] = ev.tags
if ev.organizer_name:
properties["organizer:name"] = ev.organizer_name
if ev.organizer_url:
properties["organizer:url"] = ev.organizer_url
if ev.category:
properties["category"] = ev.category
if ev.website:
properties["website"] = ev.website
feature = {
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [ev.longitude, ev.latitude],
},
"properties": properties,
}
logger.info(json.dumps(feature, indent=2, ensure_ascii=False))
return feature
def send_to_oedb(self, feature: Dict) -> bool:
# Toujours logguer le JSON envoyé (ou qui serait envoyé)
if self.dry_run:
logger.info("DRY RUN - Événement qui serait envoyé:")
else:
logger.info("Envoi de l'événement vers OEDB:")
logger.info(json.dumps(feature, indent=2, ensure_ascii=False))
if self.dry_run:
return True
try:
r = self.session.post(f"{self.api_url}/event", json=feature, timeout=30)
if r.status_code == 201:
logger.info("Événement créé avec succès")
try:
uid = feature.get('properties', {}).get('source:uid')
if uid:
self.cache['sent'][uid] = int(time.time())
self._save_cache()
except Exception:
pass
return True
if r.status_code == 409:
logger.info("Événement déjà existant (409)")
try:
uid = feature.get('properties', {}).get('source:uid')
if uid:
self.cache['sent'][uid] = int(time.time())
self._save_cache()
except Exception:
pass
return True
logger.error(f"Erreur API OEDB {r.status_code}: {r.text}")
return False
except requests.RequestException as e:
logger.error(f"Erreur d'appel OEDB: {e}")
return False
def import_events(self, limit: int, page_size: int, start_page: int = 1, sleep_s: float = 0.5) -> None:
inserted = 0
fetched = 0 # nombre brut d'événements récupérés depuis l'API
page = start_page
pages_fetched = 0
# Parcourir les pages jusqu'à atteindre la limite demandée
while inserted < limit:
# Ne pas parcourir plus de pages que nécessaire (ex: limit=1, page-size=10 => 1 page max)
max_pages = max(1, math.ceil(limit / page_size))
if pages_fetched >= max_pages:
logger.info("Limite de pages atteinte selon --limit et --page-size, arrêt de la pagination")
break
remaining_fetch = max(1, min(page_size, max(1, limit - inserted)))
events, total = self.client.fetch_events_page(page=page, page_size=remaining_fetch)
if not events:
logger.info("Aucun événement supplémentaire retourné par l'API")
# Traiter des événements non envoyés depuis le cache si disponible
if self.cache.get('events'):
logger.info("Utilisation du cache pour traiter les événements non envoyés")
for uid, ev_data in list(self.cache['events'].items()):
if inserted >= limit:
break
if uid in self.cache['sent']:
continue
ev = MobilizonEvent(
uuid=uid,
url=ev_data.get('url'),
title=ev_data.get('title'),
description=ev_data.get('description'),
begins_on=ev_data.get('begins_on'),
ends_on=ev_data.get('ends_on'),
status=ev_data.get('status'),
latitude=ev_data.get('latitude'),
longitude=ev_data.get('longitude'),
address_text=ev_data.get('address_text'),
tags=ev_data.get('tags'),
organizer_name=ev_data.get('organizer_name'),
organizer_url=ev_data.get('organizer_url'),
category=ev_data.get('category'),
website=ev_data.get('website'),
)
# Filtrer les événements de plus d'une semaine
start_dt = self._parse_dt(ev.begins_on)
end_dt = self._parse_dt(ev.ends_on)
if start_dt and end_dt:
duration = end_dt - start_dt
if duration.total_seconds() > 7 * 24 * 3600:
continue
feature = self._oedb_feature(ev)
if feature is None and self.geocode_missing and ev.address_text:
coords = self.geocode_address(ev.address_text)
if coords:
ev.latitude, ev.longitude = coords
# mettre à jour le cache
ev_data['latitude'], ev_data['longitude'] = coords
self.cache['events'][uid] = ev_data
self._save_cache()
feature = self._oedb_feature(ev)
if feature is None:
continue
ok = self.send_to_oedb(feature)
if ok:
inserted += 1
break
break
# marquer fetched et filtrer déjà envoyés/déjà vus
new_in_page = 0
filtered: List[MobilizonEvent] = []
for ev in events:
uid = ev.uuid or ev.url
if uid:
if uid in self.cache['sent']:
logger.info("Ignoré (déjà envoyé) uid=%s" % uid)
continue
if uid not in self.cache['fetched']:
new_in_page += 1
self.cache['fetched'][uid] = int(time.time())
# Sauvegarder l'événement (cache pour dry-run / re-run sans refetch)
self.cache['events'][uid] = {
'url': ev.url,
'title': ev.title,
'description': ev.description,
'begins_on': ev.begins_on,
'ends_on': ev.ends_on,
'status': ev.status,
'latitude': ev.latitude,
'longitude': ev.longitude,
'address_text': ev.address_text,
'tags': ev.tags,
'organizer_name': ev.organizer_name,
'organizer_url': ev.organizer_url,
'category': ev.category,
'website': ev.website,
}
filtered.append(ev)
self._save_cache()
fetched += len(events)
pages_fetched += 1
for ev in filtered:
if inserted >= limit:
break
# Filtrer les événements de plus d'une semaine
start_dt = self._parse_dt(ev.begins_on)
end_dt = self._parse_dt(ev.ends_on)
if start_dt and end_dt:
duration = end_dt - start_dt
if duration.total_seconds() > 7 * 24 * 3600:
logger.info("Ignoré (durée > 7 jours)")
continue
feature = self._oedb_feature(ev)
if feature is None:
# Pas de géométrie -> on saute (évite un géocodage agressif pour rester léger)
# Mais on loggue tout de même les propriétés pour visibilité
properties = {
"label": ev.title or "Événement Mobilizon",
"type": "scheduled",
"what": "culture.meetup",
"start": MobilizonImporter._iso_or_none(ev.begins_on),
"stop": MobilizonImporter._iso_or_none(ev.ends_on),
"where": ev.address_text or "",
"description": ev.description or "",
"source:name": "Mobilizon",
"source:url": ev.url or "",
"source:uid": ev.uuid or "",
"url": ev.url or "",
}
pseudo_feature = {"type": "Feature", "geometry": None, "properties": properties}
logger.info("Ignoré (pas de géométrie) - Événement qui aurait été envoyé:")
logger.info(ev)
logger.info(json.dumps(pseudo_feature, indent=2, ensure_ascii=False))
# Si demandé, essayer un géocodage sur l'adresse
if self.geocode_missing and ev.address_text:
logger.info("Tentative de géocodage pour compléter les coordonnées...")
coords = self.geocode_address(ev.address_text)
if coords:
ev.latitude, ev.longitude = coords
feature = self._oedb_feature(ev)
if feature is None:
continue
else:
continue
ok = self.send_to_oedb(feature)
if ok:
inserted += 1
time.sleep(sleep_s)
page += 1
logger.info(f"Terminé: {inserted} événement(s) traité(s) (limite demandée: {limit})")
def main() -> None:
parser = argparse.ArgumentParser(description='Import Mobilizon -> OEDB (via GraphQL)')
parser.add_argument('--limit', type=int, default=20, help="Nombre maximal d'événements à insérer")
parser.add_argument('--page-size', type=int, default=10, help='Taille des pages GraphQL')
parser.add_argument('--start-page', type=int, default=1, help='Page de départ (1-indexée)')
parser.add_argument('--instance-url', default='https://mobilizon.fr', help="URL de l'instance Mobilizon (ex: https://mobilizon.fr)")
parser.add_argument('--api-url', default='https://api.openeventdatabase.org', help="URL de l'API OEDB")
parser.add_argument('--dry-run', action='store_true', help='Mode test sans envoi vers OEDB')
parser.add_argument('--geocode-missing', action='store_true', help="Tenter un géocodage si pas de géométrie fournie", default=True)
parser.add_argument('--cache-file', default='mobilizon_cache.json', help='Fichier JSON de cache pour éviter les doublons')
parser.add_argument('--verbose', action='store_true', help='Mode verbeux')
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
importer = MobilizonImporter(api_url=args.api_url, instance_url=args.instance_url, dry_run=args.dry_run, geocode_missing=args.geocode_missing, cache_file=args.cache_file)
importer.import_events(limit=args.limit, page_size=args.page_size, start_page=args.start_page)
if __name__ == '__main__':
main()
# extractors/mobilizon.py