suivi et exclusion de pages fr des réunions

This commit is contained in:
Tykayn 2025-09-01 12:38:43 +02:00 committed by tykayn
parent 471eab4cd0
commit 466f9c773b
5 changed files with 231 additions and 4 deletions

View file

@ -135,6 +135,11 @@ def extract_pages_from_category(html_content, current_url):
title = link.get_text()
url = WIKI_BASE_URL + link.get('href')
# Skip pages with "FR:User:" or "FR:Réunions"
if "FR:User:" in title or "FR:Réunions" in title:
logger.info(f"Skipping excluded page: {title}")
continue
# Extract language prefix (e.g., "En:", "De:", etc.)
language_prefix = "Other"
match = re.match(r'^([A-Za-z]{2}):', title)