wiki illustrations et team osm fr

This commit is contained in:
Tykayn 2025-08-31 23:15:03 +02:00 committed by tykayn
parent d7a54458dc
commit 77ad76cc7e
13 changed files with 78859 additions and 13414 deletions

View file

@ -27,6 +27,7 @@ import os
import re
import random
import hashlib
import csv
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
@ -41,10 +42,33 @@ logger = logging.getLogger(__name__)
# Constants
OUTPUT_FILE = "pages_unavailable_in_french.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
def read_wiki_pages_csv():
"""
Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values
Returns:
dict: Dictionary mapping URLs to description_img_url values
"""
url_to_img_map = {}
try:
with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
if 'url' in row and 'description_img_url' in row and row['description_img_url']:
url_to_img_map[row['url']] = row['description_img_url']
logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
return url_to_img_map
except (IOError, csv.Error) as e:
logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
return {}
def is_cache_fresh():
"""
Check if the cache file exists and is less than CACHE_DURATION old
@ -273,6 +297,9 @@ def main():
logger.info(f"Use --force to update anyway")
return
# Read image URLs from wiki_pages.csv
url_to_img_map = read_wiki_pages_csv()
# Scrape pages
pages = scrape_all_pages()
@ -280,6 +307,11 @@ def main():
logger.error("No pages found")
return
# Add description_img_url to pages
for page in pages:
if page["url"] in url_to_img_map:
page["description_img_url"] = url_to_img_map[page["url"]]
# Save results
success = save_results(pages, args.dry_run)