wiki illustrations et team osm fr
This commit is contained in:
parent
d7a54458dc
commit
77ad76cc7e
13 changed files with 78859 additions and 13414 deletions
|
@ -27,6 +27,7 @@ import os
|
|||
import re
|
||||
import random
|
||||
import hashlib
|
||||
import csv
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -41,10 +42,33 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
# Constants
|
||||
OUTPUT_FILE = "pages_unavailable_in_french.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
BASE_URL = "https://wiki.openstreetmap.org/wiki/Category:Pages_unavailable_in_French"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
||||
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
||||
|
||||
def read_wiki_pages_csv():
|
||||
"""
|
||||
Read the wiki_pages.csv file and create a mapping of URLs to description_img_url values
|
||||
|
||||
Returns:
|
||||
dict: Dictionary mapping URLs to description_img_url values
|
||||
"""
|
||||
url_to_img_map = {}
|
||||
|
||||
try:
|
||||
with open(WIKI_PAGES_CSV, 'r', newline='', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
if 'url' in row and 'description_img_url' in row and row['description_img_url']:
|
||||
url_to_img_map[row['url']] = row['description_img_url']
|
||||
|
||||
logger.info(f"Read {len(url_to_img_map)} image URLs from {WIKI_PAGES_CSV}")
|
||||
return url_to_img_map
|
||||
except (IOError, csv.Error) as e:
|
||||
logger.error(f"Error reading {WIKI_PAGES_CSV}: {e}")
|
||||
return {}
|
||||
|
||||
def is_cache_fresh():
|
||||
"""
|
||||
Check if the cache file exists and is less than CACHE_DURATION old
|
||||
|
@ -273,6 +297,9 @@ def main():
|
|||
logger.info(f"Use --force to update anyway")
|
||||
return
|
||||
|
||||
# Read image URLs from wiki_pages.csv
|
||||
url_to_img_map = read_wiki_pages_csv()
|
||||
|
||||
# Scrape pages
|
||||
pages = scrape_all_pages()
|
||||
|
||||
|
@ -280,6 +307,11 @@ def main():
|
|||
logger.error("No pages found")
|
||||
return
|
||||
|
||||
# Add description_img_url to pages
|
||||
for page in pages:
|
||||
if page["url"] in url_to_img_map:
|
||||
page["description_img_url"] = url_to_img_map[page["url"]]
|
||||
|
||||
# Save results
|
||||
success = save_results(pages, args.dry_run)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue