wiki illustrations et team osm fr

This commit is contained in:
Tykayn 2025-08-31 23:15:03 +02:00 committed by tykayn
parent d7a54458dc
commit 77ad76cc7e
13 changed files with 78859 additions and 13414 deletions

View file

@ -211,6 +211,145 @@ def fetch_wiki_page(key, language='en'):
# Get media details (src and alt text)
media_details = []
# Extract description image specifically
# Try multiple selectors to find the description image
description_img = None
# Debug: Log the key we're processing
logger.info(f"Looking for description image for key '{key}' in {language}")
# Function to filter out OSM logo and small icons
def is_relevant_image(img):
src = img.get('src', '')
# Skip OSM logo
if 'osm_logo' in src:
return False
# Skip small icons (usually less than 30px)
width = img.get('width')
if width and int(width) < 30:
return False
height = img.get('height')
if height and int(height) < 30:
return False
return True
# Special case for highway key - directly target the image we want
if key == 'highway':
# Try to find the specific image in figure elements
highway_img_elements = content.select('figure.mw-halign-center img')
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images for highway")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
# If not found with highway-specific selector, try the td.d_image selector
if not description_img:
description_img_elements = content.select('td.d_image img')
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
# If still not found, try the specific selector for .description img.mw-file-element
if not description_img:
description_img_elements = content.select('.description img.mw-file-element')
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
# If still not found, try images in figures within the description box
if not description_img:
description_img_elements = content.select('.description figure img')
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
# If still not found, try any image in the description box
if not description_img:
description_img_elements = content.select('.description img')
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
# If still not found, try images in the DescriptionBox table
if not description_img:
description_img_elements = content.select('table.DescriptionBox img')
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
# If still not found, try images in figure elements anywhere in the content
if not description_img:
description_img_elements = content.select('figure img')
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
# Filter for relevant images
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
# If we still don't have an image, use any image that's not the OSM logo
if not description_img:
all_images = content.select('img')
relevant_images = [img for img in all_images if is_relevant_image(img)]
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
if relevant_images:
description_img = relevant_images[0]
logger.info(f" Using fallback image: {description_img.get('src', '')}")
# Process the found image
description_img_url = None
if description_img:
src = description_img.get('src', '')
if src:
# Make relative URLs absolute
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = 'https://wiki.openstreetmap.org' + src
description_img_url = src
# Process all images
for img in media_elements:
src = img.get('src', '')
if src:
@ -251,7 +390,8 @@ def fetch_wiki_page(key, language='en'):
'link_details': link_details,
'media_count': media_count,
'media_details': media_details,
'categories': categories
'categories': categories,
'description_img_url': description_img_url
}
except requests.exceptions.RequestException as e:
@ -692,7 +832,7 @@ def main():
try:
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
# Basic fields for CSV (detailed content will be in JSON only)
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score']
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score', 'description_img_url']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()