wiki illustrations et team osm fr
This commit is contained in:
parent
d7a54458dc
commit
77ad76cc7e
13 changed files with 78859 additions and 13414 deletions
|
@ -211,6 +211,145 @@ def fetch_wiki_page(key, language='en'):
|
|||
|
||||
# Get media details (src and alt text)
|
||||
media_details = []
|
||||
|
||||
# Extract description image specifically
|
||||
# Try multiple selectors to find the description image
|
||||
description_img = None
|
||||
|
||||
# Debug: Log the key we're processing
|
||||
logger.info(f"Looking for description image for key '{key}' in {language}")
|
||||
|
||||
# Function to filter out OSM logo and small icons
|
||||
def is_relevant_image(img):
|
||||
src = img.get('src', '')
|
||||
# Skip OSM logo
|
||||
if 'osm_logo' in src:
|
||||
return False
|
||||
# Skip small icons (usually less than 30px)
|
||||
width = img.get('width')
|
||||
if width and int(width) < 30:
|
||||
return False
|
||||
height = img.get('height')
|
||||
if height and int(height) < 30:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Special case for highway key - directly target the image we want
|
||||
if key == 'highway':
|
||||
# Try to find the specific image in figure elements
|
||||
highway_img_elements = content.select('figure.mw-halign-center img')
|
||||
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
||||
|
||||
# If not found with highway-specific selector, try the td.d_image selector
|
||||
if not description_img:
|
||||
description_img_elements = content.select('td.d_image img')
|
||||
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try the specific selector for .description img.mw-file-element
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img.mw-file-element')
|
||||
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figures within the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description figure img')
|
||||
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try any image in the description box
|
||||
if not description_img:
|
||||
description_img_elements = content.select('.description img')
|
||||
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in the DescriptionBox table
|
||||
if not description_img:
|
||||
description_img_elements = content.select('table.DescriptionBox img')
|
||||
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
||||
|
||||
# If still not found, try images in figure elements anywhere in the content
|
||||
if not description_img:
|
||||
description_img_elements = content.select('figure img')
|
||||
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
||||
|
||||
# Filter for relevant images
|
||||
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
||||
|
||||
# If we still don't have an image, use any image that's not the OSM logo
|
||||
if not description_img:
|
||||
all_images = content.select('img')
|
||||
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
||||
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
||||
|
||||
if relevant_images:
|
||||
description_img = relevant_images[0]
|
||||
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
||||
|
||||
# Process the found image
|
||||
description_img_url = None
|
||||
if description_img:
|
||||
src = description_img.get('src', '')
|
||||
if src:
|
||||
# Make relative URLs absolute
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = 'https://wiki.openstreetmap.org' + src
|
||||
|
||||
description_img_url = src
|
||||
|
||||
# Process all images
|
||||
for img in media_elements:
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
|
@ -251,7 +390,8 @@ def fetch_wiki_page(key, language='en'):
|
|||
'link_details': link_details,
|
||||
'media_count': media_count,
|
||||
'media_details': media_details,
|
||||
'categories': categories
|
||||
'categories': categories,
|
||||
'description_img_url': description_img_url
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
@ -692,7 +832,7 @@ def main():
|
|||
try:
|
||||
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
|
||||
# Basic fields for CSV (detailed content will be in JSON only)
|
||||
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score']
|
||||
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score', 'description_img_url']
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue