add set of pages to watch

This commit is contained in:
Tykayn 2025-09-01 00:14:00 +02:00 committed by tykayn
parent 77ad76cc7e
commit 7a7704bc01
22 changed files with 216839 additions and 6049 deletions

View file

@ -41,12 +41,21 @@ logger = logging.getLogger(__name__)
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 5
NUM_WIKI_PAGES = 100
# List of specific pages to compare
SPECIFIC_PAGES = [
"Anatomie_des_étiquettes_osm",
"https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois",
"FR:Tag:leisure%3Dchildren_club",
"FR:Tag:harassment_prevention%3Dask_angela"
]
def fetch_top_keys(limit=NUM_WIKI_PAGES):
"""
@ -97,28 +106,56 @@ def save_to_json(data, filename):
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def fetch_wiki_page(key, language='en'):
def fetch_wiki_page(key, language='en', is_specific_page=False):
"""
Fetch wiki page for a given key
Fetch wiki page for a given key or specific page
Args:
key (str): OSM key
key (str): OSM key or specific page title/URL
language (str): Language code ('en' or 'fr')
is_specific_page (bool): Whether this is a specific page rather than a key
Returns:
dict: Dictionary with page information or None if page doesn't exist
"""
base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
url = f"{base_url}{key}"
# Handle different URL formats
if is_specific_page:
# Case 1: Full URL
if key.startswith('http'):
url = key
# Extract the page title from the URL
page_title = key.split('/')[-1]
# Determine language from URL
if 'FR:' in key or '/FR:' in key:
language = 'fr'
else:
language = 'en'
# Case 2: Page with FR: prefix
elif key.startswith('FR:'):
url = f"{WIKI_BASE_URL}{key}"
page_title = key[3:] # Remove FR: prefix for title
language = 'fr'
# Case 3: Regular page title
else:
if language == 'fr':
url = f"{WIKI_BASE_URL}FR:{key}"
else:
url = f"{WIKI_BASE_URL}{key}"
page_title = key
else:
# Regular key page
base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
url = f"{base_url}{key}"
page_title = key
logger.info(f"Fetching {language} wiki page for key '{key}': {url}")
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
try:
response = requests.get(url)
# Check if page exists
if response.status_code == 404:
logger.warning(f"Wiki page for key '{key}' in {language} does not exist")
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
return None
response.raise_for_status()
@ -380,6 +417,7 @@ def fetch_wiki_page(key, language='en'):
return {
'key': key,
'page_title': page_title,
'language': language,
'url': url,
'last_modified': last_modified,
@ -391,11 +429,12 @@ def fetch_wiki_page(key, language='en'):
'media_count': media_count,
'media_details': media_details,
'categories': categories,
'description_img_url': description_img_url
'description_img_url': description_img_url,
'is_specific_page': is_specific_page
}
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
return None
def generate_staleness_histogram(wiki_pages):
@ -562,27 +601,52 @@ def analyze_wiki_pages(pages):
'common': []
}
# Extract section titles for comparison
en_sections = {section['title'].lower(): section for section in en_page.get('section_titles', [])}
fr_sections = {section['title'].lower(): section for section in fr_page.get('section_titles', [])}
# Group sections by their level for hierarchical comparison
en_sections_by_level = {}
fr_sections_by_level = {}
# Find sections only in English
for title, section in en_sections.items():
if title not in fr_sections:
section_comparison['en_only'].append(section)
# Organize English sections by level
for section in en_page.get('section_titles', []):
level = section['level']
if level not in en_sections_by_level:
en_sections_by_level[level] = []
en_sections_by_level[level].append(section)
# Organize French sections by level
for section in fr_page.get('section_titles', []):
level = section['level']
if level not in fr_sections_by_level:
fr_sections_by_level[level] = []
fr_sections_by_level[level].append(section)
# Find sections only in French
for title, section in fr_sections.items():
if title not in en_sections:
section_comparison['fr_only'].append(section)
# Process each level to find matching sections
all_levels = set(list(en_sections_by_level.keys()) + list(fr_sections_by_level.keys()))
# Find common sections
for title in en_sections.keys():
if title in fr_sections:
section_comparison['common'].append({
'en': en_sections[title],
'fr': fr_sections[title]
})
for level in all_levels:
en_level_sections = en_sections_by_level.get(level, [])
fr_level_sections = fr_sections_by_level.get(level, [])
# Create dictionaries for easier lookup, using lowercase titles
en_dict = {section['title'].lower(): section for section in en_level_sections}
fr_dict = {section['title'].lower(): section for section in fr_level_sections}
# Find sections at this level only in English
for title, section in en_dict.items():
if title not in fr_dict:
section_comparison['en_only'].append(section)
# Find sections at this level only in French
for title, section in fr_dict.items():
if title not in en_dict:
section_comparison['fr_only'].append(section)
# Find common sections at this level
for title in en_dict.keys():
if title in fr_dict:
section_comparison['common'].append({
'en': en_dict[title],
'fr': fr_dict[title]
})
# Compare links between English and French pages
link_comparison = {
@ -735,6 +799,8 @@ def main():
# Fetch wiki pages for each key
wiki_pages = []
# Process top keys
logger.info("Processing top keys...")
for key_info in top_keys:
key = key_info['key']
@ -748,6 +814,61 @@ def main():
if fr_page:
wiki_pages.append(fr_page)
# Process specific pages
logger.info("Processing specific pages...")
for page in SPECIFIC_PAGES:
# For specific pages, we need to handle different formats
# Case 1: Full URL
if page.startswith('http'):
# For full URLs, we directly fetch the page
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
if page_info:
wiki_pages.append(page_info)
# If it's a French page, try to find the English equivalent
if page_info['language'] == 'fr':
# Try to get the English version by removing FR: prefix
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
en_url = f"{WIKI_BASE_URL}{en_title}"
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
if en_page:
wiki_pages.append(en_page)
# If it's an English page, try to find the French equivalent
else:
# Try to get the French version by adding FR: prefix
fr_title = f"FR:{page_info['page_title']}"
fr_url = f"{WIKI_BASE_URL}{fr_title}"
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
if fr_page:
wiki_pages.append(fr_page)
# Case 2: Page with FR: prefix
elif page.startswith('FR:'):
# Fetch the French page
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
if fr_page:
wiki_pages.append(fr_page)
# Try to get the English version by removing FR: prefix
en_title = page[3:] # Remove FR: prefix
en_url = f"{WIKI_BASE_URL}{en_title}"
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
if en_page:
wiki_pages.append(en_page)
# Case 3: Regular page title
else:
# Fetch the English page
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
if en_page:
wiki_pages.append(en_page)
# Fetch the French page
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
if fr_page:
wiki_pages.append(fr_page)
# Process wiki pages to add staleness score
processed_wiki_pages = []
pages_by_key = {}