add wiki compare

This commit is contained in:
Tykayn 2025-08-21 16:50:17 +02:00 committed by tykayn
parent 692e609a46
commit 38fbc451f5
9 changed files with 81151 additions and 126 deletions

View file

@ -4,7 +4,7 @@
"""
wiki_compare.py
This script fetches the 10 most used OpenStreetMap keys from TagInfo,
This script fetches the most used OpenStreetMap keys from TagInfo,
compares their English and French wiki pages, and identifies which pages
need updating based on modification dates and content analysis.
@ -12,10 +12,10 @@ Usage:
python wiki_compare.py
Output:
- top_keys.json: JSON file containing the 10 most used OSM keys
- top_keys.json: JSON file containing the most used OSM keys
- wiki_pages.csv: CSV file with information about each wiki page
- outdated_pages.json: JSON file containing pages that need updating
- A console output listing the 10 wiki pages that need updating
- A console output listing the wiki pages that need updating
"""
import json
@ -42,8 +42,10 @@ WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 20
def fetch_top_keys(limit=50):
def fetch_top_keys(limit=NUM_WIKI_PAGES):
"""
Fetch the most used OSM keys from TagInfo API
@ -135,8 +137,28 @@ def fetch_wiki_page(key, language='en'):
except ValueError:
logger.warning(f"Could not parse date: {date_str}")
# Count sections (h2, h3, h4)
sections = len(soup.select('h2, h3, h4'))
# Extract sections (h2, h3, h4)
section_elements = soup.select('h2, h3, h4')
sections = len(section_elements)
# Extract section titles
section_titles = []
for section_elem in section_elements:
# Skip sections that are part of the table of contents or navigation
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
continue
# Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract()
section_title = section_elem.get_text(strip=True)
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
section_titles.append({
'title': section_title,
'level': section_level
})
# Count words in the content
content = soup.select_one('#mw-content-text')
@ -149,12 +171,55 @@ def fetch_wiki_page(key, language='en'):
text = content.get_text(separator=' ', strip=True)
word_count = len(text.split())
# Count links
# Extract links
links = content.select('a')
link_count = len(links)
# Get link details (text and href)
link_details = []
for link in links:
href = link.get('href', '')
# Skip edit section links and other non-content links
if 'action=edit' in href or 'redlink=1' in href or not href:
continue
# Make relative URLs absolute
if href.startswith('/'):
href = 'https://wiki.openstreetmap.org' + href
link_text = link.get_text(strip=True)
if link_text: # Only include links with text
link_details.append({
'text': link_text,
'href': href
})
# Extract media (images)
media_elements = content.select('img')
media_count = len(media_elements)
# Get media details (src and alt text)
media_details = []
for img in media_elements:
src = img.get('src', '')
if src:
# Make relative URLs absolute
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = 'https://wiki.openstreetmap.org' + src
alt_text = img.get('alt', '')
media_details.append({
'src': src,
'alt': alt_text
})
else:
word_count = 0
link_count = 0
link_details = []
media_count = 0
media_details = []
return {
'key': key,
@ -162,8 +227,12 @@ def fetch_wiki_page(key, language='en'):
'url': url,
'last_modified': last_modified,
'sections': sections,
'section_titles': section_titles,
'word_count': word_count,
'link_count': link_count
'link_count': link_count,
'link_details': link_details,
'media_count': media_count,
'media_details': media_details
}
except requests.exceptions.RequestException as e:
@ -202,6 +271,21 @@ def analyze_wiki_pages(pages):
if 'en' not in lang_pages or 'fr' not in lang_pages:
if 'en' in lang_pages:
# French page is missing
# For missing French pages, calculate a high staleness score
# Use word count as the main factor (50% weight)
missing_staleness_score = (
30 * 0.2 + # Assume 30 days outdated (20%)
lang_pages['en']['word_count'] / 100 * 0.5 + # Word count (50%)
lang_pages['en']['sections'] * 0.15 + # Sections (15%)
lang_pages['en']['link_count'] / 10 * 0.15 # Links (15%)
)
# Round to 2 decimal places and ensure it's high
missing_staleness_score = max(100, round(missing_staleness_score, 2))
# Get media count or default to 0
media_count = lang_pages['en'].get('media_count', 0)
needs_update.append({
'key': key,
'reason': 'French page missing',
@ -211,7 +295,12 @@ def analyze_wiki_pages(pages):
'word_diff': lang_pages['en']['word_count'],
'section_diff': lang_pages['en']['sections'],
'link_diff': lang_pages['en']['link_count'],
'priority': 100 # High priority for missing pages
'media_diff': media_count,
'staleness_score': missing_staleness_score,
'priority': missing_staleness_score, # Use staleness score as priority
'section_comparison': None, # No comparison possible
'link_comparison': None, # No comparison possible
'media_comparison': None # No comparison possible
})
continue
@ -231,28 +320,130 @@ def analyze_wiki_pages(pages):
word_diff = en_page['word_count'] - fr_page['word_count']
section_diff = en_page['sections'] - fr_page['sections']
link_diff = en_page['link_count'] - fr_page['link_count']
media_diff = en_page.get('media_count', 0) - fr_page.get('media_count', 0)
# Calculate priority score (higher means needs more urgent update)
# Weight factors can be adjusted
priority = (
abs(date_diff) * 0.4 + # Date difference
abs(word_diff) / 100 * 0.25 + # Word count difference (normalized)
abs(section_diff) * 0.2 + # Section difference
abs(link_diff) / 10 * 0.15 # Link count difference (normalized)
# Calculate staleness score (higher means more outdated/stale)
# Weight factors adjusted to emphasize word count differences
staleness_score = (
abs(date_diff) * 0.2 + # Date difference (20%)
abs(word_diff) / 100 * 0.5 + # Word count difference (normalized) (50%)
abs(section_diff) * 0.15 + # Section difference (15%)
abs(link_diff) / 10 * 0.15 # Link count difference (normalized) (15%)
)
# Round to 2 decimal places for display
staleness_score = round(staleness_score, 2)
# Compare sections between English and French pages
section_comparison = {
'en_only': [],
'fr_only': [],
'common': []
}
# Extract section titles for comparison
en_sections = {section['title'].lower(): section for section in en_page.get('section_titles', [])}
fr_sections = {section['title'].lower(): section for section in fr_page.get('section_titles', [])}
# Find sections only in English
for title, section in en_sections.items():
if title not in fr_sections:
section_comparison['en_only'].append(section)
# Find sections only in French
for title, section in fr_sections.items():
if title not in en_sections:
section_comparison['fr_only'].append(section)
# Find common sections
for title in en_sections.keys():
if title in fr_sections:
section_comparison['common'].append({
'en': en_sections[title],
'fr': fr_sections[title]
})
# Compare links between English and French pages
link_comparison = {
'en_only': [],
'fr_only': [],
'common': []
}
# Extract link texts for comparison (case insensitive)
en_links = {link['text'].lower(): link for link in en_page.get('link_details', [])}
fr_links = {link['text'].lower(): link for link in fr_page.get('link_details', [])}
# Find links only in English
for text, link in en_links.items():
if text not in fr_links:
link_comparison['en_only'].append(link)
# Find links only in French
for text, link in fr_links.items():
if text not in en_links:
link_comparison['fr_only'].append(link)
# Find common links
for text in en_links.keys():
if text in fr_links:
link_comparison['common'].append({
'en': en_links[text],
'fr': fr_links[text]
})
# Compare media between English and French pages
media_comparison = {
'en_only': [],
'fr_only': [],
'common': []
}
# Extract media alt texts for comparison (case insensitive)
en_media = {media['alt'].lower(): media for media in en_page.get('media_details', []) if media['alt']}
fr_media = {media['alt'].lower(): media for media in fr_page.get('media_details', []) if media['alt']}
# Find media only in English
for alt, media in en_media.items():
if alt not in fr_media:
media_comparison['en_only'].append(media)
# Find media only in French
for alt, media in fr_media.items():
if alt not in en_media:
media_comparison['fr_only'].append(media)
# Find common media
for alt in en_media.keys():
if alt in fr_media:
media_comparison['common'].append({
'en': en_media[alt],
'fr': fr_media[alt]
})
# Add media without alt text to their respective language-only lists
for media in en_page.get('media_details', []):
if not media['alt'] or media['alt'].lower() not in en_media:
media_comparison['en_only'].append(media)
for media in fr_page.get('media_details', []):
if not media['alt'] or media['alt'].lower() not in fr_media:
media_comparison['fr_only'].append(media)
if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
reason = []
if date_diff > 30:
reason.append(f"French page outdated by {date_diff} days")
reason.append(f"La version Française est datée de {date_diff} jours")
if word_diff > 200:
reason.append(f"English page has {word_diff} more words")
reason.append(f"La version Anglaise a {word_diff} plus de mots")
if section_diff > 2:
reason.append(f"English page has {section_diff} more sections")
reason.append(f"La version Anglaise a {section_diff} plus de sections")
if link_diff > 20:
reason.append(f"English page has {link_diff} more links")
reason.append(f"La version Anglaise a {link_diff} plus de liens")
if media_diff > 5:
reason.append(f"La version Anglaise a {media_diff} plus d'images")
if fr_page['word_count'] < en_page['word_count'] * 0.7:
reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content")
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
needs_update.append({
'key': key,
@ -263,7 +454,12 @@ def analyze_wiki_pages(pages):
'word_diff': word_diff,
'section_diff': section_diff,
'link_diff': link_diff,
'priority': priority
'media_diff': media_diff,
'staleness_score': staleness_score,
'priority': staleness_score, # Use staleness score as priority
'section_comparison': section_comparison,
'link_comparison': link_comparison,
'media_comparison': media_comparison
})
# Sort by priority (descending)
@ -279,7 +475,7 @@ def main():
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
# Fetch top keys
top_keys = fetch_top_keys(10)
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
if not top_keys:
logger.error("Failed to fetch top keys. Exiting.")
@ -304,16 +500,96 @@ def main():
if fr_page:
wiki_pages.append(fr_page)
# Save wiki pages to CSV
# Process wiki pages to add staleness score
processed_wiki_pages = []
pages_by_key = {}
# Group pages by key
for page in wiki_pages:
if page is None:
continue
key = page['key']
if key not in pages_by_key:
pages_by_key[key] = {}
pages_by_key[key][page['language']] = page
# Calculate staleness score for each pair of pages
for key, lang_pages in pages_by_key.items():
# Add English page with staleness score
if 'en' in lang_pages:
en_page = lang_pages['en'].copy()
# If French page exists, calculate staleness score
if 'fr' in lang_pages:
fr_page = lang_pages['fr']
# Skip if dates are missing
if en_page['last_modified'] and fr_page['last_modified']:
# Calculate date difference in days
en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
date_diff = (en_date - fr_date).days
# Calculate content differences
word_diff = en_page['word_count'] - fr_page['word_count']
section_diff = en_page['sections'] - fr_page['sections']
link_diff = en_page['link_count'] - fr_page['link_count']
# Calculate staleness score
staleness_score = (
abs(date_diff) * 0.2 +
abs(word_diff) / 100 * 0.5 +
abs(section_diff) * 0.15 +
abs(link_diff) / 10 * 0.15
)
# Round to 2 decimal places
staleness_score = round(staleness_score, 2)
en_page['staleness_score'] = staleness_score
fr_page['staleness_score'] = staleness_score
else:
en_page['staleness_score'] = 0
fr_page['staleness_score'] = 0
processed_wiki_pages.append(en_page)
processed_wiki_pages.append(fr_page)
else:
# French page is missing, calculate a high staleness score
missing_staleness_score = (
30 * 0.2 +
en_page['word_count'] / 100 * 0.5 +
en_page['sections'] * 0.15 +
en_page['link_count'] / 10 * 0.15
)
# Round to 2 decimal places and ensure it's high
missing_staleness_score = max(100, round(missing_staleness_score, 2))
en_page['staleness_score'] = missing_staleness_score
processed_wiki_pages.append(en_page)
# Add French page without English counterpart (rare case)
elif 'fr' in lang_pages:
fr_page = lang_pages['fr'].copy()
fr_page['staleness_score'] = 0
processed_wiki_pages.append(fr_page)
# Save processed wiki pages to CSV
try:
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count']
# Basic fields for CSV (detailed content will be in JSON only)
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for page in wiki_pages:
for page in processed_wiki_pages:
if page: # Skip None values
writer.writerow(page)
# Create a copy with only the CSV fields
csv_page = {field: page.get(field, '') for field in fieldnames if field in page}
writer.writerow(csv_page)
logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
@ -327,10 +603,10 @@ def main():
# Save pages that need updating to JSON
save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
# Print the top 10 pages needing updates
print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====")
# Print the top pages needing updates
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
for i, page in enumerate(pages_to_update[:10], 1):
for i, page in enumerate(pages_to_update[:NUM_WIKI_PAGES], 1):
key = page['key']
reason = page['reason']
en_url = page['en_page']['url'] if page['en_page'] else "N/A"