add wiki compare
This commit is contained in:
parent
692e609a46
commit
38fbc451f5
9 changed files with 81151 additions and 126 deletions
|
@ -4,7 +4,7 @@
|
|||
"""
|
||||
wiki_compare.py
|
||||
|
||||
This script fetches the 10 most used OpenStreetMap keys from TagInfo,
|
||||
This script fetches the most used OpenStreetMap keys from TagInfo,
|
||||
compares their English and French wiki pages, and identifies which pages
|
||||
need updating based on modification dates and content analysis.
|
||||
|
||||
|
@ -12,10 +12,10 @@ Usage:
|
|||
python wiki_compare.py
|
||||
|
||||
Output:
|
||||
- top_keys.json: JSON file containing the 10 most used OSM keys
|
||||
- top_keys.json: JSON file containing the most used OSM keys
|
||||
- wiki_pages.csv: CSV file with information about each wiki page
|
||||
- outdated_pages.json: JSON file containing pages that need updating
|
||||
- A console output listing the 10 wiki pages that need updating
|
||||
- A console output listing the wiki pages that need updating
|
||||
"""
|
||||
|
||||
import json
|
||||
|
@ -42,8 +42,10 @@ WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
|||
TOP_KEYS_FILE = "top_keys.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 20
|
||||
|
||||
def fetch_top_keys(limit=50):
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
"""
|
||||
Fetch the most used OSM keys from TagInfo API
|
||||
|
||||
|
@ -135,8 +137,28 @@ def fetch_wiki_page(key, language='en'):
|
|||
except ValueError:
|
||||
logger.warning(f"Could not parse date: {date_str}")
|
||||
|
||||
# Count sections (h2, h3, h4)
|
||||
sections = len(soup.select('h2, h3, h4'))
|
||||
# Extract sections (h2, h3, h4)
|
||||
section_elements = soup.select('h2, h3, h4')
|
||||
sections = len(section_elements)
|
||||
|
||||
# Extract section titles
|
||||
section_titles = []
|
||||
for section_elem in section_elements:
|
||||
# Skip sections that are part of the table of contents or navigation
|
||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||
continue
|
||||
|
||||
# Get the text of the section title, removing any edit links
|
||||
for edit_link in section_elem.select('.mw-editsection'):
|
||||
edit_link.extract()
|
||||
|
||||
section_title = section_elem.get_text(strip=True)
|
||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||
|
||||
section_titles.append({
|
||||
'title': section_title,
|
||||
'level': section_level
|
||||
})
|
||||
|
||||
# Count words in the content
|
||||
content = soup.select_one('#mw-content-text')
|
||||
|
@ -149,12 +171,55 @@ def fetch_wiki_page(key, language='en'):
|
|||
text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(text.split())
|
||||
|
||||
# Count links
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
link_count = len(links)
|
||||
|
||||
# Get link details (text and href)
|
||||
link_details = []
|
||||
for link in links:
|
||||
href = link.get('href', '')
|
||||
# Skip edit section links and other non-content links
|
||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||
continue
|
||||
|
||||
# Make relative URLs absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://wiki.openstreetmap.org' + href
|
||||
|
||||
link_text = link.get_text(strip=True)
|
||||
if link_text: # Only include links with text
|
||||
link_details.append({
|
||||
'text': link_text,
|
||||
'href': href
|
||||
})
|
||||
|
||||
# Extract media (images)
|
||||
media_elements = content.select('img')
|
||||
media_count = len(media_elements)
|
||||
|
||||
# Get media details (src and alt text)
|
||||
media_details = []
|
||||
for img in media_elements:
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# Make relative URLs absolute
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = 'https://wiki.openstreetmap.org' + src
|
||||
|
||||
alt_text = img.get('alt', '')
|
||||
media_details.append({
|
||||
'src': src,
|
||||
'alt': alt_text
|
||||
})
|
||||
else:
|
||||
word_count = 0
|
||||
link_count = 0
|
||||
link_details = []
|
||||
media_count = 0
|
||||
media_details = []
|
||||
|
||||
return {
|
||||
'key': key,
|
||||
|
@ -162,8 +227,12 @@ def fetch_wiki_page(key, language='en'):
|
|||
'url': url,
|
||||
'last_modified': last_modified,
|
||||
'sections': sections,
|
||||
'section_titles': section_titles,
|
||||
'word_count': word_count,
|
||||
'link_count': link_count
|
||||
'link_count': link_count,
|
||||
'link_details': link_details,
|
||||
'media_count': media_count,
|
||||
'media_details': media_details
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
@ -202,6 +271,21 @@ def analyze_wiki_pages(pages):
|
|||
if 'en' not in lang_pages or 'fr' not in lang_pages:
|
||||
if 'en' in lang_pages:
|
||||
# French page is missing
|
||||
# For missing French pages, calculate a high staleness score
|
||||
# Use word count as the main factor (50% weight)
|
||||
missing_staleness_score = (
|
||||
30 * 0.2 + # Assume 30 days outdated (20%)
|
||||
lang_pages['en']['word_count'] / 100 * 0.5 + # Word count (50%)
|
||||
lang_pages['en']['sections'] * 0.15 + # Sections (15%)
|
||||
lang_pages['en']['link_count'] / 10 * 0.15 # Links (15%)
|
||||
)
|
||||
|
||||
# Round to 2 decimal places and ensure it's high
|
||||
missing_staleness_score = max(100, round(missing_staleness_score, 2))
|
||||
|
||||
# Get media count or default to 0
|
||||
media_count = lang_pages['en'].get('media_count', 0)
|
||||
|
||||
needs_update.append({
|
||||
'key': key,
|
||||
'reason': 'French page missing',
|
||||
|
@ -211,7 +295,12 @@ def analyze_wiki_pages(pages):
|
|||
'word_diff': lang_pages['en']['word_count'],
|
||||
'section_diff': lang_pages['en']['sections'],
|
||||
'link_diff': lang_pages['en']['link_count'],
|
||||
'priority': 100 # High priority for missing pages
|
||||
'media_diff': media_count,
|
||||
'staleness_score': missing_staleness_score,
|
||||
'priority': missing_staleness_score, # Use staleness score as priority
|
||||
'section_comparison': None, # No comparison possible
|
||||
'link_comparison': None, # No comparison possible
|
||||
'media_comparison': None # No comparison possible
|
||||
})
|
||||
continue
|
||||
|
||||
|
@ -231,28 +320,130 @@ def analyze_wiki_pages(pages):
|
|||
word_diff = en_page['word_count'] - fr_page['word_count']
|
||||
section_diff = en_page['sections'] - fr_page['sections']
|
||||
link_diff = en_page['link_count'] - fr_page['link_count']
|
||||
media_diff = en_page.get('media_count', 0) - fr_page.get('media_count', 0)
|
||||
|
||||
# Calculate priority score (higher means needs more urgent update)
|
||||
# Weight factors can be adjusted
|
||||
priority = (
|
||||
abs(date_diff) * 0.4 + # Date difference
|
||||
abs(word_diff) / 100 * 0.25 + # Word count difference (normalized)
|
||||
abs(section_diff) * 0.2 + # Section difference
|
||||
abs(link_diff) / 10 * 0.15 # Link count difference (normalized)
|
||||
# Calculate staleness score (higher means more outdated/stale)
|
||||
# Weight factors adjusted to emphasize word count differences
|
||||
staleness_score = (
|
||||
abs(date_diff) * 0.2 + # Date difference (20%)
|
||||
abs(word_diff) / 100 * 0.5 + # Word count difference (normalized) (50%)
|
||||
abs(section_diff) * 0.15 + # Section difference (15%)
|
||||
abs(link_diff) / 10 * 0.15 # Link count difference (normalized) (15%)
|
||||
)
|
||||
|
||||
# Round to 2 decimal places for display
|
||||
staleness_score = round(staleness_score, 2)
|
||||
|
||||
# Compare sections between English and French pages
|
||||
section_comparison = {
|
||||
'en_only': [],
|
||||
'fr_only': [],
|
||||
'common': []
|
||||
}
|
||||
|
||||
# Extract section titles for comparison
|
||||
en_sections = {section['title'].lower(): section for section in en_page.get('section_titles', [])}
|
||||
fr_sections = {section['title'].lower(): section for section in fr_page.get('section_titles', [])}
|
||||
|
||||
# Find sections only in English
|
||||
for title, section in en_sections.items():
|
||||
if title not in fr_sections:
|
||||
section_comparison['en_only'].append(section)
|
||||
|
||||
# Find sections only in French
|
||||
for title, section in fr_sections.items():
|
||||
if title not in en_sections:
|
||||
section_comparison['fr_only'].append(section)
|
||||
|
||||
# Find common sections
|
||||
for title in en_sections.keys():
|
||||
if title in fr_sections:
|
||||
section_comparison['common'].append({
|
||||
'en': en_sections[title],
|
||||
'fr': fr_sections[title]
|
||||
})
|
||||
|
||||
# Compare links between English and French pages
|
||||
link_comparison = {
|
||||
'en_only': [],
|
||||
'fr_only': [],
|
||||
'common': []
|
||||
}
|
||||
|
||||
# Extract link texts for comparison (case insensitive)
|
||||
en_links = {link['text'].lower(): link for link in en_page.get('link_details', [])}
|
||||
fr_links = {link['text'].lower(): link for link in fr_page.get('link_details', [])}
|
||||
|
||||
# Find links only in English
|
||||
for text, link in en_links.items():
|
||||
if text not in fr_links:
|
||||
link_comparison['en_only'].append(link)
|
||||
|
||||
# Find links only in French
|
||||
for text, link in fr_links.items():
|
||||
if text not in en_links:
|
||||
link_comparison['fr_only'].append(link)
|
||||
|
||||
# Find common links
|
||||
for text in en_links.keys():
|
||||
if text in fr_links:
|
||||
link_comparison['common'].append({
|
||||
'en': en_links[text],
|
||||
'fr': fr_links[text]
|
||||
})
|
||||
|
||||
# Compare media between English and French pages
|
||||
media_comparison = {
|
||||
'en_only': [],
|
||||
'fr_only': [],
|
||||
'common': []
|
||||
}
|
||||
|
||||
# Extract media alt texts for comparison (case insensitive)
|
||||
en_media = {media['alt'].lower(): media for media in en_page.get('media_details', []) if media['alt']}
|
||||
fr_media = {media['alt'].lower(): media for media in fr_page.get('media_details', []) if media['alt']}
|
||||
|
||||
# Find media only in English
|
||||
for alt, media in en_media.items():
|
||||
if alt not in fr_media:
|
||||
media_comparison['en_only'].append(media)
|
||||
|
||||
# Find media only in French
|
||||
for alt, media in fr_media.items():
|
||||
if alt not in en_media:
|
||||
media_comparison['fr_only'].append(media)
|
||||
|
||||
# Find common media
|
||||
for alt in en_media.keys():
|
||||
if alt in fr_media:
|
||||
media_comparison['common'].append({
|
||||
'en': en_media[alt],
|
||||
'fr': fr_media[alt]
|
||||
})
|
||||
|
||||
# Add media without alt text to their respective language-only lists
|
||||
for media in en_page.get('media_details', []):
|
||||
if not media['alt'] or media['alt'].lower() not in en_media:
|
||||
media_comparison['en_only'].append(media)
|
||||
|
||||
for media in fr_page.get('media_details', []):
|
||||
if not media['alt'] or media['alt'].lower() not in fr_media:
|
||||
media_comparison['fr_only'].append(media)
|
||||
|
||||
if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||
reason = []
|
||||
if date_diff > 30:
|
||||
reason.append(f"French page outdated by {date_diff} days")
|
||||
reason.append(f"La version Française est datée de {date_diff} jours")
|
||||
if word_diff > 200:
|
||||
reason.append(f"English page has {word_diff} more words")
|
||||
reason.append(f"La version Anglaise a {word_diff} plus de mots")
|
||||
if section_diff > 2:
|
||||
reason.append(f"English page has {section_diff} more sections")
|
||||
reason.append(f"La version Anglaise a {section_diff} plus de sections")
|
||||
if link_diff > 20:
|
||||
reason.append(f"English page has {link_diff} more links")
|
||||
reason.append(f"La version Anglaise a {link_diff} plus de liens")
|
||||
if media_diff > 5:
|
||||
reason.append(f"La version Anglaise a {media_diff} plus d'images")
|
||||
if fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||
reason.append(f"French page is only {fr_page['word_count'] / en_page['word_count']:.0%} of English content")
|
||||
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
|
||||
|
||||
needs_update.append({
|
||||
'key': key,
|
||||
|
@ -263,7 +454,12 @@ def analyze_wiki_pages(pages):
|
|||
'word_diff': word_diff,
|
||||
'section_diff': section_diff,
|
||||
'link_diff': link_diff,
|
||||
'priority': priority
|
||||
'media_diff': media_diff,
|
||||
'staleness_score': staleness_score,
|
||||
'priority': staleness_score, # Use staleness score as priority
|
||||
'section_comparison': section_comparison,
|
||||
'link_comparison': link_comparison,
|
||||
'media_comparison': media_comparison
|
||||
})
|
||||
|
||||
# Sort by priority (descending)
|
||||
|
@ -279,7 +475,7 @@ def main():
|
|||
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
|
||||
|
||||
# Fetch top keys
|
||||
top_keys = fetch_top_keys(10)
|
||||
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
|
||||
|
||||
if not top_keys:
|
||||
logger.error("Failed to fetch top keys. Exiting.")
|
||||
|
@ -304,16 +500,96 @@ def main():
|
|||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Save wiki pages to CSV
|
||||
# Process wiki pages to add staleness score
|
||||
processed_wiki_pages = []
|
||||
pages_by_key = {}
|
||||
|
||||
# Group pages by key
|
||||
for page in wiki_pages:
|
||||
if page is None:
|
||||
continue
|
||||
|
||||
key = page['key']
|
||||
if key not in pages_by_key:
|
||||
pages_by_key[key] = {}
|
||||
|
||||
pages_by_key[key][page['language']] = page
|
||||
|
||||
# Calculate staleness score for each pair of pages
|
||||
for key, lang_pages in pages_by_key.items():
|
||||
# Add English page with staleness score
|
||||
if 'en' in lang_pages:
|
||||
en_page = lang_pages['en'].copy()
|
||||
|
||||
# If French page exists, calculate staleness score
|
||||
if 'fr' in lang_pages:
|
||||
fr_page = lang_pages['fr']
|
||||
|
||||
# Skip if dates are missing
|
||||
if en_page['last_modified'] and fr_page['last_modified']:
|
||||
# Calculate date difference in days
|
||||
en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
|
||||
fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
|
||||
date_diff = (en_date - fr_date).days
|
||||
|
||||
# Calculate content differences
|
||||
word_diff = en_page['word_count'] - fr_page['word_count']
|
||||
section_diff = en_page['sections'] - fr_page['sections']
|
||||
link_diff = en_page['link_count'] - fr_page['link_count']
|
||||
|
||||
# Calculate staleness score
|
||||
staleness_score = (
|
||||
abs(date_diff) * 0.2 +
|
||||
abs(word_diff) / 100 * 0.5 +
|
||||
abs(section_diff) * 0.15 +
|
||||
abs(link_diff) / 10 * 0.15
|
||||
)
|
||||
|
||||
# Round to 2 decimal places
|
||||
staleness_score = round(staleness_score, 2)
|
||||
|
||||
en_page['staleness_score'] = staleness_score
|
||||
fr_page['staleness_score'] = staleness_score
|
||||
else:
|
||||
en_page['staleness_score'] = 0
|
||||
fr_page['staleness_score'] = 0
|
||||
|
||||
processed_wiki_pages.append(en_page)
|
||||
processed_wiki_pages.append(fr_page)
|
||||
else:
|
||||
# French page is missing, calculate a high staleness score
|
||||
missing_staleness_score = (
|
||||
30 * 0.2 +
|
||||
en_page['word_count'] / 100 * 0.5 +
|
||||
en_page['sections'] * 0.15 +
|
||||
en_page['link_count'] / 10 * 0.15
|
||||
)
|
||||
|
||||
# Round to 2 decimal places and ensure it's high
|
||||
missing_staleness_score = max(100, round(missing_staleness_score, 2))
|
||||
|
||||
en_page['staleness_score'] = missing_staleness_score
|
||||
processed_wiki_pages.append(en_page)
|
||||
|
||||
# Add French page without English counterpart (rare case)
|
||||
elif 'fr' in lang_pages:
|
||||
fr_page = lang_pages['fr'].copy()
|
||||
fr_page['staleness_score'] = 0
|
||||
processed_wiki_pages.append(fr_page)
|
||||
|
||||
# Save processed wiki pages to CSV
|
||||
try:
|
||||
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
|
||||
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count']
|
||||
# Basic fields for CSV (detailed content will be in JSON only)
|
||||
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score']
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for page in wiki_pages:
|
||||
for page in processed_wiki_pages:
|
||||
if page: # Skip None values
|
||||
writer.writerow(page)
|
||||
# Create a copy with only the CSV fields
|
||||
csv_page = {field: page.get(field, '') for field in fieldnames if field in page}
|
||||
writer.writerow(csv_page)
|
||||
|
||||
logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
|
||||
|
||||
|
@ -327,10 +603,10 @@ def main():
|
|||
# Save pages that need updating to JSON
|
||||
save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
|
||||
|
||||
# Print the top 10 pages needing updates
|
||||
print("\n===== TOP 10 WIKI PAGES NEEDING UPDATES =====")
|
||||
# Print the top pages needing updates
|
||||
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
|
||||
|
||||
for i, page in enumerate(pages_to_update[:10], 1):
|
||||
for i, page in enumerate(pages_to_update[:NUM_WIKI_PAGES], 1):
|
||||
key = page['key']
|
||||
reason = page['reason']
|
||||
en_url = page['en_page']['url'] if page['en_page'] else "N/A"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue