up wiki compare

This commit is contained in:
Tykayn 2025-08-22 17:58:04 +02:00 committed by tykayn
parent ce508974c9
commit 2f49ef6479
23 changed files with 567403 additions and 5132 deletions

View file

@ -43,7 +43,7 @@ TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 20
NUM_WIKI_PAGES = 50
def fetch_top_keys(limit=NUM_WIKI_PAGES):
"""
@ -144,10 +144,14 @@ def fetch_wiki_page(key, language='en'):
# Extract section titles
section_titles = []
for section_elem in section_elements:
# Skip sections that are part of the table of contents or navigation
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
continue
# Skip sections that are inside a table with class DescriptionBox
if section_elem.find_parent('table', class_='DescriptionBox'):
continue
# Get the text of the section title, removing any edit links
for edit_link in section_elem.select('.mw-editsection'):
edit_link.extract()
@ -167,6 +171,10 @@ def fetch_wiki_page(key, language='en'):
for script in content.select('script, style'):
script.extract()
# Remove .languages elements
for languages_elem in content.select('.languages'):
languages_elem.extract()
# Get text and count words
text = content.get_text(separator=' ', strip=True)
word_count = len(text.split())
@ -214,12 +222,19 @@ def fetch_wiki_page(key, language='en'):
'src': src,
'alt': alt_text
})
# Extract categories
categories = []
category_links = soup.select('#mw-normal-catlinks li a')
for cat_link in category_links:
categories.append(cat_link.get_text(strip=True))
else:
word_count = 0
link_count = 0
link_details = []
media_count = 0
media_details = []
categories = []
return {
'key': key,
@ -232,7 +247,8 @@ def fetch_wiki_page(key, language='en'):
'link_count': link_count,
'link_details': link_details,
'media_count': media_count,
'media_details': media_details
'media_details': media_details,
'categories': categories
}
except requests.exceptions.RequestException as e:
@ -300,7 +316,8 @@ def analyze_wiki_pages(pages):
'priority': missing_staleness_score, # Use staleness score as priority
'section_comparison': None, # No comparison possible
'link_comparison': None, # No comparison possible
'media_comparison': None # No comparison possible
'media_comparison': None, # No comparison possible
'category_comparison': None # No comparison possible
})
continue
@ -430,6 +447,32 @@ def analyze_wiki_pages(pages):
if not media['alt'] or media['alt'].lower() not in fr_media:
media_comparison['fr_only'].append(media)
# Compare categories between English and French pages
category_comparison = {
'en_only': [],
'fr_only': [],
'common': []
}
# Extract categories for comparison (case insensitive)
en_categories = [cat.lower() for cat in en_page.get('categories', [])]
fr_categories = [cat.lower() for cat in fr_page.get('categories', [])]
# Find categories only in English
for cat in en_page.get('categories', []):
if cat.lower() not in fr_categories:
category_comparison['en_only'].append(cat)
# Find categories only in French
for cat in fr_page.get('categories', []):
if cat.lower() not in en_categories:
category_comparison['fr_only'].append(cat)
# Find common categories
for cat in en_page.get('categories', []):
if cat.lower() in fr_categories:
category_comparison['common'].append(cat)
if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
reason = []
if date_diff > 30:
@ -459,7 +502,8 @@ def analyze_wiki_pages(pages):
'priority': staleness_score, # Use staleness score as priority
'section_comparison': section_comparison,
'link_comparison': link_comparison,
'media_comparison': media_comparison
'media_comparison': media_comparison,
'category_comparison': category_comparison
})
# Sort by priority (descending)