mirror of
https://forge.chapril.org/tykayn/osm-commerces
synced 2025-10-04 17:04:53 +02:00
up wiki compare
This commit is contained in:
parent
ce508974c9
commit
2f49ef6479
23 changed files with 567403 additions and 5132 deletions
|
@ -43,7 +43,7 @@ TOP_KEYS_FILE = "top_keys.json"
|
|||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 20
|
||||
NUM_WIKI_PAGES = 50
|
||||
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
"""
|
||||
|
@ -144,10 +144,14 @@ def fetch_wiki_page(key, language='en'):
|
|||
# Extract section titles
|
||||
section_titles = []
|
||||
for section_elem in section_elements:
|
||||
# Skip sections that are part of the table of contents or navigation
|
||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||
continue
|
||||
|
||||
# Skip sections that are inside a table with class DescriptionBox
|
||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||
continue
|
||||
|
||||
# Get the text of the section title, removing any edit links
|
||||
for edit_link in section_elem.select('.mw-editsection'):
|
||||
edit_link.extract()
|
||||
|
@ -167,6 +171,10 @@ def fetch_wiki_page(key, language='en'):
|
|||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
# Get text and count words
|
||||
text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(text.split())
|
||||
|
@ -214,12 +222,19 @@ def fetch_wiki_page(key, language='en'):
|
|||
'src': src,
|
||||
'alt': alt_text
|
||||
})
|
||||
|
||||
# Extract categories
|
||||
categories = []
|
||||
category_links = soup.select('#mw-normal-catlinks li a')
|
||||
for cat_link in category_links:
|
||||
categories.append(cat_link.get_text(strip=True))
|
||||
else:
|
||||
word_count = 0
|
||||
link_count = 0
|
||||
link_details = []
|
||||
media_count = 0
|
||||
media_details = []
|
||||
categories = []
|
||||
|
||||
return {
|
||||
'key': key,
|
||||
|
@ -232,7 +247,8 @@ def fetch_wiki_page(key, language='en'):
|
|||
'link_count': link_count,
|
||||
'link_details': link_details,
|
||||
'media_count': media_count,
|
||||
'media_details': media_details
|
||||
'media_details': media_details,
|
||||
'categories': categories
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
@ -300,7 +316,8 @@ def analyze_wiki_pages(pages):
|
|||
'priority': missing_staleness_score, # Use staleness score as priority
|
||||
'section_comparison': None, # No comparison possible
|
||||
'link_comparison': None, # No comparison possible
|
||||
'media_comparison': None # No comparison possible
|
||||
'media_comparison': None, # No comparison possible
|
||||
'category_comparison': None # No comparison possible
|
||||
})
|
||||
continue
|
||||
|
||||
|
@ -430,6 +447,32 @@ def analyze_wiki_pages(pages):
|
|||
if not media['alt'] or media['alt'].lower() not in fr_media:
|
||||
media_comparison['fr_only'].append(media)
|
||||
|
||||
# Compare categories between English and French pages
|
||||
category_comparison = {
|
||||
'en_only': [],
|
||||
'fr_only': [],
|
||||
'common': []
|
||||
}
|
||||
|
||||
# Extract categories for comparison (case insensitive)
|
||||
en_categories = [cat.lower() for cat in en_page.get('categories', [])]
|
||||
fr_categories = [cat.lower() for cat in fr_page.get('categories', [])]
|
||||
|
||||
# Find categories only in English
|
||||
for cat in en_page.get('categories', []):
|
||||
if cat.lower() not in fr_categories:
|
||||
category_comparison['en_only'].append(cat)
|
||||
|
||||
# Find categories only in French
|
||||
for cat in fr_page.get('categories', []):
|
||||
if cat.lower() not in en_categories:
|
||||
category_comparison['fr_only'].append(cat)
|
||||
|
||||
# Find common categories
|
||||
for cat in en_page.get('categories', []):
|
||||
if cat.lower() in fr_categories:
|
||||
category_comparison['common'].append(cat)
|
||||
|
||||
if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
|
||||
reason = []
|
||||
if date_diff > 30:
|
||||
|
@ -459,7 +502,8 @@ def analyze_wiki_pages(pages):
|
|||
'priority': staleness_score, # Use staleness score as priority
|
||||
'section_comparison': section_comparison,
|
||||
'link_comparison': link_comparison,
|
||||
'media_comparison': media_comparison
|
||||
'media_comparison': media_comparison,
|
||||
'category_comparison': category_comparison
|
||||
})
|
||||
|
||||
# Sort by priority (descending)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue