up wiki compare

2025-10-04 17:04:53 +02:00 · 2025-08-22 17:58:04 +02:00 · 2025-08-22 17:58:04 +02:00 · 2f49ef6479
commit 2f49ef6479
parent ce508974c9
23 changed files with 567403 additions and 5132 deletions
--- a/wiki_compare/wiki_compare.py
+++ b/wiki_compare/wiki_compare.py
@ -43,7 +43,7 @@ TOP_KEYS_FILE = "top_keys.json"
 WIKI_PAGES_CSV = "wiki_pages.csv"
 OUTDATED_PAGES_FILE = "outdated_pages.json"
 # Number of wiki pages to examine
-NUM_WIKI_PAGES = 20
+NUM_WIKI_PAGES = 50

 def fetch_top_keys(limit=NUM_WIKI_PAGES):
    """
@ -144,10 +144,14 @@ def fetch_wiki_page(key, language='en'):
        # Extract section titles
        section_titles = []
        for section_elem in section_elements:
-            # Skip sections that are part of the table of contents or navigation
+            # Skip sections that are part of the table of contents, navigation, or DescriptionBox
            if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
                continue
                
+            # Skip sections that are inside a table with class DescriptionBox
+            if section_elem.find_parent('table', class_='DescriptionBox'):
+                continue
+                
            # Get the text of the section title, removing any edit links
            for edit_link in section_elem.select('.mw-editsection'):
                edit_link.extract()
@ -167,6 +171,10 @@ def fetch_wiki_page(key, language='en'):
            for script in content.select('script, style'):
                script.extract()
            
+            # Remove .languages elements
+            for languages_elem in content.select('.languages'):
+                languages_elem.extract()
+            
            # Get text and count words
            text = content.get_text(separator=' ', strip=True)
            word_count = len(text.split())
@ -214,12 +222,19 @@ def fetch_wiki_page(key, language='en'):
                        'src': src,
                        'alt': alt_text
                    })
+            
+            # Extract categories
+            categories = []
+            category_links = soup.select('#mw-normal-catlinks li a')
+            for cat_link in category_links:
+                categories.append(cat_link.get_text(strip=True))
        else:
            word_count = 0
            link_count = 0
            link_details = []
            media_count = 0
            media_details = []
+            categories = []
        
        return {
            'key': key,
@ -232,7 +247,8 @@ def fetch_wiki_page(key, language='en'):
            'link_count': link_count,
            'link_details': link_details,
            'media_count': media_count,
-            'media_details': media_details
+            'media_details': media_details,
+            'categories': categories
        }
    
    except requests.exceptions.RequestException as e:
@ -300,7 +316,8 @@ def analyze_wiki_pages(pages):
                    'priority': missing_staleness_score,  # Use staleness score as priority
                    'section_comparison': None,  # No comparison possible
                    'link_comparison': None,     # No comparison possible
-                    'media_comparison': None     # No comparison possible
+                    'media_comparison': None,    # No comparison possible
+                    'category_comparison': None  # No comparison possible
                })
            continue
        
@ -430,6 +447,32 @@ def analyze_wiki_pages(pages):
            if not media['alt'] or media['alt'].lower() not in fr_media:
                media_comparison['fr_only'].append(media)
        
+        # Compare categories between English and French pages
+        category_comparison = {
+            'en_only': [],
+            'fr_only': [],
+            'common': []
+        }
+        
+        # Extract categories for comparison (case insensitive)
+        en_categories = [cat.lower() for cat in en_page.get('categories', [])]
+        fr_categories = [cat.lower() for cat in fr_page.get('categories', [])]
+        
+        # Find categories only in English
+        for cat in en_page.get('categories', []):
+            if cat.lower() not in fr_categories:
+                category_comparison['en_only'].append(cat)
+        
+        # Find categories only in French
+        for cat in fr_page.get('categories', []):
+            if cat.lower() not in en_categories:
+                category_comparison['fr_only'].append(cat)
+        
+        # Find common categories
+        for cat in en_page.get('categories', []):
+            if cat.lower() in fr_categories:
+                category_comparison['common'].append(cat)
+        
        if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
            reason = []
            if date_diff > 30:
@ -459,7 +502,8 @@ def analyze_wiki_pages(pages):
                'priority': staleness_score,  # Use staleness score as priority
                'section_comparison': section_comparison,
                'link_comparison': link_comparison,
-                'media_comparison': media_comparison
+                'media_comparison': media_comparison,
+                'category_comparison': category_comparison
            })
    
    # Sort by priority (descending)