up compare

2025-08-22 23:30:36 +02:00 · 2025-08-22 23:30:36 +02:00 · 2665adc897
commit 2665adc897
parent e533c273b2
7 changed files with 753 additions and 558 deletions
--- a/wiki_compare/fetch_recent_changes.py
+++ b/wiki_compare/fetch_recent_changes.py
@ -24,6 +24,7 @@ import json
 import argparse
 import logging
 import os
+import re
 from datetime import datetime, timedelta
 import requests
 from bs4 import BeautifulSoup
@ -96,38 +97,93 @@ def extract_recent_changes(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    recent_changes = []
    
-    # Find the changes list
+    # Try different selectors for the changes list
+    # First try the old selector
    changes_list = soup.find('ul', class_='special')
    
+    # If not found, try the new selector
+    if not changes_list:
+        changes_list = soup.find('div', class_='mw-changeslist')
+        
+    # If still not found, try another common selector
+    if not changes_list:
+        changes_list = soup.find('ul', class_='mw-changeslist')
+    
+    # If still not found, look for any list inside the content area
+    if not changes_list:
+        content_div = soup.find('div', id='mw-content-text')
+        if content_div:
+            changes_list = content_div.find('ul')
+    
    if not changes_list:
        logger.warning("Could not find recent changes list")
        return []
    
    # Process each list item (each change)
-    for li in changes_list.find_all('li'):
-        # Extract the page link
-        page_link = li.find('a', class_='mw-changeslist-title')
+    # Try both li elements and div elements with appropriate classes
+    change_items = changes_list.find_all('li')
+    if not change_items:
+        change_items = changes_list.find_all('div', class_='mw-changeslist-line')
+    
+    for item in change_items:
+        # Extract the page link - try different selectors
+        page_link = item.find('a', class_='mw-changeslist-title')
+        if not page_link:
+            page_link = item.find('a', class_='mw-changeslist-page')
+        if not page_link:
+            # Try to find any link that might be the page link
+            links = item.find_all('a')
+            for link in links:
+                if '/wiki/' in link.get('href', ''):
+                    page_link = link
+                    break
+        
        if not page_link:
            continue
        
        page_name = page_link.get_text().strip()
        page_url = WIKI_BASE_URL + page_link.get('href')
        
-        # Extract the timestamp
-        timestamp_span = li.find('span', class_='mw-changeslist-date')
+        # Extract the timestamp - try different selectors
+        timestamp_span = item.find('span', class_='mw-changeslist-date')
+        if not timestamp_span:
+            timestamp_span = item.find('span', class_='mw-changeslist-time')
        timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown"
        
-        # Extract the user
-        user_link = li.find('a', class_='mw-userlink')
+        # Extract the user - try different selectors
+        user_link = item.find('a', class_='mw-userlink')
+        if not user_link:
+            user_link = item.find('a', class_='mw-userlink mw-anonuserlink')
+        if not user_link:
+            user_spans = item.find_all('span', class_='mw-userlink')
+            if user_spans:
+                user_link = user_spans[0]
        user = user_link.get_text().strip() if user_link else "Unknown"
        
-        # Extract the comment
-        comment_span = li.find('span', class_='comment')
+        # Extract the comment - try different selectors
+        comment_span = item.find('span', class_='comment')
+        if not comment_span:
+            comment_span = item.find('span', class_='changeslist-comment')
        comment = comment_span.get_text().strip() if comment_span else ""
        
-        # Extract the change size
-        change_size_span = li.find('span', class_='mw-changeslist-separator').next_sibling
-        change_size = change_size_span.get_text().strip() if change_size_span else "0"
+        # Extract the change size - try different approaches
+        change_size = "0"
+        # Try to find spans with specific classes
+        size_spans = item.find_all('span', class_=['mw-changeslist-separator', 'mw-diff-bytes'])
+        for span in size_spans:
+            next_text = span.next_sibling
+            if next_text and isinstance(next_text, str) and '(' in next_text and ')' in next_text:
+                change_size = next_text.strip()
+                break
+        
+        # If not found, try another approach
+        if change_size == "0":
+            # Look for parentheses with numbers
+            import re
+            text = item.get_text()
+            size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text)
+            if size_matches:
+                change_size = size_matches[0]
        
        recent_changes.append({
            "page_name": page_name,