up compare

This commit is contained in:
Tykayn 2025-08-22 23:30:36 +02:00 committed by tykayn
parent e533c273b2
commit 2665adc897
7 changed files with 753 additions and 558 deletions

View file

@ -24,6 +24,7 @@ import json
import argparse
import logging
import os
import re
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
@ -96,38 +97,93 @@ def extract_recent_changes(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
recent_changes = []
# Find the changes list
# Try different selectors for the changes list
# First try the old selector
changes_list = soup.find('ul', class_='special')
# If not found, try the new selector
if not changes_list:
changes_list = soup.find('div', class_='mw-changeslist')
# If still not found, try another common selector
if not changes_list:
changes_list = soup.find('ul', class_='mw-changeslist')
# If still not found, look for any list inside the content area
if not changes_list:
content_div = soup.find('div', id='mw-content-text')
if content_div:
changes_list = content_div.find('ul')
if not changes_list:
logger.warning("Could not find recent changes list")
return []
# Process each list item (each change)
for li in changes_list.find_all('li'):
# Extract the page link
page_link = li.find('a', class_='mw-changeslist-title')
# Try both li elements and div elements with appropriate classes
change_items = changes_list.find_all('li')
if not change_items:
change_items = changes_list.find_all('div', class_='mw-changeslist-line')
for item in change_items:
# Extract the page link - try different selectors
page_link = item.find('a', class_='mw-changeslist-title')
if not page_link:
page_link = item.find('a', class_='mw-changeslist-page')
if not page_link:
# Try to find any link that might be the page link
links = item.find_all('a')
for link in links:
if '/wiki/' in link.get('href', ''):
page_link = link
break
if not page_link:
continue
page_name = page_link.get_text().strip()
page_url = WIKI_BASE_URL + page_link.get('href')
# Extract the timestamp
timestamp_span = li.find('span', class_='mw-changeslist-date')
# Extract the timestamp - try different selectors
timestamp_span = item.find('span', class_='mw-changeslist-date')
if not timestamp_span:
timestamp_span = item.find('span', class_='mw-changeslist-time')
timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown"
# Extract the user
user_link = li.find('a', class_='mw-userlink')
# Extract the user - try different selectors
user_link = item.find('a', class_='mw-userlink')
if not user_link:
user_link = item.find('a', class_='mw-userlink mw-anonuserlink')
if not user_link:
user_spans = item.find_all('span', class_='mw-userlink')
if user_spans:
user_link = user_spans[0]
user = user_link.get_text().strip() if user_link else "Unknown"
# Extract the comment
comment_span = li.find('span', class_='comment')
# Extract the comment - try different selectors
comment_span = item.find('span', class_='comment')
if not comment_span:
comment_span = item.find('span', class_='changeslist-comment')
comment = comment_span.get_text().strip() if comment_span else ""
# Extract the change size
change_size_span = li.find('span', class_='mw-changeslist-separator').next_sibling
change_size = change_size_span.get_text().strip() if change_size_span else "0"
# Extract the change size - try different approaches
change_size = "0"
# Try to find spans with specific classes
size_spans = item.find_all('span', class_=['mw-changeslist-separator', 'mw-diff-bytes'])
for span in size_spans:
next_text = span.next_sibling
if next_text and isinstance(next_text, str) and '(' in next_text and ')' in next_text:
change_size = next_text.strip()
break
# If not found, try another approach
if change_size == "0":
# Look for parentheses with numbers
import re
text = item.get_text()
size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text)
if size_matches:
change_size = size_matches[0]
recent_changes.append({
"page_name": page_name,