up liste changements récents

This commit is contained in:
Tykayn 2025-08-31 23:12:38 +02:00 committed by tykayn
parent a59113400c
commit 58848a78ab
6 changed files with 1339 additions and 234 deletions

View file

@ -6,7 +6,7 @@ fetch_recent_changes.py
This script fetches recent changes from the OpenStreetMap wiki for the French namespace
and stores the URLs of these pages. It specifically targets the recent changes page:
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=10000&days=365&enhanced=1&title=Special:RecentChanges&urlversion=2
Usage:
python fetch_recent_changes.py [--dry-run] [--force]
@ -170,6 +170,21 @@ def extract_recent_changes(html_content):
user_link = item.find('a', class_='mw-userlink')
user = user_link.get_text().strip() if user_link else "Unknown"
# Extract the user profile URL
user_url = ""
if user_link and user_link.get('href'):
user_url = user_link.get('href')
if not user_url.startswith('http'):
user_url = WIKI_BASE_URL + user_url
# Extract the diff link
diff_url = ""
diff_link = item.find('a', class_='mw-changeslist-diff') or item.find('a', string='diff')
if diff_link and diff_link.get('href'):
diff_url = diff_link.get('href')
if not diff_url.startswith('http'):
diff_url = WIKI_BASE_URL + diff_url
# Extract the comment from the comment class
comment_span = item.find('span', class_='comment')
comment = comment_span.get_text().strip() if comment_span else ""
@ -186,13 +201,39 @@ def extract_recent_changes(html_content):
if size_matches:
change_size = size_matches[0]
# Extract text differences if diff_url is available
added_text = ""
removed_text = ""
if diff_url:
try:
# Fetch the diff page
diff_html = get_page_content(diff_url)
if diff_html:
diff_soup = BeautifulSoup(diff_html, 'html.parser')
# Find added text (ins elements)
added_elements = diff_soup.find_all('ins', class_='diffchange')
if added_elements:
added_text = ' '.join([el.get_text().strip() for el in added_elements])
# Find removed text (del elements)
removed_elements = diff_soup.find_all('del', class_='diffchange')
if removed_elements:
removed_text = ' '.join([el.get_text().strip() for el in removed_elements])
except Exception as e:
logger.error(f"Error fetching diff page {diff_url}: {e}")
recent_changes.append({
"page_name": page_name,
"page_url": page_url,
"timestamp": timestamp,
"user": user,
"user_url": user_url,
"comment": comment,
"change_size": change_size
"change_size": change_size,
"diff_url": diff_url,
"added_text": added_text,
"removed_text": removed_text
})
logger.debug(f"Extracted change: {page_name} by {user}")