up liste changements récents
This commit is contained in:
parent
a59113400c
commit
58848a78ab
6 changed files with 1339 additions and 234 deletions
|
@ -6,7 +6,7 @@ fetch_recent_changes.py
|
|||
|
||||
This script fetches recent changes from the OpenStreetMap wiki for the French namespace
|
||||
and stores the URLs of these pages. It specifically targets the recent changes page:
|
||||
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2
|
||||
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=10000&days=365&enhanced=1&title=Special:RecentChanges&urlversion=2
|
||||
|
||||
Usage:
|
||||
python fetch_recent_changes.py [--dry-run] [--force]
|
||||
|
@ -170,6 +170,21 @@ def extract_recent_changes(html_content):
|
|||
user_link = item.find('a', class_='mw-userlink')
|
||||
user = user_link.get_text().strip() if user_link else "Unknown"
|
||||
|
||||
# Extract the user profile URL
|
||||
user_url = ""
|
||||
if user_link and user_link.get('href'):
|
||||
user_url = user_link.get('href')
|
||||
if not user_url.startswith('http'):
|
||||
user_url = WIKI_BASE_URL + user_url
|
||||
|
||||
# Extract the diff link
|
||||
diff_url = ""
|
||||
diff_link = item.find('a', class_='mw-changeslist-diff') or item.find('a', string='diff')
|
||||
if diff_link and diff_link.get('href'):
|
||||
diff_url = diff_link.get('href')
|
||||
if not diff_url.startswith('http'):
|
||||
diff_url = WIKI_BASE_URL + diff_url
|
||||
|
||||
# Extract the comment from the comment class
|
||||
comment_span = item.find('span', class_='comment')
|
||||
comment = comment_span.get_text().strip() if comment_span else ""
|
||||
|
@ -186,13 +201,39 @@ def extract_recent_changes(html_content):
|
|||
if size_matches:
|
||||
change_size = size_matches[0]
|
||||
|
||||
# Extract text differences if diff_url is available
|
||||
added_text = ""
|
||||
removed_text = ""
|
||||
if diff_url:
|
||||
try:
|
||||
# Fetch the diff page
|
||||
diff_html = get_page_content(diff_url)
|
||||
if diff_html:
|
||||
diff_soup = BeautifulSoup(diff_html, 'html.parser')
|
||||
|
||||
# Find added text (ins elements)
|
||||
added_elements = diff_soup.find_all('ins', class_='diffchange')
|
||||
if added_elements:
|
||||
added_text = ' '.join([el.get_text().strip() for el in added_elements])
|
||||
|
||||
# Find removed text (del elements)
|
||||
removed_elements = diff_soup.find_all('del', class_='diffchange')
|
||||
if removed_elements:
|
||||
removed_text = ' '.join([el.get_text().strip() for el in removed_elements])
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching diff page {diff_url}: {e}")
|
||||
|
||||
recent_changes.append({
|
||||
"page_name": page_name,
|
||||
"page_url": page_url,
|
||||
"timestamp": timestamp,
|
||||
"user": user,
|
||||
"user_url": user_url,
|
||||
"comment": comment,
|
||||
"change_size": change_size
|
||||
"change_size": change_size,
|
||||
"diff_url": diff_url,
|
||||
"added_text": added_text,
|
||||
"removed_text": removed_text
|
||||
})
|
||||
|
||||
logger.debug(f"Extracted change: {page_name} by {user}")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue