272 lines
No EOL
9.6 KiB
Python
272 lines
No EOL
9.6 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
fetch_recent_changes.py
|
|
|
|
This script fetches recent changes from the OpenStreetMap wiki for the French namespace
|
|
and stores the URLs of these pages. It specifically targets the recent changes page:
|
|
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2
|
|
|
|
Usage:
|
|
python fetch_recent_changes.py [--dry-run] [--force]
|
|
|
|
Options:
|
|
--dry-run Run the script without saving the results to a file
|
|
--force Force update even if the cache is still fresh (less than 1 hour old)
|
|
|
|
Output:
|
|
- recent_changes.json: JSON file with information about recent changes in the French namespace
|
|
- Log messages about the scraping process and results
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
OUTPUT_FILE = "recent_changes.json"
|
|
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
|
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
|
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
|
|
|
def is_cache_fresh():
|
|
"""
|
|
Check if the cache file exists and is less than CACHE_DURATION old
|
|
|
|
Returns:
|
|
bool: True if cache is fresh, False otherwise
|
|
"""
|
|
if not os.path.exists(OUTPUT_FILE):
|
|
return False
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
|
|
now = datetime.now()
|
|
return (now - last_updated) < CACHE_DURATION
|
|
except (IOError, json.JSONDecodeError, ValueError) as e:
|
|
logger.error(f"Error checking cache freshness: {e}")
|
|
return False
|
|
|
|
def get_page_content(url):
|
|
"""
|
|
Get the HTML content of a page
|
|
|
|
Args:
|
|
url (str): URL to fetch
|
|
|
|
Returns:
|
|
str: HTML content of the page or None if request failed
|
|
"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
def extract_recent_changes(html_content):
|
|
"""
|
|
Extract recent changes from the wiki page HTML
|
|
|
|
Args:
|
|
html_content (str): HTML content of the recent changes page
|
|
|
|
Returns:
|
|
list: List of recent change dictionaries
|
|
"""
|
|
if not html_content:
|
|
return []
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
recent_changes = []
|
|
|
|
# Try different selectors for the changes list
|
|
# First try the old selector
|
|
changes_list = soup.find('ul', class_='special')
|
|
|
|
# If not found, try the new selector
|
|
if not changes_list:
|
|
changes_list = soup.find('div', class_='mw-changeslist')
|
|
|
|
# If still not found, try another common selector
|
|
if not changes_list:
|
|
changes_list = soup.find('ul', class_='mw-changeslist')
|
|
|
|
# If still not found, look for any list inside the content area
|
|
if not changes_list:
|
|
content_div = soup.find('div', id='mw-content-text')
|
|
if content_div:
|
|
changes_list = content_div.find('ul')
|
|
|
|
if not changes_list:
|
|
logger.warning("Could not find recent changes list")
|
|
return []
|
|
|
|
# Process each list item (each change)
|
|
# Try both li elements and div elements with appropriate classes
|
|
change_items = changes_list.find_all('li')
|
|
if not change_items:
|
|
change_items = changes_list.find_all('div', class_='mw-changeslist-line')
|
|
|
|
for item in change_items:
|
|
# Extract the page link - try different selectors
|
|
page_link = item.find('a', class_='mw-changeslist-title')
|
|
if not page_link:
|
|
page_link = item.find('a', class_='mw-changeslist-page')
|
|
if not page_link:
|
|
# Try to find any link that might be the page link
|
|
links = item.find_all('a')
|
|
for link in links:
|
|
if '/wiki/' in link.get('href', ''):
|
|
page_link = link
|
|
break
|
|
|
|
if not page_link:
|
|
continue
|
|
|
|
page_name = page_link.get_text().strip()
|
|
page_url = WIKI_BASE_URL + page_link.get('href')
|
|
|
|
# Extract the timestamp - try different selectors
|
|
timestamp_span = item.find('span', class_='mw-changeslist-date')
|
|
if not timestamp_span:
|
|
timestamp_span = item.find('span', class_='mw-changeslist-time')
|
|
timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown"
|
|
|
|
# Extract the user - try different selectors
|
|
user_link = item.find('a', class_='mw-userlink')
|
|
if not user_link:
|
|
user_link = item.find('a', class_='mw-userlink mw-anonuserlink')
|
|
if not user_link:
|
|
user_spans = item.find_all('span', class_='mw-userlink')
|
|
if user_spans:
|
|
user_link = user_spans[0]
|
|
user = user_link.get_text().strip() if user_link else "Unknown"
|
|
|
|
# Extract the comment - try different selectors
|
|
comment_span = item.find('span', class_='comment')
|
|
if not comment_span:
|
|
comment_span = item.find('span', class_='changeslist-comment')
|
|
comment = comment_span.get_text().strip() if comment_span else ""
|
|
|
|
# Extract the change size - try different approaches
|
|
change_size = "0"
|
|
# Try to find spans with specific classes
|
|
size_spans = item.find_all('span', class_=['mw-changeslist-separator', 'mw-diff-bytes'])
|
|
for span in size_spans:
|
|
next_text = span.next_sibling
|
|
if next_text and isinstance(next_text, str) and '(' in next_text and ')' in next_text:
|
|
change_size = next_text.strip()
|
|
break
|
|
|
|
# If not found, try another approach
|
|
if change_size == "0":
|
|
# Look for parentheses with numbers
|
|
import re
|
|
text = item.get_text()
|
|
size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text)
|
|
if size_matches:
|
|
change_size = size_matches[0]
|
|
|
|
recent_changes.append({
|
|
"page_name": page_name,
|
|
"page_url": page_url,
|
|
"timestamp": timestamp,
|
|
"user": user,
|
|
"comment": comment,
|
|
"change_size": change_size
|
|
})
|
|
|
|
logger.info(f"Found {len(recent_changes)} recent changes")
|
|
return recent_changes
|
|
|
|
def save_results(recent_changes, dry_run=False):
|
|
"""
|
|
Save the results to a JSON file
|
|
|
|
Args:
|
|
recent_changes (list): List of recent change dictionaries
|
|
dry_run (bool): If True, don't actually save to file
|
|
|
|
Returns:
|
|
bool: True if saving was successful or dry run, False otherwise
|
|
"""
|
|
if dry_run:
|
|
logger.info("DRY RUN: Would have saved results to file")
|
|
logger.info(f"Recent changes: {len(recent_changes)}")
|
|
for change in recent_changes[:5]: # Show only first 5 for brevity
|
|
logger.info(f" - {change['page_name']}: {change['page_url']} ({change['timestamp']})")
|
|
if len(recent_changes) > 5:
|
|
logger.info(f" ... and {len(recent_changes) - 5} more")
|
|
return True
|
|
|
|
# Prepare the data structure
|
|
data = {
|
|
"last_updated": datetime.now().isoformat(),
|
|
"recent_changes": recent_changes
|
|
}
|
|
|
|
try:
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}")
|
|
return True
|
|
except IOError as e:
|
|
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
|
|
return False
|
|
|
|
def main():
|
|
"""Main function to execute the script"""
|
|
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
|
|
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
|
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("Starting fetch_recent_changes.py")
|
|
|
|
# Check if cache is fresh
|
|
if is_cache_fresh() and not args.force:
|
|
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
|
|
logger.info(f"Use --force to update anyway")
|
|
return
|
|
|
|
# Get the recent changes page content
|
|
html_content = get_page_content(RECENT_CHANGES_URL)
|
|
|
|
if not html_content:
|
|
logger.error("Failed to get recent changes page content")
|
|
return
|
|
|
|
# Extract recent changes
|
|
recent_changes = extract_recent_changes(html_content)
|
|
|
|
if not recent_changes:
|
|
logger.warning("No recent changes found")
|
|
|
|
# Save results
|
|
success = save_results(recent_changes, args.dry_run)
|
|
|
|
if success:
|
|
logger.info("Script completed successfully")
|
|
else:
|
|
logger.error("Script completed with errors")
|
|
|
|
if __name__ == "__main__":
|
|
main() |