#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ fetch_recent_changes.py This script fetches recent changes from the OpenStreetMap wiki for the French namespace and stores the URLs of these pages. It specifically targets the recent changes page: https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=10000&days=365&enhanced=1&title=Special:RecentChanges&urlversion=2 Usage: python fetch_recent_changes.py [--dry-run] [--force] Options: --dry-run Run the script without saving the results to a file --force Force update even if the cache is still fresh (less than 1 hour old) Output: - recent_changes.json: JSON file with information about recent changes in the French namespace - Log messages about the scraping process and results """ import json import argparse import logging import os import re from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants # Use the directory of this script to determine the output file path SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json") RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2" WIKI_BASE_URL = "https://wiki.openstreetmap.org" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour def is_cache_fresh(): """ Check if the cache file exists and is less than CACHE_DURATION old Returns: bool: True if cache is fresh, False otherwise """ if not os.path.exists(OUTPUT_FILE): return False try: with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) now = datetime.now() return (now - last_updated) < CACHE_DURATION except (IOError, json.JSONDecodeError, ValueError) as e: logger.error(f"Error checking cache freshness: {e}") return False def get_page_content(url): """ Get the HTML content of a page Args: url (str): URL to fetch Returns: str: HTML content of the page or None if request failed """ try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None def extract_recent_changes(html_content): """ Extract recent changes from the wiki page HTML Args: html_content (str): HTML content of the recent changes page Returns: list: List of recent change dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') recent_changes = [] # Find the main changeslist container # According to the issue description, we should look for .mw-changeslist changes_list = soup.find('div', class_='mw-changeslist') if not changes_list: # If still not found, look for the content area content_div = soup.find('div', id='mw-content-text') if content_div: # Try to find the changeslist div changes_list = content_div.find('div', class_='mw-changeslist') if not changes_list: # Log the HTML structure to help debug logger.warning("Could not find recent changes list. HTML structure:") body = soup.find('body') if body: content_area = body.find('div', id='content') if content_area: logger.warning(f"Content area classes: {content_area.get('class', [])}") main_content = content_area.find('div', id='mw-content-text') if main_content: logger.warning(f"Main content first child: {main_content.find().name if main_content.find() else 'None'}") return [] logger.info(f"Found changes list with tag: {changes_list.name}, classes: {changes_list.get('class', [])}") # Process each change item - based on the actual HTML structure # According to the debug output, the changes are in tr elements change_items = changes_list.find_all('tr') # If no tr elements found directly, look for tables with class mw-changeslist-line if not change_items: tables = changes_list.find_all('table', class_='mw-changeslist-line') for table in tables: trs = table.find_all('tr') change_items.extend(trs) logger.info(f"Found {len(change_items)} change items") for item in change_items: # Extract the page link from the mw-changeslist-title class page_link = item.find('a', class_='mw-changeslist-title') if not page_link: # If not found with the specific class, try to find any link that might be the page link inner_td = item.find('td', class_='mw-changeslist-line-inner') if inner_td: links = inner_td.find_all('a') for link in links: href = link.get('href', '') if '/wiki/' in href and 'action=history' not in href and 'diff=' not in href: page_link = link break if not page_link: # Skip items without a page link (might be headers or other elements) continue page_name = page_link.get_text().strip() page_url = page_link.get('href') if not page_url.startswith('http'): page_url = WIKI_BASE_URL + page_url # Extract the timestamp from the mw-enhanced-rc class timestamp_td = item.find('td', class_='mw-enhanced-rc') timestamp = timestamp_td.get_text().strip() if timestamp_td else "Unknown" # Extract the user from the mw-userlink class user_link = item.find('a', class_='mw-userlink') user = user_link.get_text().strip() if user_link else "Unknown" # Extract the user profile URL user_url = "" if user_link and user_link.get('href'): user_url = user_link.get('href') if not user_url.startswith('http'): user_url = WIKI_BASE_URL + user_url # Extract the diff link diff_url = "" diff_link = item.find('a', class_='mw-changeslist-diff') or item.find('a', string='diff') if diff_link and diff_link.get('href'): diff_url = diff_link.get('href') if not diff_url.startswith('http'): diff_url = WIKI_BASE_URL + diff_url # Extract the comment from the comment class comment_span = item.find('span', class_='comment') comment = comment_span.get_text().strip() if comment_span else "" # Extract the change size from the mw-diff-bytes class size_span = item.find('span', class_='mw-diff-bytes') if size_span: change_size = size_span.get_text().strip() else: # If not found, try to extract from the text change_size = "0" text = item.get_text() size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text) if size_matches: change_size = size_matches[0] # Extract text differences if diff_url is available added_text = "" removed_text = "" if diff_url: try: # Fetch the diff page diff_html = get_page_content(diff_url) if diff_html: diff_soup = BeautifulSoup(diff_html, 'html.parser') # Find added text (ins elements) added_elements = diff_soup.find_all('ins', class_='diffchange') if added_elements: added_text = ' '.join([el.get_text().strip() for el in added_elements]) # Find removed text (del elements) removed_elements = diff_soup.find_all('del', class_='diffchange') if removed_elements: removed_text = ' '.join([el.get_text().strip() for el in removed_elements]) except Exception as e: logger.error(f"Error fetching diff page {diff_url}: {e}") recent_changes.append({ "page_name": page_name, "page_url": page_url, "timestamp": timestamp, "user": user, "user_url": user_url, "comment": comment, "change_size": change_size, "diff_url": diff_url, "added_text": added_text, "removed_text": removed_text }) logger.debug(f"Extracted change: {page_name} by {user}") logger.info(f"Extracted {len(recent_changes)} recent changes") return recent_changes def save_results(recent_changes, dry_run=False): """ Save the results to a JSON file Args: recent_changes (list): List of recent change dictionaries dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved results to file") logger.info(f"Recent changes: {len(recent_changes)}") for change in recent_changes[:5]: # Show only first 5 for brevity logger.info(f" - {change['page_name']}: {change['page_url']} ({change['timestamp']})") if len(recent_changes) > 5: logger.info(f" ... and {len(recent_changes) - 5} more") return True # Log some details about the recent changes logger.info(f"Preparing to save {len(recent_changes)} recent changes") if recent_changes: logger.info(f"First change: {recent_changes[0]['page_name']} by {recent_changes[0]['user']}") # Prepare the data structure data = { "last_updated": datetime.now().isoformat(), "recent_changes": recent_changes } # Get the file's last modified time before saving before_mtime = None if os.path.exists(OUTPUT_FILE): before_mtime = os.path.getmtime(OUTPUT_FILE) logger.info(f"File {OUTPUT_FILE} exists, last modified at {datetime.fromtimestamp(before_mtime)}") try: # Print the JSON data that we're trying to save json_data = json.dumps(data, indent=2, ensure_ascii=False) logger.info(f"JSON data to save (first 500 chars): {json_data[:500]}...") # Save the data to a temporary file first temp_file = OUTPUT_FILE + ".tmp" logger.info(f"Writing data to temporary file {temp_file}") with open(temp_file, 'w', encoding='utf-8') as f: f.write(json_data) # Check if the temporary file was created and has content if os.path.exists(temp_file): temp_size = os.path.getsize(temp_file) logger.info(f"Temporary file {temp_file} created, size: {temp_size} bytes") # Read the content of the temporary file to verify with open(temp_file, 'r', encoding='utf-8') as f: temp_content = f.read(500) # Read first 500 chars logger.info(f"Temporary file content (first 500 chars): {temp_content}...") # Move the temporary file to the final location logger.info(f"Moving temporary file to {OUTPUT_FILE}") import shutil shutil.move(temp_file, OUTPUT_FILE) else: logger.error(f"Failed to create temporary file {temp_file}") # Check if the file was actually updated if os.path.exists(OUTPUT_FILE): after_mtime = os.path.getmtime(OUTPUT_FILE) file_size = os.path.getsize(OUTPUT_FILE) logger.info(f"File {OUTPUT_FILE} exists, size: {file_size} bytes, mtime: {datetime.fromtimestamp(after_mtime)}") # Read the content of the file to verify with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: file_content = f.read(500) # Read first 500 chars logger.info(f"File content (first 500 chars): {file_content}...") if before_mtime and after_mtime <= before_mtime: logger.warning(f"File {OUTPUT_FILE} was not updated (mtime did not change)") else: logger.error(f"File {OUTPUT_FILE} does not exist after saving") # Copy the file to the public directory public_file = os.path.join(os.path.dirname(os.path.dirname(OUTPUT_FILE)), 'public', os.path.basename(OUTPUT_FILE)) logger.info(f"Copying {OUTPUT_FILE} to {public_file}") shutil.copy2(OUTPUT_FILE, public_file) # Check if the public file was created if os.path.exists(public_file): public_size = os.path.getsize(public_file) logger.info(f"Public file {public_file} created, size: {public_size} bytes") else: logger.error(f"Failed to create public file {public_file}") logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}") return True except IOError as e: logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace") parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") parser.add_argument("--debug", action="store_true", help="Save HTML content to a file for debugging") args = parser.parse_args() logger.info("Starting fetch_recent_changes.py") # Check if cache is fresh if is_cache_fresh() and not args.force: logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") logger.info(f"Use --force to update anyway") return # Get the recent changes page content html_content = get_page_content(RECENT_CHANGES_URL) if not html_content: logger.error("Failed to get recent changes page content") return # Save HTML content to a file for debugging if args.debug: debug_file = "recent_changes_debug.html" try: with open(debug_file, 'w', encoding='utf-8') as f: f.write(html_content) logger.info(f"Saved HTML content to {debug_file} for debugging") except IOError as e: logger.error(f"Error saving HTML content to {debug_file}: {e}") # Parse the HTML to find the structure soup = BeautifulSoup(html_content, 'html.parser') # Find the main content area content_div = soup.find('div', id='mw-content-text') if content_div: logger.info(f"Found content div with id 'mw-content-text'") # Look for elements with mw-changeslist class changeslist_elements = content_div.find_all(class_='mw-changeslist') logger.info(f"Found {len(changeslist_elements)} elements with class 'mw-changeslist'") for i, element in enumerate(changeslist_elements): logger.info(f"Element {i+1} tag: {element.name}, classes: {element.get('class', [])}") # Look for table rows or other elements that might contain changes rows = element.find_all('tr') divs = element.find_all('div', class_='mw-changeslist-line') lis = element.find_all('li') logger.info(f" - Contains {len(rows)} tr elements") logger.info(f" - Contains {len(divs)} div.mw-changeslist-line elements") logger.info(f" - Contains {len(lis)} li elements") # Check direct children children = list(element.children) logger.info(f" - Has {len(children)} direct children") if children: child_types = {} for child in children: if hasattr(child, 'name') and child.name: child_type = child.name child_types[child_type] = child_types.get(child_type, 0) + 1 logger.info(f" - Direct children types: {child_types}") # Extract recent changes recent_changes = extract_recent_changes(html_content) if not recent_changes: logger.warning("No recent changes found") # Save results success = save_results(recent_changes, args.dry_run) if success: logger.info("Script completed successfully") else: logger.error("Script completed with errors") if __name__ == "__main__": main()