#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ fetch_recent_changes.py This script fetches recent changes from the OpenStreetMap wiki for the French namespace and stores the URLs of these pages. It specifically targets the recent changes page: https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=10000&days=365&enhanced=1&title=Special:RecentChanges&urlversion=2 Usage: python fetch_recent_changes.py [--dry-run] [--force] Options: --dry-run Run the script without saving the results to a file --force Force update even if the cache is still fresh (less than 1 hour old) Output: - recent_changes.json: JSON file with information about recent changes in the French namespace - Log messages about the scraping process and results """ import json import argparse import logging import os import re import shutil from datetime import datetime, timedelta import requests from bs4 import BeautifulSoup # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Constants # Use the directory of this script to determine the output file path SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json") UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json") CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json") RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2" WIKI_BASE_URL = "https://wiki.openstreetmap.org" CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour def is_cache_fresh(): """ Check if the cache file exists and is less than CACHE_DURATION old Returns: bool: True if cache is fresh, False otherwise """ if not os.path.exists(OUTPUT_FILE): return False try: with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00')) now = datetime.now() return (now - last_updated) < CACHE_DURATION except (IOError, json.JSONDecodeError, ValueError) as e: logger.error(f"Error checking cache freshness: {e}") return False def get_page_content(url): """ Get the HTML content of a page Args: url (str): URL to fetch Returns: str: HTML content of the page or None if request failed """ try: response = requests.get(url) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: logger.error(f"Error fetching {url}: {e}") return None def extract_recent_changes(html_content): """ Extract recent changes from the wiki page HTML Args: html_content (str): HTML content of the recent changes page Returns: list: List of recent change dictionaries """ if not html_content: return [] soup = BeautifulSoup(html_content, 'html.parser') recent_changes = [] # Find the main changeslist container # According to the issue description, we should look for .mw-changeslist changes_list = soup.find('div', class_='mw-changeslist') if not changes_list: # If still not found, look for the content area content_div = soup.find('div', id='mw-content-text') if content_div: # Try to find the changeslist div changes_list = content_div.find('div', class_='mw-changeslist') if not changes_list: # Log the HTML structure to help debug logger.warning("Could not find recent changes list. HTML structure:") body = soup.find('body') if body: content_area = body.find('div', id='content') if content_area: logger.warning(f"Content area classes: {content_area.get('class', [])}") main_content = content_area.find('div', id='mw-content-text') if main_content: logger.warning(f"Main content first child: {main_content.find().name if main_content.find() else 'None'}") return [] logger.info(f"Found changes list with tag: {changes_list.name}, classes: {changes_list.get('class', [])}") # Process each change item - based on the actual HTML structure # According to the debug output, the changes are in tr elements change_items = changes_list.find_all('tr') # If no tr elements found directly, look for tables with class mw-changeslist-line if not change_items: tables = changes_list.find_all('table', class_='mw-changeslist-line') for table in tables: trs = table.find_all('tr') change_items.extend(trs) logger.info(f"Found {len(change_items)} change items") for item in change_items: # Extract the page link from the mw-changeslist-title class page_link = item.find('a', class_='mw-changeslist-title') if not page_link: # If not found with the specific class, try to find any link that might be the page link inner_td = item.find('td', class_='mw-changeslist-line-inner') if inner_td: links = inner_td.find_all('a') for link in links: href = link.get('href', '') if '/wiki/' in href and 'action=history' not in href and 'diff=' not in href: page_link = link break if not page_link: # Skip items without a page link (might be headers or other elements) continue page_name = page_link.get_text().strip() page_url = page_link.get('href') if not page_url.startswith('http'): page_url = WIKI_BASE_URL + page_url # Extract the timestamp from the mw-enhanced-rc class timestamp_td = item.find('td', class_='mw-enhanced-rc') timestamp = timestamp_td.get_text().strip() if timestamp_td else "Unknown" # Extract the user from the mw-userlink class user_link = item.find('a', class_='mw-userlink') user = user_link.get_text().strip() if user_link else "Unknown" # Extract the user profile URL user_url = "" if user_link and user_link.get('href'): user_url = user_link.get('href') if not user_url.startswith('http'): user_url = WIKI_BASE_URL + user_url # Extract the diff link diff_url = "" diff_link = item.find('a', class_='mw-changeslist-diff') or item.find('a', string='diff') if diff_link and diff_link.get('href'): diff_url = diff_link.get('href') if not diff_url.startswith('http'): diff_url = WIKI_BASE_URL + diff_url # Extract the comment from the comment class comment_span = item.find('span', class_='comment') comment = comment_span.get_text().strip() if comment_span else "" # Extract the change size from the mw-diff-bytes class size_span = item.find('span', class_='mw-diff-bytes') if size_span: change_size = size_span.get_text().strip() else: # If not found, try to extract from the text change_size = "0" text = item.get_text() size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text) if size_matches: change_size = size_matches[0] # Extract text differences if diff_url is available added_text = "" removed_text = "" if diff_url: try: # Fetch the diff page diff_html = get_page_content(diff_url) if diff_html: diff_soup = BeautifulSoup(diff_html, 'html.parser') # Find added text (ins elements) added_elements = diff_soup.find_all('ins', class_='diffchange') if added_elements: added_text = ' '.join([el.get_text().strip() for el in added_elements]) # Find removed text (del elements) removed_elements = diff_soup.find_all('del', class_='diffchange') if removed_elements: removed_text = ' '.join([el.get_text().strip() for el in removed_elements]) except Exception as e: logger.error(f"Error fetching diff page {diff_url}: {e}") recent_changes.append({ "page_name": page_name, "page_url": page_url, "timestamp": timestamp, "user": user, "user_url": user_url, "comment": comment, "change_size": change_size, "diff_url": diff_url, "added_text": added_text, "removed_text": removed_text }) logger.debug(f"Extracted change: {page_name} by {user}") logger.info(f"Extracted {len(recent_changes)} recent changes") return recent_changes def save_results(recent_changes, dry_run=False): """ Save the results to a JSON file Args: recent_changes (list): List of recent change dictionaries dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved results to file") logger.info(f"Recent changes: {len(recent_changes)}") for change in recent_changes[:5]: # Show only first 5 for brevity logger.info(f" - {change['page_name']}: {change['page_url']} ({change['timestamp']})") if len(recent_changes) > 5: logger.info(f" ... and {len(recent_changes) - 5} more") return True # Log some details about the recent changes logger.info(f"Preparing to save {len(recent_changes)} recent changes") if recent_changes: logger.info(f"First change: {recent_changes[0]['page_name']} by {recent_changes[0]['user']}") # Prepare the data structure data = { "last_updated": datetime.now().isoformat(), "recent_changes": recent_changes } # Get the file's last modified time before saving before_mtime = None if os.path.exists(OUTPUT_FILE): before_mtime = os.path.getmtime(OUTPUT_FILE) logger.info(f"File {OUTPUT_FILE} exists, last modified at {datetime.fromtimestamp(before_mtime)}") try: # Print the JSON data that we're trying to save json_data = json.dumps(data, indent=2, ensure_ascii=False) logger.info(f"JSON data to save (first 500 chars): {json_data[:500]}...") # Save the data to a temporary file first temp_file = OUTPUT_FILE + ".tmp" logger.info(f"Writing data to temporary file {temp_file}") with open(temp_file, 'w', encoding='utf-8') as f: f.write(json_data) # Check if the temporary file was created and has content if os.path.exists(temp_file): temp_size = os.path.getsize(temp_file) logger.info(f"Temporary file {temp_file} created, size: {temp_size} bytes") # Read the content of the temporary file to verify with open(temp_file, 'r', encoding='utf-8') as f: temp_content = f.read(500) # Read first 500 chars logger.info(f"Temporary file content (first 500 chars): {temp_content}...") # Move the temporary file to the final location logger.info(f"Moving temporary file to {OUTPUT_FILE}") import shutil shutil.move(temp_file, OUTPUT_FILE) else: logger.error(f"Failed to create temporary file {temp_file}") # Check if the file was actually updated if os.path.exists(OUTPUT_FILE): after_mtime = os.path.getmtime(OUTPUT_FILE) file_size = os.path.getsize(OUTPUT_FILE) logger.info(f"File {OUTPUT_FILE} exists, size: {file_size} bytes, mtime: {datetime.fromtimestamp(after_mtime)}") # Read the content of the file to verify with open(OUTPUT_FILE, 'r', encoding='utf-8') as f: file_content = f.read(500) # Read first 500 chars logger.info(f"File content (first 500 chars): {file_content}...") if before_mtime and after_mtime <= before_mtime: logger.warning(f"File {OUTPUT_FILE} was not updated (mtime did not change)") else: logger.error(f"File {OUTPUT_FILE} does not exist after saving") # Copy the file to the public directory public_file = os.path.join(os.path.dirname(os.path.dirname(OUTPUT_FILE)), 'public', os.path.basename(OUTPUT_FILE)) logger.info(f"Copying {OUTPUT_FILE} to {public_file}") shutil.copy2(OUTPUT_FILE, public_file) # Check if the public file was created if os.path.exists(public_file): public_size = os.path.getsize(public_file) logger.info(f"Public file {public_file} created, size: {public_size} bytes") else: logger.error(f"Failed to create public file {public_file}") logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}") return True except IOError as e: logger.error(f"Error saving results to {OUTPUT_FILE}: {e}") return False def load_unavailable_pages(): """ Load the list of pages unavailable in French Returns: tuple: (all_pages, grouped_pages, last_updated) """ if not os.path.exists(UNAVAILABLE_PAGES_FILE): logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist") return [], {}, None try: with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f: data = json.load(f) all_pages = data.get('all_pages', []) grouped_pages = data.get('grouped_pages', {}) last_updated = data.get('last_updated') return all_pages, grouped_pages, last_updated except (IOError, json.JSONDecodeError) as e: logger.error(f"Error loading unavailable pages file: {e}") return [], {}, None def load_created_pages(): """ Load the list of newly created French pages Returns: tuple: (created_pages, last_updated) """ if not os.path.exists(CREATED_PAGES_FILE): logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it") return [], None try: with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f: data = json.load(f) created_pages = data.get('created_pages', []) last_updated = data.get('last_updated') return created_pages, last_updated except (IOError, json.JSONDecodeError) as e: logger.error(f"Error loading created pages file: {e}") return [], None def save_created_pages(created_pages, dry_run=False): """ Save the list of newly created French pages Args: created_pages (list): List of newly created French pages dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved created pages to file") return True data = { "last_updated": datetime.now().isoformat(), "created_pages": created_pages } try: with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}") # Copy the file to the public directory public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE)) logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}") shutil.copy2(CREATED_PAGES_FILE, public_file) return True except IOError as e: logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}") return False def save_unavailable_pages(all_pages, grouped_pages, dry_run=False): """ Save the updated list of pages unavailable in French Args: all_pages (list): List of all unavailable pages grouped_pages (dict): Dictionary of pages grouped by language prefix dry_run (bool): If True, don't actually save to file Returns: bool: True if saving was successful or dry run, False otherwise """ if dry_run: logger.info("DRY RUN: Would have saved updated unavailable pages to file") return True data = { "last_updated": datetime.now().isoformat(), "all_pages": all_pages, "grouped_pages": grouped_pages } try: with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}") # Copy the file to the public directory public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE)) logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}") shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file) return True except IOError as e: logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}") return False def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages): """ Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French Args: recent_changes (list): List of recent change dictionaries all_pages (list): List of all unavailable pages grouped_pages (dict): Dictionary of pages grouped by language prefix Returns: tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages) """ newly_created_pages = [] updated_all_pages = all_pages.copy() updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()} # Check each recent change for change in recent_changes: page_name = change['page_name'] page_url = change['page_url'] comment = change['comment'].lower() # Check if this is a new page creation is_new_page = "page created" in comment or "nouvelle page" in comment if is_new_page and page_name.startswith("FR:"): logger.info(f"Found newly created French page: {page_name}") # Check if this page was previously in the list of unavailable pages # We need to check if the English version of this page is in the list en_page_name = page_name.replace("FR:", "") # Find the English page in the list of unavailable pages found_en_page = None for page in all_pages: if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name): found_en_page = page break if found_en_page: logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}") # Remove the English page from the list of unavailable pages updated_all_pages.remove(found_en_page) # Remove the English page from the grouped pages lang_prefix = found_en_page['language_prefix'] if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]: updated_grouped_pages[lang_prefix].remove(found_en_page) # If the group is now empty, remove it if not updated_grouped_pages[lang_prefix]: del updated_grouped_pages[lang_prefix] # Add the newly created page to the list newly_created_pages.append({ "title": page_name, "url": page_url, "en_title": found_en_page['title'], "en_url": found_en_page['url'], "created_at": change['timestamp'], "created_by": change['user'], "comment": change['comment'] }) return updated_all_pages, updated_grouped_pages, newly_created_pages def main(): """Main function to execute the script""" parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace") parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file") parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh") parser.add_argument("--debug", action="store_true", help="Save HTML content to a file for debugging") args = parser.parse_args() logger.info("Starting fetch_recent_changes.py") # Check if cache is fresh if is_cache_fresh() and not args.force: logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)") logger.info(f"Use --force to update anyway") return # Get the recent changes page content html_content = get_page_content(RECENT_CHANGES_URL) if not html_content: logger.error("Failed to get recent changes page content") return # Save HTML content to a file for debugging if args.debug: debug_file = "recent_changes_debug.html" try: with open(debug_file, 'w', encoding='utf-8') as f: f.write(html_content) logger.info(f"Saved HTML content to {debug_file} for debugging") except IOError as e: logger.error(f"Error saving HTML content to {debug_file}: {e}") # Parse the HTML to find the structure soup = BeautifulSoup(html_content, 'html.parser') # Find the main content area content_div = soup.find('div', id='mw-content-text') if content_div: logger.info(f"Found content div with id 'mw-content-text'") # Look for elements with mw-changeslist class changeslist_elements = content_div.find_all(class_='mw-changeslist') logger.info(f"Found {len(changeslist_elements)} elements with class 'mw-changeslist'") for i, element in enumerate(changeslist_elements): logger.info(f"Element {i+1} tag: {element.name}, classes: {element.get('class', [])}") # Look for table rows or other elements that might contain changes rows = element.find_all('tr') divs = element.find_all('div', class_='mw-changeslist-line') lis = element.find_all('li') logger.info(f" - Contains {len(rows)} tr elements") logger.info(f" - Contains {len(divs)} div.mw-changeslist-line elements") logger.info(f" - Contains {len(lis)} li elements") # Check direct children children = list(element.children) logger.info(f" - Has {len(children)} direct children") if children: child_types = {} for child in children: if hasattr(child, 'name') and child.name: child_type = child.name child_types[child_type] = child_types.get(child_type, 0) + 1 logger.info(f" - Direct children types: {child_types}") # Extract recent changes recent_changes = extract_recent_changes(html_content) if not recent_changes: logger.warning("No recent changes found") # Save results success = save_results(recent_changes, args.dry_run) # Check for newly created French pages logger.info("Checking for newly created French pages...") all_pages, grouped_pages, last_updated = load_unavailable_pages() created_pages, created_last_updated = load_created_pages() if all_pages and grouped_pages: # Check for newly created pages updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages) # If we found newly created pages, update both files if newly_created: logger.info(f"Found {len(newly_created)} newly created French pages") # Add the newly created pages to the existing list created_pages.extend(newly_created) # Save the updated files save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run) save_created_pages(created_pages, args.dry_run) else: logger.info("No newly created French pages found") else: logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty") if success: logger.info("Script completed successfully") else: logger.error("Script completed with errors") if __name__ == "__main__": main()