osm-labo/wiki_compare/fetch_recent_changes.py

635 lines
No EOL
26 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
fetch_recent_changes.py
This script fetches recent changes from the OpenStreetMap wiki for the French namespace
and stores the URLs of these pages. It specifically targets the recent changes page:
https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=10000&days=365&enhanced=1&title=Special:RecentChanges&urlversion=2
Usage:
python fetch_recent_changes.py [--dry-run] [--force]
Options:
--dry-run Run the script without saving the results to a file
--force Force update even if the cache is still fresh (less than 1 hour old)
Output:
- recent_changes.json: JSON file with information about recent changes in the French namespace
- Log messages about the scraping process and results
"""
import json
import argparse
import logging
import os
import re
import shutil
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
# Constants
# Use the directory of this script to determine the output file path
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
UNAVAILABLE_PAGES_FILE = os.path.join(SCRIPT_DIR, "pages_unavailable_in_french.json")
CREATED_PAGES_FILE = os.path.join(SCRIPT_DIR, "newly_created_french_pages.json")
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
def is_cache_fresh():
"""
Check if the cache file exists and is less than CACHE_DURATION old
Returns:
bool: True if cache is fresh, False otherwise
"""
if not os.path.exists(OUTPUT_FILE):
return False
try:
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
last_updated = datetime.fromisoformat(data.get('last_updated', '2000-01-01T00:00:00'))
now = datetime.now()
return (now - last_updated) < CACHE_DURATION
except (IOError, json.JSONDecodeError, ValueError) as e:
logger.error(f"Error checking cache freshness: {e}")
return False
def get_page_content(url):
"""
Get the HTML content of a page
Args:
url (str): URL to fetch
Returns:
str: HTML content of the page or None if request failed
"""
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def extract_recent_changes(html_content):
"""
Extract recent changes from the wiki page HTML
Args:
html_content (str): HTML content of the recent changes page
Returns:
list: List of recent change dictionaries
"""
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
recent_changes = []
# Find the main changeslist container
# According to the issue description, we should look for .mw-changeslist
changes_list = soup.find('div', class_='mw-changeslist')
if not changes_list:
# If still not found, look for the content area
content_div = soup.find('div', id='mw-content-text')
if content_div:
# Try to find the changeslist div
changes_list = content_div.find('div', class_='mw-changeslist')
if not changes_list:
# Log the HTML structure to help debug
logger.warning("Could not find recent changes list. HTML structure:")
body = soup.find('body')
if body:
content_area = body.find('div', id='content')
if content_area:
logger.warning(f"Content area classes: {content_area.get('class', [])}")
main_content = content_area.find('div', id='mw-content-text')
if main_content:
logger.warning(f"Main content first child: {main_content.find().name if main_content.find() else 'None'}")
return []
logger.info(f"Found changes list with tag: {changes_list.name}, classes: {changes_list.get('class', [])}")
# Process each change item - based on the actual HTML structure
# According to the debug output, the changes are in tr elements
change_items = changes_list.find_all('tr')
# If no tr elements found directly, look for tables with class mw-changeslist-line
if not change_items:
tables = changes_list.find_all('table', class_='mw-changeslist-line')
for table in tables:
trs = table.find_all('tr')
change_items.extend(trs)
logger.info(f"Found {len(change_items)} change items")
for item in change_items:
# Extract the page link from the mw-changeslist-title class
page_link = item.find('a', class_='mw-changeslist-title')
if not page_link:
# If not found with the specific class, try to find any link that might be the page link
inner_td = item.find('td', class_='mw-changeslist-line-inner')
if inner_td:
links = inner_td.find_all('a')
for link in links:
href = link.get('href', '')
if '/wiki/' in href and 'action=history' not in href and 'diff=' not in href:
page_link = link
break
if not page_link:
# Skip items without a page link (might be headers or other elements)
continue
page_name = page_link.get_text().strip()
page_url = page_link.get('href')
if not page_url.startswith('http'):
page_url = WIKI_BASE_URL + page_url
# Extract the timestamp from the mw-enhanced-rc class
timestamp_td = item.find('td', class_='mw-enhanced-rc')
timestamp = timestamp_td.get_text().strip() if timestamp_td else "Unknown"
# Extract the user from the mw-userlink class
user_link = item.find('a', class_='mw-userlink')
user = user_link.get_text().strip() if user_link else "Unknown"
# Extract the user profile URL
user_url = ""
if user_link and user_link.get('href'):
user_url = user_link.get('href')
if not user_url.startswith('http'):
user_url = WIKI_BASE_URL + user_url
# Extract the diff link
diff_url = ""
diff_link = item.find('a', class_='mw-changeslist-diff') or item.find('a', string='diff')
if diff_link and diff_link.get('href'):
diff_url = diff_link.get('href')
if not diff_url.startswith('http'):
diff_url = WIKI_BASE_URL + diff_url
# Extract the comment from the comment class
comment_span = item.find('span', class_='comment')
comment = comment_span.get_text().strip() if comment_span else ""
# Extract the change size from the mw-diff-bytes class
size_span = item.find('span', class_='mw-diff-bytes')
if size_span:
change_size = size_span.get_text().strip()
else:
# If not found, try to extract from the text
change_size = "0"
text = item.get_text()
size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text)
if size_matches:
change_size = size_matches[0]
# Extract text differences if diff_url is available
added_text = ""
removed_text = ""
if diff_url:
try:
# Fetch the diff page
diff_html = get_page_content(diff_url)
if diff_html:
diff_soup = BeautifulSoup(diff_html, 'html.parser')
# Find added text (ins elements)
added_elements = diff_soup.find_all('ins', class_='diffchange')
if added_elements:
added_text = ' '.join([el.get_text().strip() for el in added_elements])
# Find removed text (del elements)
removed_elements = diff_soup.find_all('del', class_='diffchange')
if removed_elements:
removed_text = ' '.join([el.get_text().strip() for el in removed_elements])
except Exception as e:
logger.error(f"Error fetching diff page {diff_url}: {e}")
recent_changes.append({
"page_name": page_name,
"page_url": page_url,
"timestamp": timestamp,
"user": user,
"user_url": user_url,
"comment": comment,
"change_size": change_size,
"diff_url": diff_url,
"added_text": added_text,
"removed_text": removed_text
})
logger.debug(f"Extracted change: {page_name} by {user}")
logger.info(f"Extracted {len(recent_changes)} recent changes")
return recent_changes
def save_results(recent_changes, dry_run=False):
"""
Save the results to a JSON file
Args:
recent_changes (list): List of recent change dictionaries
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved results to file")
logger.info(f"Recent changes: {len(recent_changes)}")
for change in recent_changes[:5]: # Show only first 5 for brevity
logger.info(f" - {change['page_name']}: {change['page_url']} ({change['timestamp']})")
if len(recent_changes) > 5:
logger.info(f" ... and {len(recent_changes) - 5} more")
return True
# Log some details about the recent changes
logger.info(f"Preparing to save {len(recent_changes)} recent changes")
if recent_changes:
logger.info(f"First change: {recent_changes[0]['page_name']} by {recent_changes[0]['user']}")
# Prepare the data structure
data = {
"last_updated": datetime.now().isoformat(),
"recent_changes": recent_changes
}
# Get the file's last modified time before saving
before_mtime = None
if os.path.exists(OUTPUT_FILE):
before_mtime = os.path.getmtime(OUTPUT_FILE)
logger.info(f"File {OUTPUT_FILE} exists, last modified at {datetime.fromtimestamp(before_mtime)}")
try:
# Print the JSON data that we're trying to save
json_data = json.dumps(data, indent=2, ensure_ascii=False)
logger.info(f"JSON data to save (first 500 chars): {json_data[:500]}...")
# Save the data to a temporary file first
temp_file = OUTPUT_FILE + ".tmp"
logger.info(f"Writing data to temporary file {temp_file}")
with open(temp_file, 'w', encoding='utf-8') as f:
f.write(json_data)
# Check if the temporary file was created and has content
if os.path.exists(temp_file):
temp_size = os.path.getsize(temp_file)
logger.info(f"Temporary file {temp_file} created, size: {temp_size} bytes")
# Read the content of the temporary file to verify
with open(temp_file, 'r', encoding='utf-8') as f:
temp_content = f.read(500) # Read first 500 chars
logger.info(f"Temporary file content (first 500 chars): {temp_content}...")
# Move the temporary file to the final location
logger.info(f"Moving temporary file to {OUTPUT_FILE}")
import shutil
shutil.move(temp_file, OUTPUT_FILE)
else:
logger.error(f"Failed to create temporary file {temp_file}")
# Check if the file was actually updated
if os.path.exists(OUTPUT_FILE):
after_mtime = os.path.getmtime(OUTPUT_FILE)
file_size = os.path.getsize(OUTPUT_FILE)
logger.info(f"File {OUTPUT_FILE} exists, size: {file_size} bytes, mtime: {datetime.fromtimestamp(after_mtime)}")
# Read the content of the file to verify
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
file_content = f.read(500) # Read first 500 chars
logger.info(f"File content (first 500 chars): {file_content}...")
if before_mtime and after_mtime <= before_mtime:
logger.warning(f"File {OUTPUT_FILE} was not updated (mtime did not change)")
else:
logger.error(f"File {OUTPUT_FILE} does not exist after saving")
# Copy the file to the public directory
public_file = os.path.join(os.path.dirname(os.path.dirname(OUTPUT_FILE)), 'public', os.path.basename(OUTPUT_FILE))
logger.info(f"Copying {OUTPUT_FILE} to {public_file}")
shutil.copy2(OUTPUT_FILE, public_file)
# Check if the public file was created
if os.path.exists(public_file):
public_size = os.path.getsize(public_file)
logger.info(f"Public file {public_file} created, size: {public_size} bytes")
else:
logger.error(f"Failed to create public file {public_file}")
logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}")
return True
except IOError as e:
logger.error(f"Error saving results to {OUTPUT_FILE}: {e}")
return False
def load_unavailable_pages():
"""
Load the list of pages unavailable in French
Returns:
tuple: (all_pages, grouped_pages, last_updated)
"""
if not os.path.exists(UNAVAILABLE_PAGES_FILE):
logger.warning(f"Unavailable pages file {UNAVAILABLE_PAGES_FILE} does not exist")
return [], {}, None
try:
with open(UNAVAILABLE_PAGES_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
all_pages = data.get('all_pages', [])
grouped_pages = data.get('grouped_pages', {})
last_updated = data.get('last_updated')
return all_pages, grouped_pages, last_updated
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading unavailable pages file: {e}")
return [], {}, None
def load_created_pages():
"""
Load the list of newly created French pages
Returns:
tuple: (created_pages, last_updated)
"""
if not os.path.exists(CREATED_PAGES_FILE):
logger.info(f"Created pages file {CREATED_PAGES_FILE} does not exist, will create it")
return [], None
try:
with open(CREATED_PAGES_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
created_pages = data.get('created_pages', [])
last_updated = data.get('last_updated')
return created_pages, last_updated
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error loading created pages file: {e}")
return [], None
def save_created_pages(created_pages, dry_run=False):
"""
Save the list of newly created French pages
Args:
created_pages (list): List of newly created French pages
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved created pages to file")
return True
data = {
"last_updated": datetime.now().isoformat(),
"created_pages": created_pages
}
try:
with open(CREATED_PAGES_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(created_pages)} created pages to {CREATED_PAGES_FILE}")
# Copy the file to the public directory
public_file = os.path.join(os.path.dirname(os.path.dirname(CREATED_PAGES_FILE)), 'public', os.path.basename(CREATED_PAGES_FILE))
logger.info(f"Copying {CREATED_PAGES_FILE} to {public_file}")
shutil.copy2(CREATED_PAGES_FILE, public_file)
return True
except IOError as e:
logger.error(f"Error saving created pages to {CREATED_PAGES_FILE}: {e}")
return False
def save_unavailable_pages(all_pages, grouped_pages, dry_run=False):
"""
Save the updated list of pages unavailable in French
Args:
all_pages (list): List of all unavailable pages
grouped_pages (dict): Dictionary of pages grouped by language prefix
dry_run (bool): If True, don't actually save to file
Returns:
bool: True if saving was successful or dry run, False otherwise
"""
if dry_run:
logger.info("DRY RUN: Would have saved updated unavailable pages to file")
return True
data = {
"last_updated": datetime.now().isoformat(),
"all_pages": all_pages,
"grouped_pages": grouped_pages
}
try:
with open(UNAVAILABLE_PAGES_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved {len(all_pages)} unavailable pages to {UNAVAILABLE_PAGES_FILE}")
# Copy the file to the public directory
public_file = os.path.join(os.path.dirname(os.path.dirname(UNAVAILABLE_PAGES_FILE)), 'public', os.path.basename(UNAVAILABLE_PAGES_FILE))
logger.info(f"Copying {UNAVAILABLE_PAGES_FILE} to {public_file}")
shutil.copy2(UNAVAILABLE_PAGES_FILE, public_file)
return True
except IOError as e:
logger.error(f"Error saving unavailable pages to {UNAVAILABLE_PAGES_FILE}: {e}")
return False
def check_for_newly_created_pages(recent_changes, all_pages, grouped_pages):
"""
Check if any of the recent changes are newly created French pages that were previously in the list of pages unavailable in French
Args:
recent_changes (list): List of recent change dictionaries
all_pages (list): List of all unavailable pages
grouped_pages (dict): Dictionary of pages grouped by language prefix
Returns:
tuple: (updated_all_pages, updated_grouped_pages, newly_created_pages)
"""
newly_created_pages = []
updated_all_pages = all_pages.copy()
updated_grouped_pages = {k: v.copy() for k, v in grouped_pages.items()}
# Check each recent change
for change in recent_changes:
page_name = change['page_name']
page_url = change['page_url']
comment = change['comment'].lower()
# Check if this is a new page creation
is_new_page = "page created" in comment or "nouvelle page" in comment
if is_new_page and page_name.startswith("FR:"):
logger.info(f"Found newly created French page: {page_name}")
# Check if this page was previously in the list of unavailable pages
# We need to check if the English version of this page is in the list
en_page_name = page_name.replace("FR:", "")
# Find the English page in the list of unavailable pages
found_en_page = None
for page in all_pages:
if page['title'] == en_page_name or (page['title'].startswith("En:") and page['title'][3:] == en_page_name):
found_en_page = page
break
if found_en_page:
logger.info(f"Found corresponding English page in unavailable pages list: {found_en_page['title']}")
# Remove the English page from the list of unavailable pages
updated_all_pages.remove(found_en_page)
# Remove the English page from the grouped pages
lang_prefix = found_en_page['language_prefix']
if lang_prefix in updated_grouped_pages and found_en_page in updated_grouped_pages[lang_prefix]:
updated_grouped_pages[lang_prefix].remove(found_en_page)
# If the group is now empty, remove it
if not updated_grouped_pages[lang_prefix]:
del updated_grouped_pages[lang_prefix]
# Add the newly created page to the list
newly_created_pages.append({
"title": page_name,
"url": page_url,
"en_title": found_en_page['title'],
"en_url": found_en_page['url'],
"created_at": change['timestamp'],
"created_by": change['user'],
"comment": change['comment']
})
return updated_all_pages, updated_grouped_pages, newly_created_pages
def main():
"""Main function to execute the script"""
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
parser.add_argument("--debug", action="store_true", help="Save HTML content to a file for debugging")
args = parser.parse_args()
logger.info("Starting fetch_recent_changes.py")
# Check if cache is fresh
if is_cache_fresh() and not args.force:
logger.info(f"Cache is still fresh (less than {CACHE_DURATION.total_seconds()/3600} hours old)")
logger.info(f"Use --force to update anyway")
return
# Get the recent changes page content
html_content = get_page_content(RECENT_CHANGES_URL)
if not html_content:
logger.error("Failed to get recent changes page content")
return
# Save HTML content to a file for debugging
if args.debug:
debug_file = "recent_changes_debug.html"
try:
with open(debug_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"Saved HTML content to {debug_file} for debugging")
except IOError as e:
logger.error(f"Error saving HTML content to {debug_file}: {e}")
# Parse the HTML to find the structure
soup = BeautifulSoup(html_content, 'html.parser')
# Find the main content area
content_div = soup.find('div', id='mw-content-text')
if content_div:
logger.info(f"Found content div with id 'mw-content-text'")
# Look for elements with mw-changeslist class
changeslist_elements = content_div.find_all(class_='mw-changeslist')
logger.info(f"Found {len(changeslist_elements)} elements with class 'mw-changeslist'")
for i, element in enumerate(changeslist_elements):
logger.info(f"Element {i+1} tag: {element.name}, classes: {element.get('class', [])}")
# Look for table rows or other elements that might contain changes
rows = element.find_all('tr')
divs = element.find_all('div', class_='mw-changeslist-line')
lis = element.find_all('li')
logger.info(f" - Contains {len(rows)} tr elements")
logger.info(f" - Contains {len(divs)} div.mw-changeslist-line elements")
logger.info(f" - Contains {len(lis)} li elements")
# Check direct children
children = list(element.children)
logger.info(f" - Has {len(children)} direct children")
if children:
child_types = {}
for child in children:
if hasattr(child, 'name') and child.name:
child_type = child.name
child_types[child_type] = child_types.get(child_type, 0) + 1
logger.info(f" - Direct children types: {child_types}")
# Extract recent changes
recent_changes = extract_recent_changes(html_content)
if not recent_changes:
logger.warning("No recent changes found")
# Save results
success = save_results(recent_changes, args.dry_run)
# Check for newly created French pages
logger.info("Checking for newly created French pages...")
all_pages, grouped_pages, last_updated = load_unavailable_pages()
created_pages, created_last_updated = load_created_pages()
if all_pages and grouped_pages:
# Check for newly created pages
updated_all_pages, updated_grouped_pages, newly_created = check_for_newly_created_pages(recent_changes, all_pages, grouped_pages)
# If we found newly created pages, update both files
if newly_created:
logger.info(f"Found {len(newly_created)} newly created French pages")
# Add the newly created pages to the existing list
created_pages.extend(newly_created)
# Save the updated files
save_unavailable_pages(updated_all_pages, updated_grouped_pages, args.dry_run)
save_created_pages(created_pages, args.dry_run)
else:
logger.info("No newly created French pages found")
else:
logger.warning("Could not check for newly created French pages: unavailable pages file not found or empty")
if success:
logger.info("Script completed successfully")
else:
logger.error("Script completed with errors")
if __name__ == "__main__":
main()