up wiki compare
This commit is contained in:
parent
e715d66020
commit
a59113400c
18 changed files with 4384 additions and 223314 deletions
|
@ -38,7 +38,9 @@ logging.basicConfig(
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Constants
|
||||
OUTPUT_FILE = "recent_changes.json"
|
||||
# Use the directory of this script to determine the output file path
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
OUTPUT_FILE = os.path.join(SCRIPT_DIR, "recent_changes.json")
|
||||
RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidepreviousrevisions=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=500&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org"
|
||||
CACHE_DURATION = timedelta(hours=1) # Cache duration of 1 hour
|
||||
|
@ -97,89 +99,88 @@ def extract_recent_changes(html_content):
|
|||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
recent_changes = []
|
||||
|
||||
# Try different selectors for the changes list
|
||||
# First try the old selector
|
||||
changes_list = soup.find('ul', class_='special')
|
||||
# Find the main changeslist container
|
||||
# According to the issue description, we should look for .mw-changeslist
|
||||
changes_list = soup.find('div', class_='mw-changeslist')
|
||||
|
||||
# If not found, try the new selector
|
||||
if not changes_list:
|
||||
changes_list = soup.find('div', class_='mw-changeslist')
|
||||
|
||||
# If still not found, try another common selector
|
||||
if not changes_list:
|
||||
changes_list = soup.find('ul', class_='mw-changeslist')
|
||||
|
||||
# If still not found, look for any list inside the content area
|
||||
if not changes_list:
|
||||
# If still not found, look for the content area
|
||||
content_div = soup.find('div', id='mw-content-text')
|
||||
if content_div:
|
||||
changes_list = content_div.find('ul')
|
||||
# Try to find the changeslist div
|
||||
changes_list = content_div.find('div', class_='mw-changeslist')
|
||||
|
||||
if not changes_list:
|
||||
logger.warning("Could not find recent changes list")
|
||||
# Log the HTML structure to help debug
|
||||
logger.warning("Could not find recent changes list. HTML structure:")
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
content_area = body.find('div', id='content')
|
||||
if content_area:
|
||||
logger.warning(f"Content area classes: {content_area.get('class', [])}")
|
||||
main_content = content_area.find('div', id='mw-content-text')
|
||||
if main_content:
|
||||
logger.warning(f"Main content first child: {main_content.find().name if main_content.find() else 'None'}")
|
||||
return []
|
||||
|
||||
# Process each list item (each change)
|
||||
# Try both li elements and div elements with appropriate classes
|
||||
change_items = changes_list.find_all('li')
|
||||
logger.info(f"Found changes list with tag: {changes_list.name}, classes: {changes_list.get('class', [])}")
|
||||
|
||||
# Process each change item - based on the actual HTML structure
|
||||
# According to the debug output, the changes are in tr elements
|
||||
change_items = changes_list.find_all('tr')
|
||||
|
||||
# If no tr elements found directly, look for tables with class mw-changeslist-line
|
||||
if not change_items:
|
||||
change_items = changes_list.find_all('div', class_='mw-changeslist-line')
|
||||
tables = changes_list.find_all('table', class_='mw-changeslist-line')
|
||||
for table in tables:
|
||||
trs = table.find_all('tr')
|
||||
change_items.extend(trs)
|
||||
|
||||
logger.info(f"Found {len(change_items)} change items")
|
||||
|
||||
for item in change_items:
|
||||
# Extract the page link - try different selectors
|
||||
# Extract the page link from the mw-changeslist-title class
|
||||
page_link = item.find('a', class_='mw-changeslist-title')
|
||||
if not page_link:
|
||||
page_link = item.find('a', class_='mw-changeslist-page')
|
||||
if not page_link:
|
||||
# Try to find any link that might be the page link
|
||||
links = item.find_all('a')
|
||||
for link in links:
|
||||
if '/wiki/' in link.get('href', ''):
|
||||
page_link = link
|
||||
break
|
||||
|
||||
if not page_link:
|
||||
# If not found with the specific class, try to find any link that might be the page link
|
||||
inner_td = item.find('td', class_='mw-changeslist-line-inner')
|
||||
if inner_td:
|
||||
links = inner_td.find_all('a')
|
||||
for link in links:
|
||||
href = link.get('href', '')
|
||||
if '/wiki/' in href and 'action=history' not in href and 'diff=' not in href:
|
||||
page_link = link
|
||||
break
|
||||
|
||||
if not page_link:
|
||||
# Skip items without a page link (might be headers or other elements)
|
||||
continue
|
||||
|
||||
page_name = page_link.get_text().strip()
|
||||
page_url = WIKI_BASE_URL + page_link.get('href')
|
||||
page_url = page_link.get('href')
|
||||
if not page_url.startswith('http'):
|
||||
page_url = WIKI_BASE_URL + page_url
|
||||
|
||||
# Extract the timestamp - try different selectors
|
||||
timestamp_span = item.find('span', class_='mw-changeslist-date')
|
||||
if not timestamp_span:
|
||||
timestamp_span = item.find('span', class_='mw-changeslist-time')
|
||||
timestamp = timestamp_span.get_text().strip() if timestamp_span else "Unknown"
|
||||
# Extract the timestamp from the mw-enhanced-rc class
|
||||
timestamp_td = item.find('td', class_='mw-enhanced-rc')
|
||||
timestamp = timestamp_td.get_text().strip() if timestamp_td else "Unknown"
|
||||
|
||||
# Extract the user - try different selectors
|
||||
# Extract the user from the mw-userlink class
|
||||
user_link = item.find('a', class_='mw-userlink')
|
||||
if not user_link:
|
||||
user_link = item.find('a', class_='mw-userlink mw-anonuserlink')
|
||||
if not user_link:
|
||||
user_spans = item.find_all('span', class_='mw-userlink')
|
||||
if user_spans:
|
||||
user_link = user_spans[0]
|
||||
user = user_link.get_text().strip() if user_link else "Unknown"
|
||||
|
||||
# Extract the comment - try different selectors
|
||||
# Extract the comment from the comment class
|
||||
comment_span = item.find('span', class_='comment')
|
||||
if not comment_span:
|
||||
comment_span = item.find('span', class_='changeslist-comment')
|
||||
comment = comment_span.get_text().strip() if comment_span else ""
|
||||
|
||||
# Extract the change size - try different approaches
|
||||
change_size = "0"
|
||||
# Try to find spans with specific classes
|
||||
size_spans = item.find_all('span', class_=['mw-changeslist-separator', 'mw-diff-bytes'])
|
||||
for span in size_spans:
|
||||
next_text = span.next_sibling
|
||||
if next_text and isinstance(next_text, str) and '(' in next_text and ')' in next_text:
|
||||
change_size = next_text.strip()
|
||||
break
|
||||
|
||||
# If not found, try another approach
|
||||
if change_size == "0":
|
||||
# Look for parentheses with numbers
|
||||
import re
|
||||
# Extract the change size from the mw-diff-bytes class
|
||||
size_span = item.find('span', class_='mw-diff-bytes')
|
||||
if size_span:
|
||||
change_size = size_span.get_text().strip()
|
||||
else:
|
||||
# If not found, try to extract from the text
|
||||
change_size = "0"
|
||||
text = item.get_text()
|
||||
size_matches = re.findall(r'\(\s*([+-]?\d+)\s*\)', text)
|
||||
if size_matches:
|
||||
|
@ -193,8 +194,10 @@ def extract_recent_changes(html_content):
|
|||
"comment": comment,
|
||||
"change_size": change_size
|
||||
})
|
||||
|
||||
logger.debug(f"Extracted change: {page_name} by {user}")
|
||||
|
||||
logger.info(f"Found {len(recent_changes)} recent changes")
|
||||
logger.info(f"Extracted {len(recent_changes)} recent changes")
|
||||
return recent_changes
|
||||
|
||||
def save_results(recent_changes, dry_run=False):
|
||||
|
@ -217,15 +220,79 @@ def save_results(recent_changes, dry_run=False):
|
|||
logger.info(f" ... and {len(recent_changes) - 5} more")
|
||||
return True
|
||||
|
||||
# Log some details about the recent changes
|
||||
logger.info(f"Preparing to save {len(recent_changes)} recent changes")
|
||||
if recent_changes:
|
||||
logger.info(f"First change: {recent_changes[0]['page_name']} by {recent_changes[0]['user']}")
|
||||
|
||||
# Prepare the data structure
|
||||
data = {
|
||||
"last_updated": datetime.now().isoformat(),
|
||||
"recent_changes": recent_changes
|
||||
}
|
||||
|
||||
# Get the file's last modified time before saving
|
||||
before_mtime = None
|
||||
if os.path.exists(OUTPUT_FILE):
|
||||
before_mtime = os.path.getmtime(OUTPUT_FILE)
|
||||
logger.info(f"File {OUTPUT_FILE} exists, last modified at {datetime.fromtimestamp(before_mtime)}")
|
||||
|
||||
try:
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
# Print the JSON data that we're trying to save
|
||||
json_data = json.dumps(data, indent=2, ensure_ascii=False)
|
||||
logger.info(f"JSON data to save (first 500 chars): {json_data[:500]}...")
|
||||
|
||||
# Save the data to a temporary file first
|
||||
temp_file = OUTPUT_FILE + ".tmp"
|
||||
logger.info(f"Writing data to temporary file {temp_file}")
|
||||
with open(temp_file, 'w', encoding='utf-8') as f:
|
||||
f.write(json_data)
|
||||
|
||||
# Check if the temporary file was created and has content
|
||||
if os.path.exists(temp_file):
|
||||
temp_size = os.path.getsize(temp_file)
|
||||
logger.info(f"Temporary file {temp_file} created, size: {temp_size} bytes")
|
||||
|
||||
# Read the content of the temporary file to verify
|
||||
with open(temp_file, 'r', encoding='utf-8') as f:
|
||||
temp_content = f.read(500) # Read first 500 chars
|
||||
logger.info(f"Temporary file content (first 500 chars): {temp_content}...")
|
||||
|
||||
# Move the temporary file to the final location
|
||||
logger.info(f"Moving temporary file to {OUTPUT_FILE}")
|
||||
import shutil
|
||||
shutil.move(temp_file, OUTPUT_FILE)
|
||||
else:
|
||||
logger.error(f"Failed to create temporary file {temp_file}")
|
||||
|
||||
# Check if the file was actually updated
|
||||
if os.path.exists(OUTPUT_FILE):
|
||||
after_mtime = os.path.getmtime(OUTPUT_FILE)
|
||||
file_size = os.path.getsize(OUTPUT_FILE)
|
||||
logger.info(f"File {OUTPUT_FILE} exists, size: {file_size} bytes, mtime: {datetime.fromtimestamp(after_mtime)}")
|
||||
|
||||
# Read the content of the file to verify
|
||||
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
|
||||
file_content = f.read(500) # Read first 500 chars
|
||||
logger.info(f"File content (first 500 chars): {file_content}...")
|
||||
|
||||
if before_mtime and after_mtime <= before_mtime:
|
||||
logger.warning(f"File {OUTPUT_FILE} was not updated (mtime did not change)")
|
||||
else:
|
||||
logger.error(f"File {OUTPUT_FILE} does not exist after saving")
|
||||
|
||||
# Copy the file to the public directory
|
||||
public_file = os.path.join(os.path.dirname(os.path.dirname(OUTPUT_FILE)), 'public', os.path.basename(OUTPUT_FILE))
|
||||
logger.info(f"Copying {OUTPUT_FILE} to {public_file}")
|
||||
shutil.copy2(OUTPUT_FILE, public_file)
|
||||
|
||||
# Check if the public file was created
|
||||
if os.path.exists(public_file):
|
||||
public_size = os.path.getsize(public_file)
|
||||
logger.info(f"Public file {public_file} created, size: {public_size} bytes")
|
||||
else:
|
||||
logger.error(f"Failed to create public file {public_file}")
|
||||
|
||||
logger.info(f"Successfully saved {len(recent_changes)} recent changes to {OUTPUT_FILE}")
|
||||
return True
|
||||
except IOError as e:
|
||||
|
@ -237,6 +304,7 @@ def main():
|
|||
parser = argparse.ArgumentParser(description="Fetch recent changes from the OSM wiki French namespace")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Run without saving results to file")
|
||||
parser.add_argument("--force", action="store_true", help="Force update even if cache is fresh")
|
||||
parser.add_argument("--debug", action="store_true", help="Save HTML content to a file for debugging")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("Starting fetch_recent_changes.py")
|
||||
|
@ -254,6 +322,51 @@ def main():
|
|||
logger.error("Failed to get recent changes page content")
|
||||
return
|
||||
|
||||
# Save HTML content to a file for debugging
|
||||
if args.debug:
|
||||
debug_file = "recent_changes_debug.html"
|
||||
try:
|
||||
with open(debug_file, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
logger.info(f"Saved HTML content to {debug_file} for debugging")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving HTML content to {debug_file}: {e}")
|
||||
|
||||
# Parse the HTML to find the structure
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Find the main content area
|
||||
content_div = soup.find('div', id='mw-content-text')
|
||||
if content_div:
|
||||
logger.info(f"Found content div with id 'mw-content-text'")
|
||||
|
||||
# Look for elements with mw-changeslist class
|
||||
changeslist_elements = content_div.find_all(class_='mw-changeslist')
|
||||
logger.info(f"Found {len(changeslist_elements)} elements with class 'mw-changeslist'")
|
||||
|
||||
for i, element in enumerate(changeslist_elements):
|
||||
logger.info(f"Element {i+1} tag: {element.name}, classes: {element.get('class', [])}")
|
||||
|
||||
# Look for table rows or other elements that might contain changes
|
||||
rows = element.find_all('tr')
|
||||
divs = element.find_all('div', class_='mw-changeslist-line')
|
||||
lis = element.find_all('li')
|
||||
|
||||
logger.info(f" - Contains {len(rows)} tr elements")
|
||||
logger.info(f" - Contains {len(divs)} div.mw-changeslist-line elements")
|
||||
logger.info(f" - Contains {len(lis)} li elements")
|
||||
|
||||
# Check direct children
|
||||
children = list(element.children)
|
||||
logger.info(f" - Has {len(children)} direct children")
|
||||
if children:
|
||||
child_types = {}
|
||||
for child in children:
|
||||
if hasattr(child, 'name') and child.name:
|
||||
child_type = child.name
|
||||
child_types[child_type] = child_types.get(child_type, 0) + 1
|
||||
logger.info(f" - Direct children types: {child_types}")
|
||||
|
||||
# Extract recent changes
|
||||
recent_changes = extract_recent_changes(html_content)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue