recup sources

2025-09-01 18:28:23 +02:00 · 2025-09-01 18:28:23 +02:00 · 65fe2a35f9
commit 65fe2a35f9
parent 86622a19ea
155 changed files with 50969 additions and 0 deletions
--- a/wiki_compare/detect_suspicious_deletions.py
+++ b/wiki_compare/detect_suspicious_deletions.py
@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import requests
+from bs4 import BeautifulSoup
+import json
+import logging
+import argparse
+import os
+import re
+from datetime import datetime
+from urllib.parse import urlparse, parse_qs, urlencode
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# URL for recent changes in OSM Wiki (namespace 202 is for Tag pages)
+RECENT_CHANGES_URL = "https://wiki.openstreetmap.org/w/index.php?hidebots=1&hidenewpages=1&hidecategorization=1&hideWikibase=1&hidelog=1&hidenewuserlog=1&namespace=202&limit=250&days=30&enhanced=1&title=Special:RecentChanges&urlversion=2"
+
+# Threshold for suspicious deletions (percentage of total content)
+DELETION_THRESHOLD_PERCENT = 5.0
+
+# Base URL for OSM Wiki
+WIKI_BASE_URL = "https://wiki.openstreetmap.org"
+
+def fetch_recent_changes():
+    """
+    Fetch the recent changes page from OSM Wiki
+    """
+    logger.info(f"Fetching recent changes from {RECENT_CHANGES_URL}")
+    try:
+        response = requests.get(RECENT_CHANGES_URL)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching recent changes: {e}")
+        return None
+
+def fetch_page_content(page_title):
+    """
+    Fetch the content of a wiki page to count characters
+    """
+    url = f"{WIKI_BASE_URL}/wiki/{page_title}"
+    logger.info(f"Fetching page content from {url}")
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error fetching page content: {e}")
+        return None
+
+def count_page_characters(html_content):
+    """
+    Count the total number of characters in the wiki page content
+    """
+    if not html_content:
+        return 0
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    
+    # Find the main content div
+    content_div = soup.select_one('#mw-content-text')
+    if not content_div:
+        return 0
+    
+    # Get all text content
+    text_content = content_div.get_text(strip=True)
+    
+    # Count characters
+    char_count = len(text_content)
+    logger.info(f"Page has {char_count} characters")
+    
+    return char_count
+
+def generate_diff_url(page_title, oldid):
+    """
+    Generate URL to view the diff of a specific revision
+    """
+    return f"{WIKI_BASE_URL}/w/index.php?title={page_title}&diff=prev&oldid={oldid}"
+
+def generate_history_url(page_title):
+    """
+    Generate URL to view the history of a page
+    """
+    return f"{WIKI_BASE_URL}/w/index.php?title={page_title}&action=history"
+
+def load_existing_deletions():
+    """
+    Load existing suspicious deletions from the JSON file
+    """
+    output_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'suspicious_deletions.json')
+    existing_pages = set()
+    
+    try:
+        if os.path.exists(output_file):
+            with open(output_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                if 'deletions' in data:
+                    for deletion in data['deletions']:
+                        if 'page_title' in deletion:
+                            existing_pages.add(deletion['page_title'])
+            logger.info(f"Loaded {len(existing_pages)} existing pages from {output_file}")
+        else:
+            logger.info(f"No existing file found at {output_file}")
+    except Exception as e:
+        logger.error(f"Error loading existing deletions: {e}")
+    
+    return existing_pages
+
+def parse_suspicious_deletions(html_content):
+    """
+    Parse the HTML content to find suspicious deletions
+    """
+    if not html_content:
+        return []
+    
+    # Load existing pages from the JSON file
+    existing_pages = load_existing_deletions()
+    
+    soup = BeautifulSoup(html_content, 'html.parser')
+    suspicious_deletions = []
+    
+    # Find all change list lines
+    change_lines = soup.select('.mw-changeslist .mw-changeslist-line')
+    logger.info(f"Found {len(change_lines)} change lines to analyze")
+    
+    for line in change_lines:
+        # Look for deletion indicators
+        deletion_indicator = line.select_one('.mw-plusminus-neg')
+        if deletion_indicator:
+            # Extract the deletion size
+            deletion_text = deletion_indicator.text.strip()
+            try:
+                # Remove any non-numeric characters except minus sign
+                deletion_size = int(''.join(c for c in deletion_text if c.isdigit() or c == '-'))
+                
+                # Skip if deletion size is not greater than 100 characters
+                if abs(deletion_size) <= 100:
+                    logger.info(f"Skipping deletion with size {deletion_size} (not > 100 characters)")
+                    continue
+                
+                # Get the page title and URL
+                title_element = line.select_one('.mw-changeslist-title')
+                if title_element:
+                    page_title = title_element.text.strip()
+                    
+                    # Skip if page is already in the JSON file
+                    if page_title in existing_pages:
+                        logger.info(f"Skipping {page_title} (already in JSON file)")
+                        continue
+                    
+                    page_url = title_element.get('href', '')
+                    if not page_url.startswith('http'):
+                        page_url = f"{WIKI_BASE_URL}{page_url}"
+                    
+                    # Extract oldid from the URL if available
+                    oldid = None
+                    if 'oldid=' in page_url:
+                        parsed_url = urlparse(page_url)
+                        query_params = parse_qs(parsed_url.query)
+                        if 'oldid' in query_params:
+                            oldid = query_params['oldid'][0]
+                    
+                    # Fetch the page content to count characters
+                    page_html = fetch_page_content(page_title)
+                    total_chars = count_page_characters(page_html)
+                    
+                    # Calculate deletion percentage
+                    deletion_percentage = 0
+                    if total_chars > 0:
+                        deletion_percentage = (abs(deletion_size) / total_chars) * 100
+                    
+                    # If deletion percentage is significant
+                    if deletion_percentage > DELETION_THRESHOLD_PERCENT:
+                        # Get the timestamp
+                        timestamp_element = line.select_one('.mw-changeslist-date')
+                        timestamp = timestamp_element.text.strip() if timestamp_element else ""
+                        
+                        # Get the user who made the change
+                        user_element = line.select_one('.mw-userlink')
+                        user = user_element.text.strip() if user_element else "Unknown"
+                        
+                        # Get the comment if available
+                        comment_element = line.select_one('.comment')
+                        comment = comment_element.text.strip() if comment_element else ""
+                        
+                        # Generate diff and history URLs
+                        diff_url = generate_diff_url(page_title, oldid) if oldid else ""
+                        history_url = generate_history_url(page_title)
+                        
+                        suspicious_deletions.append({
+                            'page_title': page_title,
+                            'page_url': page_url,
+                            'diff_url': diff_url,
+                            'history_url': history_url,
+                            'deletion_size': deletion_size,
+                            'total_chars': total_chars,
+                            'deletion_percentage': round(deletion_percentage, 2),
+                            'timestamp': timestamp,
+                            'user': user,
+                            'comment': comment
+                        })
+                        logger.info(f"Found suspicious deletion: {page_title} ({deletion_size} chars, {deletion_percentage:.2f}% of content)")
+            except ValueError:
+                logger.warning(f"Could not parse deletion size from: {deletion_text}")
+    
+    return suspicious_deletions
+
+def save_suspicious_deletions(suspicious_deletions):
+    """
+    Save the suspicious deletions to a JSON file
+    """
+    output_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'suspicious_deletions.json')
+    
+    # Add timestamp to the data
+    data = {
+        'last_updated': datetime.now().isoformat(),
+        'deletions': suspicious_deletions
+    }
+    
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    
+    logger.info(f"Saved {len(suspicious_deletions)} suspicious deletions to {output_file}")
+    return output_file
+
+def main():
+    parser = argparse.ArgumentParser(description='Detect suspicious deletions in OSM Wiki recent changes')
+    parser.add_argument('--dry-run', action='store_true', help='Print results without saving to file')
+    args = parser.parse_args()
+    
+    html_content = fetch_recent_changes()
+    if html_content:
+        suspicious_deletions = parse_suspicious_deletions(html_content)
+        
+        if args.dry_run:
+            logger.info(f"Found {len(suspicious_deletions)} suspicious deletions:")
+            for deletion in suspicious_deletions:
+                logger.info(f"- {deletion['page_title']}: {deletion['deletion_size']} chars by {deletion['user']}")
+        else:
+            output_file = save_suspicious_deletions(suspicious_deletions)
+            logger.info(f"Results saved to {output_file}")
+    else:
+        logger.error("Failed to fetch recent changes. Exiting.")
+
+if __name__ == "__main__":
+    main()