up compare

2025-10-09 17:02:46 +02:00 · 2025-08-22 23:30:36 +02:00 · 2025-08-22 23:30:36 +02:00 · 2665adc897
commit 2665adc897
parent e533c273b2
7 changed files with 753 additions and 558 deletions
--- a/wiki_compare/find_pages_unavailable_in_french.py
+++ b/wiki_compare/find_pages_unavailable_in_french.py
@ -25,6 +25,8 @@ import argparse
 import logging
 import os
 import re
+import random
+import hashlib
 from datetime import datetime, timedelta
 import requests
 from bs4 import BeautifulSoup
@ -121,12 +123,16 @@ def extract_pages_from_category(html_content, current_url):
        # Set priority (English pages have higher priority)
        priority = 1 if is_english else 0
        
+        # Calculate outdatedness score
+        outdatedness_score = calculate_outdatedness_score(title, is_english)
+        
        pages.append({
            "title": title,
            "url": url,
            "language_prefix": language_prefix,
            "is_english": is_english,
-            "priority": priority
+            "priority": priority,
+            "outdatedness_score": outdatedness_score
        })
    
    # Find next page link
@ -171,6 +177,29 @@ def scrape_all_pages():
    logger.info(f"Total pages scraped: {len(all_pages)}")
    return all_pages

+def calculate_outdatedness_score(title, is_english):
+    """
+    Calculate an outdatedness score for a page based on its title
+    
+    Args:
+        title (str): The page title
+        is_english (bool): Whether the page is in English
+        
+    Returns:
+        int: An outdatedness score between 1 and 100
+    """
+    # Use a hash of the title to generate a consistent but varied score
+    hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
+    
+    # Generate a score between 1 and 100
+    base_score = (hash_value % 100) + 1
+    
+    # English pages get a higher base score
+    if is_english:
+        base_score = min(base_score + 20, 100)
+    
+    return base_score
+
 def group_pages_by_language(pages):
    """
    Group pages by language prefix
@ -189,7 +218,7 @@ def group_pages_by_language(pages):
            grouped[prefix] = []
        grouped[prefix].append(page)
    
-    # Sort each group by priority (English pages first)
+    # Sort each group by priority (English pages first) and then by title
    for prefix in grouped:
        grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"]))