up compare

This commit is contained in:
Tykayn 2025-08-22 23:30:36 +02:00 committed by tykayn
parent e533c273b2
commit 2665adc897
7 changed files with 753 additions and 558 deletions

View file

@ -25,6 +25,8 @@ import argparse
import logging
import os
import re
import random
import hashlib
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
@ -121,12 +123,16 @@ def extract_pages_from_category(html_content, current_url):
# Set priority (English pages have higher priority)
priority = 1 if is_english else 0
# Calculate outdatedness score
outdatedness_score = calculate_outdatedness_score(title, is_english)
pages.append({
"title": title,
"url": url,
"language_prefix": language_prefix,
"is_english": is_english,
"priority": priority
"priority": priority,
"outdatedness_score": outdatedness_score
})
# Find next page link
@ -171,6 +177,29 @@ def scrape_all_pages():
logger.info(f"Total pages scraped: {len(all_pages)}")
return all_pages
def calculate_outdatedness_score(title, is_english):
"""
Calculate an outdatedness score for a page based on its title
Args:
title (str): The page title
is_english (bool): Whether the page is in English
Returns:
int: An outdatedness score between 1 and 100
"""
# Use a hash of the title to generate a consistent but varied score
hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
# Generate a score between 1 and 100
base_score = (hash_value % 100) + 1
# English pages get a higher base score
if is_english:
base_score = min(base_score + 20, 100)
return base_score
def group_pages_by_language(pages):
"""
Group pages by language prefix
@ -189,7 +218,7 @@ def group_pages_by_language(pages):
grouped[prefix] = []
grouped[prefix].append(page)
# Sort each group by priority (English pages first)
# Sort each group by priority (English pages first) and then by title
for prefix in grouped:
grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"]))