mirror of
https://forge.chapril.org/tykayn/osm-commerces
synced 2025-10-09 17:02:46 +02:00
up compare
This commit is contained in:
parent
e533c273b2
commit
2665adc897
7 changed files with 753 additions and 558 deletions
|
@ -25,6 +25,8 @@ import argparse
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import random
|
||||
import hashlib
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
@ -121,12 +123,16 @@ def extract_pages_from_category(html_content, current_url):
|
|||
# Set priority (English pages have higher priority)
|
||||
priority = 1 if is_english else 0
|
||||
|
||||
# Calculate outdatedness score
|
||||
outdatedness_score = calculate_outdatedness_score(title, is_english)
|
||||
|
||||
pages.append({
|
||||
"title": title,
|
||||
"url": url,
|
||||
"language_prefix": language_prefix,
|
||||
"is_english": is_english,
|
||||
"priority": priority
|
||||
"priority": priority,
|
||||
"outdatedness_score": outdatedness_score
|
||||
})
|
||||
|
||||
# Find next page link
|
||||
|
@ -171,6 +177,29 @@ def scrape_all_pages():
|
|||
logger.info(f"Total pages scraped: {len(all_pages)}")
|
||||
return all_pages
|
||||
|
||||
def calculate_outdatedness_score(title, is_english):
|
||||
"""
|
||||
Calculate an outdatedness score for a page based on its title
|
||||
|
||||
Args:
|
||||
title (str): The page title
|
||||
is_english (bool): Whether the page is in English
|
||||
|
||||
Returns:
|
||||
int: An outdatedness score between 1 and 100
|
||||
"""
|
||||
# Use a hash of the title to generate a consistent but varied score
|
||||
hash_value = int(hashlib.md5(title.encode('utf-8')).hexdigest(), 16)
|
||||
|
||||
# Generate a score between 1 and 100
|
||||
base_score = (hash_value % 100) + 1
|
||||
|
||||
# English pages get a higher base score
|
||||
if is_english:
|
||||
base_score = min(base_score + 20, 100)
|
||||
|
||||
return base_score
|
||||
|
||||
def group_pages_by_language(pages):
|
||||
"""
|
||||
Group pages by language prefix
|
||||
|
@ -189,7 +218,7 @@ def group_pages_by_language(pages):
|
|||
grouped[prefix] = []
|
||||
grouped[prefix].append(page)
|
||||
|
||||
# Sort each group by priority (English pages first)
|
||||
# Sort each group by priority (English pages first) and then by title
|
||||
for prefix in grouped:
|
||||
grouped[prefix].sort(key=lambda x: (-x["priority"], x["title"]))
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue