add history in articles measures

This commit is contained in:
Tykayn 2025-09-08 10:20:51 +02:00 committed by tykayn
parent 1ed74c2e2f
commit 381f378db4
9 changed files with 1678 additions and 195 deletions

View file

@ -116,7 +116,8 @@ SPECIFIC_PAGES = [
"Mapping_private_information",
"Any_tags_you_like",
"Organised_Editing/Best_Practices",
"Map_features"
"Map_features",
"Wiki"
]
def fetch_desynchronized_pages():
@ -280,12 +281,104 @@ def save_to_json(data, filename):
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def calculate_global_metrics(data):
"""
Calculate global metrics for all pages in the data
Args:
data: Data containing regular_pages and specific_pages
Returns:
dict: Dictionary with global metrics
"""
# Combine regular and specific pages for global metrics
all_pages = data.get('regular_pages', []) + data.get('specific_pages', [])
# Initialize metrics
metrics = {
'total_pages': len(all_pages),
'avg_sections': 0,
'avg_words': 0,
'avg_links': 0,
'avg_images': 0,
'avg_categories': 0,
'avg_staleness': 0,
'pages_with_en_fr': 0,
'pages_missing_fr': 0,
'staleness_distribution': {
'0-20': 0,
'21-40': 0,
'41-60': 0,
'61-80': 0,
'81-100': 0,
'100+': 0
}
}
# Skip if no pages
if not all_pages:
return metrics
# Calculate totals
total_sections = 0
total_words = 0
total_links = 0
total_images = 0
total_categories = 0
total_staleness = 0
for page in all_pages:
# Count pages with/without French version
if page.get('fr_page'):
metrics['pages_with_en_fr'] += 1
else:
metrics['pages_missing_fr'] += 1
# Add to staleness distribution
staleness = page.get('staleness_score', 0)
total_staleness += staleness
if staleness <= 20:
metrics['staleness_distribution']['0-20'] += 1
elif staleness <= 40:
metrics['staleness_distribution']['21-40'] += 1
elif staleness <= 60:
metrics['staleness_distribution']['41-60'] += 1
elif staleness <= 80:
metrics['staleness_distribution']['61-80'] += 1
elif staleness <= 100:
metrics['staleness_distribution']['81-100'] += 1
else:
metrics['staleness_distribution']['100+'] += 1
# Add to totals
total_sections += page.get('section_diff', 0) if 'section_diff' in page else 0
total_words += page.get('word_diff', 0) if 'word_diff' in page else 0
total_links += page.get('link_diff', 0) if 'link_diff' in page else 0
total_images += page.get('media_diff', 0) if 'media_diff' in page else 0
# Count categories if available
if page.get('category_comparison'):
cat_count = len(page['category_comparison'].get('en_only', []))
total_categories += cat_count
# Calculate averages
metrics['avg_sections'] = round(total_sections / len(all_pages), 2)
metrics['avg_words'] = round(total_words / len(all_pages), 2)
metrics['avg_links'] = round(total_links / len(all_pages), 2)
metrics['avg_images'] = round(total_images / len(all_pages), 2)
metrics['avg_categories'] = round(total_categories / len(all_pages), 2)
metrics['avg_staleness'] = round(total_staleness / len(all_pages), 2)
return metrics
def save_with_history(data, filename):
"""
Save data to a JSON file while preserving history
This function loads existing data from the file (if it exists),
adds the new data to the history, and saves the updated data back to the file.
It also calculates global metrics for the current data.
Args:
data: New data to save
@ -301,11 +394,15 @@ def save_with_history(data, filename):
# Initialize history if it doesn't exist
if 'history' not in existing_data:
existing_data['history'] = {}
# Calculate global metrics for the current data
global_metrics = calculate_global_metrics(data)
# Add current regular_pages and specific_pages to history
# Add current regular_pages, specific_pages, and global metrics to history
history_entry = {
'regular_pages': data.get('regular_pages', []),
'specific_pages': data.get('specific_pages', [])
'specific_pages': data.get('specific_pages', []),
'global_metrics': global_metrics
}
# Add the entry to history with timestamp as key
@ -314,6 +411,7 @@ def save_with_history(data, filename):
# Update the current data
existing_data['regular_pages'] = data.get('regular_pages', [])
existing_data['specific_pages'] = data.get('specific_pages', [])
existing_data['global_metrics'] = global_metrics
existing_data['last_updated'] = current_timestamp
# Save the updated data
@ -321,10 +419,119 @@ def save_with_history(data, filename):
json.dump(existing_data, f, indent=2, ensure_ascii=False)
logger.info(f"Data with history saved to {filename}")
# Also save a separate ranking history file
save_ranking_history(existing_data, "page_rankings.json")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving data with history to {filename}: {e}")
# Fallback to regular save if there's an error
save_to_json(data, filename)
def save_ranking_history(data, filename):
"""
Save ranking history to a separate JSON file
This function extracts ranking data from the history and saves it in a format
optimized for displaying ranking evolution over time.
Args:
data: Data containing history entries
filename (str): Name of the file to save rankings
"""
try:
# Initialize ranking data structure
ranking_data = {
'timestamps': [],
'pages': {},
'global_metrics': {}
}
# Extract history entries
history = data.get('history', {})
# Sort timestamps chronologically
sorted_timestamps = sorted(history.keys())
ranking_data['timestamps'] = sorted_timestamps
# Process each page to track its metrics over time
all_page_keys = set()
# First, collect all unique page keys across all history entries
for timestamp in sorted_timestamps:
entry = history[timestamp]
# Add global metrics for this timestamp
if 'global_metrics' in entry:
ranking_data['global_metrics'][timestamp] = entry['global_metrics']
# Collect page keys from regular pages
for page in entry.get('regular_pages', []):
all_page_keys.add(page['key'])
# Collect page keys from specific pages
for page in entry.get('specific_pages', []):
all_page_keys.add(page['key'])
# Initialize data structure for each page
for page_key in all_page_keys:
ranking_data['pages'][page_key] = {
'title': page_key,
'metrics': {}
}
# Fill in metrics for each page at each timestamp
for timestamp in sorted_timestamps:
entry = history[timestamp]
# Process regular pages
for page in entry.get('regular_pages', []):
page_key = page['key']
# Extract metrics we want to track
metrics = {
'staleness_score': page.get('staleness_score', 0),
'word_diff': page.get('word_diff', 0),
'section_diff': page.get('section_diff', 0),
'link_diff': page.get('link_diff', 0),
'media_diff': page.get('media_diff', 0)
}
# Store metrics for this timestamp
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
# Store page title if available
if 'en_page' in page and page['en_page']:
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
# Process specific pages
for page in entry.get('specific_pages', []):
page_key = page['key']
# Extract metrics we want to track
metrics = {
'staleness_score': page.get('staleness_score', 0),
'word_diff': page.get('word_diff', 0),
'section_diff': page.get('section_diff', 0),
'link_diff': page.get('link_diff', 0),
'media_diff': page.get('media_diff', 0)
}
# Store metrics for this timestamp
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
# Store page title if available
if 'en_page' in page and page['en_page']:
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
# Save the ranking data
with open(filename, 'w', encoding='utf-8') as f:
json.dump(ranking_data, f, indent=2, ensure_ascii=False)
logger.info(f"Ranking history saved to {filename}")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving ranking history to {filename}: {e}")
def check_grammar_with_grammalecte(text):
"""