add history in articles measures
This commit is contained in:
parent
1ed74c2e2f
commit
381f378db4
9 changed files with 1678 additions and 195 deletions
Binary file not shown.
|
@ -116,7 +116,8 @@ SPECIFIC_PAGES = [
|
|||
"Mapping_private_information",
|
||||
"Any_tags_you_like",
|
||||
"Organised_Editing/Best_Practices",
|
||||
"Map_features"
|
||||
"Map_features",
|
||||
"Wiki"
|
||||
]
|
||||
|
||||
def fetch_desynchronized_pages():
|
||||
|
@ -280,12 +281,104 @@ def save_to_json(data, filename):
|
|||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def calculate_global_metrics(data):
|
||||
"""
|
||||
Calculate global metrics for all pages in the data
|
||||
|
||||
Args:
|
||||
data: Data containing regular_pages and specific_pages
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with global metrics
|
||||
"""
|
||||
# Combine regular and specific pages for global metrics
|
||||
all_pages = data.get('regular_pages', []) + data.get('specific_pages', [])
|
||||
|
||||
# Initialize metrics
|
||||
metrics = {
|
||||
'total_pages': len(all_pages),
|
||||
'avg_sections': 0,
|
||||
'avg_words': 0,
|
||||
'avg_links': 0,
|
||||
'avg_images': 0,
|
||||
'avg_categories': 0,
|
||||
'avg_staleness': 0,
|
||||
'pages_with_en_fr': 0,
|
||||
'pages_missing_fr': 0,
|
||||
'staleness_distribution': {
|
||||
'0-20': 0,
|
||||
'21-40': 0,
|
||||
'41-60': 0,
|
||||
'61-80': 0,
|
||||
'81-100': 0,
|
||||
'100+': 0
|
||||
}
|
||||
}
|
||||
|
||||
# Skip if no pages
|
||||
if not all_pages:
|
||||
return metrics
|
||||
|
||||
# Calculate totals
|
||||
total_sections = 0
|
||||
total_words = 0
|
||||
total_links = 0
|
||||
total_images = 0
|
||||
total_categories = 0
|
||||
total_staleness = 0
|
||||
|
||||
for page in all_pages:
|
||||
# Count pages with/without French version
|
||||
if page.get('fr_page'):
|
||||
metrics['pages_with_en_fr'] += 1
|
||||
else:
|
||||
metrics['pages_missing_fr'] += 1
|
||||
|
||||
# Add to staleness distribution
|
||||
staleness = page.get('staleness_score', 0)
|
||||
total_staleness += staleness
|
||||
|
||||
if staleness <= 20:
|
||||
metrics['staleness_distribution']['0-20'] += 1
|
||||
elif staleness <= 40:
|
||||
metrics['staleness_distribution']['21-40'] += 1
|
||||
elif staleness <= 60:
|
||||
metrics['staleness_distribution']['41-60'] += 1
|
||||
elif staleness <= 80:
|
||||
metrics['staleness_distribution']['61-80'] += 1
|
||||
elif staleness <= 100:
|
||||
metrics['staleness_distribution']['81-100'] += 1
|
||||
else:
|
||||
metrics['staleness_distribution']['100+'] += 1
|
||||
|
||||
# Add to totals
|
||||
total_sections += page.get('section_diff', 0) if 'section_diff' in page else 0
|
||||
total_words += page.get('word_diff', 0) if 'word_diff' in page else 0
|
||||
total_links += page.get('link_diff', 0) if 'link_diff' in page else 0
|
||||
total_images += page.get('media_diff', 0) if 'media_diff' in page else 0
|
||||
|
||||
# Count categories if available
|
||||
if page.get('category_comparison'):
|
||||
cat_count = len(page['category_comparison'].get('en_only', []))
|
||||
total_categories += cat_count
|
||||
|
||||
# Calculate averages
|
||||
metrics['avg_sections'] = round(total_sections / len(all_pages), 2)
|
||||
metrics['avg_words'] = round(total_words / len(all_pages), 2)
|
||||
metrics['avg_links'] = round(total_links / len(all_pages), 2)
|
||||
metrics['avg_images'] = round(total_images / len(all_pages), 2)
|
||||
metrics['avg_categories'] = round(total_categories / len(all_pages), 2)
|
||||
metrics['avg_staleness'] = round(total_staleness / len(all_pages), 2)
|
||||
|
||||
return metrics
|
||||
|
||||
def save_with_history(data, filename):
|
||||
"""
|
||||
Save data to a JSON file while preserving history
|
||||
|
||||
This function loads existing data from the file (if it exists),
|
||||
adds the new data to the history, and saves the updated data back to the file.
|
||||
It also calculates global metrics for the current data.
|
||||
|
||||
Args:
|
||||
data: New data to save
|
||||
|
@ -301,11 +394,15 @@ def save_with_history(data, filename):
|
|||
# Initialize history if it doesn't exist
|
||||
if 'history' not in existing_data:
|
||||
existing_data['history'] = {}
|
||||
|
||||
# Calculate global metrics for the current data
|
||||
global_metrics = calculate_global_metrics(data)
|
||||
|
||||
# Add current regular_pages and specific_pages to history
|
||||
# Add current regular_pages, specific_pages, and global metrics to history
|
||||
history_entry = {
|
||||
'regular_pages': data.get('regular_pages', []),
|
||||
'specific_pages': data.get('specific_pages', [])
|
||||
'specific_pages': data.get('specific_pages', []),
|
||||
'global_metrics': global_metrics
|
||||
}
|
||||
|
||||
# Add the entry to history with timestamp as key
|
||||
|
@ -314,6 +411,7 @@ def save_with_history(data, filename):
|
|||
# Update the current data
|
||||
existing_data['regular_pages'] = data.get('regular_pages', [])
|
||||
existing_data['specific_pages'] = data.get('specific_pages', [])
|
||||
existing_data['global_metrics'] = global_metrics
|
||||
existing_data['last_updated'] = current_timestamp
|
||||
|
||||
# Save the updated data
|
||||
|
@ -321,10 +419,119 @@ def save_with_history(data, filename):
|
|||
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Data with history saved to {filename}")
|
||||
|
||||
# Also save a separate ranking history file
|
||||
save_ranking_history(existing_data, "page_rankings.json")
|
||||
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error saving data with history to {filename}: {e}")
|
||||
# Fallback to regular save if there's an error
|
||||
save_to_json(data, filename)
|
||||
|
||||
def save_ranking_history(data, filename):
|
||||
"""
|
||||
Save ranking history to a separate JSON file
|
||||
|
||||
This function extracts ranking data from the history and saves it in a format
|
||||
optimized for displaying ranking evolution over time.
|
||||
|
||||
Args:
|
||||
data: Data containing history entries
|
||||
filename (str): Name of the file to save rankings
|
||||
"""
|
||||
try:
|
||||
# Initialize ranking data structure
|
||||
ranking_data = {
|
||||
'timestamps': [],
|
||||
'pages': {},
|
||||
'global_metrics': {}
|
||||
}
|
||||
|
||||
# Extract history entries
|
||||
history = data.get('history', {})
|
||||
|
||||
# Sort timestamps chronologically
|
||||
sorted_timestamps = sorted(history.keys())
|
||||
ranking_data['timestamps'] = sorted_timestamps
|
||||
|
||||
# Process each page to track its metrics over time
|
||||
all_page_keys = set()
|
||||
|
||||
# First, collect all unique page keys across all history entries
|
||||
for timestamp in sorted_timestamps:
|
||||
entry = history[timestamp]
|
||||
|
||||
# Add global metrics for this timestamp
|
||||
if 'global_metrics' in entry:
|
||||
ranking_data['global_metrics'][timestamp] = entry['global_metrics']
|
||||
|
||||
# Collect page keys from regular pages
|
||||
for page in entry.get('regular_pages', []):
|
||||
all_page_keys.add(page['key'])
|
||||
|
||||
# Collect page keys from specific pages
|
||||
for page in entry.get('specific_pages', []):
|
||||
all_page_keys.add(page['key'])
|
||||
|
||||
# Initialize data structure for each page
|
||||
for page_key in all_page_keys:
|
||||
ranking_data['pages'][page_key] = {
|
||||
'title': page_key,
|
||||
'metrics': {}
|
||||
}
|
||||
|
||||
# Fill in metrics for each page at each timestamp
|
||||
for timestamp in sorted_timestamps:
|
||||
entry = history[timestamp]
|
||||
|
||||
# Process regular pages
|
||||
for page in entry.get('regular_pages', []):
|
||||
page_key = page['key']
|
||||
|
||||
# Extract metrics we want to track
|
||||
metrics = {
|
||||
'staleness_score': page.get('staleness_score', 0),
|
||||
'word_diff': page.get('word_diff', 0),
|
||||
'section_diff': page.get('section_diff', 0),
|
||||
'link_diff': page.get('link_diff', 0),
|
||||
'media_diff': page.get('media_diff', 0)
|
||||
}
|
||||
|
||||
# Store metrics for this timestamp
|
||||
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
|
||||
|
||||
# Store page title if available
|
||||
if 'en_page' in page and page['en_page']:
|
||||
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
|
||||
|
||||
# Process specific pages
|
||||
for page in entry.get('specific_pages', []):
|
||||
page_key = page['key']
|
||||
|
||||
# Extract metrics we want to track
|
||||
metrics = {
|
||||
'staleness_score': page.get('staleness_score', 0),
|
||||
'word_diff': page.get('word_diff', 0),
|
||||
'section_diff': page.get('section_diff', 0),
|
||||
'link_diff': page.get('link_diff', 0),
|
||||
'media_diff': page.get('media_diff', 0)
|
||||
}
|
||||
|
||||
# Store metrics for this timestamp
|
||||
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
|
||||
|
||||
# Store page title if available
|
||||
if 'en_page' in page and page['en_page']:
|
||||
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
|
||||
|
||||
# Save the ranking data
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(ranking_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Ranking history saved to {filename}")
|
||||
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error saving ranking history to {filename}: {e}")
|
||||
|
||||
def check_grammar_with_grammalecte(text):
|
||||
"""
|
||||
|
|
|
@ -12,6 +12,8 @@ Key:harassment_prevention,en,https://wiki.openstreetmap.org/wiki/Key:harassment_
|
|||
Key:harassment_prevention,fr,https://wiki.openstreetmap.org/wiki/FR:Key:harassment_prevention,2025-07-03,15,328,83,14,66.72,https://wiki.openstreetmap.org/w/images/thumb/7/76/Osm_element_node.svg/30px-Osm_element_node.svg.png
|
||||
Proposal process,en,https://wiki.openstreetmap.org/wiki/Proposal process,2025-08-13,46,5292,202,4,172.34,https://wiki.openstreetmap.org/w/images/thumb/c/c2/Save_proposal_first.png/761px-Save_proposal_first.png
|
||||
Proposal process,fr,https://wiki.openstreetmap.org/wiki/FR:Proposal process,2023-09-22,15,0,0,0,172.34,
|
||||
Outil de Manipulation et d'Organisation,en,https://wiki.openstreetmap.org/wiki/Outil de Manipulation et d'Organisation,2025-09-02,9,0,0,0,0.6,
|
||||
Outil de Manipulation et d'Organisation,fr,https://wiki.openstreetmap.org/wiki/FR:Outil de Manipulation et d'Organisation,2025-09-02,13,0,0,0,0.6,
|
||||
Automated_Edits_code_of_conduct,en,https://wiki.openstreetmap.org/wiki/Automated_Edits_code_of_conduct,2025-07-26,19,0,0,0,23.1,
|
||||
Automated_Edits_code_of_conduct,fr,https://wiki.openstreetmap.org/wiki/FR:Automated_Edits_code_of_conduct,2025-04-03,17,0,0,0,23.1,
|
||||
Key:cuisine,en,https://wiki.openstreetmap.org/wiki/Key:cuisine,2025-07-23,17,3422,693,303,107.73,https://upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Food_montage.jpg/200px-Food_montage.jpg
|
||||
|
@ -44,6 +46,8 @@ Any_tags_you_like,fr,https://wiki.openstreetmap.org/wiki/FR:Any_tags_you_like,20
|
|||
Organised_Editing/Best_Practices,en,https://wiki.openstreetmap.org/wiki/Organised_Editing/Best_Practices,2025-07-18,16,501,10,1,100,https://upload.wikimedia.org/wikipedia/commons/thumb/1/15/Ambox_warning_pn.svg/40px-Ambox_warning_pn.svg.png
|
||||
Map_features,en,https://wiki.openstreetmap.org/wiki/Map_features,2025-07-21,125,21926,4255,2222,507.98,https://upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Bar_MXCT.JPG/100px-Bar_MXCT.JPG
|
||||
Map_features,fr,https://wiki.openstreetmap.org/wiki/FR:Map_features,2018-12-27,103,23159,5516,3062,507.98,https://wiki.openstreetmap.org/w/images/c/c4/Aerialway_gondola_render.png
|
||||
Wiki,en,https://wiki.openstreetmap.org/wiki/Wiki,2025-02-24,16,669,40,1,302.87,https://wiki.openstreetmap.org/w/images/thumb/b/b7/OpenStreetMap_Wiki_MainPage.png/300px-OpenStreetMap_Wiki_MainPage.png
|
||||
Wiki,fr,https://wiki.openstreetmap.org/wiki/FR:Wiki,2021-01-04,14,645,37,1,302.87,https://wiki.openstreetmap.org/w/images/thumb/b/b7/OpenStreetMap_Wiki_MainPage.png/300px-OpenStreetMap_Wiki_MainPage.png
|
||||
https://wiki.openstreetmap.org/wiki/FR:Quality_Assurance,fr,https://wiki.openstreetmap.org/wiki/FR:Quality_Assurance,2015-05-16,16,0,0,0,0,
|
||||
https://wiki.openstreetmap.org/wiki/Quality_Assurance,en,https://wiki.openstreetmap.org/wiki/Quality_Assurance,2025-06-01,19,0,0,0,100,
|
||||
https://wiki.openstreetmap.org/wiki/FR:Nominatim/Installation,fr,https://wiki.openstreetmap.org/wiki/FR:Nominatim/Installation,2016-08-22,32,0,0,0,0,
|
||||
|
|
|
Loading…
Add table
Add a link
Reference in a new issue