add history in articles measures

This commit is contained in:
Tykayn 2025-09-08 10:20:51 +02:00 committed by tykayn
parent 1ed74c2e2f
commit 381f378db4
9 changed files with 1678 additions and 195 deletions

View file

@ -116,7 +116,8 @@ SPECIFIC_PAGES = [
"Mapping_private_information",
"Any_tags_you_like",
"Organised_Editing/Best_Practices",
"Map_features"
"Map_features",
"Wiki"
]
def fetch_desynchronized_pages():
@ -280,12 +281,104 @@ def save_to_json(data, filename):
except IOError as e:
logger.error(f"Error saving data to {filename}: {e}")
def calculate_global_metrics(data):
"""
Calculate global metrics for all pages in the data
Args:
data: Data containing regular_pages and specific_pages
Returns:
dict: Dictionary with global metrics
"""
# Combine regular and specific pages for global metrics
all_pages = data.get('regular_pages', []) + data.get('specific_pages', [])
# Initialize metrics
metrics = {
'total_pages': len(all_pages),
'avg_sections': 0,
'avg_words': 0,
'avg_links': 0,
'avg_images': 0,
'avg_categories': 0,
'avg_staleness': 0,
'pages_with_en_fr': 0,
'pages_missing_fr': 0,
'staleness_distribution': {
'0-20': 0,
'21-40': 0,
'41-60': 0,
'61-80': 0,
'81-100': 0,
'100+': 0
}
}
# Skip if no pages
if not all_pages:
return metrics
# Calculate totals
total_sections = 0
total_words = 0
total_links = 0
total_images = 0
total_categories = 0
total_staleness = 0
for page in all_pages:
# Count pages with/without French version
if page.get('fr_page'):
metrics['pages_with_en_fr'] += 1
else:
metrics['pages_missing_fr'] += 1
# Add to staleness distribution
staleness = page.get('staleness_score', 0)
total_staleness += staleness
if staleness <= 20:
metrics['staleness_distribution']['0-20'] += 1
elif staleness <= 40:
metrics['staleness_distribution']['21-40'] += 1
elif staleness <= 60:
metrics['staleness_distribution']['41-60'] += 1
elif staleness <= 80:
metrics['staleness_distribution']['61-80'] += 1
elif staleness <= 100:
metrics['staleness_distribution']['81-100'] += 1
else:
metrics['staleness_distribution']['100+'] += 1
# Add to totals
total_sections += page.get('section_diff', 0) if 'section_diff' in page else 0
total_words += page.get('word_diff', 0) if 'word_diff' in page else 0
total_links += page.get('link_diff', 0) if 'link_diff' in page else 0
total_images += page.get('media_diff', 0) if 'media_diff' in page else 0
# Count categories if available
if page.get('category_comparison'):
cat_count = len(page['category_comparison'].get('en_only', []))
total_categories += cat_count
# Calculate averages
metrics['avg_sections'] = round(total_sections / len(all_pages), 2)
metrics['avg_words'] = round(total_words / len(all_pages), 2)
metrics['avg_links'] = round(total_links / len(all_pages), 2)
metrics['avg_images'] = round(total_images / len(all_pages), 2)
metrics['avg_categories'] = round(total_categories / len(all_pages), 2)
metrics['avg_staleness'] = round(total_staleness / len(all_pages), 2)
return metrics
def save_with_history(data, filename):
"""
Save data to a JSON file while preserving history
This function loads existing data from the file (if it exists),
adds the new data to the history, and saves the updated data back to the file.
It also calculates global metrics for the current data.
Args:
data: New data to save
@ -301,11 +394,15 @@ def save_with_history(data, filename):
# Initialize history if it doesn't exist
if 'history' not in existing_data:
existing_data['history'] = {}
# Calculate global metrics for the current data
global_metrics = calculate_global_metrics(data)
# Add current regular_pages and specific_pages to history
# Add current regular_pages, specific_pages, and global metrics to history
history_entry = {
'regular_pages': data.get('regular_pages', []),
'specific_pages': data.get('specific_pages', [])
'specific_pages': data.get('specific_pages', []),
'global_metrics': global_metrics
}
# Add the entry to history with timestamp as key
@ -314,6 +411,7 @@ def save_with_history(data, filename):
# Update the current data
existing_data['regular_pages'] = data.get('regular_pages', [])
existing_data['specific_pages'] = data.get('specific_pages', [])
existing_data['global_metrics'] = global_metrics
existing_data['last_updated'] = current_timestamp
# Save the updated data
@ -321,10 +419,119 @@ def save_with_history(data, filename):
json.dump(existing_data, f, indent=2, ensure_ascii=False)
logger.info(f"Data with history saved to {filename}")
# Also save a separate ranking history file
save_ranking_history(existing_data, "page_rankings.json")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving data with history to {filename}: {e}")
# Fallback to regular save if there's an error
save_to_json(data, filename)
def save_ranking_history(data, filename):
"""
Save ranking history to a separate JSON file
This function extracts ranking data from the history and saves it in a format
optimized for displaying ranking evolution over time.
Args:
data: Data containing history entries
filename (str): Name of the file to save rankings
"""
try:
# Initialize ranking data structure
ranking_data = {
'timestamps': [],
'pages': {},
'global_metrics': {}
}
# Extract history entries
history = data.get('history', {})
# Sort timestamps chronologically
sorted_timestamps = sorted(history.keys())
ranking_data['timestamps'] = sorted_timestamps
# Process each page to track its metrics over time
all_page_keys = set()
# First, collect all unique page keys across all history entries
for timestamp in sorted_timestamps:
entry = history[timestamp]
# Add global metrics for this timestamp
if 'global_metrics' in entry:
ranking_data['global_metrics'][timestamp] = entry['global_metrics']
# Collect page keys from regular pages
for page in entry.get('regular_pages', []):
all_page_keys.add(page['key'])
# Collect page keys from specific pages
for page in entry.get('specific_pages', []):
all_page_keys.add(page['key'])
# Initialize data structure for each page
for page_key in all_page_keys:
ranking_data['pages'][page_key] = {
'title': page_key,
'metrics': {}
}
# Fill in metrics for each page at each timestamp
for timestamp in sorted_timestamps:
entry = history[timestamp]
# Process regular pages
for page in entry.get('regular_pages', []):
page_key = page['key']
# Extract metrics we want to track
metrics = {
'staleness_score': page.get('staleness_score', 0),
'word_diff': page.get('word_diff', 0),
'section_diff': page.get('section_diff', 0),
'link_diff': page.get('link_diff', 0),
'media_diff': page.get('media_diff', 0)
}
# Store metrics for this timestamp
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
# Store page title if available
if 'en_page' in page and page['en_page']:
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
# Process specific pages
for page in entry.get('specific_pages', []):
page_key = page['key']
# Extract metrics we want to track
metrics = {
'staleness_score': page.get('staleness_score', 0),
'word_diff': page.get('word_diff', 0),
'section_diff': page.get('section_diff', 0),
'link_diff': page.get('link_diff', 0),
'media_diff': page.get('media_diff', 0)
}
# Store metrics for this timestamp
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
# Store page title if available
if 'en_page' in page and page['en_page']:
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
# Save the ranking data
with open(filename, 'w', encoding='utf-8') as f:
json.dump(ranking_data, f, indent=2, ensure_ascii=False)
logger.info(f"Ranking history saved to {filename}")
except (IOError, json.JSONDecodeError) as e:
logger.error(f"Error saving ranking history to {filename}: {e}")
def check_grammar_with_grammalecte(text):
"""

View file

@ -12,6 +12,8 @@ Key:harassment_prevention,en,https://wiki.openstreetmap.org/wiki/Key:harassment_
Key:harassment_prevention,fr,https://wiki.openstreetmap.org/wiki/FR:Key:harassment_prevention,2025-07-03,15,328,83,14,66.72,https://wiki.openstreetmap.org/w/images/thumb/7/76/Osm_element_node.svg/30px-Osm_element_node.svg.png
Proposal process,en,https://wiki.openstreetmap.org/wiki/Proposal process,2025-08-13,46,5292,202,4,172.34,https://wiki.openstreetmap.org/w/images/thumb/c/c2/Save_proposal_first.png/761px-Save_proposal_first.png
Proposal process,fr,https://wiki.openstreetmap.org/wiki/FR:Proposal process,2023-09-22,15,0,0,0,172.34,
Outil de Manipulation et d'Organisation,en,https://wiki.openstreetmap.org/wiki/Outil de Manipulation et d'Organisation,2025-09-02,9,0,0,0,0.6,
Outil de Manipulation et d'Organisation,fr,https://wiki.openstreetmap.org/wiki/FR:Outil de Manipulation et d'Organisation,2025-09-02,13,0,0,0,0.6,
Automated_Edits_code_of_conduct,en,https://wiki.openstreetmap.org/wiki/Automated_Edits_code_of_conduct,2025-07-26,19,0,0,0,23.1,
Automated_Edits_code_of_conduct,fr,https://wiki.openstreetmap.org/wiki/FR:Automated_Edits_code_of_conduct,2025-04-03,17,0,0,0,23.1,
Key:cuisine,en,https://wiki.openstreetmap.org/wiki/Key:cuisine,2025-07-23,17,3422,693,303,107.73,https://upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Food_montage.jpg/200px-Food_montage.jpg
@ -44,6 +46,8 @@ Any_tags_you_like,fr,https://wiki.openstreetmap.org/wiki/FR:Any_tags_you_like,20
Organised_Editing/Best_Practices,en,https://wiki.openstreetmap.org/wiki/Organised_Editing/Best_Practices,2025-07-18,16,501,10,1,100,https://upload.wikimedia.org/wikipedia/commons/thumb/1/15/Ambox_warning_pn.svg/40px-Ambox_warning_pn.svg.png
Map_features,en,https://wiki.openstreetmap.org/wiki/Map_features,2025-07-21,125,21926,4255,2222,507.98,https://upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Bar_MXCT.JPG/100px-Bar_MXCT.JPG
Map_features,fr,https://wiki.openstreetmap.org/wiki/FR:Map_features,2018-12-27,103,23159,5516,3062,507.98,https://wiki.openstreetmap.org/w/images/c/c4/Aerialway_gondola_render.png
Wiki,en,https://wiki.openstreetmap.org/wiki/Wiki,2025-02-24,16,669,40,1,302.87,https://wiki.openstreetmap.org/w/images/thumb/b/b7/OpenStreetMap_Wiki_MainPage.png/300px-OpenStreetMap_Wiki_MainPage.png
Wiki,fr,https://wiki.openstreetmap.org/wiki/FR:Wiki,2021-01-04,14,645,37,1,302.87,https://wiki.openstreetmap.org/w/images/thumb/b/b7/OpenStreetMap_Wiki_MainPage.png/300px-OpenStreetMap_Wiki_MainPage.png
https://wiki.openstreetmap.org/wiki/FR:Quality_Assurance,fr,https://wiki.openstreetmap.org/wiki/FR:Quality_Assurance,2015-05-16,16,0,0,0,0,
https://wiki.openstreetmap.org/wiki/Quality_Assurance,en,https://wiki.openstreetmap.org/wiki/Quality_Assurance,2025-06-01,19,0,0,0,100,
https://wiki.openstreetmap.org/wiki/FR:Nominatim/Installation,fr,https://wiki.openstreetmap.org/wiki/FR:Nominatim/Installation,2016-08-22,32,0,0,0,0,

1 key language url last_modified sections word_count link_count media_count staleness_score description_img_url
12 Key:harassment_prevention fr https://wiki.openstreetmap.org/wiki/FR:Key:harassment_prevention 2025-07-03 15 328 83 14 66.72 https://wiki.openstreetmap.org/w/images/thumb/7/76/Osm_element_node.svg/30px-Osm_element_node.svg.png
13 Proposal process en https://wiki.openstreetmap.org/wiki/Proposal process 2025-08-13 46 5292 202 4 172.34 https://wiki.openstreetmap.org/w/images/thumb/c/c2/Save_proposal_first.png/761px-Save_proposal_first.png
14 Proposal process fr https://wiki.openstreetmap.org/wiki/FR:Proposal process 2023-09-22 15 0 0 0 172.34
15 Outil de Manipulation et d'Organisation en https://wiki.openstreetmap.org/wiki/Outil de Manipulation et d'Organisation 2025-09-02 9 0 0 0 0.6
16 Outil de Manipulation et d'Organisation fr https://wiki.openstreetmap.org/wiki/FR:Outil de Manipulation et d'Organisation 2025-09-02 13 0 0 0 0.6
17 Automated_Edits_code_of_conduct en https://wiki.openstreetmap.org/wiki/Automated_Edits_code_of_conduct 2025-07-26 19 0 0 0 23.1
18 Automated_Edits_code_of_conduct fr https://wiki.openstreetmap.org/wiki/FR:Automated_Edits_code_of_conduct 2025-04-03 17 0 0 0 23.1
19 Key:cuisine en https://wiki.openstreetmap.org/wiki/Key:cuisine 2025-07-23 17 3422 693 303 107.73 https://upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Food_montage.jpg/200px-Food_montage.jpg
46 Organised_Editing/Best_Practices en https://wiki.openstreetmap.org/wiki/Organised_Editing/Best_Practices 2025-07-18 16 501 10 1 100 https://upload.wikimedia.org/wikipedia/commons/thumb/1/15/Ambox_warning_pn.svg/40px-Ambox_warning_pn.svg.png
47 Map_features en https://wiki.openstreetmap.org/wiki/Map_features 2025-07-21 125 21926 4255 2222 507.98 https://upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Bar_MXCT.JPG/100px-Bar_MXCT.JPG
48 Map_features fr https://wiki.openstreetmap.org/wiki/FR:Map_features 2018-12-27 103 23159 5516 3062 507.98 https://wiki.openstreetmap.org/w/images/c/c4/Aerialway_gondola_render.png
49 Wiki en https://wiki.openstreetmap.org/wiki/Wiki 2025-02-24 16 669 40 1 302.87 https://wiki.openstreetmap.org/w/images/thumb/b/b7/OpenStreetMap_Wiki_MainPage.png/300px-OpenStreetMap_Wiki_MainPage.png
50 Wiki fr https://wiki.openstreetmap.org/wiki/FR:Wiki 2021-01-04 14 645 37 1 302.87 https://wiki.openstreetmap.org/w/images/thumb/b/b7/OpenStreetMap_Wiki_MainPage.png/300px-OpenStreetMap_Wiki_MainPage.png
51 https://wiki.openstreetmap.org/wiki/FR:Quality_Assurance fr https://wiki.openstreetmap.org/wiki/FR:Quality_Assurance 2015-05-16 16 0 0 0 0
52 https://wiki.openstreetmap.org/wiki/Quality_Assurance en https://wiki.openstreetmap.org/wiki/Quality_Assurance 2025-06-01 19 0 0 0 100
53 https://wiki.openstreetmap.org/wiki/FR:Nominatim/Installation fr https://wiki.openstreetmap.org/wiki/FR:Nominatim/Installation 2016-08-22 32 0 0 0 0