up wiki compare

This commit is contained in:
Tykayn 2025-08-31 17:57:28 +02:00 committed by tykayn
parent d2936d5730
commit 1535cf8ee3
8 changed files with 1036 additions and 79 deletions

View file

@ -26,6 +26,8 @@ import os
from datetime import datetime
from bs4 import BeautifulSoup
import logging
import matplotlib.pyplot as plt
import numpy as np
# Configure logging
logging.basicConfig(
@ -42,6 +44,7 @@ WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
TOP_KEYS_FILE = "top_keys.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 100
@ -255,6 +258,67 @@ def fetch_wiki_page(key, language='en'):
logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
return None
def generate_staleness_histogram(wiki_pages):
"""
Generate a histogram of staleness scores by 10% ranges
Args:
wiki_pages (list): List of dictionaries containing page information with staleness scores
Returns:
None: Saves the histogram to a file
"""
logger.info("Generating histogram of staleness scores by 10% ranges...")
# Extract staleness scores
staleness_scores = []
for page in wiki_pages:
if page and 'staleness_score' in page:
staleness_scores.append(page['staleness_score'])
if not staleness_scores:
logger.warning("No staleness scores found. Cannot generate histogram.")
return
# Determine the maximum score for binning
max_score = max(staleness_scores)
# Round up to the nearest 10 to ensure all scores are included
max_bin_edge = np.ceil(max_score / 10) * 10
# Create bins for 10% ranges
bins = np.arange(0, max_bin_edge + 10, 10)
# Count scores in each bin
hist, bin_edges = np.histogram(staleness_scores, bins=bins)
# Create histogram
plt.figure(figsize=(12, 6))
# Create bar chart
plt.bar(range(len(hist)), hist, align='center')
# Set x-axis labels for each bin
bin_labels = [f"{int(bin_edges[i])}-{int(bin_edges[i+1])}%" for i in range(len(bin_edges)-1)]
plt.xticks(range(len(hist)), bin_labels, rotation=45)
# Set labels and title
plt.xlabel('Tranches de score de décrépitude (en %)')
plt.ylabel('Nombre de pages')
plt.title('Répartition du score de décrépitude par tranches de 10%')
# Add grid for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Adjust layout
plt.tight_layout()
# Save figure
plt.savefig(STALENESS_HISTOGRAM_FILE)
logger.info(f"Histogram saved to {STALENESS_HISTOGRAM_FILE}")
# Close the figure to free memory
plt.close()
def analyze_wiki_pages(pages):
"""
Analyze wiki pages to determine which ones need updating
@ -621,6 +685,9 @@ def main():
fr_page['staleness_score'] = 0
processed_wiki_pages.append(fr_page)
# Generate histogram of staleness scores
generate_staleness_histogram(processed_wiki_pages)
# Save processed wiki pages to CSV
try:
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f: