up wiki compare
This commit is contained in:
parent
d2936d5730
commit
1535cf8ee3
8 changed files with 1036 additions and 79 deletions
|
@ -26,6 +26,8 @@ import os
|
|||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
|
@ -42,6 +44,7 @@ WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
|||
TOP_KEYS_FILE = "top_keys.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 100
|
||||
|
||||
|
@ -255,6 +258,67 @@ def fetch_wiki_page(key, language='en'):
|
|||
logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
def generate_staleness_histogram(wiki_pages):
|
||||
"""
|
||||
Generate a histogram of staleness scores by 10% ranges
|
||||
|
||||
Args:
|
||||
wiki_pages (list): List of dictionaries containing page information with staleness scores
|
||||
|
||||
Returns:
|
||||
None: Saves the histogram to a file
|
||||
"""
|
||||
logger.info("Generating histogram of staleness scores by 10% ranges...")
|
||||
|
||||
# Extract staleness scores
|
||||
staleness_scores = []
|
||||
for page in wiki_pages:
|
||||
if page and 'staleness_score' in page:
|
||||
staleness_scores.append(page['staleness_score'])
|
||||
|
||||
if not staleness_scores:
|
||||
logger.warning("No staleness scores found. Cannot generate histogram.")
|
||||
return
|
||||
|
||||
# Determine the maximum score for binning
|
||||
max_score = max(staleness_scores)
|
||||
# Round up to the nearest 10 to ensure all scores are included
|
||||
max_bin_edge = np.ceil(max_score / 10) * 10
|
||||
|
||||
# Create bins for 10% ranges
|
||||
bins = np.arange(0, max_bin_edge + 10, 10)
|
||||
|
||||
# Count scores in each bin
|
||||
hist, bin_edges = np.histogram(staleness_scores, bins=bins)
|
||||
|
||||
# Create histogram
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
# Create bar chart
|
||||
plt.bar(range(len(hist)), hist, align='center')
|
||||
|
||||
# Set x-axis labels for each bin
|
||||
bin_labels = [f"{int(bin_edges[i])}-{int(bin_edges[i+1])}%" for i in range(len(bin_edges)-1)]
|
||||
plt.xticks(range(len(hist)), bin_labels, rotation=45)
|
||||
|
||||
# Set labels and title
|
||||
plt.xlabel('Tranches de score de décrépitude (en %)')
|
||||
plt.ylabel('Nombre de pages')
|
||||
plt.title('Répartition du score de décrépitude par tranches de 10%')
|
||||
|
||||
# Add grid for better readability
|
||||
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||
|
||||
# Adjust layout
|
||||
plt.tight_layout()
|
||||
|
||||
# Save figure
|
||||
plt.savefig(STALENESS_HISTOGRAM_FILE)
|
||||
logger.info(f"Histogram saved to {STALENESS_HISTOGRAM_FILE}")
|
||||
|
||||
# Close the figure to free memory
|
||||
plt.close()
|
||||
|
||||
def analyze_wiki_pages(pages):
|
||||
"""
|
||||
Analyze wiki pages to determine which ones need updating
|
||||
|
@ -621,6 +685,9 @@ def main():
|
|||
fr_page['staleness_score'] = 0
|
||||
processed_wiki_pages.append(fr_page)
|
||||
|
||||
# Generate histogram of staleness scores
|
||||
generate_staleness_histogram(processed_wiki_pages)
|
||||
|
||||
# Save processed wiki pages to CSV
|
||||
try:
|
||||
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue