1868 lines
No EOL
73 KiB
Python
Executable file
1868 lines
No EOL
73 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
wiki_compare.py
|
|
|
|
This script fetches the most used OpenStreetMap keys from TagInfo,
|
|
compares their English and French wiki pages, and identifies which pages
|
|
need updating based on modification dates and content analysis.
|
|
|
|
The script also compares a specific list of wiki pages defined in the
|
|
SPECIFIC_PAGES constant. This list can include regular page titles,
|
|
full URLs, or pages with FR: prefix.
|
|
|
|
Usage:
|
|
python wiki_compare.py
|
|
|
|
Output:
|
|
- top_keys.json: JSON file containing the most used OSM keys
|
|
- wiki_pages.csv: CSV file with information about each wiki page
|
|
- outdated_pages.json: JSON file containing pages that need updating
|
|
- staleness_histogram.png: Histogram of staleness scores
|
|
- A console output listing the wiki pages that need updating
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import requests
|
|
import re
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import hashlib
|
|
import argparse
|
|
from datetime import datetime
|
|
from bs4 import BeautifulSoup
|
|
import logging
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
# Try to import nltk, but make it optional
|
|
try:
|
|
import nltk
|
|
NLTK_AVAILABLE = True
|
|
except ImportError:
|
|
NLTK_AVAILABLE = False
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants
|
|
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
|
|
TAGINFO_FRANCE_API_URL = "https://taginfo.geofabrik.de/europe:france/api/4/keys/without_wiki_page"
|
|
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
|
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
|
|
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
|
|
WIKI_DEADEND_PAGES_URL = "https://wiki.openstreetmap.org/w/index.php?title=Special:DeadendPages&limit=500&offset=1000"
|
|
TOP_KEYS_FILE = "top_keys.json"
|
|
KEYS_WITHOUT_WIKI_FILE = "keys_without_wiki.json"
|
|
WIKI_PAGES_CSV = "wiki_pages.csv"
|
|
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
|
DEADEND_PAGES_FILE = "deadend_pages.json"
|
|
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
|
# Number of wiki pages to examine
|
|
NUM_WIKI_PAGES = 2
|
|
# HTML cache folder
|
|
HTML_CACHE_DIR = "html_cache"
|
|
|
|
# Initialize NLTK for sentence tokenization if available
|
|
if NLTK_AVAILABLE:
|
|
try:
|
|
nltk.data.find('tokenizers/punkt')
|
|
except LookupError:
|
|
nltk.download('punkt')
|
|
|
|
# Also download punkt_tab resource which is needed for sent_tokenize
|
|
try:
|
|
nltk.data.find('tokenizers/punkt_tab')
|
|
except LookupError:
|
|
nltk.download('punkt_tab')
|
|
|
|
# Create HTML cache directory if it doesn't exist
|
|
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
|
|
|
# List of specific pages to compare (in addition to top keys)
|
|
# This list can include:
|
|
# 1. Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
|
|
# 2. Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
|
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
|
SPECIFIC_PAGES = [
|
|
"Anatomie_des_étiquettes_osm",
|
|
"Tag:leisure=children_club",
|
|
"Tag:harassment_prevention=ask_angela",
|
|
"Key:harassment_prevention",
|
|
"Proposal process",
|
|
"Outil de Manipulation et d'Organisation",
|
|
"Automated_Edits_code_of_conduct",
|
|
"Key:cuisine",
|
|
"Libre_Charge_Map",
|
|
"OSM_Mon_Commerce",
|
|
"Complète_Tes_Commerces",
|
|
"Tag:amenity=charging_station",
|
|
"Organised_Editing/Activities/MapYourGrid_Initiative",
|
|
"Key:highway",
|
|
"Quality_assurance",
|
|
"Verifiability",
|
|
"Good_practice",
|
|
"Mapping_parties",
|
|
"State_of_the_Map",
|
|
"Diversity",
|
|
"Mapping_private_information",
|
|
"Any_tags_you_like",
|
|
"Organised_Editing/Best_Practices",
|
|
"Map_features",
|
|
"Wiki"
|
|
]
|
|
|
|
def fetch_desynchronized_pages():
|
|
"""
|
|
Fetch pages from the FR:Traductions_désynchronisées category
|
|
|
|
Returns:
|
|
list: List of page URLs from the category
|
|
"""
|
|
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
|
|
|
|
try:
|
|
response = requests.get(WIKI_CATEGORY_URL)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find all links to French pages in the category
|
|
page_links = []
|
|
for link in soup.select('a[href^="/wiki/FR:"]'):
|
|
href = link.get('href', '')
|
|
# Skip if it's a category link or a language link
|
|
if '/Category:' in href or 'action=edit' in href:
|
|
continue
|
|
|
|
# Get the full URL
|
|
full_url = 'https://wiki.openstreetmap.org' + href
|
|
page_links.append(full_url)
|
|
|
|
logger.info(f"Found {len(page_links)} pages in the category")
|
|
return page_links
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching category page: {e}")
|
|
return []
|
|
|
|
def suggest_categories(page_title, page_url):
|
|
"""
|
|
Suggest categories for an uncategorized page based on its title and content
|
|
|
|
Args:
|
|
page_title (str): Title of the page
|
|
page_url (str): URL of the page
|
|
|
|
Returns:
|
|
list: List of suggested categories
|
|
"""
|
|
logger.info(f"Suggesting categories for page: {page_title}")
|
|
|
|
suggested_categories = []
|
|
|
|
# Common categories for French OSM wiki pages
|
|
common_categories = [
|
|
"Documentation OSM en français",
|
|
"Cartographie",
|
|
"Contributeurs",
|
|
"Développeurs",
|
|
"Éléments cartographiés",
|
|
"Imports",
|
|
"Logiciels",
|
|
"Projets",
|
|
"Rencontres",
|
|
"Utilisateurs"
|
|
]
|
|
|
|
# Add geography-related categories for pages about France
|
|
if "France" in page_title:
|
|
suggested_categories.append("France")
|
|
|
|
# Check for specific regions or departments
|
|
regions = [
|
|
"Auvergne-Rhône-Alpes", "Bourgogne-Franche-Comté", "Bretagne",
|
|
"Centre-Val de Loire", "Corse", "Grand Est", "Hauts-de-France",
|
|
"Île-de-France", "Normandie", "Nouvelle-Aquitaine",
|
|
"Occitanie", "Pays de la Loire", "Provence-Alpes-Côte d'Azur"
|
|
]
|
|
|
|
for region in regions:
|
|
if region in page_title:
|
|
suggested_categories.append(region)
|
|
|
|
# Try to fetch the page content to make better suggestions
|
|
try:
|
|
response = requests.get(page_url)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Get the main content
|
|
content = soup.select_one('#mw-content-text')
|
|
if content:
|
|
text = content.get_text(separator=' ', strip=True).lower()
|
|
|
|
# Check for keywords related to common categories
|
|
if any(keyword in text for keyword in ["carte", "cartographie", "mapper"]):
|
|
suggested_categories.append("Cartographie")
|
|
|
|
if any(keyword in text for keyword in ["contribuer", "contributeur", "éditer"]):
|
|
suggested_categories.append("Contributeurs")
|
|
|
|
if any(keyword in text for keyword in ["développeur", "programmer", "code", "api"]):
|
|
suggested_categories.append("Développeurs")
|
|
|
|
if any(keyword in text for keyword in ["tag", "clé", "valeur", "élément", "nœud", "way", "relation"]):
|
|
suggested_categories.append("Éléments cartographiés")
|
|
|
|
if any(keyword in text for keyword in ["import", "données", "dataset"]):
|
|
suggested_categories.append("Imports")
|
|
|
|
if any(keyword in text for keyword in ["logiciel", "application", "outil"]):
|
|
suggested_categories.append("Logiciels")
|
|
|
|
if any(keyword in text for keyword in ["projet", "initiative"]):
|
|
suggested_categories.append("Projets")
|
|
|
|
if any(keyword in text for keyword in ["rencontre", "réunion", "événement", "conférence"]):
|
|
suggested_categories.append("Rencontres")
|
|
|
|
if any(keyword in text for keyword in ["utiliser", "utilisateur", "usage"]):
|
|
suggested_categories.append("Utilisateurs")
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.warning(f"Error fetching page content for category suggestions: {e}")
|
|
# If we can't fetch the content, suggest common categories based on title only
|
|
if "projet" in page_title.lower():
|
|
suggested_categories.append("Projets")
|
|
elif "logiciel" in page_title.lower() or "application" in page_title.lower():
|
|
suggested_categories.append("Logiciels")
|
|
elif "rencontre" in page_title.lower() or "réunion" in page_title.lower():
|
|
suggested_categories.append("Rencontres")
|
|
|
|
# Always suggest the general French documentation category
|
|
suggested_categories.append("Documentation OSM en français")
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_categories = []
|
|
for cat in suggested_categories:
|
|
if cat not in seen:
|
|
seen.add(cat)
|
|
unique_categories.append(cat)
|
|
|
|
logger.info(f"Suggested {len(unique_categories)} categories for {page_title}: {', '.join(unique_categories)}")
|
|
return unique_categories
|
|
|
|
def fetch_deadend_pages():
|
|
"""
|
|
Fetch pages starting with "France" from the DeadendPages list
|
|
|
|
Returns:
|
|
list: List of dictionaries containing page information
|
|
"""
|
|
logger.info(f"Fetching pages from DeadendPages list: {WIKI_DEADEND_PAGES_URL}")
|
|
|
|
try:
|
|
response = requests.get(WIKI_DEADEND_PAGES_URL)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Find all links in the DeadendPages list
|
|
page_links = []
|
|
for link in soup.select('.mw-spcontent li a'):
|
|
href = link.get('href', '')
|
|
title = link.get_text(strip=True)
|
|
|
|
# Skip if it's not a wiki page or if it's a special page
|
|
if not href.startswith('/wiki/') or 'Special:' in href:
|
|
continue
|
|
|
|
# Filter pages that start with "France"
|
|
if title.startswith('France'):
|
|
# Get the full URL
|
|
full_url = 'https://wiki.openstreetmap.org' + href
|
|
|
|
# Suggest categories for this page
|
|
suggested_categories = suggest_categories(title, full_url)
|
|
|
|
page_links.append({
|
|
'title': title,
|
|
'url': full_url,
|
|
'suggested_categories': suggested_categories
|
|
})
|
|
|
|
logger.info(f"Found {len(page_links)} pages starting with 'France' in the DeadendPages list")
|
|
return page_links
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching DeadendPages list: {e}")
|
|
return []
|
|
|
|
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
|
"""
|
|
Fetch the most used OSM keys from TagInfo API
|
|
|
|
Args:
|
|
limit (int): Number of keys to fetch
|
|
|
|
Returns:
|
|
list: List of dictionaries containing key information
|
|
"""
|
|
logger.info(f"Fetching top {limit} OSM keys from TagInfo API...")
|
|
|
|
params = {
|
|
'page': 1,
|
|
'rp': limit,
|
|
'sortname': 'count_all',
|
|
'sortorder': 'desc'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(TAGINFO_API_URL, params=params)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Extract just the key names and counts
|
|
top_keys = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
|
|
|
|
logger.info(f"Successfully fetched {len(top_keys)} keys")
|
|
return top_keys
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching data from TagInfo API: {e}")
|
|
return []
|
|
|
|
def fetch_keys_without_wiki_page(limit=36):
|
|
"""
|
|
Fetch keys used in France that are missing a wiki page from TagInfo API
|
|
|
|
Args:
|
|
limit (int): Number of keys to fetch
|
|
|
|
Returns:
|
|
list: List of dictionaries containing key information
|
|
"""
|
|
logger.info(f"Fetching top {limit} OSM keys without wiki pages used in France...")
|
|
|
|
params = {
|
|
'page': 1,
|
|
'rp': limit,
|
|
'english': 0,
|
|
'sortname': 'count_all',
|
|
'sortorder': 'desc'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(TAGINFO_FRANCE_API_URL, params=params)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Extract just the key names and counts
|
|
keys_without_wiki = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
|
|
|
|
logger.info(f"Successfully fetched {len(keys_without_wiki)} keys without wiki pages")
|
|
return keys_without_wiki
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching data from TagInfo France API: {e}")
|
|
return []
|
|
|
|
def load_json_data(filename):
|
|
"""
|
|
Load data from a JSON file
|
|
|
|
Args:
|
|
filename (str): Name of the file
|
|
|
|
Returns:
|
|
dict: Data loaded from the file or empty dict if file doesn't exist
|
|
"""
|
|
try:
|
|
if os.path.exists(filename):
|
|
with open(filename, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
logger.info(f"Data loaded from {filename}")
|
|
return data
|
|
else:
|
|
logger.info(f"File {filename} doesn't exist, returning empty dict")
|
|
return {}
|
|
except (IOError, json.JSONDecodeError) as e:
|
|
logger.error(f"Error loading data from {filename}: {e}")
|
|
return {}
|
|
|
|
def save_to_json(data, filename):
|
|
"""
|
|
Save data to a JSON file
|
|
|
|
Args:
|
|
data: Data to save
|
|
filename (str): Name of the file
|
|
"""
|
|
try:
|
|
# Convert data to JSON string
|
|
json_str = json.dumps(data, indent=2, ensure_ascii=False)
|
|
|
|
# Print the JSON string for debugging
|
|
logger.info(f"JSON string to be written to {filename}:")
|
|
|
|
# Check if data is a dictionary before trying to access keys
|
|
if isinstance(data, dict):
|
|
logger.info(f"JSON keys at top level: {list(data.keys())}")
|
|
if 'translations' in data:
|
|
logger.info(f"JSON keys in translations: {list(data['translations'].keys())}")
|
|
if 'type' in data['translations']:
|
|
logger.info(f"'type' key exists in translations")
|
|
if 'type_key' in data['translations']:
|
|
logger.info(f"'type_key' key exists in translations")
|
|
elif isinstance(data, list):
|
|
logger.info(f"Data is a list with {len(data)} items")
|
|
|
|
# Write the JSON string to the file
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
f.write(json_str)
|
|
|
|
logger.info(f"Data saved to {filename}")
|
|
except IOError as e:
|
|
logger.error(f"Error saving data to {filename}: {e}")
|
|
|
|
def calculate_global_metrics(data):
|
|
"""
|
|
Calculate global metrics for all pages in the data
|
|
|
|
Args:
|
|
data: Data containing regular_pages and specific_pages
|
|
|
|
Returns:
|
|
dict: Dictionary with global metrics
|
|
"""
|
|
# Combine regular and specific pages for global metrics
|
|
all_pages = data.get('regular_pages', []) + data.get('specific_pages', [])
|
|
|
|
# Initialize metrics
|
|
metrics = {
|
|
'total_pages': len(all_pages),
|
|
'avg_sections': 0,
|
|
'avg_words': 0,
|
|
'avg_links': 0,
|
|
'avg_images': 0,
|
|
'avg_categories': 0,
|
|
'avg_staleness': 0,
|
|
'pages_with_en_fr': 0,
|
|
'pages_missing_fr': 0,
|
|
'staleness_distribution': {
|
|
'0-20': 0,
|
|
'21-40': 0,
|
|
'41-60': 0,
|
|
'61-80': 0,
|
|
'81-100': 0,
|
|
'100+': 0
|
|
}
|
|
}
|
|
|
|
# Skip if no pages
|
|
if not all_pages:
|
|
return metrics
|
|
|
|
# Calculate totals
|
|
total_sections = 0
|
|
total_words = 0
|
|
total_links = 0
|
|
total_images = 0
|
|
total_categories = 0
|
|
total_staleness = 0
|
|
|
|
for page in all_pages:
|
|
# Count pages with/without French version
|
|
if page.get('fr_page'):
|
|
metrics['pages_with_en_fr'] += 1
|
|
else:
|
|
metrics['pages_missing_fr'] += 1
|
|
|
|
# Add to staleness distribution
|
|
staleness = page.get('staleness_score', 0)
|
|
total_staleness += staleness
|
|
|
|
if staleness <= 20:
|
|
metrics['staleness_distribution']['0-20'] += 1
|
|
elif staleness <= 40:
|
|
metrics['staleness_distribution']['21-40'] += 1
|
|
elif staleness <= 60:
|
|
metrics['staleness_distribution']['41-60'] += 1
|
|
elif staleness <= 80:
|
|
metrics['staleness_distribution']['61-80'] += 1
|
|
elif staleness <= 100:
|
|
metrics['staleness_distribution']['81-100'] += 1
|
|
else:
|
|
metrics['staleness_distribution']['100+'] += 1
|
|
|
|
# Add to totals
|
|
total_sections += page.get('section_diff', 0) if 'section_diff' in page else 0
|
|
total_words += page.get('word_diff', 0) if 'word_diff' in page else 0
|
|
total_links += page.get('link_diff', 0) if 'link_diff' in page else 0
|
|
total_images += page.get('media_diff', 0) if 'media_diff' in page else 0
|
|
|
|
# Count categories if available
|
|
if page.get('category_comparison'):
|
|
cat_count = len(page['category_comparison'].get('en_only', []))
|
|
total_categories += cat_count
|
|
|
|
# Calculate averages
|
|
metrics['avg_sections'] = round(total_sections / len(all_pages), 2)
|
|
metrics['avg_words'] = round(total_words / len(all_pages), 2)
|
|
metrics['avg_links'] = round(total_links / len(all_pages), 2)
|
|
metrics['avg_images'] = round(total_images / len(all_pages), 2)
|
|
metrics['avg_categories'] = round(total_categories / len(all_pages), 2)
|
|
metrics['avg_staleness'] = round(total_staleness / len(all_pages), 2)
|
|
|
|
return metrics
|
|
|
|
def save_with_history(data, filename):
|
|
"""
|
|
Save data to a JSON file while preserving history
|
|
|
|
This function loads existing data from the file (if it exists),
|
|
adds the new data to the history, and saves the updated data back to the file.
|
|
It also calculates global metrics for the current data.
|
|
|
|
Args:
|
|
data: New data to save
|
|
filename (str): Name of the file
|
|
"""
|
|
try:
|
|
# Load existing data
|
|
existing_data = load_json_data(filename)
|
|
|
|
# Create a timestamp for the current data
|
|
current_timestamp = datetime.now().isoformat()
|
|
|
|
# Initialize history if it doesn't exist
|
|
if 'history' not in existing_data:
|
|
existing_data['history'] = {}
|
|
|
|
# Calculate global metrics for the current data
|
|
global_metrics = calculate_global_metrics(data)
|
|
|
|
# Add current regular_pages, specific_pages, and global metrics to history
|
|
history_entry = {
|
|
'regular_pages': data.get('regular_pages', []),
|
|
'specific_pages': data.get('specific_pages', []),
|
|
'global_metrics': global_metrics
|
|
}
|
|
|
|
# Add the entry to history with timestamp as key
|
|
existing_data['history'][current_timestamp] = history_entry
|
|
|
|
# Update the current data
|
|
existing_data['regular_pages'] = data.get('regular_pages', [])
|
|
existing_data['specific_pages'] = data.get('specific_pages', [])
|
|
existing_data['global_metrics'] = global_metrics
|
|
existing_data['last_updated'] = current_timestamp
|
|
|
|
# Save the updated data
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Data with history saved to {filename}")
|
|
|
|
# Also save a separate ranking history file
|
|
save_ranking_history(existing_data, "page_rankings.json")
|
|
|
|
except (IOError, json.JSONDecodeError) as e:
|
|
logger.error(f"Error saving data with history to {filename}: {e}")
|
|
# Fallback to regular save if there's an error
|
|
save_to_json(data, filename)
|
|
|
|
def save_ranking_history(data, filename):
|
|
"""
|
|
Save ranking history to a separate JSON file
|
|
|
|
This function extracts ranking data from the history and saves it in a format
|
|
optimized for displaying ranking evolution over time.
|
|
|
|
Args:
|
|
data: Data containing history entries
|
|
filename (str): Name of the file to save rankings
|
|
"""
|
|
try:
|
|
# Initialize ranking data structure
|
|
ranking_data = {
|
|
'timestamps': [],
|
|
'pages': {},
|
|
'global_metrics': {}
|
|
}
|
|
|
|
# Extract history entries
|
|
history = data.get('history', {})
|
|
|
|
# Sort timestamps chronologically
|
|
sorted_timestamps = sorted(history.keys())
|
|
ranking_data['timestamps'] = sorted_timestamps
|
|
|
|
# Process each page to track its metrics over time
|
|
all_page_keys = set()
|
|
|
|
# First, collect all unique page keys across all history entries
|
|
for timestamp in sorted_timestamps:
|
|
entry = history[timestamp]
|
|
|
|
# Add global metrics for this timestamp
|
|
if 'global_metrics' in entry:
|
|
ranking_data['global_metrics'][timestamp] = entry['global_metrics']
|
|
|
|
# Collect page keys from regular pages
|
|
for page in entry.get('regular_pages', []):
|
|
all_page_keys.add(page['key'])
|
|
|
|
# Collect page keys from specific pages
|
|
for page in entry.get('specific_pages', []):
|
|
all_page_keys.add(page['key'])
|
|
|
|
# Initialize data structure for each page
|
|
for page_key in all_page_keys:
|
|
ranking_data['pages'][page_key] = {
|
|
'title': page_key,
|
|
'metrics': {}
|
|
}
|
|
|
|
# Fill in metrics for each page at each timestamp
|
|
for timestamp in sorted_timestamps:
|
|
entry = history[timestamp]
|
|
|
|
# Process regular pages
|
|
for page in entry.get('regular_pages', []):
|
|
page_key = page['key']
|
|
|
|
# Extract metrics we want to track
|
|
metrics = {
|
|
'staleness_score': page.get('staleness_score', 0),
|
|
'word_diff': page.get('word_diff', 0),
|
|
'section_diff': page.get('section_diff', 0),
|
|
'link_diff': page.get('link_diff', 0),
|
|
'media_diff': page.get('media_diff', 0)
|
|
}
|
|
|
|
# Store metrics for this timestamp
|
|
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
|
|
|
|
# Store page title if available
|
|
if 'en_page' in page and page['en_page']:
|
|
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
|
|
|
|
# Process specific pages
|
|
for page in entry.get('specific_pages', []):
|
|
page_key = page['key']
|
|
|
|
# Extract metrics we want to track
|
|
metrics = {
|
|
'staleness_score': page.get('staleness_score', 0),
|
|
'word_diff': page.get('word_diff', 0),
|
|
'section_diff': page.get('section_diff', 0),
|
|
'link_diff': page.get('link_diff', 0),
|
|
'media_diff': page.get('media_diff', 0)
|
|
}
|
|
|
|
# Store metrics for this timestamp
|
|
ranking_data['pages'][page_key]['metrics'][timestamp] = metrics
|
|
|
|
# Store page title if available
|
|
if 'en_page' in page and page['en_page']:
|
|
ranking_data['pages'][page_key]['title'] = page['en_page'].get('page_title', page_key)
|
|
|
|
# Save the ranking data
|
|
with open(filename, 'w', encoding='utf-8') as f:
|
|
json.dump(ranking_data, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Ranking history saved to {filename}")
|
|
|
|
except (IOError, json.JSONDecodeError) as e:
|
|
logger.error(f"Error saving ranking history to {filename}: {e}")
|
|
|
|
def check_grammar_with_grammalecte(text):
|
|
"""
|
|
Check grammar in French text using grammalecte-cli
|
|
|
|
Args:
|
|
text (str): French text to check
|
|
|
|
Returns:
|
|
list: List of grammar suggestions
|
|
"""
|
|
if not text or len(text.strip()) == 0:
|
|
logger.warning("Empty text provided for grammar checking")
|
|
return []
|
|
|
|
# Check if grammalecte-cli is available
|
|
try:
|
|
subprocess.run(['which', 'grammalecte-cli'], capture_output=True, check=True)
|
|
except subprocess.CalledProcessError:
|
|
logger.warning("grammalecte-cli not found, skipping grammar check")
|
|
return []
|
|
|
|
logger.info("Checking grammar with grammalecte-cli...")
|
|
|
|
try:
|
|
# Create a temporary file with the text
|
|
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.txt', delete=False) as temp_file:
|
|
temp_file.write(text)
|
|
temp_file_path = temp_file.name
|
|
|
|
# Run grammalecte-cli on the temporary file
|
|
cmd = ['grammalecte-cli', '-f', temp_file_path, '-j', '-ctx', '-wss']
|
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
|
|
# Parse the JSON output
|
|
grammar_data = json.loads(result.stdout)
|
|
|
|
# Extract grammar errors from all paragraphs
|
|
grammar_suggestions = []
|
|
for paragraph in grammar_data.get('data', []):
|
|
paragraph_index = paragraph.get('iParagraph', 0)
|
|
|
|
# Process grammar errors
|
|
for error in paragraph.get('lGrammarErrors', []):
|
|
suggestion = {
|
|
'paragraph': paragraph_index,
|
|
'start': error.get('nStart', 0),
|
|
'end': error.get('nEnd', 0),
|
|
'type': error.get('sType', ''),
|
|
'message': error.get('sMessage', ''),
|
|
'suggestions': error.get('aSuggestions', []),
|
|
'text': error.get('sUnderlined', ''),
|
|
'before': error.get('sBefore', ''),
|
|
'after': error.get('sAfter', '')
|
|
}
|
|
grammar_suggestions.append(suggestion)
|
|
|
|
# Process spelling errors
|
|
for error in paragraph.get('lSpellingErrors', []):
|
|
suggestion = {
|
|
'paragraph': paragraph_index,
|
|
'start': error.get('nStart', 0),
|
|
'end': error.get('nEnd', 0),
|
|
'type': 'spelling',
|
|
'message': 'Erreur d\'orthographe',
|
|
'suggestions': error.get('aSuggestions', []),
|
|
'text': error.get('sUnderlined', ''),
|
|
'before': error.get('sBefore', ''),
|
|
'after': error.get('sAfter', '')
|
|
}
|
|
grammar_suggestions.append(suggestion)
|
|
|
|
# Clean up the temporary file
|
|
os.unlink(temp_file_path)
|
|
|
|
logger.info(f"Found {len(grammar_suggestions)} grammar/spelling suggestions")
|
|
return grammar_suggestions
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Error running grammalecte-cli: {e}")
|
|
logger.error(f"stdout: {e.stdout}")
|
|
logger.error(f"stderr: {e.stderr}")
|
|
return []
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Error parsing grammalecte-cli output: {e}")
|
|
return []
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error during grammar checking: {e}")
|
|
return []
|
|
|
|
def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
|
|
"""
|
|
Fetch wiki page for a given key or specific page
|
|
|
|
This function handles different types of wiki pages:
|
|
1. Regular OSM key pages (e.g., "building", "highway")
|
|
2. Specific wiki pages that can be in various formats:
|
|
- Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
|
|
- Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
|
- Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
|
|
|
Args:
|
|
key (str): OSM key or specific page title/URL
|
|
language (str): Language code ('en' or 'fr')
|
|
is_specific_page (bool): Whether this is a specific page rather than a key
|
|
check_grammar (bool): Whether to check grammar for French pages
|
|
|
|
Returns:
|
|
dict: Dictionary with page information or None if page doesn't exist
|
|
"""
|
|
# Skip pages with "FR:User:" or "FR:Réunions"
|
|
if "FR:User:" in key or "FR:Réunions" in key:
|
|
logger.info(f"Skipping excluded page: {key}")
|
|
return None
|
|
# Handle different URL formats
|
|
if is_specific_page:
|
|
# Case 1: Full URL
|
|
if key.startswith('http'):
|
|
url = key
|
|
# Extract the page title from the URL
|
|
page_title = key.split('/')[-1]
|
|
# Determine language from URL
|
|
if 'FR:' in key or '/FR:' in key:
|
|
language = 'fr'
|
|
else:
|
|
language = 'en'
|
|
# Case 2: Page with FR: prefix
|
|
elif key.startswith('FR:'):
|
|
url = f"{WIKI_BASE_URL}{key}"
|
|
page_title = key[3:] # Remove FR: prefix for title
|
|
language = 'fr'
|
|
# Case 3: Regular page title
|
|
else:
|
|
if language == 'fr':
|
|
url = f"{WIKI_BASE_URL}FR:{key}"
|
|
else:
|
|
url = f"{WIKI_BASE_URL}{key}"
|
|
page_title = key
|
|
else:
|
|
# Regular key page
|
|
base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
|
|
url = f"{base_url}{key}"
|
|
page_title = key
|
|
|
|
# Create a unique cache filename based on the URL
|
|
cache_key = hashlib.md5(url.encode()).hexdigest()
|
|
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
|
|
|
html_content = None
|
|
|
|
# Try to load from cache first
|
|
if cache_file.exists():
|
|
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
|
|
try:
|
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
except Exception as e:
|
|
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
|
|
html_content = None
|
|
|
|
# If not in cache or cache read failed, fetch from web
|
|
if html_content is None:
|
|
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
|
try:
|
|
response = requests.get(url)
|
|
|
|
# Check if page exists
|
|
if response.status_code == 404:
|
|
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
|
return None
|
|
|
|
response.raise_for_status()
|
|
html_content = response.text
|
|
|
|
# Save to cache
|
|
try:
|
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
|
f.write(html_content)
|
|
logger.info(f"Saved {language} wiki page to cache for {'page' if is_specific_page else 'key'} '{key}'")
|
|
except Exception as e:
|
|
logger.warning(f"Error saving to cache: {e}")
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
|
return None
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Get last modification date
|
|
last_modified = None
|
|
footer_info = soup.select_one('#footer-info-lastmod')
|
|
if footer_info:
|
|
date_text = footer_info.text
|
|
# Extract date using regex
|
|
date_match = re.search(r'(\d{1,2} \w+ \d{4})', date_text)
|
|
if date_match:
|
|
date_str = date_match.group(1)
|
|
try:
|
|
# Parse date (format may vary based on wiki language)
|
|
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
|
except ValueError:
|
|
logger.warning(f"Could not parse date: {date_str}")
|
|
|
|
# Extract sections (h2, h3, h4)
|
|
section_elements = soup.select('h2, h3, h4')
|
|
sections = len(section_elements)
|
|
|
|
# Extract section titles
|
|
section_titles = []
|
|
for section_elem in section_elements:
|
|
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
|
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
|
continue
|
|
|
|
# Skip sections that are inside a table with class DescriptionBox
|
|
if section_elem.find_parent('table', class_='DescriptionBox'):
|
|
continue
|
|
|
|
# Get the text of the section title, removing any edit links
|
|
for edit_link in section_elem.select('.mw-editsection'):
|
|
edit_link.extract()
|
|
|
|
section_title = section_elem.get_text(strip=True)
|
|
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
|
|
|
section_titles.append({
|
|
'title': section_title,
|
|
'level': section_level
|
|
})
|
|
|
|
# Count words and sentences in the content
|
|
content = soup.select_one('#mw-content-text')
|
|
clean_text = ""
|
|
if content:
|
|
# Remove script and style elements
|
|
for script in content.select('script, style'):
|
|
script.extract()
|
|
|
|
# Remove .languages elements
|
|
for languages_elem in content.select('.languages'):
|
|
languages_elem.extract()
|
|
|
|
# Get text and count words
|
|
clean_text = content.get_text(separator=' ', strip=True)
|
|
word_count = len(clean_text.split())
|
|
|
|
# Count sentences using NLTK if available, otherwise use a simple approximation
|
|
if NLTK_AVAILABLE and check_grammar:
|
|
sentences = nltk.sent_tokenize(clean_text)
|
|
sentence_count = len(sentences)
|
|
else:
|
|
# Simple approximation: count periods, exclamation marks, and question marks
|
|
sentence_count = len(re.findall(r'[.!?]+', clean_text))
|
|
|
|
# Check grammar for French pages
|
|
grammar_suggestions = []
|
|
if language == 'fr' and check_grammar:
|
|
logger.info(f"Checking grammar for French page: {key}")
|
|
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
|
elif language == 'fr' and not check_grammar:
|
|
logger.info(f"Grammar checking disabled for French page: {key}")
|
|
|
|
# Extract links
|
|
links = content.select('a')
|
|
link_count = len(links)
|
|
|
|
# Get link details (text and href)
|
|
link_details = []
|
|
for link in links:
|
|
href = link.get('href', '')
|
|
# Skip edit section links and other non-content links
|
|
if 'action=edit' in href or 'redlink=1' in href or not href:
|
|
continue
|
|
|
|
# Make relative URLs absolute
|
|
if href.startswith('/'):
|
|
href = 'https://wiki.openstreetmap.org' + href
|
|
|
|
link_text = link.get_text(strip=True)
|
|
if link_text: # Only include links with text
|
|
link_details.append({
|
|
'text': link_text,
|
|
'href': href
|
|
})
|
|
|
|
# Extract media (images)
|
|
media_elements = content.select('img')
|
|
media_count = len(media_elements)
|
|
|
|
# Get media details (src and alt text)
|
|
media_details = []
|
|
|
|
# Extract description image specifically
|
|
# Try multiple selectors to find the description image
|
|
description_img = None
|
|
|
|
# Debug: Log the key we're processing
|
|
logger.info(f"Looking for description image for key '{key}' in {language}")
|
|
|
|
# Function to filter out OSM logo and small icons
|
|
def is_relevant_image(img):
|
|
src = img.get('src', '')
|
|
# Skip OSM logo
|
|
if 'osm_logo' in src:
|
|
return False
|
|
# Skip small icons (usually less than 30px)
|
|
width = img.get('width')
|
|
if width and int(width) < 30:
|
|
return False
|
|
height = img.get('height')
|
|
if height and int(height) < 30:
|
|
return False
|
|
return True
|
|
|
|
# Special case for highway key - directly target the image we want
|
|
if key == 'highway':
|
|
# Try to find the specific image in figure elements
|
|
highway_img_elements = content.select('figure.mw-halign-center img')
|
|
logger.info(f" Highway specific selector 'figure.mw-halign-center img' found {len(highway_img_elements)} elements")
|
|
|
|
# Filter for relevant images
|
|
relevant_images = [img for img in highway_img_elements if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images for highway")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using highway-specific image: {description_img.get('src', '')}")
|
|
|
|
# If not found with highway-specific selector, try the td.d_image selector
|
|
if not description_img:
|
|
description_img_elements = content.select('td.d_image img')
|
|
logger.info(f" Selector 'td.d_image img' found {len(description_img_elements)} elements")
|
|
|
|
# Filter for relevant images
|
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images in td.d_image")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using image from 'td.d_image img': {description_img.get('src', '')}")
|
|
|
|
# If still not found, try the specific selector for .description img.mw-file-element
|
|
if not description_img:
|
|
description_img_elements = content.select('.description img.mw-file-element')
|
|
logger.info(f" Selector '.description img.mw-file-element' found {len(description_img_elements)} elements")
|
|
|
|
# Filter for relevant images
|
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images in .description")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using image from '.description img.mw-file-element': {description_img.get('src', '')}")
|
|
|
|
# If still not found, try images in figures within the description box
|
|
if not description_img:
|
|
description_img_elements = content.select('.description figure img')
|
|
logger.info(f" Selector '.description figure img' found {len(description_img_elements)} elements")
|
|
|
|
# Filter for relevant images
|
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images in .description figure")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using image from '.description figure img': {description_img.get('src', '')}")
|
|
|
|
# If still not found, try any image in the description box
|
|
if not description_img:
|
|
description_img_elements = content.select('.description img')
|
|
logger.info(f" Selector '.description img' found {len(description_img_elements)} elements")
|
|
|
|
# Filter for relevant images
|
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images in .description general")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using image from '.description img': {description_img.get('src', '')}")
|
|
|
|
# If still not found, try images in the DescriptionBox table
|
|
if not description_img:
|
|
description_img_elements = content.select('table.DescriptionBox img')
|
|
logger.info(f" Selector 'table.DescriptionBox img' found {len(description_img_elements)} elements")
|
|
|
|
# Filter for relevant images
|
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images in DescriptionBox")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using image from 'table.DescriptionBox img': {description_img.get('src', '')}")
|
|
|
|
# If still not found, try images in figure elements anywhere in the content
|
|
if not description_img:
|
|
description_img_elements = content.select('figure img')
|
|
logger.info(f" Selector 'figure img' found {len(description_img_elements)} elements")
|
|
|
|
# Filter for relevant images
|
|
relevant_images = [img for img in description_img_elements if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images in figure elements")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using image from 'figure img': {description_img.get('src', '')}")
|
|
|
|
# If we still don't have an image, use any image that's not the OSM logo
|
|
if not description_img:
|
|
all_images = content.select('img')
|
|
relevant_images = [img for img in all_images if is_relevant_image(img)]
|
|
logger.info(f" Found {len(relevant_images)} relevant images in the entire page")
|
|
|
|
if relevant_images:
|
|
description_img = relevant_images[0]
|
|
logger.info(f" Using fallback image: {description_img.get('src', '')}")
|
|
|
|
# Process the found image
|
|
description_img_url = None
|
|
if description_img:
|
|
src = description_img.get('src', '')
|
|
if src:
|
|
# Make relative URLs absolute
|
|
if src.startswith('//'):
|
|
src = 'https:' + src
|
|
elif src.startswith('/'):
|
|
src = 'https://wiki.openstreetmap.org' + src
|
|
|
|
description_img_url = src
|
|
|
|
# Process all images
|
|
for img in media_elements:
|
|
src = img.get('src', '')
|
|
if src:
|
|
# Make relative URLs absolute
|
|
if src.startswith('//'):
|
|
src = 'https:' + src
|
|
elif src.startswith('/'):
|
|
src = 'https://wiki.openstreetmap.org' + src
|
|
|
|
alt_text = img.get('alt', '')
|
|
media_details.append({
|
|
'src': src,
|
|
'alt': alt_text
|
|
})
|
|
|
|
# Extract categories
|
|
categories = []
|
|
category_links = soup.select('#mw-normal-catlinks li a')
|
|
for cat_link in category_links:
|
|
categories.append(cat_link.get_text(strip=True))
|
|
else:
|
|
word_count = 0
|
|
link_count = 0
|
|
link_details = []
|
|
media_count = 0
|
|
media_details = []
|
|
categories = []
|
|
grammar_suggestions = []
|
|
|
|
return {
|
|
'key': key,
|
|
'page_title': page_title,
|
|
'language': language,
|
|
'url': url,
|
|
'last_modified': last_modified,
|
|
'sections': sections,
|
|
'section_titles': section_titles,
|
|
'word_count': word_count,
|
|
'sentence_count': sentence_count,
|
|
'link_count': link_count,
|
|
'link_details': link_details,
|
|
'media_count': media_count,
|
|
'media_details': media_details,
|
|
'categories': categories,
|
|
'description_img_url': description_img_url,
|
|
'is_specific_page': is_specific_page,
|
|
'grammar_suggestions': grammar_suggestions,
|
|
'html_content': html_content
|
|
}
|
|
|
|
def generate_staleness_histogram(wiki_pages):
|
|
"""
|
|
Generate a histogram of staleness scores by 10% ranges
|
|
|
|
Args:
|
|
wiki_pages (list): List of dictionaries containing page information with staleness scores
|
|
|
|
Returns:
|
|
None: Saves the histogram to a file
|
|
"""
|
|
logger.info("Generating histogram of staleness scores by 10% ranges...")
|
|
|
|
# Extract staleness scores
|
|
staleness_scores = []
|
|
for page in wiki_pages:
|
|
if page and 'staleness_score' in page:
|
|
staleness_scores.append(page['staleness_score'])
|
|
|
|
if not staleness_scores:
|
|
logger.warning("No staleness scores found. Cannot generate histogram.")
|
|
return
|
|
|
|
# Determine the maximum score for binning
|
|
max_score = max(staleness_scores)
|
|
# Round up to the nearest 10 to ensure all scores are included
|
|
max_bin_edge = np.ceil(max_score / 10) * 10
|
|
|
|
# Create bins for 10% ranges
|
|
bins = np.arange(0, max_bin_edge + 10, 10)
|
|
|
|
# Count scores in each bin
|
|
hist, bin_edges = np.histogram(staleness_scores, bins=bins)
|
|
|
|
# Create histogram
|
|
plt.figure(figsize=(12, 6))
|
|
|
|
# Create bar chart
|
|
plt.bar(range(len(hist)), hist, align='center')
|
|
|
|
# Set x-axis labels for each bin
|
|
bin_labels = [f"{int(bin_edges[i])}-{int(bin_edges[i+1])}%" for i in range(len(bin_edges)-1)]
|
|
plt.xticks(range(len(hist)), bin_labels, rotation=45)
|
|
|
|
# Set labels and title
|
|
plt.xlabel('Tranches de score de décrépitude (en %)')
|
|
plt.ylabel('Nombre de pages')
|
|
plt.title('Répartition du score de décrépitude par tranches de 10%')
|
|
|
|
# Add grid for better readability
|
|
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
|
|
|
# Adjust layout
|
|
plt.tight_layout()
|
|
|
|
# Save figure
|
|
plt.savefig(STALENESS_HISTOGRAM_FILE)
|
|
logger.info(f"Histogram saved to {STALENESS_HISTOGRAM_FILE}")
|
|
|
|
# Close the figure to free memory
|
|
plt.close()
|
|
|
|
def analyze_wiki_pages(pages):
|
|
"""
|
|
Analyze wiki pages to determine which ones need updating
|
|
|
|
Args:
|
|
pages (list): List of dictionaries containing page information
|
|
|
|
Returns:
|
|
list: List of pages that need updating, sorted by priority
|
|
"""
|
|
logger.info("Analyzing wiki pages to identify those needing updates...")
|
|
|
|
# Group pages by key
|
|
pages_by_key = {}
|
|
for page in pages:
|
|
if page is None:
|
|
continue
|
|
|
|
key = page['key']
|
|
if key not in pages_by_key:
|
|
pages_by_key[key] = {}
|
|
|
|
pages_by_key[key][page['language']] = page
|
|
|
|
# Analyze each key's pages
|
|
needs_update = []
|
|
|
|
for key, lang_pages in pages_by_key.items():
|
|
# Skip if either language is missing
|
|
if 'en' not in lang_pages or 'fr' not in lang_pages:
|
|
if 'en' in lang_pages:
|
|
# French page is missing
|
|
# For missing French pages, calculate a high staleness score
|
|
# Use word count as the main factor (50% weight)
|
|
missing_staleness_score = (
|
|
30 * 0.2 + # Assume 30 days outdated (20%)
|
|
lang_pages['en']['word_count'] / 100 * 0.5 + # Word count (50%)
|
|
lang_pages['en']['sections'] * 0.15 + # Sections (15%)
|
|
lang_pages['en']['link_count'] / 10 * 0.15 # Links (15%)
|
|
)
|
|
|
|
# Round to 2 decimal places and ensure it's high
|
|
missing_staleness_score = max(100, round(missing_staleness_score, 2))
|
|
|
|
# Get media count or default to 0
|
|
media_count = lang_pages['en'].get('media_count', 0)
|
|
|
|
needs_update.append({
|
|
'key': key,
|
|
'reason': 'French page missing',
|
|
'en_page': lang_pages['en'],
|
|
'fr_page': None,
|
|
'date_diff': 0,
|
|
'word_diff': lang_pages['en']['word_count'],
|
|
'section_diff': lang_pages['en']['sections'],
|
|
'link_diff': lang_pages['en']['link_count'],
|
|
'media_diff': media_count,
|
|
'staleness_score': missing_staleness_score,
|
|
'priority': missing_staleness_score, # Use staleness score as priority
|
|
'section_comparison': None, # No comparison possible
|
|
'link_comparison': None, # No comparison possible
|
|
'media_comparison': None, # No comparison possible
|
|
'category_comparison': None # No comparison possible
|
|
})
|
|
continue
|
|
|
|
en_page = lang_pages['en']
|
|
fr_page = lang_pages['fr']
|
|
|
|
# Skip if dates are missing
|
|
if not en_page['last_modified'] or not fr_page['last_modified']:
|
|
continue
|
|
|
|
# Calculate date difference in days
|
|
en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
|
|
fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
|
|
date_diff = (en_date - fr_date).days
|
|
|
|
# Calculate content differences
|
|
word_diff = en_page['word_count'] - fr_page['word_count']
|
|
section_diff = en_page['sections'] - fr_page['sections']
|
|
link_diff = en_page['link_count'] - fr_page['link_count']
|
|
media_diff = en_page.get('media_count', 0) - fr_page.get('media_count', 0)
|
|
|
|
# Calculate staleness score (higher means more outdated/stale)
|
|
# Weight factors adjusted to emphasize word count differences
|
|
staleness_score = (
|
|
abs(date_diff) * 0.2 + # Date difference (20%)
|
|
abs(word_diff) / 100 * 0.5 + # Word count difference (normalized) (50%)
|
|
abs(section_diff) * 0.15 + # Section difference (15%)
|
|
abs(link_diff) / 10 * 0.15 # Link count difference (normalized) (15%)
|
|
)
|
|
|
|
# Round to 2 decimal places for display
|
|
staleness_score = round(staleness_score, 2)
|
|
|
|
# Compare sections between English and French pages
|
|
section_comparison = {
|
|
'en_only': [],
|
|
'fr_only': [],
|
|
'common': []
|
|
}
|
|
|
|
# Group sections by their level for hierarchical comparison
|
|
en_sections_by_level = {}
|
|
fr_sections_by_level = {}
|
|
|
|
# Organize English sections by level
|
|
for section in en_page.get('section_titles', []):
|
|
level = section['level']
|
|
if level not in en_sections_by_level:
|
|
en_sections_by_level[level] = []
|
|
en_sections_by_level[level].append(section)
|
|
|
|
# Organize French sections by level
|
|
for section in fr_page.get('section_titles', []):
|
|
level = section['level']
|
|
if level not in fr_sections_by_level:
|
|
fr_sections_by_level[level] = []
|
|
fr_sections_by_level[level].append(section)
|
|
|
|
# Process each level to find matching sections
|
|
all_levels = set(list(en_sections_by_level.keys()) + list(fr_sections_by_level.keys()))
|
|
|
|
for level in all_levels:
|
|
en_level_sections = en_sections_by_level.get(level, [])
|
|
fr_level_sections = fr_sections_by_level.get(level, [])
|
|
|
|
# Create dictionaries for easier lookup, using lowercase titles
|
|
en_dict = {section['title'].lower(): section for section in en_level_sections}
|
|
fr_dict = {section['title'].lower(): section for section in fr_level_sections}
|
|
|
|
# Find sections at this level only in English
|
|
for title, section in en_dict.items():
|
|
if title not in fr_dict:
|
|
section_comparison['en_only'].append(section)
|
|
|
|
# Find sections at this level only in French
|
|
for title, section in fr_dict.items():
|
|
if title not in en_dict:
|
|
section_comparison['fr_only'].append(section)
|
|
|
|
# Find common sections at this level
|
|
for title in en_dict.keys():
|
|
if title in fr_dict:
|
|
section_comparison['common'].append({
|
|
'en': en_dict[title],
|
|
'fr': fr_dict[title]
|
|
})
|
|
|
|
# Compare links between English and French pages
|
|
link_comparison = {
|
|
'en_only': [],
|
|
'fr_only': [],
|
|
'common': []
|
|
}
|
|
|
|
# Extract link texts for comparison (case insensitive)
|
|
en_links = {link['text'].lower(): link for link in en_page.get('link_details', [])}
|
|
fr_links = {link['text'].lower(): link for link in fr_page.get('link_details', [])}
|
|
|
|
# Find links only in English
|
|
for text, link in en_links.items():
|
|
if text not in fr_links:
|
|
link_comparison['en_only'].append(link)
|
|
|
|
# Find links only in French
|
|
for text, link in fr_links.items():
|
|
if text not in en_links:
|
|
link_comparison['fr_only'].append(link)
|
|
|
|
# Find common links
|
|
for text in en_links.keys():
|
|
if text in fr_links:
|
|
link_comparison['common'].append({
|
|
'en': en_links[text],
|
|
'fr': fr_links[text]
|
|
})
|
|
|
|
# Compare media between English and French pages
|
|
media_comparison = {
|
|
'en_only': [],
|
|
'fr_only': [],
|
|
'common': []
|
|
}
|
|
|
|
# Extract media alt texts for comparison (case insensitive)
|
|
en_media = {media['alt'].lower(): media for media in en_page.get('media_details', []) if media['alt']}
|
|
fr_media = {media['alt'].lower(): media for media in fr_page.get('media_details', []) if media['alt']}
|
|
|
|
# Find media only in English
|
|
for alt, media in en_media.items():
|
|
if alt not in fr_media:
|
|
media_comparison['en_only'].append(media)
|
|
|
|
# Find media only in French
|
|
for alt, media in fr_media.items():
|
|
if alt not in en_media:
|
|
media_comparison['fr_only'].append(media)
|
|
|
|
# Find common media
|
|
for alt in en_media.keys():
|
|
if alt in fr_media:
|
|
media_comparison['common'].append({
|
|
'en': en_media[alt],
|
|
'fr': fr_media[alt]
|
|
})
|
|
|
|
# Add media without alt text to their respective language-only lists
|
|
for media in en_page.get('media_details', []):
|
|
if not media['alt'] or media['alt'].lower() not in en_media:
|
|
media_comparison['en_only'].append(media)
|
|
|
|
for media in fr_page.get('media_details', []):
|
|
if not media['alt'] or media['alt'].lower() not in fr_media:
|
|
media_comparison['fr_only'].append(media)
|
|
|
|
# Compare categories between English and French pages
|
|
category_comparison = {
|
|
'en_only': [],
|
|
'fr_only': [],
|
|
'common': []
|
|
}
|
|
|
|
# Extract categories for comparison (case insensitive)
|
|
en_categories = [cat.lower() for cat in en_page.get('categories', [])]
|
|
fr_categories = [cat.lower() for cat in fr_page.get('categories', [])]
|
|
|
|
# Find categories only in English
|
|
for cat in en_page.get('categories', []):
|
|
if cat.lower() not in fr_categories:
|
|
category_comparison['en_only'].append(cat)
|
|
|
|
# Find categories only in French
|
|
for cat in fr_page.get('categories', []):
|
|
if cat.lower() not in en_categories:
|
|
category_comparison['fr_only'].append(cat)
|
|
|
|
# Find common categories
|
|
for cat in en_page.get('categories', []):
|
|
if cat.lower() in fr_categories:
|
|
category_comparison['common'].append(cat)
|
|
|
|
if date_diff > 30 or word_diff > 200 or section_diff > 2 or link_diff > 20 or fr_page['word_count'] < en_page['word_count'] * 0.7:
|
|
reason = []
|
|
if date_diff > 30:
|
|
reason.append(f"La version Française est datée de {date_diff} jours")
|
|
if word_diff > 200:
|
|
reason.append(f"La version Anglaise a {word_diff} mots de plus")
|
|
if section_diff > 2:
|
|
reason.append(f"La version Anglaise a {section_diff} sections de plus")
|
|
if link_diff > 20:
|
|
reason.append(f"La version Anglaise a {link_diff} liens de plus")
|
|
if media_diff > 5:
|
|
reason.append(f"La version Anglaise a {media_diff} images de plus")
|
|
if fr_page['word_count'] < en_page['word_count'] * 0.7:
|
|
reason.append(f"La version Française a seulement {fr_page['word_count'] / en_page['word_count']:.0%} % du contenu en Anglais.")
|
|
|
|
needs_update.append({
|
|
'key': key,
|
|
'reason': ', '.join(reason),
|
|
'en_page': en_page,
|
|
'fr_page': fr_page,
|
|
'date_diff': date_diff,
|
|
'word_diff': word_diff,
|
|
'section_diff': section_diff,
|
|
'link_diff': link_diff,
|
|
'media_diff': media_diff,
|
|
'staleness_score': staleness_score,
|
|
'priority': staleness_score, # Use staleness score as priority
|
|
'section_comparison': section_comparison,
|
|
'link_comparison': link_comparison,
|
|
'media_comparison': media_comparison,
|
|
'category_comparison': category_comparison
|
|
})
|
|
|
|
# Sort by priority (descending)
|
|
needs_update.sort(key=lambda x: x['priority'], reverse=True)
|
|
|
|
return needs_update
|
|
|
|
def main():
|
|
"""
|
|
Main function to execute the script
|
|
|
|
This function:
|
|
1. Fetches the top OSM keys from TagInfo API
|
|
2. Fetches keys used in France that are missing a wiki page from TagInfo API
|
|
3. Fetches and processes wiki pages for these keys
|
|
4. Processes specific wiki pages listed in SPECIFIC_PAGES
|
|
5. Processes pages from the FR:Traductions_désynchronisées category
|
|
6. Processes pages starting with "France" from the DeadendPages list
|
|
7. Calculates staleness scores for all pages
|
|
8. Generates a histogram of staleness scores
|
|
9. Saves the results to CSV and JSON files
|
|
10. Prints a list of pages that need updating
|
|
"""
|
|
# Parse command-line arguments
|
|
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
|
|
parser.add_argument('--no-grammar-check', action='store_true',
|
|
help='Disable grammar checking for French pages', default=False)
|
|
args = parser.parse_args()
|
|
|
|
# Whether to check grammar for French pages
|
|
check_grammar = not args.no_grammar_check
|
|
|
|
logger.info("Starting wiki_compare.py")
|
|
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
|
|
|
|
# Create output directory if it doesn't exist
|
|
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
|
|
|
|
# Fetch top keys
|
|
top_keys = fetch_top_keys(NUM_WIKI_PAGES)
|
|
|
|
if not top_keys:
|
|
logger.error("Failed to fetch top keys. Exiting.")
|
|
return
|
|
|
|
# Save top keys to JSON
|
|
save_to_json(top_keys, TOP_KEYS_FILE)
|
|
|
|
# Fetch keys without wiki pages used in France
|
|
keys_without_wiki = fetch_keys_without_wiki_page()
|
|
|
|
if keys_without_wiki:
|
|
# Save keys without wiki pages to JSON
|
|
save_to_json(keys_without_wiki, KEYS_WITHOUT_WIKI_FILE)
|
|
logger.info(f"Saved {len(keys_without_wiki)} keys without wiki pages to {KEYS_WITHOUT_WIKI_FILE}")
|
|
else:
|
|
logger.warning("No keys without wiki pages were fetched.")
|
|
|
|
# Fetch pages starting with "France" from the DeadendPages list
|
|
deadend_pages = fetch_deadend_pages()
|
|
|
|
if deadend_pages:
|
|
# Load existing deadend pages data to compare with history
|
|
existing_data = load_json_data(DEADEND_PAGES_FILE)
|
|
|
|
# Initialize history if it doesn't exist
|
|
if 'history' not in existing_data:
|
|
existing_data['history'] = {}
|
|
|
|
# Get the most recent history entry
|
|
sorted_timestamps = sorted(existing_data.get('history', {}).keys())
|
|
previous_pages = []
|
|
if sorted_timestamps:
|
|
latest_timestamp = sorted_timestamps[-1]
|
|
previous_pages = existing_data['history'][latest_timestamp].get('pages', [])
|
|
|
|
# Find pages that were in the previous list but are no longer in the current list
|
|
previous_urls = [page['url'] for page in previous_pages]
|
|
current_urls = [page['url'] for page in deadend_pages]
|
|
|
|
categorized_pages = []
|
|
for url in previous_urls:
|
|
if url not in current_urls:
|
|
# Find the page in previous_pages
|
|
for page in previous_pages:
|
|
if page['url'] == url:
|
|
# This page is no longer in the DeadendPages list, which means it has been categorized
|
|
categorized_pages.append(page)
|
|
break
|
|
|
|
# Create a timestamp for the current data
|
|
current_timestamp = datetime.now().isoformat()
|
|
|
|
# Create the history entry
|
|
history_entry = {
|
|
'pages': deadend_pages,
|
|
'categorized_pages': categorized_pages
|
|
}
|
|
|
|
# Add the entry to history with timestamp as key
|
|
existing_data['history'][current_timestamp] = history_entry
|
|
|
|
# Update the current data
|
|
existing_data['pages'] = deadend_pages
|
|
existing_data['categorized_pages'] = categorized_pages
|
|
existing_data['last_updated'] = current_timestamp
|
|
|
|
# Save the updated data
|
|
save_to_json(existing_data, DEADEND_PAGES_FILE)
|
|
logger.info(f"Saved {len(deadend_pages)} deadend pages to {DEADEND_PAGES_FILE}")
|
|
logger.info(f"Found {len(categorized_pages)} pages that have been categorized since the last run")
|
|
else:
|
|
logger.warning("No deadend pages were fetched.")
|
|
|
|
# Fetch wiki pages for each key
|
|
wiki_pages = []
|
|
|
|
# Process top keys
|
|
logger.info("Processing top keys...")
|
|
for key_info in top_keys:
|
|
key = key_info['key']
|
|
|
|
# Fetch English page
|
|
en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
|
|
if en_page:
|
|
wiki_pages.append(en_page)
|
|
|
|
# Fetch French page
|
|
fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
|
|
if fr_page:
|
|
wiki_pages.append(fr_page)
|
|
|
|
# Process specific pages from the SPECIFIC_PAGES list
|
|
# These are additional pages to compare beyond the top keys from TagInfo
|
|
logger.info("Processing specific pages...")
|
|
for page in SPECIFIC_PAGES:
|
|
# For specific pages, we need to handle different formats
|
|
|
|
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
|
if page.startswith('http'):
|
|
# For full URLs, we directly fetch the page
|
|
page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
|
if page_info:
|
|
wiki_pages.append(page_info)
|
|
|
|
# If it's a French page, try to find the English equivalent
|
|
if page_info['language'] == 'fr':
|
|
# Try to get the English version by removing FR: prefix
|
|
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
|
|
en_url = f"{WIKI_BASE_URL}{en_title}"
|
|
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
|
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
|
if en_page:
|
|
wiki_pages.append(en_page)
|
|
# If it's an English page, try to find the French equivalent
|
|
else:
|
|
# Try to get the French version by adding FR: prefix
|
|
fr_title = f"FR:{page_info['page_title']}"
|
|
fr_url = f"{WIKI_BASE_URL}{fr_title}"
|
|
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
|
|
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
|
if fr_page:
|
|
wiki_pages.append(fr_page)
|
|
|
|
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
|
elif page.startswith('FR:'):
|
|
# Fetch the French page
|
|
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
|
if fr_page:
|
|
wiki_pages.append(fr_page)
|
|
|
|
# Try to get the English version by removing FR: prefix
|
|
en_title = page[3:] # Remove FR: prefix
|
|
en_url = f"{WIKI_BASE_URL}{en_title}"
|
|
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
|
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
|
if en_page:
|
|
wiki_pages.append(en_page)
|
|
|
|
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
|
|
else:
|
|
# Fetch the English page
|
|
en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
|
if en_page:
|
|
wiki_pages.append(en_page)
|
|
|
|
# Fetch the French page (by adding FR: prefix)
|
|
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
|
if fr_page:
|
|
wiki_pages.append(fr_page)
|
|
|
|
# Process pages from the FR:Traductions_désynchronisées category
|
|
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
|
|
desynchronized_pages = fetch_desynchronized_pages()
|
|
for page_url in desynchronized_pages:
|
|
# Fetch the French page
|
|
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
|
if fr_page:
|
|
wiki_pages.append(fr_page)
|
|
|
|
# Try to find the English equivalent
|
|
if fr_page['page_title'].startswith('FR:'):
|
|
en_title = fr_page['page_title'][3:] # Remove FR: prefix
|
|
else:
|
|
en_title = fr_page['page_title']
|
|
|
|
en_url = f"{WIKI_BASE_URL}{en_title}"
|
|
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
|
|
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
|
if en_page:
|
|
wiki_pages.append(en_page)
|
|
|
|
# Process wiki pages to add staleness score
|
|
processed_wiki_pages = []
|
|
pages_by_key = {}
|
|
|
|
# Group pages by key
|
|
for page in wiki_pages:
|
|
if page is None:
|
|
continue
|
|
|
|
key = page['key']
|
|
if key not in pages_by_key:
|
|
pages_by_key[key] = {}
|
|
|
|
pages_by_key[key][page['language']] = page
|
|
|
|
# Calculate staleness score for each pair of pages
|
|
for key, lang_pages in pages_by_key.items():
|
|
# Add English page with staleness score
|
|
if 'en' in lang_pages:
|
|
en_page = lang_pages['en'].copy()
|
|
|
|
# If French page exists, calculate staleness score
|
|
if 'fr' in lang_pages:
|
|
fr_page = lang_pages['fr']
|
|
|
|
# Skip if dates are missing
|
|
if en_page['last_modified'] and fr_page['last_modified']:
|
|
# Calculate date difference in days
|
|
en_date = datetime.strptime(en_page['last_modified'], '%Y-%m-%d')
|
|
fr_date = datetime.strptime(fr_page['last_modified'], '%Y-%m-%d')
|
|
date_diff = (en_date - fr_date).days
|
|
|
|
# Calculate content differences
|
|
word_diff = en_page['word_count'] - fr_page['word_count']
|
|
section_diff = en_page['sections'] - fr_page['sections']
|
|
link_diff = en_page['link_count'] - fr_page['link_count']
|
|
|
|
# Calculate staleness score
|
|
staleness_score = (
|
|
abs(date_diff) * 0.2 +
|
|
abs(word_diff) / 100 * 0.5 +
|
|
abs(section_diff) * 0.15 +
|
|
abs(link_diff) / 10 * 0.15
|
|
)
|
|
|
|
# Round to 2 decimal places
|
|
staleness_score = round(staleness_score, 2)
|
|
|
|
en_page['staleness_score'] = staleness_score
|
|
fr_page['staleness_score'] = staleness_score
|
|
else:
|
|
en_page['staleness_score'] = 0
|
|
fr_page['staleness_score'] = 0
|
|
|
|
processed_wiki_pages.append(en_page)
|
|
processed_wiki_pages.append(fr_page)
|
|
else:
|
|
# French page is missing, calculate a high staleness score
|
|
missing_staleness_score = (
|
|
30 * 0.2 +
|
|
en_page['word_count'] / 100 * 0.5 +
|
|
en_page['sections'] * 0.15 +
|
|
en_page['link_count'] / 10 * 0.15
|
|
)
|
|
|
|
# Round to 2 decimal places and ensure it's high
|
|
missing_staleness_score = max(100, round(missing_staleness_score, 2))
|
|
|
|
en_page['staleness_score'] = missing_staleness_score
|
|
processed_wiki_pages.append(en_page)
|
|
|
|
# Add French page without English counterpart (rare case)
|
|
elif 'fr' in lang_pages:
|
|
fr_page = lang_pages['fr'].copy()
|
|
fr_page['staleness_score'] = 0
|
|
processed_wiki_pages.append(fr_page)
|
|
|
|
# Generate histogram of staleness scores
|
|
generate_staleness_histogram(processed_wiki_pages)
|
|
|
|
# Save processed wiki pages to CSV
|
|
try:
|
|
with open(WIKI_PAGES_CSV, 'w', newline='', encoding='utf-8') as f:
|
|
# Basic fields for CSV (detailed content will be in JSON only)
|
|
fieldnames = ['key', 'language', 'url', 'last_modified', 'sections', 'word_count', 'link_count', 'media_count', 'staleness_score', 'description_img_url']
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
writer.writeheader()
|
|
for page in processed_wiki_pages:
|
|
if page: # Skip None values
|
|
# Create a copy with only the CSV fields
|
|
csv_page = {field: page.get(field, '') for field in fieldnames if field in page}
|
|
writer.writerow(csv_page)
|
|
|
|
logger.info(f"Wiki page data saved to {WIKI_PAGES_CSV}")
|
|
|
|
except IOError as e:
|
|
logger.error(f"Error saving data to {WIKI_PAGES_CSV}: {e}")
|
|
return
|
|
|
|
# Analyze pages to find those needing updates
|
|
pages_to_update = analyze_wiki_pages(wiki_pages)
|
|
|
|
# Separate regular pages and specific pages
|
|
regular_pages = []
|
|
specific_pages = []
|
|
|
|
for page in pages_to_update:
|
|
# Check if either English or French page is marked as specific
|
|
is_specific = False
|
|
if page['en_page'] and page['en_page'].get('is_specific_page', False):
|
|
is_specific = True
|
|
elif page['fr_page'] and page['fr_page'].get('is_specific_page', False):
|
|
is_specific = True
|
|
|
|
if is_specific:
|
|
specific_pages.append(page)
|
|
else:
|
|
regular_pages.append(page)
|
|
|
|
# Create a structured output with separate sections
|
|
output_data = {
|
|
"regular_pages": regular_pages,
|
|
"specific_pages": specific_pages,
|
|
"last_updated": datetime.now().isoformat()
|
|
}
|
|
|
|
# Save pages that need updating to JSON with history
|
|
save_with_history(output_data, OUTDATED_PAGES_FILE)
|
|
|
|
# Print the top pages needing updates
|
|
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
|
|
|
|
for i, page in enumerate(pages_to_update[:NUM_WIKI_PAGES], 1):
|
|
key = page['key']
|
|
reason = page['reason']
|
|
en_url = page['en_page']['url'] if page['en_page'] else "N/A"
|
|
fr_url = page['fr_page']['url'] if page['fr_page'] else "N/A"
|
|
|
|
print(f"{i}. Key: {key}")
|
|
print(f" Reason: {reason}")
|
|
print(f" English: {en_url}")
|
|
print(f" French: {fr_url}")
|
|
print()
|
|
|
|
logger.info("Script completed successfully")
|
|
|
|
if __name__ == "__main__":
|
|
main() |