add specific follow of translation pages

This commit is contained in:
Tykayn 2025-09-01 00:29:17 +02:00 committed by tykayn
parent 7a7704bc01
commit bd3d14e9f8
11 changed files with 48190 additions and 268240 deletions

View file

@ -8,6 +8,10 @@ This script fetches the most used OpenStreetMap keys from TagInfo,
compares their English and French wiki pages, and identifies which pages
need updating based on modification dates and content analysis.
The script also compares a specific list of wiki pages defined in the
SPECIFIC_PAGES constant. This list can include regular page titles,
full URLs, or pages with FR: prefix.
Usage:
python wiki_compare.py
@ -15,6 +19,7 @@ Output:
- top_keys.json: JSON file containing the most used OSM keys
- wiki_pages.csv: CSV file with information about each wiki page
- outdated_pages.json: JSON file containing pages that need updating
- staleness_histogram.png: Histogram of staleness scores
- A console output listing the wiki pages that need updating
"""
@ -47,14 +52,21 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
# Number of wiki pages to examine
NUM_WIKI_PAGES = 100
NUM_WIKI_PAGES = 1
# List of specific pages to compare
# List of specific pages to compare (in addition to top keys)
# This list can include:
# 1. Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
# 2. Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
SPECIFIC_PAGES = [
"Anatomie_des_étiquettes_osm",
"https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois",
"FR:Tag:leisure%3Dchildren_club",
"FR:Tag:harassment_prevention%3Dask_angela"
"FR:Tag:harassment_prevention%3Dask_angela",
"Key:harassment_prevention",
"Proposal process",
"Automated_Edits_code_of_conduct",
"Key:cuisine"
]
def fetch_top_keys(limit=NUM_WIKI_PAGES):
@ -110,6 +122,13 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
"""
Fetch wiki page for a given key or specific page
This function handles different types of wiki pages:
1. Regular OSM key pages (e.g., "building", "highway")
2. Specific wiki pages that can be in various formats:
- Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
- Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
- Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
Args:
key (str): OSM key or specific page title/URL
language (str): Language code ('en' or 'fr')
@ -780,7 +799,18 @@ def analyze_wiki_pages(pages):
return needs_update
def main():
"""Main function to execute the script"""
"""
Main function to execute the script
This function:
1. Fetches the top OSM keys from TagInfo API
2. Fetches and processes wiki pages for these keys
3. Processes specific wiki pages listed in SPECIFIC_PAGES
4. Calculates staleness scores for all pages
5. Generates a histogram of staleness scores
6. Saves the results to CSV and JSON files
7. Prints a list of pages that need updating
"""
logger.info("Starting wiki_compare.py")
# Create output directory if it doesn't exist
@ -814,12 +844,13 @@ def main():
if fr_page:
wiki_pages.append(fr_page)
# Process specific pages
# Process specific pages from the SPECIFIC_PAGES list
# These are additional pages to compare beyond the top keys from TagInfo
logger.info("Processing specific pages...")
for page in SPECIFIC_PAGES:
# For specific pages, we need to handle different formats
# Case 1: Full URL
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
if page.startswith('http'):
# For full URLs, we directly fetch the page
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
@ -831,6 +862,7 @@ def main():
# Try to get the English version by removing FR: prefix
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
if en_page:
wiki_pages.append(en_page)
@ -839,11 +871,12 @@ def main():
# Try to get the French version by adding FR: prefix
fr_title = f"FR:{page_info['page_title']}"
fr_url = f"{WIKI_BASE_URL}{fr_title}"
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
if fr_page:
wiki_pages.append(fr_page)
# Case 2: Page with FR: prefix
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
elif page.startswith('FR:'):
# Fetch the French page
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
@ -853,18 +886,19 @@ def main():
# Try to get the English version by removing FR: prefix
en_title = page[3:] # Remove FR: prefix
en_url = f"{WIKI_BASE_URL}{en_title}"
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
if en_page:
wiki_pages.append(en_page)
# Case 3: Regular page title
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
else:
# Fetch the English page
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
if en_page:
wiki_pages.append(en_page)
# Fetch the French page
# Fetch the French page (by adding FR: prefix)
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
if fr_page:
wiki_pages.append(fr_page)
@ -972,8 +1006,32 @@ def main():
# Analyze pages to find those needing updates
pages_to_update = analyze_wiki_pages(wiki_pages)
# Separate regular pages and specific pages
regular_pages = []
specific_pages = []
for page in pages_to_update:
# Check if either English or French page is marked as specific
is_specific = False
if page['en_page'] and page['en_page'].get('is_specific_page', False):
is_specific = True
elif page['fr_page'] and page['fr_page'].get('is_specific_page', False):
is_specific = True
if is_specific:
specific_pages.append(page)
else:
regular_pages.append(page)
# Create a structured output with separate sections
output_data = {
"regular_pages": regular_pages,
"specific_pages": specific_pages,
"last_updated": datetime.now().isoformat()
}
# Save pages that need updating to JSON
save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
save_to_json(output_data, OUTDATED_PAGES_FILE)
# Print the top pages needing updates
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")