add specific follow of translation pages
This commit is contained in:
parent
7a7704bc01
commit
bd3d14e9f8
11 changed files with 48190 additions and 268240 deletions
|
@ -8,6 +8,10 @@ This script fetches the most used OpenStreetMap keys from TagInfo,
|
|||
compares their English and French wiki pages, and identifies which pages
|
||||
need updating based on modification dates and content analysis.
|
||||
|
||||
The script also compares a specific list of wiki pages defined in the
|
||||
SPECIFIC_PAGES constant. This list can include regular page titles,
|
||||
full URLs, or pages with FR: prefix.
|
||||
|
||||
Usage:
|
||||
python wiki_compare.py
|
||||
|
||||
|
@ -15,6 +19,7 @@ Output:
|
|||
- top_keys.json: JSON file containing the most used OSM keys
|
||||
- wiki_pages.csv: CSV file with information about each wiki page
|
||||
- outdated_pages.json: JSON file containing pages that need updating
|
||||
- staleness_histogram.png: Histogram of staleness scores
|
||||
- A console output listing the wiki pages that need updating
|
||||
"""
|
||||
|
||||
|
@ -47,14 +52,21 @@ WIKI_PAGES_CSV = "wiki_pages.csv"
|
|||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 100
|
||||
NUM_WIKI_PAGES = 1
|
||||
|
||||
# List of specific pages to compare
|
||||
# List of specific pages to compare (in addition to top keys)
|
||||
# This list can include:
|
||||
# 1. Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
|
||||
# 2. Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
||||
# 3. Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||
SPECIFIC_PAGES = [
|
||||
"Anatomie_des_étiquettes_osm",
|
||||
"https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois",
|
||||
"FR:Tag:leisure%3Dchildren_club",
|
||||
"FR:Tag:harassment_prevention%3Dask_angela"
|
||||
"FR:Tag:harassment_prevention%3Dask_angela",
|
||||
"Key:harassment_prevention",
|
||||
"Proposal process",
|
||||
"Automated_Edits_code_of_conduct",
|
||||
"Key:cuisine"
|
||||
]
|
||||
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
|
@ -110,6 +122,13 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
"""
|
||||
Fetch wiki page for a given key or specific page
|
||||
|
||||
This function handles different types of wiki pages:
|
||||
1. Regular OSM key pages (e.g., "building", "highway")
|
||||
2. Specific wiki pages that can be in various formats:
|
||||
- Regular page titles (e.g., "Anatomie_des_étiquettes_osm")
|
||||
- Full URLs (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
||||
- Pages with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||
|
||||
Args:
|
||||
key (str): OSM key or specific page title/URL
|
||||
language (str): Language code ('en' or 'fr')
|
||||
|
@ -780,7 +799,18 @@ def analyze_wiki_pages(pages):
|
|||
return needs_update
|
||||
|
||||
def main():
|
||||
"""Main function to execute the script"""
|
||||
"""
|
||||
Main function to execute the script
|
||||
|
||||
This function:
|
||||
1. Fetches the top OSM keys from TagInfo API
|
||||
2. Fetches and processes wiki pages for these keys
|
||||
3. Processes specific wiki pages listed in SPECIFIC_PAGES
|
||||
4. Calculates staleness scores for all pages
|
||||
5. Generates a histogram of staleness scores
|
||||
6. Saves the results to CSV and JSON files
|
||||
7. Prints a list of pages that need updating
|
||||
"""
|
||||
logger.info("Starting wiki_compare.py")
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
|
@ -814,12 +844,13 @@ def main():
|
|||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Process specific pages
|
||||
# Process specific pages from the SPECIFIC_PAGES list
|
||||
# These are additional pages to compare beyond the top keys from TagInfo
|
||||
logger.info("Processing specific pages...")
|
||||
for page in SPECIFIC_PAGES:
|
||||
# For specific pages, we need to handle different formats
|
||||
|
||||
# Case 1: Full URL
|
||||
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
||||
if page.startswith('http'):
|
||||
# For full URLs, we directly fetch the page
|
||||
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
|
@ -831,6 +862,7 @@ def main():
|
|||
# Try to get the English version by removing FR: prefix
|
||||
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
@ -839,11 +871,12 @@ def main():
|
|||
# Try to get the French version by adding FR: prefix
|
||||
fr_title = f"FR:{page_info['page_title']}"
|
||||
fr_url = f"{WIKI_BASE_URL}{fr_title}"
|
||||
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
|
||||
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Case 2: Page with FR: prefix
|
||||
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||
elif page.startswith('FR:'):
|
||||
# Fetch the French page
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
|
@ -853,18 +886,19 @@ def main():
|
|||
# Try to get the English version by removing FR: prefix
|
||||
en_title = page[3:] # Remove FR: prefix
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Case 3: Regular page title
|
||||
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
|
||||
else:
|
||||
# Fetch the English page
|
||||
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Fetch the French page
|
||||
# Fetch the French page (by adding FR: prefix)
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
@ -972,8 +1006,32 @@ def main():
|
|||
# Analyze pages to find those needing updates
|
||||
pages_to_update = analyze_wiki_pages(wiki_pages)
|
||||
|
||||
# Separate regular pages and specific pages
|
||||
regular_pages = []
|
||||
specific_pages = []
|
||||
|
||||
for page in pages_to_update:
|
||||
# Check if either English or French page is marked as specific
|
||||
is_specific = False
|
||||
if page['en_page'] and page['en_page'].get('is_specific_page', False):
|
||||
is_specific = True
|
||||
elif page['fr_page'] and page['fr_page'].get('is_specific_page', False):
|
||||
is_specific = True
|
||||
|
||||
if is_specific:
|
||||
specific_pages.append(page)
|
||||
else:
|
||||
regular_pages.append(page)
|
||||
|
||||
# Create a structured output with separate sections
|
||||
output_data = {
|
||||
"regular_pages": regular_pages,
|
||||
"specific_pages": specific_pages,
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Save pages that need updating to JSON
|
||||
save_to_json(pages_to_update, OUTDATED_PAGES_FILE)
|
||||
save_to_json(output_data, OUTDATED_PAGES_FILE)
|
||||
|
||||
# Print the top pages needing updates
|
||||
print(f"\n===== TOP {min(NUM_WIKI_PAGES, len(pages_to_update))} WIKI PAGES NEEDING UPDATES =====")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue