add set of pages to watch
This commit is contained in:
parent
77ad76cc7e
commit
7a7704bc01
22 changed files with 216839 additions and 6049 deletions
|
@ -41,12 +41,21 @@ logger = logging.getLogger(__name__)
|
|||
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
|
||||
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
||||
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
|
||||
TOP_KEYS_FILE = "top_keys.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
# Number of wiki pages to examine
|
||||
NUM_WIKI_PAGES = 5
|
||||
NUM_WIKI_PAGES = 100
|
||||
|
||||
# List of specific pages to compare
|
||||
SPECIFIC_PAGES = [
|
||||
"Anatomie_des_étiquettes_osm",
|
||||
"https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois",
|
||||
"FR:Tag:leisure%3Dchildren_club",
|
||||
"FR:Tag:harassment_prevention%3Dask_angela"
|
||||
]
|
||||
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
"""
|
||||
|
@ -97,28 +106,56 @@ def save_to_json(data, filename):
|
|||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
def fetch_wiki_page(key, language='en'):
|
||||
def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||
"""
|
||||
Fetch wiki page for a given key
|
||||
Fetch wiki page for a given key or specific page
|
||||
|
||||
Args:
|
||||
key (str): OSM key
|
||||
key (str): OSM key or specific page title/URL
|
||||
language (str): Language code ('en' or 'fr')
|
||||
is_specific_page (bool): Whether this is a specific page rather than a key
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with page information or None if page doesn't exist
|
||||
"""
|
||||
base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
|
||||
url = f"{base_url}{key}"
|
||||
# Handle different URL formats
|
||||
if is_specific_page:
|
||||
# Case 1: Full URL
|
||||
if key.startswith('http'):
|
||||
url = key
|
||||
# Extract the page title from the URL
|
||||
page_title = key.split('/')[-1]
|
||||
# Determine language from URL
|
||||
if 'FR:' in key or '/FR:' in key:
|
||||
language = 'fr'
|
||||
else:
|
||||
language = 'en'
|
||||
# Case 2: Page with FR: prefix
|
||||
elif key.startswith('FR:'):
|
||||
url = f"{WIKI_BASE_URL}{key}"
|
||||
page_title = key[3:] # Remove FR: prefix for title
|
||||
language = 'fr'
|
||||
# Case 3: Regular page title
|
||||
else:
|
||||
if language == 'fr':
|
||||
url = f"{WIKI_BASE_URL}FR:{key}"
|
||||
else:
|
||||
url = f"{WIKI_BASE_URL}{key}"
|
||||
page_title = key
|
||||
else:
|
||||
# Regular key page
|
||||
base_url = WIKI_BASE_URL_EN if language == 'en' else WIKI_BASE_URL_FR
|
||||
url = f"{base_url}{key}"
|
||||
page_title = key
|
||||
|
||||
logger.info(f"Fetching {language} wiki page for key '{key}': {url}")
|
||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
# Check if page exists
|
||||
if response.status_code == 404:
|
||||
logger.warning(f"Wiki page for key '{key}' in {language} does not exist")
|
||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
|
@ -380,6 +417,7 @@ def fetch_wiki_page(key, language='en'):
|
|||
|
||||
return {
|
||||
'key': key,
|
||||
'page_title': page_title,
|
||||
'language': language,
|
||||
'url': url,
|
||||
'last_modified': last_modified,
|
||||
|
@ -391,11 +429,12 @@ def fetch_wiki_page(key, language='en'):
|
|||
'media_count': media_count,
|
||||
'media_details': media_details,
|
||||
'categories': categories,
|
||||
'description_img_url': description_img_url
|
||||
'description_img_url': description_img_url,
|
||||
'is_specific_page': is_specific_page
|
||||
}
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page for key '{key}' in {language}: {e}")
|
||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
def generate_staleness_histogram(wiki_pages):
|
||||
|
@ -562,27 +601,52 @@ def analyze_wiki_pages(pages):
|
|||
'common': []
|
||||
}
|
||||
|
||||
# Extract section titles for comparison
|
||||
en_sections = {section['title'].lower(): section for section in en_page.get('section_titles', [])}
|
||||
fr_sections = {section['title'].lower(): section for section in fr_page.get('section_titles', [])}
|
||||
# Group sections by their level for hierarchical comparison
|
||||
en_sections_by_level = {}
|
||||
fr_sections_by_level = {}
|
||||
|
||||
# Find sections only in English
|
||||
for title, section in en_sections.items():
|
||||
if title not in fr_sections:
|
||||
section_comparison['en_only'].append(section)
|
||||
# Organize English sections by level
|
||||
for section in en_page.get('section_titles', []):
|
||||
level = section['level']
|
||||
if level not in en_sections_by_level:
|
||||
en_sections_by_level[level] = []
|
||||
en_sections_by_level[level].append(section)
|
||||
|
||||
# Organize French sections by level
|
||||
for section in fr_page.get('section_titles', []):
|
||||
level = section['level']
|
||||
if level not in fr_sections_by_level:
|
||||
fr_sections_by_level[level] = []
|
||||
fr_sections_by_level[level].append(section)
|
||||
|
||||
# Find sections only in French
|
||||
for title, section in fr_sections.items():
|
||||
if title not in en_sections:
|
||||
section_comparison['fr_only'].append(section)
|
||||
# Process each level to find matching sections
|
||||
all_levels = set(list(en_sections_by_level.keys()) + list(fr_sections_by_level.keys()))
|
||||
|
||||
# Find common sections
|
||||
for title in en_sections.keys():
|
||||
if title in fr_sections:
|
||||
section_comparison['common'].append({
|
||||
'en': en_sections[title],
|
||||
'fr': fr_sections[title]
|
||||
})
|
||||
for level in all_levels:
|
||||
en_level_sections = en_sections_by_level.get(level, [])
|
||||
fr_level_sections = fr_sections_by_level.get(level, [])
|
||||
|
||||
# Create dictionaries for easier lookup, using lowercase titles
|
||||
en_dict = {section['title'].lower(): section for section in en_level_sections}
|
||||
fr_dict = {section['title'].lower(): section for section in fr_level_sections}
|
||||
|
||||
# Find sections at this level only in English
|
||||
for title, section in en_dict.items():
|
||||
if title not in fr_dict:
|
||||
section_comparison['en_only'].append(section)
|
||||
|
||||
# Find sections at this level only in French
|
||||
for title, section in fr_dict.items():
|
||||
if title not in en_dict:
|
||||
section_comparison['fr_only'].append(section)
|
||||
|
||||
# Find common sections at this level
|
||||
for title in en_dict.keys():
|
||||
if title in fr_dict:
|
||||
section_comparison['common'].append({
|
||||
'en': en_dict[title],
|
||||
'fr': fr_dict[title]
|
||||
})
|
||||
|
||||
# Compare links between English and French pages
|
||||
link_comparison = {
|
||||
|
@ -735,6 +799,8 @@ def main():
|
|||
# Fetch wiki pages for each key
|
||||
wiki_pages = []
|
||||
|
||||
# Process top keys
|
||||
logger.info("Processing top keys...")
|
||||
for key_info in top_keys:
|
||||
key = key_info['key']
|
||||
|
||||
|
@ -748,6 +814,61 @@ def main():
|
|||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Process specific pages
|
||||
logger.info("Processing specific pages...")
|
||||
for page in SPECIFIC_PAGES:
|
||||
# For specific pages, we need to handle different formats
|
||||
|
||||
# Case 1: Full URL
|
||||
if page.startswith('http'):
|
||||
# For full URLs, we directly fetch the page
|
||||
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
if page_info:
|
||||
wiki_pages.append(page_info)
|
||||
|
||||
# If it's a French page, try to find the English equivalent
|
||||
if page_info['language'] == 'fr':
|
||||
# Try to get the English version by removing FR: prefix
|
||||
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
# If it's an English page, try to find the French equivalent
|
||||
else:
|
||||
# Try to get the French version by adding FR: prefix
|
||||
fr_title = f"FR:{page_info['page_title']}"
|
||||
fr_url = f"{WIKI_BASE_URL}{fr_title}"
|
||||
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Case 2: Page with FR: prefix
|
||||
elif page.startswith('FR:'):
|
||||
# Fetch the French page
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Try to get the English version by removing FR: prefix
|
||||
en_title = page[3:] # Remove FR: prefix
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Case 3: Regular page title
|
||||
else:
|
||||
# Fetch the English page
|
||||
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Fetch the French page
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Process wiki pages to add staleness score
|
||||
processed_wiki_pages = []
|
||||
pages_by_key = {}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue