add missing wiki pages from taginfo fr

This commit is contained in:
Tykayn 2025-09-05 11:37:19 +02:00 committed by tykayn
parent e056cfc8fa
commit dffb21b56e
8 changed files with 469 additions and 131 deletions

View file

@ -37,9 +37,15 @@ from bs4 import BeautifulSoup
import logging
import matplotlib.pyplot as plt
import numpy as np
import nltk
from pathlib import Path
# Try to import nltk, but make it optional
try:
import nltk
NLTK_AVAILABLE = True
except ImportError:
NLTK_AVAILABLE = False
# Configure logging
logging.basicConfig(
level=logging.INFO,
@ -50,11 +56,13 @@ logger = logging.getLogger(__name__)
# Constants
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
TAGINFO_FRANCE_API_URL = "https://taginfo.geofabrik.de/europe:france/api/4/keys/without_wiki_page"
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
TOP_KEYS_FILE = "top_keys.json"
KEYS_WITHOUT_WIKI_FILE = "keys_without_wiki.json"
WIKI_PAGES_CSV = "wiki_pages.csv"
OUTDATED_PAGES_FILE = "outdated_pages.json"
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
@ -63,17 +71,18 @@ NUM_WIKI_PAGES = 2
# HTML cache folder
HTML_CACHE_DIR = "html_cache"
# Initialize NLTK for sentence tokenization
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Initialize NLTK for sentence tokenization if available
if NLTK_AVAILABLE:
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Also download punkt_tab resource which is needed for sent_tokenize
try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
# Also download punkt_tab resource which is needed for sent_tokenize
try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
# Create HTML cache directory if it doesn't exist
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
@ -177,6 +186,41 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
logger.error(f"Error fetching data from TagInfo API: {e}")
return []
def fetch_keys_without_wiki_page(limit=36):
"""
Fetch keys used in France that are missing a wiki page from TagInfo API
Args:
limit (int): Number of keys to fetch
Returns:
list: List of dictionaries containing key information
"""
logger.info(f"Fetching top {limit} OSM keys without wiki pages used in France...")
params = {
'page': 1,
'rp': limit,
'english': 0,
'sortname': 'count_all',
'sortorder': 'desc'
}
try:
response = requests.get(TAGINFO_FRANCE_API_URL, params=params)
response.raise_for_status()
data = response.json()
# Extract just the key names and counts
keys_without_wiki = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
logger.info(f"Successfully fetched {len(keys_without_wiki)} keys without wiki pages")
return keys_without_wiki
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching data from TagInfo France API: {e}")
return []
def load_json_data(filename):
"""
Load data from a JSON file
@ -295,6 +339,13 @@ def check_grammar_with_grammalecte(text):
logger.warning("Empty text provided for grammar checking")
return []
# Check if grammalecte-cli is available
try:
subprocess.run(['which', 'grammalecte-cli'], capture_output=True, check=True)
except subprocess.CalledProcessError:
logger.warning("grammalecte-cli not found, skipping grammar check")
return []
logger.info("Checking grammar with grammalecte-cli...")
try:
@ -520,9 +571,13 @@ def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=Tr
clean_text = content.get_text(separator=' ', strip=True)
word_count = len(clean_text.split())
# Count sentences using NLTK
sentences = nltk.sent_tokenize(clean_text)
sentence_count = len(sentences)
# Count sentences using NLTK if available, otherwise use a simple approximation
if NLTK_AVAILABLE and check_grammar:
sentences = nltk.sent_tokenize(clean_text)
sentence_count = len(sentences)
else:
# Simple approximation: count periods, exclamation marks, and question marks
sentence_count = len(re.findall(r'[.!?]+', clean_text))
# Check grammar for French pages
grammar_suggestions = []
@ -1098,18 +1153,19 @@ def main():
This function:
1. Fetches the top OSM keys from TagInfo API
2. Fetches and processes wiki pages for these keys
3. Processes specific wiki pages listed in SPECIFIC_PAGES
4. Processes pages from the FR:Traductions_désynchronisées category
5. Calculates staleness scores for all pages
6. Generates a histogram of staleness scores
7. Saves the results to CSV and JSON files
8. Prints a list of pages that need updating
2. Fetches keys used in France that are missing a wiki page from TagInfo API
3. Fetches and processes wiki pages for these keys
4. Processes specific wiki pages listed in SPECIFIC_PAGES
5. Processes pages from the FR:Traductions_désynchronisées category
6. Calculates staleness scores for all pages
7. Generates a histogram of staleness scores
8. Saves the results to CSV and JSON files
9. Prints a list of pages that need updating
"""
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
parser.add_argument('--no-grammar-check', action='store_true',
help='Disable grammar checking for French pages')
help='Disable grammar checking for French pages', default=False)
args = parser.parse_args()
# Whether to check grammar for French pages
@ -1131,6 +1187,16 @@ def main():
# Save top keys to JSON
save_to_json(top_keys, TOP_KEYS_FILE)
# Fetch keys without wiki pages used in France
keys_without_wiki = fetch_keys_without_wiki_page()
if keys_without_wiki:
# Save keys without wiki pages to JSON
save_to_json(keys_without_wiki, KEYS_WITHOUT_WIKI_FILE)
logger.info(f"Saved {len(keys_without_wiki)} keys without wiki pages to {KEYS_WITHOUT_WIKI_FILE}")
else:
logger.warning("No keys without wiki pages were fetched.")
# Fetch wiki pages for each key
wiki_pages = []