add missing wiki pages from taginfo fr
This commit is contained in:
parent
e056cfc8fa
commit
dffb21b56e
8 changed files with 469 additions and 131 deletions
|
@ -37,9 +37,15 @@ from bs4 import BeautifulSoup
|
|||
import logging
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import nltk
|
||||
from pathlib import Path
|
||||
|
||||
# Try to import nltk, but make it optional
|
||||
try:
|
||||
import nltk
|
||||
NLTK_AVAILABLE = True
|
||||
except ImportError:
|
||||
NLTK_AVAILABLE = False
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
|
@ -50,11 +56,13 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
# Constants
|
||||
TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
|
||||
TAGINFO_FRANCE_API_URL = "https://taginfo.geofabrik.de/europe:france/api/4/keys/without_wiki_page"
|
||||
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
||||
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
|
||||
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
|
||||
TOP_KEYS_FILE = "top_keys.json"
|
||||
KEYS_WITHOUT_WIKI_FILE = "keys_without_wiki.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
STALENESS_HISTOGRAM_FILE = "staleness_histogram.png"
|
||||
|
@ -63,17 +71,18 @@ NUM_WIKI_PAGES = 2
|
|||
# HTML cache folder
|
||||
HTML_CACHE_DIR = "html_cache"
|
||||
|
||||
# Initialize NLTK for sentence tokenization
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
# Initialize NLTK for sentence tokenization if available
|
||||
if NLTK_AVAILABLE:
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
|
||||
# Also download punkt_tab resource which is needed for sent_tokenize
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt_tab')
|
||||
except LookupError:
|
||||
nltk.download('punkt_tab')
|
||||
# Also download punkt_tab resource which is needed for sent_tokenize
|
||||
try:
|
||||
nltk.data.find('tokenizers/punkt_tab')
|
||||
except LookupError:
|
||||
nltk.download('punkt_tab')
|
||||
|
||||
# Create HTML cache directory if it doesn't exist
|
||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||
|
@ -177,6 +186,41 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
|||
logger.error(f"Error fetching data from TagInfo API: {e}")
|
||||
return []
|
||||
|
||||
def fetch_keys_without_wiki_page(limit=36):
|
||||
"""
|
||||
Fetch keys used in France that are missing a wiki page from TagInfo API
|
||||
|
||||
Args:
|
||||
limit (int): Number of keys to fetch
|
||||
|
||||
Returns:
|
||||
list: List of dictionaries containing key information
|
||||
"""
|
||||
logger.info(f"Fetching top {limit} OSM keys without wiki pages used in France...")
|
||||
|
||||
params = {
|
||||
'page': 1,
|
||||
'rp': limit,
|
||||
'english': 0,
|
||||
'sortname': 'count_all',
|
||||
'sortorder': 'desc'
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(TAGINFO_FRANCE_API_URL, params=params)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Extract just the key names and counts
|
||||
keys_without_wiki = [{'key': item['key'], 'count': item['count_all']} for item in data['data']]
|
||||
|
||||
logger.info(f"Successfully fetched {len(keys_without_wiki)} keys without wiki pages")
|
||||
return keys_without_wiki
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching data from TagInfo France API: {e}")
|
||||
return []
|
||||
|
||||
def load_json_data(filename):
|
||||
"""
|
||||
Load data from a JSON file
|
||||
|
@ -295,6 +339,13 @@ def check_grammar_with_grammalecte(text):
|
|||
logger.warning("Empty text provided for grammar checking")
|
||||
return []
|
||||
|
||||
# Check if grammalecte-cli is available
|
||||
try:
|
||||
subprocess.run(['which', 'grammalecte-cli'], capture_output=True, check=True)
|
||||
except subprocess.CalledProcessError:
|
||||
logger.warning("grammalecte-cli not found, skipping grammar check")
|
||||
return []
|
||||
|
||||
logger.info("Checking grammar with grammalecte-cli...")
|
||||
|
||||
try:
|
||||
|
@ -520,9 +571,13 @@ def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=Tr
|
|||
clean_text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(clean_text.split())
|
||||
|
||||
# Count sentences using NLTK
|
||||
sentences = nltk.sent_tokenize(clean_text)
|
||||
sentence_count = len(sentences)
|
||||
# Count sentences using NLTK if available, otherwise use a simple approximation
|
||||
if NLTK_AVAILABLE and check_grammar:
|
||||
sentences = nltk.sent_tokenize(clean_text)
|
||||
sentence_count = len(sentences)
|
||||
else:
|
||||
# Simple approximation: count periods, exclamation marks, and question marks
|
||||
sentence_count = len(re.findall(r'[.!?]+', clean_text))
|
||||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
|
@ -1098,18 +1153,19 @@ def main():
|
|||
|
||||
This function:
|
||||
1. Fetches the top OSM keys from TagInfo API
|
||||
2. Fetches and processes wiki pages for these keys
|
||||
3. Processes specific wiki pages listed in SPECIFIC_PAGES
|
||||
4. Processes pages from the FR:Traductions_désynchronisées category
|
||||
5. Calculates staleness scores for all pages
|
||||
6. Generates a histogram of staleness scores
|
||||
7. Saves the results to CSV and JSON files
|
||||
8. Prints a list of pages that need updating
|
||||
2. Fetches keys used in France that are missing a wiki page from TagInfo API
|
||||
3. Fetches and processes wiki pages for these keys
|
||||
4. Processes specific wiki pages listed in SPECIFIC_PAGES
|
||||
5. Processes pages from the FR:Traductions_désynchronisées category
|
||||
6. Calculates staleness scores for all pages
|
||||
7. Generates a histogram of staleness scores
|
||||
8. Saves the results to CSV and JSON files
|
||||
9. Prints a list of pages that need updating
|
||||
"""
|
||||
# Parse command-line arguments
|
||||
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
|
||||
parser.add_argument('--no-grammar-check', action='store_true',
|
||||
help='Disable grammar checking for French pages')
|
||||
help='Disable grammar checking for French pages', default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Whether to check grammar for French pages
|
||||
|
@ -1131,6 +1187,16 @@ def main():
|
|||
# Save top keys to JSON
|
||||
save_to_json(top_keys, TOP_KEYS_FILE)
|
||||
|
||||
# Fetch keys without wiki pages used in France
|
||||
keys_without_wiki = fetch_keys_without_wiki_page()
|
||||
|
||||
if keys_without_wiki:
|
||||
# Save keys without wiki pages to JSON
|
||||
save_to_json(keys_without_wiki, KEYS_WITHOUT_WIKI_FILE)
|
||||
logger.info(f"Saved {len(keys_without_wiki)} keys without wiki pages to {KEYS_WITHOUT_WIKI_FILE}")
|
||||
else:
|
||||
logger.warning("No keys without wiki pages were fetched.")
|
||||
|
||||
# Fetch wiki pages for each key
|
||||
wiki_pages = []
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue