up fetch desyncronised pages
This commit is contained in:
parent
8008e0291e
commit
685efd6710
4 changed files with 140 additions and 62 deletions
Binary file not shown.
After Width: | Height: | Size: 167 KiB |
|
@ -1,5 +1,5 @@
|
|||
qualiwiki.cipherbliss.com {
|
||||
root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public
|
||||
root * /home/poule/encrypted/qualiwiki/public
|
||||
|
||||
# serve files directly if they can be found (e.g. CSS or JS files in public/)
|
||||
encode zstd gzip
|
BIN
wiki_compare/__pycache__/wiki_compare.cpython-312.pyc
Normal file
BIN
wiki_compare/__pycache__/wiki_compare.cpython-312.pyc
Normal file
Binary file not shown.
|
@ -31,6 +31,7 @@ import os
|
|||
import subprocess
|
||||
import tempfile
|
||||
import hashlib
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
|
|||
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
||||
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
|
||||
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
|
||||
TOP_KEYS_FILE = "top_keys.json"
|
||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||
|
@ -72,7 +74,7 @@ try:
|
|||
nltk.data.find('tokenizers/punkt_tab')
|
||||
except LookupError:
|
||||
nltk.download('punkt_tab')
|
||||
|
||||
|
||||
# Create HTML cache directory if it doesn't exist
|
||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||
|
||||
|
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
|
|||
"Key:cuisine",
|
||||
"Libre_Charge_Map",
|
||||
"OSM_Mon_Commerce",
|
||||
"Complète_Tes_Commerces",
|
||||
"Tag:amenity=charging_station",
|
||||
"Organised_Editing/Activities/MapYourGrid_Initiative",
|
||||
"Key:highway",
|
||||
"Quality_assurance"
|
||||
"Quality_assurance",
|
||||
"Verifiability",
|
||||
"Good_practice",
|
||||
"Mapping_parties",
|
||||
"State_of_the_Map",
|
||||
"Diversity"
|
||||
]
|
||||
|
||||
def fetch_desynchronized_pages():
|
||||
"""
|
||||
Fetch pages from the FR:Traductions_désynchronisées category
|
||||
|
||||
Returns:
|
||||
list: List of page URLs from the category
|
||||
"""
|
||||
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
|
||||
|
||||
try:
|
||||
response = requests.get(WIKI_CATEGORY_URL)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Find all links to French pages in the category
|
||||
page_links = []
|
||||
for link in soup.select('a[href^="/wiki/FR:"]'):
|
||||
href = link.get('href', '')
|
||||
# Skip if it's a category link or a language link
|
||||
if '/Category:' in href or 'action=edit' in href:
|
||||
continue
|
||||
|
||||
# Get the full URL
|
||||
full_url = 'https://wiki.openstreetmap.org' + href
|
||||
page_links.append(full_url)
|
||||
|
||||
logger.info(f"Found {len(page_links)} pages in the category")
|
||||
return page_links
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching category page: {e}")
|
||||
return []
|
||||
|
||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||
"""
|
||||
Fetch the most used OSM keys from TagInfo API
|
||||
|
@ -133,10 +176,10 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
|||
def load_json_data(filename):
|
||||
"""
|
||||
Load data from a JSON file
|
||||
|
||||
|
||||
Args:
|
||||
filename (str): Name of the file
|
||||
|
||||
|
||||
Returns:
|
||||
dict: Data loaded from the file or empty dict if file doesn't exist
|
||||
"""
|
||||
|
@ -164,7 +207,7 @@ def save_to_json(data, filename):
|
|||
try:
|
||||
# Convert data to JSON string
|
||||
json_str = json.dumps(data, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
# Print the JSON string for debugging
|
||||
logger.info(f"JSON string to be written to {filename}:")
|
||||
logger.info(f"JSON keys at top level: {list(data.keys())}")
|
||||
|
@ -174,22 +217,22 @@ def save_to_json(data, filename):
|
|||
logger.info(f"'type' key exists in translations")
|
||||
if 'type_key' in data['translations']:
|
||||
logger.info(f"'type_key' key exists in translations")
|
||||
|
||||
|
||||
# Write the JSON string to the file
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
f.write(json_str)
|
||||
|
||||
|
||||
logger.info(f"Data saved to {filename}")
|
||||
except IOError as e:
|
||||
logger.error(f"Error saving data to {filename}: {e}")
|
||||
|
||||
|
||||
def save_with_history(data, filename):
|
||||
"""
|
||||
Save data to a JSON file while preserving history
|
||||
|
||||
|
||||
This function loads existing data from the file (if it exists),
|
||||
adds the new data to the history, and saves the updated data back to the file.
|
||||
|
||||
|
||||
Args:
|
||||
data: New data to save
|
||||
filename (str): Name of the file
|
||||
|
@ -197,32 +240,32 @@ def save_with_history(data, filename):
|
|||
try:
|
||||
# Load existing data
|
||||
existing_data = load_json_data(filename)
|
||||
|
||||
|
||||
# Create a timestamp for the current data
|
||||
current_timestamp = datetime.now().isoformat()
|
||||
|
||||
|
||||
# Initialize history if it doesn't exist
|
||||
if 'history' not in existing_data:
|
||||
existing_data['history'] = {}
|
||||
|
||||
|
||||
# Add current regular_pages and specific_pages to history
|
||||
history_entry = {
|
||||
'regular_pages': data.get('regular_pages', []),
|
||||
'specific_pages': data.get('specific_pages', [])
|
||||
}
|
||||
|
||||
|
||||
# Add the entry to history with timestamp as key
|
||||
existing_data['history'][current_timestamp] = history_entry
|
||||
|
||||
|
||||
# Update the current data
|
||||
existing_data['regular_pages'] = data.get('regular_pages', [])
|
||||
existing_data['specific_pages'] = data.get('specific_pages', [])
|
||||
existing_data['last_updated'] = current_timestamp
|
||||
|
||||
|
||||
# Save the updated data
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
logger.info(f"Data with history saved to {filename}")
|
||||
except (IOError, json.JSONDecodeError) as e:
|
||||
logger.error(f"Error saving data with history to {filename}: {e}")
|
||||
|
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
|
|||
logger.error(f"Unexpected error during grammar checking: {e}")
|
||||
return []
|
||||
|
||||
def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||
def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
|
||||
"""
|
||||
Fetch wiki page for a given key or specific page
|
||||
|
||||
|
@ -328,7 +371,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
key (str): OSM key or specific page title/URL
|
||||
language (str): Language code ('en' or 'fr')
|
||||
is_specific_page (bool): Whether this is a specific page rather than a key
|
||||
|
||||
check_grammar (bool): Whether to check grammar for French pages
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with page information or None if page doesn't exist
|
||||
"""
|
||||
|
@ -369,9 +413,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
# Create a unique cache filename based on the URL
|
||||
cache_key = hashlib.md5(url.encode()).hexdigest()
|
||||
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
||||
|
||||
|
||||
html_content = None
|
||||
|
||||
|
||||
# Try to load from cache first
|
||||
if cache_file.exists():
|
||||
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||
|
@ -381,21 +425,21 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
except Exception as e:
|
||||
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
|
||||
html_content = None
|
||||
|
||||
|
||||
# If not in cache or cache read failed, fetch from web
|
||||
if html_content is None:
|
||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
|
||||
# Check if page exists
|
||||
if response.status_code == 404:
|
||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||
return None
|
||||
|
||||
|
||||
response.raise_for_status()
|
||||
html_content = response.text
|
||||
|
||||
|
||||
# Save to cache
|
||||
try:
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
|
@ -406,9 +450,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
|
||||
# Get last modification date
|
||||
last_modified = None
|
||||
footer_info = soup.select_one('#footer-info-lastmod')
|
||||
|
@ -423,29 +467,29 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||
except ValueError:
|
||||
logger.warning(f"Could not parse date: {date_str}")
|
||||
|
||||
|
||||
# Extract sections (h2, h3, h4)
|
||||
section_elements = soup.select('h2, h3, h4')
|
||||
sections = len(section_elements)
|
||||
|
||||
|
||||
# Extract section titles
|
||||
section_titles = []
|
||||
for section_elem in section_elements:
|
||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||
continue
|
||||
|
||||
|
||||
# Skip sections that are inside a table with class DescriptionBox
|
||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||
continue
|
||||
|
||||
|
||||
# Get the text of the section title, removing any edit links
|
||||
for edit_link in section_elem.select('.mw-editsection'):
|
||||
edit_link.extract()
|
||||
|
||||
|
||||
section_title = section_elem.get_text(strip=True)
|
||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||
|
||||
|
||||
section_titles.append({
|
||||
'title': section_title,
|
||||
'level': section_level
|
||||
|
@ -458,29 +502,31 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
# Remove script and style elements
|
||||
for script in content.select('script, style'):
|
||||
script.extract()
|
||||
|
||||
|
||||
# Remove .languages elements
|
||||
for languages_elem in content.select('.languages'):
|
||||
languages_elem.extract()
|
||||
|
||||
|
||||
# Get text and count words
|
||||
clean_text = content.get_text(separator=' ', strip=True)
|
||||
word_count = len(clean_text.split())
|
||||
|
||||
|
||||
# Count sentences using NLTK
|
||||
sentences = nltk.sent_tokenize(clean_text)
|
||||
sentence_count = len(sentences)
|
||||
|
||||
|
||||
# Check grammar for French pages
|
||||
grammar_suggestions = []
|
||||
# if language == 'fr':
|
||||
# logger.info(f"Checking grammar for French page: {key}")
|
||||
# grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
|
||||
if language == 'fr' and check_grammar:
|
||||
logger.info(f"Checking grammar for French page: {key}")
|
||||
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||
elif language == 'fr' and not check_grammar:
|
||||
logger.info(f"Grammar checking disabled for French page: {key}")
|
||||
|
||||
# Extract links
|
||||
links = content.select('a')
|
||||
link_count = len(links)
|
||||
|
||||
|
||||
# Get link details (text and href)
|
||||
link_details = []
|
||||
for link in links:
|
||||
|
@ -488,22 +534,22 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
|||
# Skip edit section links and other non-content links
|
||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||
continue
|
||||
|
||||
|
||||
# Make relative URLs absolute
|
||||
if href.startswith('/'):
|
||||
href = 'https://wiki.openstreetmap.org' + href
|
||||
|
||||
|
||||
link_text = link.get_text(strip=True)
|
||||
if link_text: # Only include links with text
|
||||
link_details.append({
|
||||
'text': link_text,
|
||||
'href': href
|
||||
})
|
||||
|
||||
|
||||
# Extract media (images)
|
||||
media_elements = content.select('img')
|
||||
media_count = len(media_elements)
|
||||
|
||||
|
||||
# Get media details (src and alt text)
|
||||
media_details = []
|
||||
|
||||
|
@ -1045,13 +1091,24 @@ def main():
|
|||
1. Fetches the top OSM keys from TagInfo API
|
||||
2. Fetches and processes wiki pages for these keys
|
||||
3. Processes specific wiki pages listed in SPECIFIC_PAGES
|
||||
4. Calculates staleness scores for all pages
|
||||
5. Generates a histogram of staleness scores
|
||||
6. Saves the results to CSV and JSON files
|
||||
7. Prints a list of pages that need updating
|
||||
4. Processes pages from the FR:Traductions_désynchronisées category
|
||||
5. Calculates staleness scores for all pages
|
||||
6. Generates a histogram of staleness scores
|
||||
7. Saves the results to CSV and JSON files
|
||||
8. Prints a list of pages that need updating
|
||||
"""
|
||||
# Parse command-line arguments
|
||||
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
|
||||
parser.add_argument('--no-grammar-check', action='store_true',
|
||||
help='Disable grammar checking for French pages')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Whether to check grammar for French pages
|
||||
check_grammar = not args.no_grammar_check
|
||||
|
||||
logger.info("Starting wiki_compare.py")
|
||||
|
||||
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
|
||||
|
||||
|
@ -1074,12 +1131,12 @@ def main():
|
|||
key = key_info['key']
|
||||
|
||||
# Fetch English page
|
||||
en_page = fetch_wiki_page(key, 'en')
|
||||
en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Fetch French page
|
||||
fr_page = fetch_wiki_page(key, 'fr')
|
||||
fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
|
@ -1092,7 +1149,7 @@ def main():
|
|||
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
||||
if page.startswith('http'):
|
||||
# For full URLs, we directly fetch the page
|
||||
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if page_info:
|
||||
wiki_pages.append(page_info)
|
||||
|
||||
|
@ -1102,7 +1159,7 @@ def main():
|
|||
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
# If it's an English page, try to find the French equivalent
|
||||
|
@ -1111,14 +1168,14 @@ def main():
|
|||
fr_title = f"FR:{page_info['page_title']}"
|
||||
fr_url = f"{WIKI_BASE_URL}{fr_title}"
|
||||
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
|
||||
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
|
||||
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||
elif page.startswith('FR:'):
|
||||
# Fetch the French page
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
|
@ -1126,22 +1183,43 @@ def main():
|
|||
en_title = page[3:] # Remove FR: prefix
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
|
||||
else:
|
||||
# Fetch the English page
|
||||
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
|
||||
en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Fetch the French page (by adding FR: prefix)
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
|
||||
# Process pages from the FR:Traductions_désynchronisées category
|
||||
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
|
||||
desynchronized_pages = fetch_desynchronized_pages()
|
||||
for page_url in desynchronized_pages:
|
||||
# Fetch the French page
|
||||
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||
if fr_page:
|
||||
wiki_pages.append(fr_page)
|
||||
|
||||
# Try to find the English equivalent
|
||||
if fr_page['page_title'].startswith('FR:'):
|
||||
en_title = fr_page['page_title'][3:] # Remove FR: prefix
|
||||
else:
|
||||
en_title = fr_page['page_title']
|
||||
|
||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
|
||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||
if en_page:
|
||||
wiki_pages.append(en_page)
|
||||
|
||||
# Process wiki pages to add staleness score
|
||||
processed_wiki_pages = []
|
||||
pages_by_key = {}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue