diff --git a/public/assets/img/Screenshot 2025-09-02 at 10-12-37 Propositions archivées OSM.png b/public/assets/img/Screenshot 2025-09-02 at 10-12-37 Propositions archivées OSM.png new file mode 100644 index 0000000..f195cfd Binary files /dev/null and b/public/assets/img/Screenshot 2025-09-02 at 10-12-37 Propositions archivées OSM.png differ diff --git a/cipherbliss-osm-commerce.config.caddy b/qualiwiki.cipherbliss.com.config.caddy similarity index 86% rename from cipherbliss-osm-commerce.config.caddy rename to qualiwiki.cipherbliss.com.config.caddy index 8b7a41c..2bb0a3a 100644 --- a/cipherbliss-osm-commerce.config.caddy +++ b/qualiwiki.cipherbliss.com.config.caddy @@ -1,5 +1,5 @@ qualiwiki.cipherbliss.com { - root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public + root * /home/poule/encrypted/qualiwiki/public # serve files directly if they can be found (e.g. CSS or JS files in public/) encode zstd gzip diff --git a/wiki_compare/__pycache__/wiki_compare.cpython-312.pyc b/wiki_compare/__pycache__/wiki_compare.cpython-312.pyc new file mode 100644 index 0000000..734b24f Binary files /dev/null and b/wiki_compare/__pycache__/wiki_compare.cpython-312.pyc differ diff --git a/wiki_compare/wiki_compare.py b/wiki_compare/wiki_compare.py index 6f14f3a..0d30495 100755 --- a/wiki_compare/wiki_compare.py +++ b/wiki_compare/wiki_compare.py @@ -31,6 +31,7 @@ import os import subprocess import tempfile import hashlib +import argparse from datetime import datetime from bs4 import BeautifulSoup import logging @@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all" WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:" WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:" WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/" +WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es" TOP_KEYS_FILE = "top_keys.json" WIKI_PAGES_CSV = "wiki_pages.csv" OUTDATED_PAGES_FILE = "outdated_pages.json" @@ -72,7 +74,7 @@ try: nltk.data.find('tokenizers/punkt_tab') except LookupError: nltk.download('punkt_tab') - + # Create HTML cache directory if it doesn't exist Path(HTML_CACHE_DIR).mkdir(exist_ok=True) @@ -91,11 +93,52 @@ SPECIFIC_PAGES = [ "Key:cuisine", "Libre_Charge_Map", "OSM_Mon_Commerce", + "Complète_Tes_Commerces", "Tag:amenity=charging_station", + "Organised_Editing/Activities/MapYourGrid_Initiative", "Key:highway", - "Quality_assurance" + "Quality_assurance", + "Verifiability", + "Good_practice", + "Mapping_parties", + "State_of_the_Map", + "Diversity" ] +def fetch_desynchronized_pages(): + """ + Fetch pages from the FR:Traductions_désynchronisées category + + Returns: + list: List of page URLs from the category + """ + logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}") + + try: + response = requests.get(WIKI_CATEGORY_URL) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + + # Find all links to French pages in the category + page_links = [] + for link in soup.select('a[href^="/wiki/FR:"]'): + href = link.get('href', '') + # Skip if it's a category link or a language link + if '/Category:' in href or 'action=edit' in href: + continue + + # Get the full URL + full_url = 'https://wiki.openstreetmap.org' + href + page_links.append(full_url) + + logger.info(f"Found {len(page_links)} pages in the category") + return page_links + + except requests.exceptions.RequestException as e: + logger.error(f"Error fetching category page: {e}") + return [] + def fetch_top_keys(limit=NUM_WIKI_PAGES): """ Fetch the most used OSM keys from TagInfo API @@ -133,10 +176,10 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES): def load_json_data(filename): """ Load data from a JSON file - + Args: filename (str): Name of the file - + Returns: dict: Data loaded from the file or empty dict if file doesn't exist """ @@ -164,7 +207,7 @@ def save_to_json(data, filename): try: # Convert data to JSON string json_str = json.dumps(data, indent=2, ensure_ascii=False) - + # Print the JSON string for debugging logger.info(f"JSON string to be written to {filename}:") logger.info(f"JSON keys at top level: {list(data.keys())}") @@ -174,22 +217,22 @@ def save_to_json(data, filename): logger.info(f"'type' key exists in translations") if 'type_key' in data['translations']: logger.info(f"'type_key' key exists in translations") - + # Write the JSON string to the file with open(filename, 'w', encoding='utf-8') as f: f.write(json_str) - + logger.info(f"Data saved to {filename}") except IOError as e: logger.error(f"Error saving data to {filename}: {e}") - + def save_with_history(data, filename): """ Save data to a JSON file while preserving history - + This function loads existing data from the file (if it exists), adds the new data to the history, and saves the updated data back to the file. - + Args: data: New data to save filename (str): Name of the file @@ -197,32 +240,32 @@ def save_with_history(data, filename): try: # Load existing data existing_data = load_json_data(filename) - + # Create a timestamp for the current data current_timestamp = datetime.now().isoformat() - + # Initialize history if it doesn't exist if 'history' not in existing_data: existing_data['history'] = {} - + # Add current regular_pages and specific_pages to history history_entry = { 'regular_pages': data.get('regular_pages', []), 'specific_pages': data.get('specific_pages', []) } - + # Add the entry to history with timestamp as key existing_data['history'][current_timestamp] = history_entry - + # Update the current data existing_data['regular_pages'] = data.get('regular_pages', []) existing_data['specific_pages'] = data.get('specific_pages', []) existing_data['last_updated'] = current_timestamp - + # Save the updated data with open(filename, 'w', encoding='utf-8') as f: json.dump(existing_data, f, indent=2, ensure_ascii=False) - + logger.info(f"Data with history saved to {filename}") except (IOError, json.JSONDecodeError) as e: logger.error(f"Error saving data with history to {filename}: {e}") @@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text): logger.error(f"Unexpected error during grammar checking: {e}") return [] -def fetch_wiki_page(key, language='en', is_specific_page=False): +def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True): """ Fetch wiki page for a given key or specific page @@ -328,7 +371,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): key (str): OSM key or specific page title/URL language (str): Language code ('en' or 'fr') is_specific_page (bool): Whether this is a specific page rather than a key - + check_grammar (bool): Whether to check grammar for French pages + Returns: dict: Dictionary with page information or None if page doesn't exist """ @@ -369,9 +413,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): # Create a unique cache filename based on the URL cache_key = hashlib.md5(url.encode()).hexdigest() cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html" - + html_content = None - + # Try to load from cache first if cache_file.exists(): logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'") @@ -381,21 +425,21 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): except Exception as e: logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.") html_content = None - + # If not in cache or cache read failed, fetch from web if html_content is None: logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}") try: response = requests.get(url) - + # Check if page exists if response.status_code == 404: logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist") return None - + response.raise_for_status() html_content = response.text - + # Save to cache try: with open(cache_file, 'w', encoding='utf-8') as f: @@ -406,9 +450,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): except requests.exceptions.RequestException as e: logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}") return None - + soup = BeautifulSoup(html_content, 'html.parser') - + # Get last modification date last_modified = None footer_info = soup.select_one('#footer-info-lastmod') @@ -423,29 +467,29 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d') except ValueError: logger.warning(f"Could not parse date: {date_str}") - + # Extract sections (h2, h3, h4) section_elements = soup.select('h2, h3, h4') sections = len(section_elements) - + # Extract section titles section_titles = [] for section_elem in section_elements: # Skip sections that are part of the table of contents, navigation, or DescriptionBox if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']: continue - + # Skip sections that are inside a table with class DescriptionBox if section_elem.find_parent('table', class_='DescriptionBox'): continue - + # Get the text of the section title, removing any edit links for edit_link in section_elem.select('.mw-editsection'): edit_link.extract() - + section_title = section_elem.get_text(strip=True) section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4 - + section_titles.append({ 'title': section_title, 'level': section_level @@ -458,29 +502,31 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): # Remove script and style elements for script in content.select('script, style'): script.extract() - + # Remove .languages elements for languages_elem in content.select('.languages'): languages_elem.extract() - + # Get text and count words clean_text = content.get_text(separator=' ', strip=True) word_count = len(clean_text.split()) - + # Count sentences using NLTK sentences = nltk.sent_tokenize(clean_text) sentence_count = len(sentences) - + # Check grammar for French pages grammar_suggestions = [] -# if language == 'fr': -# logger.info(f"Checking grammar for French page: {key}") -# grammar_suggestions = check_grammar_with_grammalecte(clean_text) - + if language == 'fr' and check_grammar: + logger.info(f"Checking grammar for French page: {key}") + grammar_suggestions = check_grammar_with_grammalecte(clean_text) + elif language == 'fr' and not check_grammar: + logger.info(f"Grammar checking disabled for French page: {key}") + # Extract links links = content.select('a') link_count = len(links) - + # Get link details (text and href) link_details = [] for link in links: @@ -488,22 +534,22 @@ def fetch_wiki_page(key, language='en', is_specific_page=False): # Skip edit section links and other non-content links if 'action=edit' in href or 'redlink=1' in href or not href: continue - + # Make relative URLs absolute if href.startswith('/'): href = 'https://wiki.openstreetmap.org' + href - + link_text = link.get_text(strip=True) if link_text: # Only include links with text link_details.append({ 'text': link_text, 'href': href }) - + # Extract media (images) media_elements = content.select('img') media_count = len(media_elements) - + # Get media details (src and alt text) media_details = [] @@ -1045,13 +1091,24 @@ def main(): 1. Fetches the top OSM keys from TagInfo API 2. Fetches and processes wiki pages for these keys 3. Processes specific wiki pages listed in SPECIFIC_PAGES - 4. Calculates staleness scores for all pages - 5. Generates a histogram of staleness scores - 6. Saves the results to CSV and JSON files - 7. Prints a list of pages that need updating + 4. Processes pages from the FR:Traductions_désynchronisées category + 5. Calculates staleness scores for all pages + 6. Generates a histogram of staleness scores + 7. Saves the results to CSV and JSON files + 8. Prints a list of pages that need updating """ + # Parse command-line arguments + parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.') + parser.add_argument('--no-grammar-check', action='store_true', + help='Disable grammar checking for French pages') + args = parser.parse_args() + + # Whether to check grammar for French pages + check_grammar = not args.no_grammar_check + logger.info("Starting wiki_compare.py") - + logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}") + # Create output directory if it doesn't exist os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True) @@ -1074,12 +1131,12 @@ def main(): key = key_info['key'] # Fetch English page - en_page = fetch_wiki_page(key, 'en') + en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar) if en_page: wiki_pages.append(en_page) # Fetch French page - fr_page = fetch_wiki_page(key, 'fr') + fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar) if fr_page: wiki_pages.append(fr_page) @@ -1092,7 +1149,7 @@ def main(): # Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois") if page.startswith('http'): # For full URLs, we directly fetch the page - page_info = fetch_wiki_page(page, 'en', is_specific_page=True) + page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar) if page_info: wiki_pages.append(page_info) @@ -1102,7 +1159,7 @@ def main(): en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '') en_url = f"{WIKI_BASE_URL}{en_title}" logger.info(f"Trying to find English equivalent for {page}: {en_url}") - en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True) + en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar) if en_page: wiki_pages.append(en_page) # If it's an English page, try to find the French equivalent @@ -1111,14 +1168,14 @@ def main(): fr_title = f"FR:{page_info['page_title']}" fr_url = f"{WIKI_BASE_URL}{fr_title}" logger.info(f"Trying to find French equivalent for {page}: {fr_url}") - fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True) + fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar) if fr_page: wiki_pages.append(fr_page) # Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club") elif page.startswith('FR:'): # Fetch the French page - fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True) + fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar) if fr_page: wiki_pages.append(fr_page) @@ -1126,22 +1183,43 @@ def main(): en_title = page[3:] # Remove FR: prefix en_url = f"{WIKI_BASE_URL}{en_title}" logger.info(f"Trying to find English equivalent for {page}: {en_url}") - en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True) + en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar) if en_page: wiki_pages.append(en_page) # Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm") else: # Fetch the English page - en_page = fetch_wiki_page(page, 'en', is_specific_page=True) + en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar) if en_page: wiki_pages.append(en_page) # Fetch the French page (by adding FR: prefix) - fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True) + fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar) if fr_page: wiki_pages.append(fr_page) - + + # Process pages from the FR:Traductions_désynchronisées category + logger.info("Processing pages from FR:Traductions_désynchronisées category...") + desynchronized_pages = fetch_desynchronized_pages() + for page_url in desynchronized_pages: + # Fetch the French page + fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar) + if fr_page: + wiki_pages.append(fr_page) + + # Try to find the English equivalent + if fr_page['page_title'].startswith('FR:'): + en_title = fr_page['page_title'][3:] # Remove FR: prefix + else: + en_title = fr_page['page_title'] + + en_url = f"{WIKI_BASE_URL}{en_title}" + logger.info(f"Trying to find English equivalent for {page_url}: {en_url}") + en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar) + if en_page: + wiki_pages.append(en_page) + # Process wiki pages to add staleness score processed_wiki_pages = [] pages_by_key = {}