up fetch desyncronised pages
This commit is contained in:
parent
8008e0291e
commit
685efd6710
4 changed files with 140 additions and 62 deletions
Binary file not shown.
After Width: | Height: | Size: 167 KiB |
|
@ -1,5 +1,5 @@
|
||||||
qualiwiki.cipherbliss.com {
|
qualiwiki.cipherbliss.com {
|
||||||
root * /home/poule/encrypted/stockage-syncable/www/development/html/qualiwiki/public
|
root * /home/poule/encrypted/qualiwiki/public
|
||||||
|
|
||||||
# serve files directly if they can be found (e.g. CSS or JS files in public/)
|
# serve files directly if they can be found (e.g. CSS or JS files in public/)
|
||||||
encode zstd gzip
|
encode zstd gzip
|
BIN
wiki_compare/__pycache__/wiki_compare.cpython-312.pyc
Normal file
BIN
wiki_compare/__pycache__/wiki_compare.cpython-312.pyc
Normal file
Binary file not shown.
|
@ -31,6 +31,7 @@ import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import argparse
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import logging
|
import logging
|
||||||
|
@ -52,6 +53,7 @@ TAGINFO_API_URL = "https://taginfo.openstreetmap.org/api/4/keys/all"
|
||||||
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
WIKI_BASE_URL_EN = "https://wiki.openstreetmap.org/wiki/Key:"
|
||||||
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
WIKI_BASE_URL_FR = "https://wiki.openstreetmap.org/wiki/FR:Key:"
|
||||||
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
|
WIKI_BASE_URL = "https://wiki.openstreetmap.org/wiki/"
|
||||||
|
WIKI_CATEGORY_URL = "https://wiki.openstreetmap.org/wiki/Category:FR:Traductions_d%C3%A9synchronis%C3%A9es"
|
||||||
TOP_KEYS_FILE = "top_keys.json"
|
TOP_KEYS_FILE = "top_keys.json"
|
||||||
WIKI_PAGES_CSV = "wiki_pages.csv"
|
WIKI_PAGES_CSV = "wiki_pages.csv"
|
||||||
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
OUTDATED_PAGES_FILE = "outdated_pages.json"
|
||||||
|
@ -72,7 +74,7 @@ try:
|
||||||
nltk.data.find('tokenizers/punkt_tab')
|
nltk.data.find('tokenizers/punkt_tab')
|
||||||
except LookupError:
|
except LookupError:
|
||||||
nltk.download('punkt_tab')
|
nltk.download('punkt_tab')
|
||||||
|
|
||||||
# Create HTML cache directory if it doesn't exist
|
# Create HTML cache directory if it doesn't exist
|
||||||
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
Path(HTML_CACHE_DIR).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
@ -91,11 +93,52 @@ SPECIFIC_PAGES = [
|
||||||
"Key:cuisine",
|
"Key:cuisine",
|
||||||
"Libre_Charge_Map",
|
"Libre_Charge_Map",
|
||||||
"OSM_Mon_Commerce",
|
"OSM_Mon_Commerce",
|
||||||
|
"Complète_Tes_Commerces",
|
||||||
"Tag:amenity=charging_station",
|
"Tag:amenity=charging_station",
|
||||||
|
"Organised_Editing/Activities/MapYourGrid_Initiative",
|
||||||
"Key:highway",
|
"Key:highway",
|
||||||
"Quality_assurance"
|
"Quality_assurance",
|
||||||
|
"Verifiability",
|
||||||
|
"Good_practice",
|
||||||
|
"Mapping_parties",
|
||||||
|
"State_of_the_Map",
|
||||||
|
"Diversity"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def fetch_desynchronized_pages():
|
||||||
|
"""
|
||||||
|
Fetch pages from the FR:Traductions_désynchronisées category
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List of page URLs from the category
|
||||||
|
"""
|
||||||
|
logger.info(f"Fetching pages from category: {WIKI_CATEGORY_URL}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(WIKI_CATEGORY_URL)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
|
||||||
|
# Find all links to French pages in the category
|
||||||
|
page_links = []
|
||||||
|
for link in soup.select('a[href^="/wiki/FR:"]'):
|
||||||
|
href = link.get('href', '')
|
||||||
|
# Skip if it's a category link or a language link
|
||||||
|
if '/Category:' in href or 'action=edit' in href:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the full URL
|
||||||
|
full_url = 'https://wiki.openstreetmap.org' + href
|
||||||
|
page_links.append(full_url)
|
||||||
|
|
||||||
|
logger.info(f"Found {len(page_links)} pages in the category")
|
||||||
|
return page_links
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error fetching category page: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||||
"""
|
"""
|
||||||
Fetch the most used OSM keys from TagInfo API
|
Fetch the most used OSM keys from TagInfo API
|
||||||
|
@ -133,10 +176,10 @@ def fetch_top_keys(limit=NUM_WIKI_PAGES):
|
||||||
def load_json_data(filename):
|
def load_json_data(filename):
|
||||||
"""
|
"""
|
||||||
Load data from a JSON file
|
Load data from a JSON file
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
filename (str): Name of the file
|
filename (str): Name of the file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Data loaded from the file or empty dict if file doesn't exist
|
dict: Data loaded from the file or empty dict if file doesn't exist
|
||||||
"""
|
"""
|
||||||
|
@ -164,7 +207,7 @@ def save_to_json(data, filename):
|
||||||
try:
|
try:
|
||||||
# Convert data to JSON string
|
# Convert data to JSON string
|
||||||
json_str = json.dumps(data, indent=2, ensure_ascii=False)
|
json_str = json.dumps(data, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
# Print the JSON string for debugging
|
# Print the JSON string for debugging
|
||||||
logger.info(f"JSON string to be written to {filename}:")
|
logger.info(f"JSON string to be written to {filename}:")
|
||||||
logger.info(f"JSON keys at top level: {list(data.keys())}")
|
logger.info(f"JSON keys at top level: {list(data.keys())}")
|
||||||
|
@ -174,22 +217,22 @@ def save_to_json(data, filename):
|
||||||
logger.info(f"'type' key exists in translations")
|
logger.info(f"'type' key exists in translations")
|
||||||
if 'type_key' in data['translations']:
|
if 'type_key' in data['translations']:
|
||||||
logger.info(f"'type_key' key exists in translations")
|
logger.info(f"'type_key' key exists in translations")
|
||||||
|
|
||||||
# Write the JSON string to the file
|
# Write the JSON string to the file
|
||||||
with open(filename, 'w', encoding='utf-8') as f:
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
f.write(json_str)
|
f.write(json_str)
|
||||||
|
|
||||||
logger.info(f"Data saved to {filename}")
|
logger.info(f"Data saved to {filename}")
|
||||||
except IOError as e:
|
except IOError as e:
|
||||||
logger.error(f"Error saving data to {filename}: {e}")
|
logger.error(f"Error saving data to {filename}: {e}")
|
||||||
|
|
||||||
def save_with_history(data, filename):
|
def save_with_history(data, filename):
|
||||||
"""
|
"""
|
||||||
Save data to a JSON file while preserving history
|
Save data to a JSON file while preserving history
|
||||||
|
|
||||||
This function loads existing data from the file (if it exists),
|
This function loads existing data from the file (if it exists),
|
||||||
adds the new data to the history, and saves the updated data back to the file.
|
adds the new data to the history, and saves the updated data back to the file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data: New data to save
|
data: New data to save
|
||||||
filename (str): Name of the file
|
filename (str): Name of the file
|
||||||
|
@ -197,32 +240,32 @@ def save_with_history(data, filename):
|
||||||
try:
|
try:
|
||||||
# Load existing data
|
# Load existing data
|
||||||
existing_data = load_json_data(filename)
|
existing_data = load_json_data(filename)
|
||||||
|
|
||||||
# Create a timestamp for the current data
|
# Create a timestamp for the current data
|
||||||
current_timestamp = datetime.now().isoformat()
|
current_timestamp = datetime.now().isoformat()
|
||||||
|
|
||||||
# Initialize history if it doesn't exist
|
# Initialize history if it doesn't exist
|
||||||
if 'history' not in existing_data:
|
if 'history' not in existing_data:
|
||||||
existing_data['history'] = {}
|
existing_data['history'] = {}
|
||||||
|
|
||||||
# Add current regular_pages and specific_pages to history
|
# Add current regular_pages and specific_pages to history
|
||||||
history_entry = {
|
history_entry = {
|
||||||
'regular_pages': data.get('regular_pages', []),
|
'regular_pages': data.get('regular_pages', []),
|
||||||
'specific_pages': data.get('specific_pages', [])
|
'specific_pages': data.get('specific_pages', [])
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add the entry to history with timestamp as key
|
# Add the entry to history with timestamp as key
|
||||||
existing_data['history'][current_timestamp] = history_entry
|
existing_data['history'][current_timestamp] = history_entry
|
||||||
|
|
||||||
# Update the current data
|
# Update the current data
|
||||||
existing_data['regular_pages'] = data.get('regular_pages', [])
|
existing_data['regular_pages'] = data.get('regular_pages', [])
|
||||||
existing_data['specific_pages'] = data.get('specific_pages', [])
|
existing_data['specific_pages'] = data.get('specific_pages', [])
|
||||||
existing_data['last_updated'] = current_timestamp
|
existing_data['last_updated'] = current_timestamp
|
||||||
|
|
||||||
# Save the updated data
|
# Save the updated data
|
||||||
with open(filename, 'w', encoding='utf-8') as f:
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
json.dump(existing_data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
logger.info(f"Data with history saved to {filename}")
|
logger.info(f"Data with history saved to {filename}")
|
||||||
except (IOError, json.JSONDecodeError) as e:
|
except (IOError, json.JSONDecodeError) as e:
|
||||||
logger.error(f"Error saving data with history to {filename}: {e}")
|
logger.error(f"Error saving data with history to {filename}: {e}")
|
||||||
|
@ -313,7 +356,7 @@ def check_grammar_with_grammalecte(text):
|
||||||
logger.error(f"Unexpected error during grammar checking: {e}")
|
logger.error(f"Unexpected error during grammar checking: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def fetch_wiki_page(key, language='en', is_specific_page=False):
|
def fetch_wiki_page(key, language='en', is_specific_page=False, check_grammar=True):
|
||||||
"""
|
"""
|
||||||
Fetch wiki page for a given key or specific page
|
Fetch wiki page for a given key or specific page
|
||||||
|
|
||||||
|
@ -328,7 +371,8 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
key (str): OSM key or specific page title/URL
|
key (str): OSM key or specific page title/URL
|
||||||
language (str): Language code ('en' or 'fr')
|
language (str): Language code ('en' or 'fr')
|
||||||
is_specific_page (bool): Whether this is a specific page rather than a key
|
is_specific_page (bool): Whether this is a specific page rather than a key
|
||||||
|
check_grammar (bool): Whether to check grammar for French pages
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Dictionary with page information or None if page doesn't exist
|
dict: Dictionary with page information or None if page doesn't exist
|
||||||
"""
|
"""
|
||||||
|
@ -369,9 +413,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
# Create a unique cache filename based on the URL
|
# Create a unique cache filename based on the URL
|
||||||
cache_key = hashlib.md5(url.encode()).hexdigest()
|
cache_key = hashlib.md5(url.encode()).hexdigest()
|
||||||
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
||||||
|
|
||||||
html_content = None
|
html_content = None
|
||||||
|
|
||||||
# Try to load from cache first
|
# Try to load from cache first
|
||||||
if cache_file.exists():
|
if cache_file.exists():
|
||||||
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
|
logger.info(f"Loading {language} wiki page from cache for {'page' if is_specific_page else 'key'} '{key}'")
|
||||||
|
@ -381,21 +425,21 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
|
logger.warning(f"Error reading from cache: {e}. Will fetch from web instead.")
|
||||||
html_content = None
|
html_content = None
|
||||||
|
|
||||||
# If not in cache or cache read failed, fetch from web
|
# If not in cache or cache read failed, fetch from web
|
||||||
if html_content is None:
|
if html_content is None:
|
||||||
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
logger.info(f"Fetching {language} wiki page for {'page' if is_specific_page else 'key'} '{key}': {url}")
|
||||||
try:
|
try:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
|
|
||||||
# Check if page exists
|
# Check if page exists
|
||||||
if response.status_code == 404:
|
if response.status_code == 404:
|
||||||
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
logger.warning(f"Wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language} does not exist")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
html_content = response.text
|
html_content = response.text
|
||||||
|
|
||||||
# Save to cache
|
# Save to cache
|
||||||
try:
|
try:
|
||||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||||
|
@ -406,9 +450,9 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
logger.error(f"Error fetching wiki page for {'page' if is_specific_page else 'key'} '{key}' in {language}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
# Get last modification date
|
# Get last modification date
|
||||||
last_modified = None
|
last_modified = None
|
||||||
footer_info = soup.select_one('#footer-info-lastmod')
|
footer_info = soup.select_one('#footer-info-lastmod')
|
||||||
|
@ -423,29 +467,29 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
last_modified = datetime.strptime(date_str, '%d %B %Y').strftime('%Y-%m-%d')
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.warning(f"Could not parse date: {date_str}")
|
logger.warning(f"Could not parse date: {date_str}")
|
||||||
|
|
||||||
# Extract sections (h2, h3, h4)
|
# Extract sections (h2, h3, h4)
|
||||||
section_elements = soup.select('h2, h3, h4')
|
section_elements = soup.select('h2, h3, h4')
|
||||||
sections = len(section_elements)
|
sections = len(section_elements)
|
||||||
|
|
||||||
# Extract section titles
|
# Extract section titles
|
||||||
section_titles = []
|
section_titles = []
|
||||||
for section_elem in section_elements:
|
for section_elem in section_elements:
|
||||||
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
# Skip sections that are part of the table of contents, navigation, or DescriptionBox
|
||||||
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
if section_elem.parent and section_elem.parent.get('id') in ['toc', 'mw-navigation']:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip sections that are inside a table with class DescriptionBox
|
# Skip sections that are inside a table with class DescriptionBox
|
||||||
if section_elem.find_parent('table', class_='DescriptionBox'):
|
if section_elem.find_parent('table', class_='DescriptionBox'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get the text of the section title, removing any edit links
|
# Get the text of the section title, removing any edit links
|
||||||
for edit_link in section_elem.select('.mw-editsection'):
|
for edit_link in section_elem.select('.mw-editsection'):
|
||||||
edit_link.extract()
|
edit_link.extract()
|
||||||
|
|
||||||
section_title = section_elem.get_text(strip=True)
|
section_title = section_elem.get_text(strip=True)
|
||||||
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
section_level = int(section_elem.name[1]) # h2 -> 2, h3 -> 3, h4 -> 4
|
||||||
|
|
||||||
section_titles.append({
|
section_titles.append({
|
||||||
'title': section_title,
|
'title': section_title,
|
||||||
'level': section_level
|
'level': section_level
|
||||||
|
@ -458,29 +502,31 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
# Remove script and style elements
|
# Remove script and style elements
|
||||||
for script in content.select('script, style'):
|
for script in content.select('script, style'):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
|
||||||
# Remove .languages elements
|
# Remove .languages elements
|
||||||
for languages_elem in content.select('.languages'):
|
for languages_elem in content.select('.languages'):
|
||||||
languages_elem.extract()
|
languages_elem.extract()
|
||||||
|
|
||||||
# Get text and count words
|
# Get text and count words
|
||||||
clean_text = content.get_text(separator=' ', strip=True)
|
clean_text = content.get_text(separator=' ', strip=True)
|
||||||
word_count = len(clean_text.split())
|
word_count = len(clean_text.split())
|
||||||
|
|
||||||
# Count sentences using NLTK
|
# Count sentences using NLTK
|
||||||
sentences = nltk.sent_tokenize(clean_text)
|
sentences = nltk.sent_tokenize(clean_text)
|
||||||
sentence_count = len(sentences)
|
sentence_count = len(sentences)
|
||||||
|
|
||||||
# Check grammar for French pages
|
# Check grammar for French pages
|
||||||
grammar_suggestions = []
|
grammar_suggestions = []
|
||||||
# if language == 'fr':
|
if language == 'fr' and check_grammar:
|
||||||
# logger.info(f"Checking grammar for French page: {key}")
|
logger.info(f"Checking grammar for French page: {key}")
|
||||||
# grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
grammar_suggestions = check_grammar_with_grammalecte(clean_text)
|
||||||
|
elif language == 'fr' and not check_grammar:
|
||||||
|
logger.info(f"Grammar checking disabled for French page: {key}")
|
||||||
|
|
||||||
# Extract links
|
# Extract links
|
||||||
links = content.select('a')
|
links = content.select('a')
|
||||||
link_count = len(links)
|
link_count = len(links)
|
||||||
|
|
||||||
# Get link details (text and href)
|
# Get link details (text and href)
|
||||||
link_details = []
|
link_details = []
|
||||||
for link in links:
|
for link in links:
|
||||||
|
@ -488,22 +534,22 @@ def fetch_wiki_page(key, language='en', is_specific_page=False):
|
||||||
# Skip edit section links and other non-content links
|
# Skip edit section links and other non-content links
|
||||||
if 'action=edit' in href or 'redlink=1' in href or not href:
|
if 'action=edit' in href or 'redlink=1' in href or not href:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Make relative URLs absolute
|
# Make relative URLs absolute
|
||||||
if href.startswith('/'):
|
if href.startswith('/'):
|
||||||
href = 'https://wiki.openstreetmap.org' + href
|
href = 'https://wiki.openstreetmap.org' + href
|
||||||
|
|
||||||
link_text = link.get_text(strip=True)
|
link_text = link.get_text(strip=True)
|
||||||
if link_text: # Only include links with text
|
if link_text: # Only include links with text
|
||||||
link_details.append({
|
link_details.append({
|
||||||
'text': link_text,
|
'text': link_text,
|
||||||
'href': href
|
'href': href
|
||||||
})
|
})
|
||||||
|
|
||||||
# Extract media (images)
|
# Extract media (images)
|
||||||
media_elements = content.select('img')
|
media_elements = content.select('img')
|
||||||
media_count = len(media_elements)
|
media_count = len(media_elements)
|
||||||
|
|
||||||
# Get media details (src and alt text)
|
# Get media details (src and alt text)
|
||||||
media_details = []
|
media_details = []
|
||||||
|
|
||||||
|
@ -1045,13 +1091,24 @@ def main():
|
||||||
1. Fetches the top OSM keys from TagInfo API
|
1. Fetches the top OSM keys from TagInfo API
|
||||||
2. Fetches and processes wiki pages for these keys
|
2. Fetches and processes wiki pages for these keys
|
||||||
3. Processes specific wiki pages listed in SPECIFIC_PAGES
|
3. Processes specific wiki pages listed in SPECIFIC_PAGES
|
||||||
4. Calculates staleness scores for all pages
|
4. Processes pages from the FR:Traductions_désynchronisées category
|
||||||
5. Generates a histogram of staleness scores
|
5. Calculates staleness scores for all pages
|
||||||
6. Saves the results to CSV and JSON files
|
6. Generates a histogram of staleness scores
|
||||||
7. Prints a list of pages that need updating
|
7. Saves the results to CSV and JSON files
|
||||||
|
8. Prints a list of pages that need updating
|
||||||
"""
|
"""
|
||||||
|
# Parse command-line arguments
|
||||||
|
parser = argparse.ArgumentParser(description='Compare OpenStreetMap wiki pages in English and French.')
|
||||||
|
parser.add_argument('--no-grammar-check', action='store_true',
|
||||||
|
help='Disable grammar checking for French pages')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Whether to check grammar for French pages
|
||||||
|
check_grammar = not args.no_grammar_check
|
||||||
|
|
||||||
logger.info("Starting wiki_compare.py")
|
logger.info("Starting wiki_compare.py")
|
||||||
|
logger.info(f"Grammar checking is {'disabled' if args.no_grammar_check else 'enabled'}")
|
||||||
|
|
||||||
# Create output directory if it doesn't exist
|
# Create output directory if it doesn't exist
|
||||||
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
|
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
|
||||||
|
|
||||||
|
@ -1074,12 +1131,12 @@ def main():
|
||||||
key = key_info['key']
|
key = key_info['key']
|
||||||
|
|
||||||
# Fetch English page
|
# Fetch English page
|
||||||
en_page = fetch_wiki_page(key, 'en')
|
en_page = fetch_wiki_page(key, 'en', check_grammar=check_grammar)
|
||||||
if en_page:
|
if en_page:
|
||||||
wiki_pages.append(en_page)
|
wiki_pages.append(en_page)
|
||||||
|
|
||||||
# Fetch French page
|
# Fetch French page
|
||||||
fr_page = fetch_wiki_page(key, 'fr')
|
fr_page = fetch_wiki_page(key, 'fr', check_grammar=check_grammar)
|
||||||
if fr_page:
|
if fr_page:
|
||||||
wiki_pages.append(fr_page)
|
wiki_pages.append(fr_page)
|
||||||
|
|
||||||
|
@ -1092,7 +1149,7 @@ def main():
|
||||||
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
# Case 1: Full URL (e.g., "https://wiki.openstreetmap.org/wiki/FR:Projet_du_mois")
|
||||||
if page.startswith('http'):
|
if page.startswith('http'):
|
||||||
# For full URLs, we directly fetch the page
|
# For full URLs, we directly fetch the page
|
||||||
page_info = fetch_wiki_page(page, 'en', is_specific_page=True)
|
page_info = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||||
if page_info:
|
if page_info:
|
||||||
wiki_pages.append(page_info)
|
wiki_pages.append(page_info)
|
||||||
|
|
||||||
|
@ -1102,7 +1159,7 @@ def main():
|
||||||
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
|
en_title = page_info['page_title'].replace('FR:', '').replace('fr:', '')
|
||||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||||
if en_page:
|
if en_page:
|
||||||
wiki_pages.append(en_page)
|
wiki_pages.append(en_page)
|
||||||
# If it's an English page, try to find the French equivalent
|
# If it's an English page, try to find the French equivalent
|
||||||
|
@ -1111,14 +1168,14 @@ def main():
|
||||||
fr_title = f"FR:{page_info['page_title']}"
|
fr_title = f"FR:{page_info['page_title']}"
|
||||||
fr_url = f"{WIKI_BASE_URL}{fr_title}"
|
fr_url = f"{WIKI_BASE_URL}{fr_title}"
|
||||||
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
|
logger.info(f"Trying to find French equivalent for {page}: {fr_url}")
|
||||||
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True)
|
fr_page = fetch_wiki_page(fr_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||||
if fr_page:
|
if fr_page:
|
||||||
wiki_pages.append(fr_page)
|
wiki_pages.append(fr_page)
|
||||||
|
|
||||||
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
# Case 2: Page with FR: prefix (e.g., "FR:Tag:leisure%3Dchildren_club")
|
||||||
elif page.startswith('FR:'):
|
elif page.startswith('FR:'):
|
||||||
# Fetch the French page
|
# Fetch the French page
|
||||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||||
if fr_page:
|
if fr_page:
|
||||||
wiki_pages.append(fr_page)
|
wiki_pages.append(fr_page)
|
||||||
|
|
||||||
|
@ -1126,22 +1183,43 @@ def main():
|
||||||
en_title = page[3:] # Remove FR: prefix
|
en_title = page[3:] # Remove FR: prefix
|
||||||
en_url = f"{WIKI_BASE_URL}{en_title}"
|
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||||
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
logger.info(f"Trying to find English equivalent for {page}: {en_url}")
|
||||||
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True)
|
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||||
if en_page:
|
if en_page:
|
||||||
wiki_pages.append(en_page)
|
wiki_pages.append(en_page)
|
||||||
|
|
||||||
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
|
# Case 3: Regular page title (e.g., "Anatomie_des_étiquettes_osm")
|
||||||
else:
|
else:
|
||||||
# Fetch the English page
|
# Fetch the English page
|
||||||
en_page = fetch_wiki_page(page, 'en', is_specific_page=True)
|
en_page = fetch_wiki_page(page, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||||
if en_page:
|
if en_page:
|
||||||
wiki_pages.append(en_page)
|
wiki_pages.append(en_page)
|
||||||
|
|
||||||
# Fetch the French page (by adding FR: prefix)
|
# Fetch the French page (by adding FR: prefix)
|
||||||
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True)
|
fr_page = fetch_wiki_page(page, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||||
if fr_page:
|
if fr_page:
|
||||||
wiki_pages.append(fr_page)
|
wiki_pages.append(fr_page)
|
||||||
|
|
||||||
|
# Process pages from the FR:Traductions_désynchronisées category
|
||||||
|
logger.info("Processing pages from FR:Traductions_désynchronisées category...")
|
||||||
|
desynchronized_pages = fetch_desynchronized_pages()
|
||||||
|
for page_url in desynchronized_pages:
|
||||||
|
# Fetch the French page
|
||||||
|
fr_page = fetch_wiki_page(page_url, 'fr', is_specific_page=True, check_grammar=check_grammar)
|
||||||
|
if fr_page:
|
||||||
|
wiki_pages.append(fr_page)
|
||||||
|
|
||||||
|
# Try to find the English equivalent
|
||||||
|
if fr_page['page_title'].startswith('FR:'):
|
||||||
|
en_title = fr_page['page_title'][3:] # Remove FR: prefix
|
||||||
|
else:
|
||||||
|
en_title = fr_page['page_title']
|
||||||
|
|
||||||
|
en_url = f"{WIKI_BASE_URL}{en_title}"
|
||||||
|
logger.info(f"Trying to find English equivalent for {page_url}: {en_url}")
|
||||||
|
en_page = fetch_wiki_page(en_url, 'en', is_specific_page=True, check_grammar=check_grammar)
|
||||||
|
if en_page:
|
||||||
|
wiki_pages.append(en_page)
|
||||||
|
|
||||||
# Process wiki pages to add staleness score
|
# Process wiki pages to add staleness score
|
||||||
processed_wiki_pages = []
|
processed_wiki_pages = []
|
||||||
pages_by_key = {}
|
pages_by_key = {}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue