add missing wiki pages from taginfo fr
This commit is contained in:
parent
e056cfc8fa
commit
dffb21b56e
8 changed files with 469 additions and 131 deletions
|
@ -778,6 +778,7 @@ class WikiController extends AbstractController
|
|||
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
|
||||
$englishHtml = null;
|
||||
$frenchHtml = null;
|
||||
$frenchCacheExists = false;
|
||||
|
||||
if (file_exists($scriptPath)) {
|
||||
// Create a temporary Python script to fetch the page content
|
||||
|
@ -788,25 +789,52 @@ class WikiController extends AbstractController
|
|||
|
||||
import sys
|
||||
import json
|
||||
from wiki_compare import fetch_wiki_page
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from wiki_compare import fetch_wiki_page, HTML_CACHE_DIR
|
||||
|
||||
# Get the key from command line arguments
|
||||
key = sys.argv[1]
|
||||
language = sys.argv[2]
|
||||
|
||||
# Fetch the page
|
||||
page = fetch_wiki_page(key, language)
|
||||
# Check if we're just checking cache existence
|
||||
check_cache_only = len(sys.argv) > 3 and sys.argv[3] == 'check_cache'
|
||||
|
||||
# Output the HTML content
|
||||
if page and 'html_content' in page:
|
||||
print(page['html_content'])
|
||||
if check_cache_only and language == 'fr':
|
||||
# For French pages, construct the URL to check cache
|
||||
if key.startswith('http'):
|
||||
url = key
|
||||
else:
|
||||
url = f"https://wiki.openstreetmap.org/wiki/FR:{key}"
|
||||
|
||||
# Create cache key
|
||||
cache_key = hashlib.md5(url.encode()).hexdigest()
|
||||
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
|
||||
|
||||
# Check if cache exists
|
||||
if cache_file.exists():
|
||||
print("CACHE_EXISTS")
|
||||
else:
|
||||
print("CACHE_MISSING")
|
||||
else:
|
||||
print("")
|
||||
# Normal fetch operation
|
||||
page = fetch_wiki_page(key, language)
|
||||
|
||||
# Output the HTML content
|
||||
if page and 'html_content' in page:
|
||||
print(page['html_content'])
|
||||
else:
|
||||
print("")
|
||||
EOT;
|
||||
|
||||
file_put_contents($tempScriptPath, $pythonCode);
|
||||
chmod($tempScriptPath, 0755);
|
||||
|
||||
// First check if French page exists in cache
|
||||
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr check_cache";
|
||||
$cacheCheckResult = trim(shell_exec($command));
|
||||
$frenchCacheExists = ($cacheCheckResult === "CACHE_EXISTS");
|
||||
|
||||
// Fetch English page
|
||||
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
|
||||
$englishHtml = shell_exec($command);
|
||||
|
@ -834,7 +862,8 @@ EOT;
|
|||
'english_url' => $englishUrl,
|
||||
'french_edit_url' => $frenchEditUrl,
|
||||
'english_html' => $englishHtml,
|
||||
'french_html' => $frenchHtml
|
||||
'french_html' => $frenchHtml,
|
||||
'french_cache_exists' => $frenchCacheExists
|
||||
]);
|
||||
}
|
||||
|
||||
|
@ -1104,7 +1133,25 @@ EOT;
|
|||
if (file_exists($pagesUnavailableInEnglishFile)) {
|
||||
$pagesUnavailableInEnglishData = json_decode(file_get_contents($pagesUnavailableInEnglishFile), true);
|
||||
if (isset($pagesUnavailableInEnglishData['pages']) && is_array($pagesUnavailableInEnglishData['pages'])) {
|
||||
$pagesUnavailableInEnglish = $pagesUnavailableInEnglishData['pages'];
|
||||
// Deduplicate pages based on URL
|
||||
$uniquePages = [];
|
||||
$seenUrls = [];
|
||||
|
||||
foreach ($pagesUnavailableInEnglishData['pages'] as $page) {
|
||||
if (isset($page['url'])) {
|
||||
// Use URL as the key for deduplication
|
||||
$url = $page['url'];
|
||||
if (!isset($seenUrls[$url])) {
|
||||
$seenUrls[$url] = true;
|
||||
$uniquePages[] = $page;
|
||||
}
|
||||
} else {
|
||||
// If no URL, keep the page (shouldn't happen, but just in case)
|
||||
$uniquePages[] = $page;
|
||||
}
|
||||
}
|
||||
|
||||
$pagesUnavailableInEnglish = $uniquePages;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1112,10 +1159,10 @@ EOT;
|
|||
$specificPages = [];
|
||||
$outdatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
|
||||
if (file_exists($outdatedPagesFile)) {
|
||||
$outdatedPagesData = json_decode(file_get_contents($outdatedPagesFile), true);
|
||||
if (isset($outdatedPagesData['specific_pages']) && is_array($outdatedPagesData['specific_pages'])) {
|
||||
$specificPages = $outdatedPagesData['specific_pages'];
|
||||
}
|
||||
// Use a memory-efficient approach to extract only the specific_pages array
|
||||
// without loading the entire file into memory
|
||||
$maxPages = 100; // Limit the number of pages to prevent memory exhaustion
|
||||
$specificPages = $this->extractSpecificPagesFromJson($outdatedPagesFile, $maxPages);
|
||||
}
|
||||
|
||||
// Load newly created French pages
|
||||
|
@ -1137,6 +1184,16 @@ EOT;
|
|||
$availableTranslations = $translationsData['translations'];
|
||||
}
|
||||
}
|
||||
|
||||
// Load keys without wiki pages
|
||||
$keysWithoutWiki = [];
|
||||
$keysWithoutWikiFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/keys_without_wiki.json';
|
||||
if (file_exists($keysWithoutWikiFile)) {
|
||||
$keysWithoutWikiData = json_decode(file_get_contents($keysWithoutWikiFile), true);
|
||||
if (is_array($keysWithoutWikiData)) {
|
||||
$keysWithoutWiki = $keysWithoutWikiData;
|
||||
}
|
||||
}
|
||||
|
||||
return $this->render('admin/wiki.html.twig', [
|
||||
'wiki_pages' => $wikiPages,
|
||||
|
@ -1147,7 +1204,8 @@ EOT;
|
|||
'newly_created_pages' => $newlyCreatedPages,
|
||||
'staleness_stats' => $stalenessStats,
|
||||
'wiki_pages_stats' => $wikiPagesStats,
|
||||
'available_translations' => $availableTranslations
|
||||
'available_translations' => $availableTranslations,
|
||||
'keys_without_wiki' => $keysWithoutWiki
|
||||
]);
|
||||
}
|
||||
|
||||
|
@ -1791,4 +1849,124 @@ EOT;
|
|||
|
||||
return $contentHtml;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the specific_pages array from a large JSON file without loading the entire file into memory
|
||||
*
|
||||
* @param string $filePath Path to the JSON file
|
||||
* @param int $maxPages Maximum number of pages to extract (to prevent memory exhaustion)
|
||||
* @return array The extracted specific_pages array
|
||||
*/
|
||||
private function extractSpecificPagesFromJson(string $filePath, int $maxPages = 100): array
|
||||
{
|
||||
$specificPages = [];
|
||||
|
||||
// For very large files, we'll use a more direct approach
|
||||
// Instead of parsing the entire JSON structure, we'll extract just what we need
|
||||
|
||||
// First, check if the file exists and is readable
|
||||
if (!is_readable($filePath)) {
|
||||
return $specificPages;
|
||||
}
|
||||
|
||||
// Get the file size
|
||||
$fileSize = filesize($filePath);
|
||||
if ($fileSize === false || $fileSize === 0) {
|
||||
return $specificPages;
|
||||
}
|
||||
|
||||
// For very large files, we'll use a more efficient approach
|
||||
// We'll search for the "specific_pages" key directly
|
||||
$handle = fopen($filePath, 'r');
|
||||
if (!$handle) {
|
||||
return $specificPages;
|
||||
}
|
||||
|
||||
// Variables to track parsing state
|
||||
$inSpecificPages = false;
|
||||
$bracketCount = 0;
|
||||
$buffer = '';
|
||||
$pageCount = 0;
|
||||
$lineCount = 0;
|
||||
|
||||
// Skip ahead to find the specific_pages key more quickly
|
||||
// This is a simple optimization for this specific file structure
|
||||
$found = false;
|
||||
while (!$found && ($line = fgets($handle)) !== false) {
|
||||
$lineCount++;
|
||||
if (strpos($line, '"specific_pages"') !== false) {
|
||||
$found = true;
|
||||
$inSpecificPages = true;
|
||||
|
||||
// Find the opening bracket of the array
|
||||
if (strpos($line, '[') !== false) {
|
||||
$bracketCount = 1;
|
||||
$buffer = '['; // Start the buffer with an opening bracket
|
||||
} else {
|
||||
// If the opening bracket is on the next line
|
||||
$nextLine = fgets($handle);
|
||||
if ($nextLine !== false && strpos($nextLine, '[') !== false) {
|
||||
$bracketCount = 1;
|
||||
$buffer = '['; // Start the buffer with an opening bracket
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't find the specific_pages key, return empty array
|
||||
if (!$found) {
|
||||
fclose($handle);
|
||||
return $specificPages;
|
||||
}
|
||||
|
||||
// Now process the specific_pages array
|
||||
while (($line = fgets($handle)) !== false) {
|
||||
// Count opening and closing brackets to track array nesting
|
||||
$openBrackets = substr_count($line, '[') + substr_count($line, '{');
|
||||
$closeBrackets = substr_count($line, ']') + substr_count($line, '}');
|
||||
$bracketCount += $openBrackets - $closeBrackets;
|
||||
|
||||
// Add the line to our buffer
|
||||
$buffer .= $line;
|
||||
|
||||
// If we've reached the end of the array (bracketCount = 0)
|
||||
if ($bracketCount === 0) {
|
||||
// Parse the buffer as JSON
|
||||
$parsedData = json_decode($buffer, true);
|
||||
if (is_array($parsedData)) {
|
||||
// Limit the number of pages to prevent memory exhaustion
|
||||
$specificPages = array_slice($parsedData, 0, $maxPages);
|
||||
} else {
|
||||
// If parsing fails, log the error but don't crash
|
||||
error_log('Failed to parse specific_pages JSON data in ' . $filePath);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Check if we've found a complete page object (when we see a closing brace followed by a comma)
|
||||
if (preg_match('/\}\s*,\s*$/m', $line)) {
|
||||
$pageCount++;
|
||||
// If we've reached the maximum number of pages, stop processing
|
||||
if ($pageCount >= $maxPages) {
|
||||
// Close the array properly
|
||||
$buffer = rtrim($buffer, ",\r\n") . ']';
|
||||
// Parse the buffer as JSON
|
||||
$parsedData = json_decode($buffer, true);
|
||||
if (is_array($parsedData)) {
|
||||
$specificPages = $parsedData;
|
||||
} else {
|
||||
// If parsing fails, log the error but don't crash
|
||||
error_log('Failed to parse specific_pages JSON data in ' . $filePath . ' after reaching max pages');
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close the file
|
||||
fclose($handle);
|
||||
|
||||
return $specificPages;
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue