From 1ed74c2e2f2bd0672bc5f6683c85f7aa9a755858 Mon Sep 17 00:00:00 2001 From: Tykayn Date: Fri, 5 Sep 2025 15:58:26 +0200 Subject: [PATCH] up pages --- src/Controller/WikiController.php | 636 ++++++++++++------ templates/admin/wiki.html.twig | 72 +- templates/admin/wiki_decrepitude.html.twig | 140 +++- .../admin/wiki_random_suggestion.html.twig | 61 +- templates/public/wiki.html.twig | 84 ++- .../public/wiki_random_suggestion.html.twig | 22 +- test_compare_route.php | 435 ++++++++++++ test_decrepitude.php | 464 +++++++++++++ wiki_compare/wiki_compare.py | 1 + 9 files changed, 1610 insertions(+), 305 deletions(-) create mode 100644 test_compare_route.php create mode 100644 test_decrepitude.php diff --git a/src/Controller/WikiController.php b/src/Controller/WikiController.php index c43a7fa..26b1fda 100644 --- a/src/Controller/WikiController.php +++ b/src/Controller/WikiController.php @@ -23,19 +23,17 @@ class WikiController extends AbstractController $histogramExists = file_exists($histogramFile); if (file_exists($outdatedPagesFile)) { - $outdatedPagesData = json_decode(file_get_contents($outdatedPagesFile), true); + // Use memory-efficient approach to extract data from the large JSON file + $maxPages = 100; // Limit the number of pages to prevent memory exhaustion - if (isset($outdatedPagesData['regular_pages']) && is_array($outdatedPagesData['regular_pages'])) { - $regularPages = $outdatedPagesData['regular_pages']; - } + // Extract regular_pages array + $regularPages = $this->extractJsonArrayByKey($outdatedPagesFile, 'regular_pages', $maxPages); - if (isset($outdatedPagesData['specific_pages']) && is_array($outdatedPagesData['specific_pages'])) { - $specificPages = $outdatedPagesData['specific_pages']; - } + // Extract specific_pages array + $specificPages = $this->extractJsonArrayByKey($outdatedPagesFile, 'specific_pages', $maxPages); - if (isset($outdatedPagesData['last_updated'])) { - $lastUpdated = $outdatedPagesData['last_updated']; - } + // Extract last_updated value + $lastUpdated = $this->extractJsonScalarByKey($outdatedPagesFile, 'last_updated'); } return $this->render('admin/wiki_decrepitude.html.twig', [ @@ -738,21 +736,15 @@ class WikiController extends AbstractController return $this->redirectToRoute('app_admin_wiki'); } - $jsonData = json_decode(file_get_contents($jsonFile), true); - - if (empty($jsonData)) { - $this->addFlash('error', 'Aucune page à améliorer n\'a été trouvée.'); - return $this->redirectToRoute('app_admin_wiki'); - } - - // Combine regular_pages and specific_pages into a single array - $allPages = []; - if (isset($jsonData['regular_pages']) && is_array($jsonData['regular_pages'])) { - $allPages = array_merge($allPages, $jsonData['regular_pages']); - } - if (isset($jsonData['specific_pages']) && is_array($jsonData['specific_pages'])) { - $allPages = array_merge($allPages, $jsonData['specific_pages']); - } + // Use memory-efficient approach to extract only the necessary data + $maxItems = 100; // Limit the number of items to prevent memory exhaustion + + // Extract regular_pages and specific_pages arrays + $regularPages = $this->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems); + $specificPages = $this->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems); + + // Combine them into a single array + $allPages = array_merge($regularPages, $specificPages); if (empty($allPages)) { $this->addFlash('error', 'Aucune page à améliorer n\'a été trouvée.'); @@ -893,13 +885,17 @@ EOT; // Check if the archived proposals file exists and load it if (file_exists($jsonFile)) { - $proposalsData = json_decode(file_get_contents($jsonFile), true); - - if (isset($proposalsData['proposals']) && is_array($proposalsData['proposals'])) { - $proposals = $proposalsData['proposals']; - $statistics = $proposalsData['statistics'] ?? []; - $lastUpdated = isset($proposalsData['last_updated']) ? $proposalsData['last_updated'] : null; - } + // Use memory-efficient approach to extract only the necessary data + $maxItems = 100; // Limit the number of items to prevent memory exhaustion + + // Extract proposals array + $proposals = $this->extractJsonArrayByKey($jsonFile, 'proposals', $maxItems); + + // Extract statistics object + $statistics = $this->extractJsonArrayByKey($jsonFile, 'statistics', $maxItems); + + // Extract last_updated value + $lastUpdated = $this->extractJsonScalarByKey($jsonFile, 'last_updated'); // Check if the data is older than 1 day if ($lastUpdated) { @@ -1312,73 +1308,76 @@ EOT; $historyData = null; if (file_exists($jsonFile)) { - $jsonData = json_decode(file_get_contents($jsonFile), true); - + // Use memory-efficient approach to extract only the necessary data + $maxItems = 100; // Limit the number of items to prevent memory exhaustion + // Extract history data if available $historyData = []; - if (isset($jsonData['history']) && is_array($jsonData['history'])) { - // Process history data for the current key - foreach ($jsonData['history'] as $timestamp => $entry) { - $historyEntry = [ - 'timestamp' => $timestamp, - 'date' => (new \DateTime($timestamp))->format('Y-m-d'), - 'metrics' => [] - ]; - - // Check regular_pages - if (isset($entry['regular_pages']) && is_array($entry['regular_pages'])) { - foreach ($entry['regular_pages'] as $page) { - if (isset($page['key']) && $page['key'] === $key) { - // Extract metrics - $historyEntry['metrics'] = [ - 'staleness_score' => $page['staleness_score'] ?? 0, - 'date_diff' => $page['date_diff'] ?? 0, - 'word_diff' => $page['word_diff'] ?? 0, - 'section_diff' => $page['section_diff'] ?? 0, - 'link_diff' => $page['link_diff'] ?? 0, - 'media_diff' => $page['media_diff'] ?? 0 - ]; - $historyData[] = $historyEntry; - break; - } - } - } - - // If not found in regular_pages, check specific_pages - if (empty($historyEntry['metrics']) && isset($entry['specific_pages']) && is_array($entry['specific_pages'])) { - foreach ($entry['specific_pages'] as $page) { - if (isset($page['key']) && $page['key'] === $key) { - // Extract metrics - $historyEntry['metrics'] = [ - 'staleness_score' => $page['staleness_score'] ?? 0, - 'date_diff' => $page['date_diff'] ?? 0, - 'word_diff' => $page['word_diff'] ?? 0, - 'section_diff' => $page['section_diff'] ?? 0, - 'link_diff' => $page['link_diff'] ?? 0, - 'media_diff' => $page['media_diff'] ?? 0 - ]; - $historyData[] = $historyEntry; - break; - } + + // Get history data from the JSON file + $historyEntries = $this->extractJsonArrayByKey($jsonFile, 'history', $maxItems); + + // Process history data for the current key + foreach ($historyEntries as $timestamp => $entry) { + $historyEntry = [ + 'timestamp' => $timestamp, + 'date' => is_string($timestamp) && !empty($timestamp) && $timestamp !== '0' ? + (new \DateTime($timestamp))->format('Y-m-d') : 'N/A', + 'metrics' => [] + ]; + + // Check regular_pages + if (isset($entry['regular_pages']) && is_array($entry['regular_pages'])) { + foreach ($entry['regular_pages'] as $page) { + if (isset($page['key']) && $page['key'] === $key) { + // Extract metrics + $historyEntry['metrics'] = [ + 'staleness_score' => $page['staleness_score'] ?? 0, + 'date_diff' => $page['date_diff'] ?? 0, + 'word_diff' => $page['word_diff'] ?? 0, + 'section_diff' => $page['section_diff'] ?? 0, + 'link_diff' => $page['link_diff'] ?? 0, + 'media_diff' => $page['media_diff'] ?? 0 + ]; + $historyData[] = $historyEntry; + break; } } } - // Sort history data by timestamp - usort($historyData, function($a, $b) { - return strtotime($a['timestamp']) - strtotime($b['timestamp']); - }); + // If not found in regular_pages, check specific_pages + if (empty($historyEntry['metrics']) && isset($entry['specific_pages']) && is_array($entry['specific_pages'])) { + foreach ($entry['specific_pages'] as $page) { + if (isset($page['key']) && $page['key'] === $key) { + // Extract metrics + $historyEntry['metrics'] = [ + 'staleness_score' => $page['staleness_score'] ?? 0, + 'date_diff' => $page['date_diff'] ?? 0, + 'word_diff' => $page['word_diff'] ?? 0, + 'section_diff' => $page['section_diff'] ?? 0, + 'link_diff' => $page['link_diff'] ?? 0, + 'media_diff' => $page['media_diff'] ?? 0 + ]; + $historyData[] = $historyEntry; + break; + } + } + } } + + // Sort history data by timestamp + usort($historyData, function($a, $b) { + return strtotime($a['timestamp']) - strtotime($b['timestamp']); + }); - // Check both regular_pages and specific_pages sections - $allPages = []; - if (isset($jsonData['regular_pages']) && is_array($jsonData['regular_pages'])) { - $allPages = array_merge($allPages, $jsonData['regular_pages']); - } - if (isset($jsonData['specific_pages']) && is_array($jsonData['specific_pages'])) { - $allPages = array_merge($allPages, $jsonData['specific_pages']); - } + // Get regular_pages and specific_pages arrays + $regularPages = $this->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems); + $specificPages = $this->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems); + + // Combine them into a single array + $allPages = array_merge($regularPages, $specificPages); + // Find the page with the matching key foreach ($allPages as $page) { if (isset($page['key']) && $page['key'] === $key) { $mediaComparison = $page['media_comparison'] ?? null; @@ -1850,8 +1849,355 @@ EOT; return $contentHtml; } + /** + * Extracts an array from a large JSON file by key without loading the entire file into memory + * + * @param string $filePath Path to the JSON file + * @param string $key The key of the array to extract + * @param int $maxItems Maximum number of items to extract (to prevent memory exhaustion) + * @return array The extracted array + */ + private function extractJsonArrayByKey(string $filePath, string $key, int $maxItems = 100): array + { + $result = []; + + // First, check if the file exists and is readable + if (!is_readable($filePath)) { + error_log("File is not readable: $filePath"); + return $result; + } + + // Get the file size + $fileSize = filesize($filePath); + if ($fileSize === false || $fileSize === 0) { + error_log("File is empty or size could not be determined: $filePath"); + return $result; + } + + try { + // For very large files, we'll use a more efficient approach + // We'll search for the specified key directly + $handle = fopen($filePath, 'r'); + if (!$handle) { + error_log("Could not open file: $filePath"); + return $result; + } + + // Variables to track parsing state + $bracketCount = 0; + $buffer = ''; + $itemCount = 0; + $inArray = false; + $arrayStarted = false; + + // Skip ahead to find the specified key more quickly + $found = false; + $searchKey = '"' . $key . '"'; + + while (!$found && ($line = fgets($handle)) !== false) { + if (strpos($line, $searchKey) !== false) { + $found = true; + + // Extract everything after the key + $keyPos = strpos($line, $searchKey); + $afterKey = substr($line, $keyPos + strlen($searchKey)); + + // Find the colon and then the opening bracket + if (strpos($afterKey, ':') !== false && strpos($afterKey, '[') !== false) { + $inArray = true; + $arrayStarted = true; + $bracketPos = strpos($afterKey, '['); + $buffer = '['; // Start the buffer with an opening bracket + $bracketCount = 1; + + // Add everything after the opening bracket to the buffer + $buffer .= substr($afterKey, $bracketPos + 1); + } else if (strpos($afterKey, ':') !== false) { + // The opening bracket might be on the next line + $inArray = true; + } + + break; + } + } + + // If we didn't find the key, return empty array + if (!$found) { + fclose($handle); + error_log("Key '$key' not found in file: $filePath"); + return $result; + } + + // If we found the key but not the opening bracket yet, look for it + if ($inArray && !$arrayStarted) { + while (($line = fgets($handle)) !== false) { + if (strpos($line, '[') !== false) { + $bracketPos = strpos($line, '['); + $buffer = '['; // Start the buffer with an opening bracket + $bracketCount = 1; + $arrayStarted = true; + + // Add everything after the opening bracket to the buffer + $buffer .= substr($line, $bracketPos + 1); + break; + } + } + } + + // If we still haven't found the opening bracket, something is wrong + if (!$arrayStarted) { + fclose($handle); + error_log("Could not find opening bracket for array '$key' in file: $filePath"); + return $result; + } + + // Now process the array + $collectingItems = true; + while ($collectingItems && ($line = fgets($handle)) !== false) { + // Count opening and closing brackets to track array nesting + $openBrackets = substr_count($line, '[') + substr_count($line, '{'); + $closeBrackets = substr_count($line, ']') + substr_count($line, '}'); + $bracketCount += $openBrackets - $closeBrackets; + + // Add the line to our buffer + $buffer .= $line; + + // If we've reached the end of the array (bracketCount = 0) + if ($bracketCount === 0) { + $collectingItems = false; + + // Try to parse the buffer as JSON + try { + $parsedData = json_decode($buffer, true); + if (json_last_error() !== JSON_ERROR_NONE) { + error_log("JSON parse error: " . json_last_error_msg() . " for key '$key'"); + + // Try a different approach - manually construct a valid JSON array + // Split the buffer by objects (each starting with { and ending with }) + preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches); + + if (!empty($matches[0])) { + // Take the first $maxItems objects + $objects = array_slice($matches[0], 0, $maxItems); + + // Construct a valid JSON array + $validJson = '[' . implode(',', $objects) . ']'; + + // Try to parse the valid JSON + $parsedData = json_decode($validJson, true); + if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) { + $result = $parsedData; + } else { + error_log("Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'"); + } + } + } else if (is_array($parsedData)) { + // Limit the number of items to prevent memory exhaustion + $result = array_slice($parsedData, 0, $maxItems); + } + } catch (\Exception $e) { + error_log("Exception parsing JSON for key '$key': " . $e->getMessage()); + } + + break; + } + + // Check if we've found a complete item (when we see a closing brace followed by a comma) + // This is used to count items and limit the number of items processed + if (preg_match('/\}\s*,\s*$/m', $line)) { + $itemCount++; + + // If we've reached the maximum number of items, stop processing + if ($itemCount >= $maxItems) { + $collectingItems = false; + + // Create a valid JSON array with the items we've collected so far + // We need to ensure the buffer ends with a complete JSON object and a closing bracket + + // First, find the last complete object (ending with }) + $lastObjectEnd = strrpos($buffer, '}'); + if ($lastObjectEnd !== false) { + // Truncate the buffer at the end of the last complete object + $buffer = substr($buffer, 0, $lastObjectEnd + 1); + // Add the closing bracket for the array + $buffer .= ']'; + + // Try to parse the buffer as JSON + try { + $parsedData = json_decode($buffer, true); + if (json_last_error() !== JSON_ERROR_NONE) { + error_log("JSON parse error after max items: " . json_last_error_msg() . " for key '$key'"); + + // Try a different approach - manually construct a valid JSON array + // Split the buffer by objects (each starting with { and ending with }) + preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches); + + if (!empty($matches[0])) { + // Take the first $maxItems objects + $objects = array_slice($matches[0], 0, $maxItems); + + // Construct a valid JSON array + $validJson = '[' . implode(',', $objects) . ']'; + + // Try to parse the valid JSON + $parsedData = json_decode($validJson, true); + if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) { + $result = $parsedData; + } else { + error_log("Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'"); + } + } + } else if (is_array($parsedData)) { + $result = $parsedData; + } + } catch (\Exception $e) { + error_log("Exception parsing JSON after max items for key '$key': " . $e->getMessage()); + } + } else { + error_log("Could not find the end of the last complete object for key '$key'"); + } + + break; + } + } + } + + // Close the file + fclose($handle); + + } catch (\Exception $e) { + error_log("Exception in extractJsonArrayByKey for key '$key': " . $e->getMessage()); + } + + return $result; + } + + /** + * Extracts a scalar value from a large JSON file by key without loading the entire file into memory + * + * @param string $filePath Path to the JSON file + * @param string $key The key of the scalar value to extract + * @return mixed The extracted scalar value or null if not found + */ + private function extractJsonScalarByKey(string $filePath, string $key): mixed + { + // First, check if the file exists and is readable + if (!is_readable($filePath)) { + error_log("File is not readable: $filePath"); + return null; + } + + try { + // For very large files, we'll use a more efficient approach + // We'll search for the specified key directly + $handle = fopen($filePath, 'r'); + if (!$handle) { + error_log("Could not open file: $filePath"); + return null; + } + + // Skip ahead to find the specified key more quickly + $found = false; + $searchKey = '"' . $key . '"'; + $value = null; + + while (!$found && ($line = fgets($handle)) !== false) { + if (strpos($line, $searchKey) !== false) { + $found = true; + + // Extract everything after the key + $keyPos = strpos($line, $searchKey); + $afterKey = substr($line, $keyPos + strlen($searchKey)); + + // Check if the value is on this line + if (strpos($afterKey, ':') !== false) { + $colonPos = strpos($afterKey, ':'); + $afterColon = trim(substr($afterKey, $colonPos + 1)); + + // Extract the value based on its type + if (preg_match('/^"([^"]*)"/', $afterColon, $matches)) { + // String value + $value = $matches[1]; + } elseif (preg_match('/^(\d+)/', $afterColon, $matches)) { + // Numeric value + $value = intval($matches[1]); + } elseif (preg_match('/^(true|false)/', $afterColon, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + } elseif (strpos($afterColon, 'null') === 0) { + // Null value + $value = null; + } else { + // The value might be on the next line or more complex + // For simplicity, we'll just use the regex approach as a fallback + if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $line, $matches)) { + // String value + $value = $matches[1]; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $line, $matches)) { + // Numeric value + $value = intval($matches[1]); + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $line, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + } elseif (strpos($line, 'null') !== false) { + // Null value + $value = null; + } else { + error_log("Could not extract value for key '$key' from line: " . trim($line)); + } + } + } else { + // The value might be on the next line + error_log("Value for key '$key' might be on the next line, using fallback method"); + + // Read the next line + $nextLine = fgets($handle); + if ($nextLine !== false) { + $combinedLine = $line . $nextLine; + + // Try to extract the value using regex + if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $combinedLine, $matches)) { + // String value + $value = $matches[1]; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $combinedLine, $matches)) { + // Numeric value + $value = intval($matches[1]); + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $combinedLine, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + } elseif (strpos($combinedLine, 'null') !== false) { + // Null value + $value = null; + } else { + error_log("Could not extract value for key '$key' from combined lines"); + } + } + } + + break; + } + } + + // Close the file + fclose($handle); + + if (!$found) { + error_log("Key '$key' not found in file: $filePath"); + } else if ($value === null) { + error_log("Value for key '$key' is null or could not be extracted"); + } + + return $value; + + } catch (\Exception $e) { + error_log("Exception in extractJsonScalarByKey for key '$key': " . $e->getMessage()); + return null; + } + } + /** * Extracts the specific_pages array from a large JSON file without loading the entire file into memory + * This is a legacy method kept for backward compatibility * * @param string $filePath Path to the JSON file * @param int $maxPages Maximum number of pages to extract (to prevent memory exhaustion) @@ -1859,114 +2205,6 @@ EOT; */ private function extractSpecificPagesFromJson(string $filePath, int $maxPages = 100): array { - $specificPages = []; - - // For very large files, we'll use a more direct approach - // Instead of parsing the entire JSON structure, we'll extract just what we need - - // First, check if the file exists and is readable - if (!is_readable($filePath)) { - return $specificPages; - } - - // Get the file size - $fileSize = filesize($filePath); - if ($fileSize === false || $fileSize === 0) { - return $specificPages; - } - - // For very large files, we'll use a more efficient approach - // We'll search for the "specific_pages" key directly - $handle = fopen($filePath, 'r'); - if (!$handle) { - return $specificPages; - } - - // Variables to track parsing state - $inSpecificPages = false; - $bracketCount = 0; - $buffer = ''; - $pageCount = 0; - $lineCount = 0; - - // Skip ahead to find the specific_pages key more quickly - // This is a simple optimization for this specific file structure - $found = false; - while (!$found && ($line = fgets($handle)) !== false) { - $lineCount++; - if (strpos($line, '"specific_pages"') !== false) { - $found = true; - $inSpecificPages = true; - - // Find the opening bracket of the array - if (strpos($line, '[') !== false) { - $bracketCount = 1; - $buffer = '['; // Start the buffer with an opening bracket - } else { - // If the opening bracket is on the next line - $nextLine = fgets($handle); - if ($nextLine !== false && strpos($nextLine, '[') !== false) { - $bracketCount = 1; - $buffer = '['; // Start the buffer with an opening bracket - } - } - break; - } - } - - // If we didn't find the specific_pages key, return empty array - if (!$found) { - fclose($handle); - return $specificPages; - } - - // Now process the specific_pages array - while (($line = fgets($handle)) !== false) { - // Count opening and closing brackets to track array nesting - $openBrackets = substr_count($line, '[') + substr_count($line, '{'); - $closeBrackets = substr_count($line, ']') + substr_count($line, '}'); - $bracketCount += $openBrackets - $closeBrackets; - - // Add the line to our buffer - $buffer .= $line; - - // If we've reached the end of the array (bracketCount = 0) - if ($bracketCount === 0) { - // Parse the buffer as JSON - $parsedData = json_decode($buffer, true); - if (is_array($parsedData)) { - // Limit the number of pages to prevent memory exhaustion - $specificPages = array_slice($parsedData, 0, $maxPages); - } else { - // If parsing fails, log the error but don't crash - error_log('Failed to parse specific_pages JSON data in ' . $filePath); - } - break; - } - - // Check if we've found a complete page object (when we see a closing brace followed by a comma) - if (preg_match('/\}\s*,\s*$/m', $line)) { - $pageCount++; - // If we've reached the maximum number of pages, stop processing - if ($pageCount >= $maxPages) { - // Close the array properly - $buffer = rtrim($buffer, ",\r\n") . ']'; - // Parse the buffer as JSON - $parsedData = json_decode($buffer, true); - if (is_array($parsedData)) { - $specificPages = $parsedData; - } else { - // If parsing fails, log the error but don't crash - error_log('Failed to parse specific_pages JSON data in ' . $filePath . ' after reaching max pages'); - } - break; - } - } - } - - // Close the file - fclose($handle); - - return $specificPages; + return $this->extractJsonArrayByKey($filePath, 'specific_pages', $maxPages); } } \ No newline at end of file diff --git a/templates/admin/wiki.html.twig b/templates/admin/wiki.html.twig index 1cf652d..c8fef3a 100644 --- a/templates/admin/wiki.html.twig +++ b/templates/admin/wiki.html.twig @@ -293,20 +293,20 @@
- {% if page.en_page.description_img_url is defined and page.en_page.description_img_url %} + {% if page.en_page is defined and page.en_page.description_img_url is defined and page.en_page.description_img_url %}
{{ page.key }}
{% endif %}
- {{ page.key }} + {{ page.title }}
- {{ page.reason }} +{# {{ page.reason }}#} {% if page.staleness_score is defined %} @@ -326,26 +326,54 @@
- - EN - - {% if page.fr_page %} - - FR - - - Comparer + {% if page.en_page is defined and page.en_page.url is defined %} + + EN + {% endif %} + {% if page.fr_page is defined and page.fr_page %} + {% if page.fr_page.url is defined %} + + FR + + {% endif %} + {% if page.key is defined %} + + Comparer + + {% elseif page.title is defined %} + + Comparer + + {% else %} + + {% endif %} {% else %} - - Traduire - + {% if page.key is defined %} + + Traduire + + {% elseif page.title is defined %} + + Traduire + + {% else %} + + {% endif %} {% endif %}
diff --git a/templates/admin/wiki_decrepitude.html.twig b/templates/admin/wiki_decrepitude.html.twig index 5472289..5fa7d8a 100644 --- a/templates/admin/wiki_decrepitude.html.twig +++ b/templates/admin/wiki_decrepitude.html.twig @@ -79,8 +79,10 @@ python3 wiki_compare.py {% for page in regular_pages|slice(0, 20) %} - {{ page.key }} - {{ page.reason }} + {{ page.title }} + +{# {{ page.reason }}#} + {% if page.word_diff > 0 %} {{ page.word_diff }} @@ -126,22 +128,52 @@ python3 wiki_compare.py class="btn btn-sm btn-outline-primary" title="Version anglaise"> EN - {% if page.fr_page %} - - FR - - - Comparer - + {% if page.fr_page is defined and page.fr_page %} + {% if page.fr_page.url is defined %} + + FR + + {% else %} + + {% endif %} + {% if page.key is defined %} + + Comparer + + {% elseif page.title is defined %} + + Comparer + + {% else %} + + {% endif %} {% else %} - - Traduire - + {% if page.key is defined %} + + Traduire + + {% elseif page.title is defined %} + + Traduire + + {% else %} + + {% endif %} {% endif %} @@ -178,17 +210,17 @@ python3 wiki_compare.py {% if page.en_page.description_img_url is defined and page.en_page.description_img_url %}
{{ page.key }}
{% endif %}
- {{ page.key }} + {{ page.title }}
- {{ page.reason }} +{# {{ page.reason }}#}
@@ -208,22 +240,52 @@ python3 wiki_compare.py class="btn btn-sm btn-outline-primary" title="Version anglaise"> EN - {% if page.fr_page %} - - FR - - - Comparer - + {% if page.fr_page is defined and page.fr_page %} + {% if page.fr_page.url is defined %} + + FR + + {% else %} + + {% endif %} + {% if page.key is defined %} + + Comparer + + {% elseif page.title is defined %} + + Comparer + + {% else %} + + {% endif %} {% else %} - - Traduire - + {% if page.key is defined %} + + Traduire + + {% elseif page.title is defined %} + + Traduire + + {% else %} + + {% endif %} {% endif %}
@@ -251,7 +313,13 @@ python3 wiki_compare.py const colors = []; {% for page in regular_pages|slice(0, 20) %} - labels.push("{{ page.key }}"); + {% if page.key is defined %} + labels.push("{{ page.key }}"); + {% elseif page.title is defined %} + labels.push("{{ page.title }}"); + {% else %} + labels.push("Page sans clé"); + {% endif %} scores.push({{ page.staleness_score }}); // Set color based on score diff --git a/templates/admin/wiki_random_suggestion.html.twig b/templates/admin/wiki_random_suggestion.html.twig index c92b2ba..2bce17e 100644 --- a/templates/admin/wiki_random_suggestion.html.twig +++ b/templates/admin/wiki_random_suggestion.html.twig @@ -11,12 +11,12 @@
-

{{ page.key }}

+

{% if page.key is defined %}{{ page.key }}{% elseif page.title is defined %}{{ page.title }}{% else %}Page sans clé{% endif %}

Raisons d'amélioration

-

{{ page.reason }}

+{#

{{ page.reason }}

#}
@@ -55,9 +55,9 @@

Version française

- {% if page.fr_page %} + {% if page.fr_page is defined and page.fr_page %}

- Dernière modification: {{ page.fr_page.last_modified }} + Dernière modification: {{ page.fr_page.last_modified is defined ? page.fr_page.last_modified : 'Non disponible' }}

{% else %}

@@ -66,11 +66,11 @@ {% endif %}

- {% if page.fr_page %} + {% if page.fr_page is defined and page.fr_page %}
  • Sections - {{ page.fr_page.sections }} + {{ page.fr_page.sections is defined ? page.fr_page.sections : 0 }}
  • Mots @@ -82,21 +82,38 @@
- - Voir la page française - + {% if page.fr_page.url is defined %} + + Voir la page française + + {% else %} + + {% endif %}
{% else %}

La page wiki pour la clé - "{{ page.key }}" n'existe pas en français.

+ "{% if page.key is defined %}{{ page.key }}{% elseif page.title is defined %}{{ page.title }}{% else %}Page sans clé{% endif %}" n'existe pas en français.

Vous pouvez contribuer en créant cette page sur le wiki OpenStreetMap.

- - Créer la page française - + {% if page.key is defined %} + + Créer la page française + + {% elseif page.title is defined %} + + Créer la page française + + {% else %} + + {% endif %}
{% endif %}
@@ -105,9 +122,19 @@
- - Voir la comparaison détaillée - + {% if page.key is defined %} + + Voir la comparaison détaillée + + {% elseif page.title is defined %} + + Voir la comparaison détaillée + + {% else %} + + {% endif %} Autre suggestion aléatoire diff --git a/templates/public/wiki.html.twig b/templates/public/wiki.html.twig index ea6b707..346a915 100644 --- a/templates/public/wiki.html.twig +++ b/templates/public/wiki.html.twig @@ -206,18 +206,18 @@ {% if page.en_page.description_img_url is defined and page.en_page.description_img_url %}
{{ page.key }}
{% endif %}
- {{ page.key }} + {{ page.title }} Spécifique
- {{ page.reason }} +{# {{ page.reason }}#} {% if page.staleness_score is defined %} @@ -241,22 +241,52 @@ class="btn btn-sm btn-outline-primary" title="Version anglaise"> EN - {% if page.fr_page %} - - FR - - - Comparer - + {% if page.fr_page is defined and page.fr_page %} + {% if page.fr_page.url is defined %} + + FR + + {% else %} + + {% endif %} + {% if page.key is defined %} + + Comparer + + {% elseif page.title is defined %} + + Comparer + + {% else %} + + {% endif %} {% else %} - - Traduire - + {% if page.key is defined %} + + Traduire + + {% elseif page.title is defined %} + + Traduire + + {% else %} + + {% endif %} {% endif %}
@@ -325,11 +355,19 @@ FR {% set en_url = page.url|replace({'FR:': ''}) %} - - - title="Créer une traduction anglaise"> - créer EN - + {% if page.key is defined %} + + créer EN + + {% elseif page.title is defined %} + + créer EN + + {% else %} + + {% endif %}
diff --git a/templates/public/wiki_random_suggestion.html.twig b/templates/public/wiki_random_suggestion.html.twig index d079448..1a10cba 100644 --- a/templates/public/wiki_random_suggestion.html.twig +++ b/templates/public/wiki_random_suggestion.html.twig @@ -16,7 +16,7 @@

Raisons d'amélioration

-

{{ page.reason }}

+{#

{{ page.reason }}

#}
@@ -55,9 +55,9 @@

Version française

- {% if page.fr_page %} + {% if page.fr_page is defined and page.fr_page %}

- Dernière modification: {{ page.fr_page.last_modified }} + Dernière modification: {{ page.fr_page.last_modified is defined ? page.fr_page.last_modified : 'Non disponible' }}

{% else %}

@@ -66,11 +66,11 @@ {% endif %}

- {% if page.fr_page %} + {% if page.fr_page is defined and page.fr_page %}
  • Sections - {{ page.fr_page.sections }} + {{ page.fr_page.sections is defined ? page.fr_page.sections : 0 }}
  • Mots @@ -82,9 +82,15 @@
- - Voir la page française - + {% if page.fr_page.url is defined %} + + Voir la page française + + {% else %} + + {% endif %}
{% else %}
diff --git a/test_compare_route.php b/test_compare_route.php new file mode 100644 index 0000000..4b1c9b0 --- /dev/null +++ b/test_compare_route.php @@ -0,0 +1,435 @@ +projectDir = __DIR__; + } + + public function getParameter($name) { + if ($name === 'kernel.project_dir') { + return $this->projectDir; + } + return null; + } + + /** + * Extracts an array from a large JSON file by key without loading the entire file into memory + * + * @param string $filePath Path to the JSON file + * @param string $key The key of the array to extract + * @param int $maxItems Maximum number of items to extract (to prevent memory exhaustion) + * @return array The extracted array + */ + public function extractJsonArrayByKey(string $filePath, string $key, int $maxItems = 100): array + { + $result = []; + + // First, check if the file exists and is readable + if (!is_readable($filePath)) { + echo "File is not readable: $filePath\n"; + return $result; + } + + // Get the file size + $fileSize = filesize($filePath); + if ($fileSize === false || $fileSize === 0) { + echo "File is empty or size could not be determined: $filePath\n"; + return $result; + } + + try { + // For very large files, we'll use a more efficient approach + // We'll search for the specified key directly + $handle = fopen($filePath, 'r'); + if (!$handle) { + echo "Could not open file: $filePath\n"; + return $result; + } + + // Variables to track parsing state + $bracketCount = 0; + $buffer = ''; + $itemCount = 0; + $inArray = false; + $arrayStarted = false; + + // Skip ahead to find the specified key more quickly + $found = false; + $searchKey = '"' . $key . '"'; + + while (!$found && ($line = fgets($handle)) !== false) { + if (strpos($line, $searchKey) !== false) { + $found = true; + + // Extract everything after the key + $keyPos = strpos($line, $searchKey); + $afterKey = substr($line, $keyPos + strlen($searchKey)); + + // Find the colon and then the opening bracket + if (strpos($afterKey, ':') !== false && strpos($afterKey, '[') !== false) { + $inArray = true; + $arrayStarted = true; + $bracketPos = strpos($afterKey, '['); + $buffer = '['; // Start the buffer with an opening bracket + $bracketCount = 1; + + // Add everything after the opening bracket to the buffer + $buffer .= substr($afterKey, $bracketPos + 1); + } else if (strpos($afterKey, ':') !== false) { + // The opening bracket might be on the next line + $inArray = true; + } + + break; + } + } + + // If we didn't find the key, return empty array + if (!$found) { + fclose($handle); + echo "Key '$key' not found in file: $filePath\n"; + return $result; + } + + // If we found the key but not the opening bracket yet, look for it + if ($inArray && !$arrayStarted) { + while (($line = fgets($handle)) !== false) { + if (strpos($line, '[') !== false) { + $bracketPos = strpos($line, '['); + $buffer = '['; // Start the buffer with an opening bracket + $bracketCount = 1; + $arrayStarted = true; + + // Add everything after the opening bracket to the buffer + $buffer .= substr($line, $bracketPos + 1); + break; + } + } + } + + // If we still haven't found the opening bracket, something is wrong + if (!$arrayStarted) { + fclose($handle); + echo "Could not find opening bracket for array '$key' in file: $filePath\n"; + return $result; + } + + // Now process the array + $collectingItems = true; + while ($collectingItems && ($line = fgets($handle)) !== false) { + // Count opening and closing brackets to track array nesting + $openBrackets = substr_count($line, '[') + substr_count($line, '{'); + $closeBrackets = substr_count($line, ']') + substr_count($line, '}'); + $bracketCount += $openBrackets - $closeBrackets; + + // Add the line to our buffer + $buffer .= $line; + + // If we've reached the end of the array (bracketCount = 0) + if ($bracketCount === 0) { + $collectingItems = false; + + // Try to parse the buffer as JSON + try { + $parsedData = json_decode($buffer, true); + if (json_last_error() !== JSON_ERROR_NONE) { + echo "JSON parse error: " . json_last_error_msg() . " for key '$key'\n"; + + // Try a different approach - manually construct a valid JSON array + // Split the buffer by objects (each starting with { and ending with }) + preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches); + + if (!empty($matches[0])) { + // Take the first $maxItems objects + $objects = array_slice($matches[0], 0, $maxItems); + + // Construct a valid JSON array + $validJson = '[' . implode(',', $objects) . ']'; + + // Try to parse the valid JSON + $parsedData = json_decode($validJson, true); + if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) { + $result = $parsedData; + } else { + echo "Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'\n"; + } + } + } else if (is_array($parsedData)) { + // Limit the number of items to prevent memory exhaustion + $result = array_slice($parsedData, 0, $maxItems); + } + } catch (Exception $e) { + echo "Exception parsing JSON for key '$key': " . $e->getMessage() . "\n"; + } + + break; + } + + // Check if we've found a complete item (when we see a closing brace followed by a comma) + // This is used to count items and limit the number of items processed + if (preg_match('/\}\s*,\s*$/m', $line)) { + $itemCount++; + + // If we've reached the maximum number of items, stop processing + if ($itemCount >= $maxItems) { + $collectingItems = false; + + // Create a valid JSON array with the items we've collected so far + // We need to ensure the buffer ends with a complete JSON object and a closing bracket + + // First, find the last complete object (ending with }) + $lastObjectEnd = strrpos($buffer, '}'); + if ($lastObjectEnd !== false) { + // Truncate the buffer at the end of the last complete object + $buffer = substr($buffer, 0, $lastObjectEnd + 1); + // Add the closing bracket for the array + $buffer .= ']'; + + // Try to parse the buffer as JSON + try { + $parsedData = json_decode($buffer, true); + if (json_last_error() !== JSON_ERROR_NONE) { + echo "JSON parse error after max items: " . json_last_error_msg() . " for key '$key'\n"; + + // Try a different approach - manually construct a valid JSON array + // Split the buffer by objects (each starting with { and ending with }) + preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches); + + if (!empty($matches[0])) { + // Take the first $maxItems objects + $objects = array_slice($matches[0], 0, $maxItems); + + // Construct a valid JSON array + $validJson = '[' . implode(',', $objects) . ']'; + + // Try to parse the valid JSON + $parsedData = json_decode($validJson, true); + if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) { + $result = $parsedData; + } else { + echo "Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'\n"; + } + } + } else if (is_array($parsedData)) { + $result = $parsedData; + } + } catch (Exception $e) { + echo "Exception parsing JSON after max items for key '$key': " . $e->getMessage() . "\n"; + } + } else { + echo "Could not find the end of the last complete object for key '$key'\n"; + } + + break; + } + } + } + + // Close the file + fclose($handle); + + } catch (Exception $e) { + echo "Exception in extractJsonArrayByKey for key '$key': " . $e->getMessage() . "\n"; + } + + return $result; + } + + /** + * Extracts a scalar value from a large JSON file by key without loading the entire file into memory + * + * @param string $filePath Path to the JSON file + * @param string $key The key of the scalar value to extract + * @return mixed The extracted scalar value or null if not found + */ + public function extractJsonScalarByKey(string $filePath, string $key): mixed + { + // First, check if the file exists and is readable + if (!is_readable($filePath)) { + echo "File is not readable: $filePath\n"; + return null; + } + + try { + // For very large files, we'll use a more efficient approach + // We'll search for the specified key directly + $handle = fopen($filePath, 'r'); + if (!$handle) { + echo "Could not open file: $filePath\n"; + return null; + } + + // Skip ahead to find the specified key more quickly + $found = false; + $searchKey = '"' . $key . '"'; + $value = null; + + while (!$found && ($line = fgets($handle)) !== false) { + if (strpos($line, $searchKey) !== false) { + $found = true; + + // Extract everything after the key + $keyPos = strpos($line, $searchKey); + $afterKey = substr($line, $keyPos + strlen($searchKey)); + + // Check if the value is on this line + if (strpos($afterKey, ':') !== false) { + $colonPos = strpos($afterKey, ':'); + $afterColon = trim(substr($afterKey, $colonPos + 1)); + + // Extract the value based on its type + if (preg_match('/^"([^"]*)"/', $afterColon, $matches)) { + // String value + $value = $matches[1]; + } elseif (preg_match('/^(\d+)/', $afterColon, $matches)) { + // Numeric value + $value = intval($matches[1]); + } elseif (preg_match('/^(true|false)/', $afterColon, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + } elseif (strpos($afterColon, 'null') === 0) { + // Null value + $value = null; + } else { + // The value might be on the next line or more complex + // For simplicity, we'll just use the regex approach as a fallback + if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $line, $matches)) { + // String value + $value = $matches[1]; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $line, $matches)) { + // Numeric value + $value = intval($matches[1]); + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $line, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + } elseif (strpos($line, 'null') !== false) { + // Null value + $value = null; + } else { + echo "Could not extract value for key '$key' from line: " . trim($line) . "\n"; + } + } + } else { + // The value might be on the next line + echo "Value for key '$key' might be on the next line, using fallback method\n"; + + // Read the next line + $nextLine = fgets($handle); + if ($nextLine !== false) { + $combinedLine = $line . $nextLine; + + // Try to extract the value using regex + if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $combinedLine, $matches)) { + // String value + $value = $matches[1]; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $combinedLine, $matches)) { + // Numeric value + $value = intval($matches[1]); + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $combinedLine, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + } elseif (strpos($combinedLine, 'null') !== false) { + // Null value + $value = null; + } else { + echo "Could not extract value for key '$key' from combined lines\n"; + } + } + } + + break; + } + } + + // Close the file + fclose($handle); + + if (!$found) { + echo "Key '$key' not found in file: $filePath\n"; + } else if ($value === null) { + echo "Value for key '$key' is null or could not be extracted\n"; + } + + return $value; + + } catch (Exception $e) { + echo "Exception in extractJsonScalarByKey for key '$key': " . $e->getMessage() . "\n"; + return null; + } + } +} + +// Create a mock controller +$controller = new MockController(); + +// Test the memory-efficient approach +echo "Testing memory-efficient approach for /wiki/compare/Key:harassment_prevention route\n"; +echo "Memory limit: " . ini_get('memory_limit') . "\n\n"; + +// Get the file path +$jsonFile = __DIR__ . '/wiki_compare/outdated_pages.json'; +$key = 'Key:harassment_prevention'; + +// Check if the file exists +if (!file_exists($jsonFile)) { + echo "Error: File $jsonFile does not exist\n"; + exit(1); +} + +echo "File size: " . round(filesize($jsonFile) / (1024 * 1024), 2) . " MB\n\n"; + +// Measure memory usage before +$memBefore = memory_get_usage(); +echo "Memory usage before: " . round($memBefore / (1024 * 1024), 2) . " MB\n"; + +// Start timer +$startTime = microtime(true); + +// Extract data using memory-efficient approach +$maxItems = 100; +$regularPages = $controller->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems); +$specificPages = $controller->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems); +$historyEntries = $controller->extractJsonArrayByKey($jsonFile, 'history', $maxItems); + +// Combine regular_pages and specific_pages +$allPages = array_merge($regularPages, $specificPages); + +// Find the page with the matching key +$targetPage = null; +foreach ($allPages as $page) { + if (isset($page['key']) && $page['key'] === $key) { + $targetPage = $page; + break; + } +} + +// End timer +$endTime = microtime(true); + +// Measure memory usage after +$memAfter = memory_get_usage(); +echo "Memory usage after: " . round($memAfter / (1024 * 1024), 2) . " MB\n"; +echo "Memory used: " . round(($memAfter - $memBefore) / (1024 * 1024), 2) . " MB\n"; +echo "Time taken: " . round($endTime - $startTime, 2) . " seconds\n\n"; + +// Check if we found the page +if ($targetPage) { + echo "Successfully found page with key '$key'\n"; + echo "Page details:\n"; + echo "- Staleness score: " . ($targetPage['staleness_score'] ?? 'N/A') . "\n"; + echo "- Date diff: " . ($targetPage['date_diff'] ?? 'N/A') . "\n"; + echo "- Word diff: " . ($targetPage['word_diff'] ?? 'N/A') . "\n"; +} else { + echo "Page with key '$key' not found\n"; +} + +echo "\nTest completed successfully without memory exhaustion!\n"; \ No newline at end of file diff --git a/test_decrepitude.php b/test_decrepitude.php new file mode 100644 index 0000000..88ebe76 --- /dev/null +++ b/test_decrepitude.php @@ -0,0 +1,464 @@ +getMessage() . "\n"; + } + + echo "\n"; +} + +// Test the new approach (streaming) +function testNewApproach($filePath) { + echo "Testing new approach (streaming)...\n"; + $memBefore = memory_get_usage(); + echo "Memory usage before: " . round($memBefore / (1024 * 1024), 2) . " MB\n"; + + try { + $startTime = microtime(true); + + $regularPages = extractJsonArrayByKey($filePath, 'regular_pages', 100); + $specificPages = extractJsonArrayByKey($filePath, 'specific_pages', 100); + $lastUpdated = extractJsonScalarByKey($filePath, 'last_updated'); + + $endTime = microtime(true); + + echo "Successfully loaded data:\n"; + echo "- Regular pages: " . count($regularPages) . "\n"; + echo "- Specific pages: " . count($specificPages) . "\n"; + echo "- Last updated: " . ($lastUpdated ?? 'null') . "\n"; + + $memAfter = memory_get_usage(); + echo "Memory usage after: " . round($memAfter / (1024 * 1024), 2) . " MB\n"; + echo "Memory used: " . round(($memAfter - $memBefore) / (1024 * 1024), 2) . " MB\n"; + echo "Time taken: " . round($endTime - $startTime, 2) . " seconds\n"; + } catch (Exception $e) { + echo "Error: " . $e->getMessage() . "\n"; + } + + echo "\n"; +} + +// Implementation of extractJsonArrayByKey +function extractJsonArrayByKey(string $filePath, string $key, int $maxItems = 100): array +{ + $result = []; + + // First, check if the file exists and is readable + if (!is_readable($filePath)) { + echo "File is not readable: $filePath\n"; + return $result; + } + + // Get the file size + $fileSize = filesize($filePath); + if ($fileSize === false || $fileSize === 0) { + echo "File is empty or size could not be determined: $filePath\n"; + return $result; + } + + try { + // For very large files, we'll use a more efficient approach + // We'll search for the specified key directly + $handle = fopen($filePath, 'r'); + if (!$handle) { + echo "Could not open file: $filePath\n"; + return $result; + } + + // Variables to track parsing state + $bracketCount = 0; + $buffer = ''; + $itemCount = 0; + $inArray = false; + $arrayStarted = false; + + // Skip ahead to find the specified key more quickly + $found = false; + $searchKey = '"' . $key . '"'; + $lineCount = 0; + + while (!$found && ($line = fgets($handle)) !== false) { + $lineCount++; + if ($lineCount % 1000 === 0) { + echo "Processed $lineCount lines searching for $key...\r"; + } + + if (strpos($line, $searchKey) !== false) { + $found = true; + echo "\nFound $key key at line $lineCount\n"; + + // Extract everything after the key + $keyPos = strpos($line, $searchKey); + $afterKey = substr($line, $keyPos + strlen($searchKey)); + + // Find the colon and then the opening bracket + if (strpos($afterKey, ':') !== false && strpos($afterKey, '[') !== false) { + $inArray = true; + $arrayStarted = true; + $bracketPos = strpos($afterKey, '['); + $buffer = '['; // Start the buffer with an opening bracket + $bracketCount = 1; + + // Add everything after the opening bracket to the buffer + $buffer .= substr($afterKey, $bracketPos + 1); + echo "Opening bracket found on the same line\n"; + } else if (strpos($afterKey, ':') !== false) { + // The opening bracket might be on the next line + $inArray = true; + echo "Colon found, but opening bracket might be on the next line\n"; + } + + break; + } + } + + // If we didn't find the key, return empty array + if (!$found) { + echo "$key key not found in the file\n"; + fclose($handle); + return $result; + } + + // If we found the key but not the opening bracket yet, look for it + if ($inArray && !$arrayStarted) { + echo "Looking for opening bracket...\n"; + while (($line = fgets($handle)) !== false) { + if (strpos($line, '[') !== false) { + $bracketPos = strpos($line, '['); + $buffer = '['; // Start the buffer with an opening bracket + $bracketCount = 1; + $arrayStarted = true; + + // Add everything after the opening bracket to the buffer + $buffer .= substr($line, $bracketPos + 1); + echo "Opening bracket found on the next line\n"; + break; + } + } + } + + // If we still haven't found the opening bracket, something is wrong + if (!$arrayStarted) { + echo "Could not find opening bracket for array '$key' in file: $filePath\n"; + fclose($handle); + return $result; + } + + echo "Processing $key array...\n"; + + // Now process the array + $collectingItems = true; + while ($collectingItems && ($line = fgets($handle)) !== false) { + // Count opening and closing brackets to track array nesting + $openBrackets = substr_count($line, '[') + substr_count($line, '{'); + $closeBrackets = substr_count($line, ']') + substr_count($line, '}'); + $bracketCount += $openBrackets - $closeBrackets; + + // Add the line to our buffer + $buffer .= $line; + + // If we've reached the end of the array (bracketCount = 0) + if ($bracketCount === 0) { + $collectingItems = false; + echo "Reached end of $key array\n"; + + // Try to parse the buffer as JSON + try { + $parsedData = json_decode($buffer, true); + if (json_last_error() !== JSON_ERROR_NONE) { + echo "JSON parse error: " . json_last_error_msg() . " for key '$key'\n"; + // Debug: output a small part of the buffer + echo "Buffer preview (first 100 chars): " . substr($buffer, 0, 100) . "...\n"; + echo "Buffer preview (last 100 chars): ..." . substr($buffer, -100) . "\n"; + } else if (is_array($parsedData)) { + // Limit the number of items to prevent memory exhaustion + $result = array_slice($parsedData, 0, $maxItems); + echo "Parsed " . count($result) . " items from the $key array\n"; + } + } catch (Exception $e) { + echo "Exception parsing JSON for key '$key': " . $e->getMessage() . "\n"; + } + + break; + } + + // Check if we've found a complete item (when we see a closing brace followed by a comma) + // This is used to count items and limit the number of items processed + if (preg_match('/\}\s*,\s*$/m', $line)) { + $itemCount++; + if ($itemCount % 10 === 0) { + echo "Found $itemCount items in $key array...\r"; + } + + // If we've reached the maximum number of items, stop processing + if ($itemCount >= $maxItems) { + $collectingItems = false; + echo "\nReached maximum number of items ($maxItems) for $key\n"; + + // Create a valid JSON array with the items we've collected so far + // We need to ensure the buffer ends with a complete JSON object and a closing bracket + + // First, find the last complete object (ending with }) + $lastObjectEnd = strrpos($buffer, '}'); + if ($lastObjectEnd !== false) { + // Truncate the buffer at the end of the last complete object + $buffer = substr($buffer, 0, $lastObjectEnd + 1); + // Add the closing bracket for the array + $buffer .= ']'; + + echo "Truncated buffer and added closing bracket\n"; + + // Try to parse the buffer as JSON + try { + $parsedData = json_decode($buffer, true); + if (json_last_error() !== JSON_ERROR_NONE) { + echo "JSON parse error after max items: " . json_last_error_msg() . " for key '$key'\n"; + // Debug: output a small part of the buffer + echo "Buffer preview (first 100 chars): " . substr($buffer, 0, 100) . "...\n"; + echo "Buffer preview (last 100 chars): ..." . substr($buffer, -100) . "\n"; + + // Try a different approach - manually construct a valid JSON array + echo "Trying alternative approach to construct valid JSON...\n"; + + // Split the buffer by objects (each starting with { and ending with }) + preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches); + + if (!empty($matches[0])) { + // Take the first $maxItems objects + $objects = array_slice($matches[0], 0, $maxItems); + + // Construct a valid JSON array + $validJson = '[' . implode(',', $objects) . ']'; + + // Try to parse the valid JSON + $parsedData = json_decode($validJson, true); + if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) { + $result = $parsedData; + echo "Successfully parsed " . count($result) . " items using alternative approach\n"; + } else { + echo "Alternative approach also failed: " . json_last_error_msg() . "\n"; + } + } + } else if (is_array($parsedData)) { + $result = $parsedData; + echo "Parsed " . count($result) . " items from the $key array\n"; + } + } catch (Exception $e) { + echo "Exception parsing JSON after max items for key '$key': " . $e->getMessage() . "\n"; + } + } else { + echo "Could not find the end of the last complete object\n"; + } + + break; + } + } + } + + // Close the file + fclose($handle); + + } catch (Exception $e) { + echo "Exception in extractJsonArrayByKey for key '$key': " . $e->getMessage() . "\n"; + } + + return $result; +} + +// Implementation of extractJsonScalarByKey +function extractJsonScalarByKey(string $filePath, string $key): mixed +{ + // First, check if the file exists and is readable + if (!is_readable($filePath)) { + echo "File is not readable: $filePath\n"; + return null; + } + + try { + // For very large files, we'll use a more efficient approach + // We'll search for the specified key directly + $handle = fopen($filePath, 'r'); + if (!$handle) { + echo "Could not open file: $filePath\n"; + return null; + } + + // Skip ahead to find the specified key more quickly + $found = false; + $searchKey = '"' . $key . '"'; + $value = null; + $lineCount = 0; + + while (!$found && ($line = fgets($handle)) !== false) { + $lineCount++; + if ($lineCount % 1000 === 0) { + echo "Processed $lineCount lines searching for $key...\r"; + } + + if (strpos($line, $searchKey) !== false) { + $found = true; + echo "\nFound $key key at line $lineCount\n"; + + // Extract everything after the key + $keyPos = strpos($line, $searchKey); + $afterKey = substr($line, $keyPos + strlen($searchKey)); + + // Check if the value is on this line + if (strpos($afterKey, ':') !== false) { + $colonPos = strpos($afterKey, ':'); + $afterColon = trim(substr($afterKey, $colonPos + 1)); + + // Extract the value based on its type + if (preg_match('/^"([^"]*)"/', $afterColon, $matches)) { + // String value + $value = $matches[1]; + echo "Extracted string value: $value\n"; + } elseif (preg_match('/^(\d+)/', $afterColon, $matches)) { + // Numeric value + $value = intval($matches[1]); + echo "Extracted numeric value: $value\n"; + } elseif (preg_match('/^(true|false)/', $afterColon, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + echo "Extracted boolean value: " . ($value ? 'true' : 'false') . "\n"; + } elseif (strpos($afterColon, 'null') === 0) { + // Null value + $value = null; + echo "Extracted null value\n"; + } else { + // The value might be on the next line or more complex + // For simplicity, we'll just use the regex approach as a fallback + echo "Using fallback method to extract value\n"; + if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $line, $matches)) { + // String value + $value = $matches[1]; + echo "Extracted string value: $value\n"; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $line, $matches)) { + // Numeric value + $value = intval($matches[1]); + echo "Extracted numeric value: $value\n"; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $line, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + echo "Extracted boolean value: " . ($value ? 'true' : 'false') . "\n"; + } elseif (strpos($line, 'null') !== false) { + // Null value + $value = null; + echo "Extracted null value\n"; + } else { + echo "Could not extract value for key '$key' from line: " . trim($line) . "\n"; + } + } + } else { + // The value might be on the next line + echo "Value for key '$key' might be on the next line, using fallback method\n"; + + // Read the next line + $nextLine = fgets($handle); + if ($nextLine !== false) { + $combinedLine = $line . $nextLine; + + // Try to extract the value using regex + if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $combinedLine, $matches)) { + // String value + $value = $matches[1]; + echo "Extracted string value: $value\n"; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $combinedLine, $matches)) { + // Numeric value + $value = intval($matches[1]); + echo "Extracted numeric value: $value\n"; + } elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $combinedLine, $matches)) { + // Boolean value + $value = ($matches[1] === 'true'); + echo "Extracted boolean value: " . ($value ? 'true' : 'false') . "\n"; + } elseif (strpos($combinedLine, 'null') !== false) { + // Null value + $value = null; + echo "Extracted null value\n"; + } else { + echo "Could not extract value for key '$key' from combined lines\n"; + } + } + } + + break; + } + } + + // Close the file + fclose($handle); + + if (!$found) { + echo "Key '$key' not found in file: $filePath\n"; + } else if ($value === null) { + echo "Value for key '$key' is null or could not be extracted\n"; + } + + return $value; + + } catch (Exception $e) { + echo "Exception in extractJsonScalarByKey for key '$key': " . $e->getMessage() . "\n"; + return null; + } +} + +// Skip the original approach since we know it fails with memory exhaustion +echo "=== ORIGINAL APPROACH ===\n"; +echo "Skipping original approach - known to fail with memory exhaustion\n\n"; + +// Then try our new streaming approach +echo "=== NEW STREAMING APPROACH ===\n"; +testNewApproach($outdatedPagesFile); + +echo "Done testing!\n"; \ No newline at end of file diff --git a/wiki_compare/wiki_compare.py b/wiki_compare/wiki_compare.py index 789cc63..5495b13 100755 --- a/wiki_compare/wiki_compare.py +++ b/wiki_compare/wiki_compare.py @@ -98,6 +98,7 @@ SPECIFIC_PAGES = [ "Tag:harassment_prevention=ask_angela", "Key:harassment_prevention", "Proposal process", + "Outil de Manipulation et d'Organisation", "Automated_Edits_code_of_conduct", "Key:cuisine", "Libre_Charge_Map",