435 lines
20 KiB
PHP
435 lines
20 KiB
PHP
![]() |
<?php
|
||
|
|
||
|
// Set memory limit to match the error condition (128MB)
|
||
|
ini_set('memory_limit', '128M');
|
||
|
|
||
|
// Include the WikiController class
|
||
|
require_once __DIR__ . '/vendor/autoload.php';
|
||
|
|
||
|
// Mock the necessary dependencies
|
||
|
class MockController {
|
||
|
private $projectDir;
|
||
|
|
||
|
public function __construct() {
|
||
|
$this->projectDir = __DIR__;
|
||
|
}
|
||
|
|
||
|
public function getParameter($name) {
|
||
|
if ($name === 'kernel.project_dir') {
|
||
|
return $this->projectDir;
|
||
|
}
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Extracts an array from a large JSON file by key without loading the entire file into memory
|
||
|
*
|
||
|
* @param string $filePath Path to the JSON file
|
||
|
* @param string $key The key of the array to extract
|
||
|
* @param int $maxItems Maximum number of items to extract (to prevent memory exhaustion)
|
||
|
* @return array The extracted array
|
||
|
*/
|
||
|
public function extractJsonArrayByKey(string $filePath, string $key, int $maxItems = 100): array
|
||
|
{
|
||
|
$result = [];
|
||
|
|
||
|
// First, check if the file exists and is readable
|
||
|
if (!is_readable($filePath)) {
|
||
|
echo "File is not readable: $filePath\n";
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
// Get the file size
|
||
|
$fileSize = filesize($filePath);
|
||
|
if ($fileSize === false || $fileSize === 0) {
|
||
|
echo "File is empty or size could not be determined: $filePath\n";
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
try {
|
||
|
// For very large files, we'll use a more efficient approach
|
||
|
// We'll search for the specified key directly
|
||
|
$handle = fopen($filePath, 'r');
|
||
|
if (!$handle) {
|
||
|
echo "Could not open file: $filePath\n";
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
// Variables to track parsing state
|
||
|
$bracketCount = 0;
|
||
|
$buffer = '';
|
||
|
$itemCount = 0;
|
||
|
$inArray = false;
|
||
|
$arrayStarted = false;
|
||
|
|
||
|
// Skip ahead to find the specified key more quickly
|
||
|
$found = false;
|
||
|
$searchKey = '"' . $key . '"';
|
||
|
|
||
|
while (!$found && ($line = fgets($handle)) !== false) {
|
||
|
if (strpos($line, $searchKey) !== false) {
|
||
|
$found = true;
|
||
|
|
||
|
// Extract everything after the key
|
||
|
$keyPos = strpos($line, $searchKey);
|
||
|
$afterKey = substr($line, $keyPos + strlen($searchKey));
|
||
|
|
||
|
// Find the colon and then the opening bracket
|
||
|
if (strpos($afterKey, ':') !== false && strpos($afterKey, '[') !== false) {
|
||
|
$inArray = true;
|
||
|
$arrayStarted = true;
|
||
|
$bracketPos = strpos($afterKey, '[');
|
||
|
$buffer = '['; // Start the buffer with an opening bracket
|
||
|
$bracketCount = 1;
|
||
|
|
||
|
// Add everything after the opening bracket to the buffer
|
||
|
$buffer .= substr($afterKey, $bracketPos + 1);
|
||
|
} else if (strpos($afterKey, ':') !== false) {
|
||
|
// The opening bracket might be on the next line
|
||
|
$inArray = true;
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If we didn't find the key, return empty array
|
||
|
if (!$found) {
|
||
|
fclose($handle);
|
||
|
echo "Key '$key' not found in file: $filePath\n";
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
// If we found the key but not the opening bracket yet, look for it
|
||
|
if ($inArray && !$arrayStarted) {
|
||
|
while (($line = fgets($handle)) !== false) {
|
||
|
if (strpos($line, '[') !== false) {
|
||
|
$bracketPos = strpos($line, '[');
|
||
|
$buffer = '['; // Start the buffer with an opening bracket
|
||
|
$bracketCount = 1;
|
||
|
$arrayStarted = true;
|
||
|
|
||
|
// Add everything after the opening bracket to the buffer
|
||
|
$buffer .= substr($line, $bracketPos + 1);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If we still haven't found the opening bracket, something is wrong
|
||
|
if (!$arrayStarted) {
|
||
|
fclose($handle);
|
||
|
echo "Could not find opening bracket for array '$key' in file: $filePath\n";
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
// Now process the array
|
||
|
$collectingItems = true;
|
||
|
while ($collectingItems && ($line = fgets($handle)) !== false) {
|
||
|
// Count opening and closing brackets to track array nesting
|
||
|
$openBrackets = substr_count($line, '[') + substr_count($line, '{');
|
||
|
$closeBrackets = substr_count($line, ']') + substr_count($line, '}');
|
||
|
$bracketCount += $openBrackets - $closeBrackets;
|
||
|
|
||
|
// Add the line to our buffer
|
||
|
$buffer .= $line;
|
||
|
|
||
|
// If we've reached the end of the array (bracketCount = 0)
|
||
|
if ($bracketCount === 0) {
|
||
|
$collectingItems = false;
|
||
|
|
||
|
// Try to parse the buffer as JSON
|
||
|
try {
|
||
|
$parsedData = json_decode($buffer, true);
|
||
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
||
|
echo "JSON parse error: " . json_last_error_msg() . " for key '$key'\n";
|
||
|
|
||
|
// Try a different approach - manually construct a valid JSON array
|
||
|
// Split the buffer by objects (each starting with { and ending with })
|
||
|
preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches);
|
||
|
|
||
|
if (!empty($matches[0])) {
|
||
|
// Take the first $maxItems objects
|
||
|
$objects = array_slice($matches[0], 0, $maxItems);
|
||
|
|
||
|
// Construct a valid JSON array
|
||
|
$validJson = '[' . implode(',', $objects) . ']';
|
||
|
|
||
|
// Try to parse the valid JSON
|
||
|
$parsedData = json_decode($validJson, true);
|
||
|
if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
|
||
|
$result = $parsedData;
|
||
|
} else {
|
||
|
echo "Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'\n";
|
||
|
}
|
||
|
}
|
||
|
} else if (is_array($parsedData)) {
|
||
|
// Limit the number of items to prevent memory exhaustion
|
||
|
$result = array_slice($parsedData, 0, $maxItems);
|
||
|
}
|
||
|
} catch (Exception $e) {
|
||
|
echo "Exception parsing JSON for key '$key': " . $e->getMessage() . "\n";
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// Check if we've found a complete item (when we see a closing brace followed by a comma)
|
||
|
// This is used to count items and limit the number of items processed
|
||
|
if (preg_match('/\}\s*,\s*$/m', $line)) {
|
||
|
$itemCount++;
|
||
|
|
||
|
// If we've reached the maximum number of items, stop processing
|
||
|
if ($itemCount >= $maxItems) {
|
||
|
$collectingItems = false;
|
||
|
|
||
|
// Create a valid JSON array with the items we've collected so far
|
||
|
// We need to ensure the buffer ends with a complete JSON object and a closing bracket
|
||
|
|
||
|
// First, find the last complete object (ending with })
|
||
|
$lastObjectEnd = strrpos($buffer, '}');
|
||
|
if ($lastObjectEnd !== false) {
|
||
|
// Truncate the buffer at the end of the last complete object
|
||
|
$buffer = substr($buffer, 0, $lastObjectEnd + 1);
|
||
|
// Add the closing bracket for the array
|
||
|
$buffer .= ']';
|
||
|
|
||
|
// Try to parse the buffer as JSON
|
||
|
try {
|
||
|
$parsedData = json_decode($buffer, true);
|
||
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
||
|
echo "JSON parse error after max items: " . json_last_error_msg() . " for key '$key'\n";
|
||
|
|
||
|
// Try a different approach - manually construct a valid JSON array
|
||
|
// Split the buffer by objects (each starting with { and ending with })
|
||
|
preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches);
|
||
|
|
||
|
if (!empty($matches[0])) {
|
||
|
// Take the first $maxItems objects
|
||
|
$objects = array_slice($matches[0], 0, $maxItems);
|
||
|
|
||
|
// Construct a valid JSON array
|
||
|
$validJson = '[' . implode(',', $objects) . ']';
|
||
|
|
||
|
// Try to parse the valid JSON
|
||
|
$parsedData = json_decode($validJson, true);
|
||
|
if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
|
||
|
$result = $parsedData;
|
||
|
} else {
|
||
|
echo "Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'\n";
|
||
|
}
|
||
|
}
|
||
|
} else if (is_array($parsedData)) {
|
||
|
$result = $parsedData;
|
||
|
}
|
||
|
} catch (Exception $e) {
|
||
|
echo "Exception parsing JSON after max items for key '$key': " . $e->getMessage() . "\n";
|
||
|
}
|
||
|
} else {
|
||
|
echo "Could not find the end of the last complete object for key '$key'\n";
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Close the file
|
||
|
fclose($handle);
|
||
|
|
||
|
} catch (Exception $e) {
|
||
|
echo "Exception in extractJsonArrayByKey for key '$key': " . $e->getMessage() . "\n";
|
||
|
}
|
||
|
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Extracts a scalar value from a large JSON file by key without loading the entire file into memory
|
||
|
*
|
||
|
* @param string $filePath Path to the JSON file
|
||
|
* @param string $key The key of the scalar value to extract
|
||
|
* @return mixed The extracted scalar value or null if not found
|
||
|
*/
|
||
|
public function extractJsonScalarByKey(string $filePath, string $key): mixed
|
||
|
{
|
||
|
// First, check if the file exists and is readable
|
||
|
if (!is_readable($filePath)) {
|
||
|
echo "File is not readable: $filePath\n";
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
try {
|
||
|
// For very large files, we'll use a more efficient approach
|
||
|
// We'll search for the specified key directly
|
||
|
$handle = fopen($filePath, 'r');
|
||
|
if (!$handle) {
|
||
|
echo "Could not open file: $filePath\n";
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
// Skip ahead to find the specified key more quickly
|
||
|
$found = false;
|
||
|
$searchKey = '"' . $key . '"';
|
||
|
$value = null;
|
||
|
|
||
|
while (!$found && ($line = fgets($handle)) !== false) {
|
||
|
if (strpos($line, $searchKey) !== false) {
|
||
|
$found = true;
|
||
|
|
||
|
// Extract everything after the key
|
||
|
$keyPos = strpos($line, $searchKey);
|
||
|
$afterKey = substr($line, $keyPos + strlen($searchKey));
|
||
|
|
||
|
// Check if the value is on this line
|
||
|
if (strpos($afterKey, ':') !== false) {
|
||
|
$colonPos = strpos($afterKey, ':');
|
||
|
$afterColon = trim(substr($afterKey, $colonPos + 1));
|
||
|
|
||
|
// Extract the value based on its type
|
||
|
if (preg_match('/^"([^"]*)"/', $afterColon, $matches)) {
|
||
|
// String value
|
||
|
$value = $matches[1];
|
||
|
} elseif (preg_match('/^(\d+)/', $afterColon, $matches)) {
|
||
|
// Numeric value
|
||
|
$value = intval($matches[1]);
|
||
|
} elseif (preg_match('/^(true|false)/', $afterColon, $matches)) {
|
||
|
// Boolean value
|
||
|
$value = ($matches[1] === 'true');
|
||
|
} elseif (strpos($afterColon, 'null') === 0) {
|
||
|
// Null value
|
||
|
$value = null;
|
||
|
} else {
|
||
|
// The value might be on the next line or more complex
|
||
|
// For simplicity, we'll just use the regex approach as a fallback
|
||
|
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $line, $matches)) {
|
||
|
// String value
|
||
|
$value = $matches[1];
|
||
|
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $line, $matches)) {
|
||
|
// Numeric value
|
||
|
$value = intval($matches[1]);
|
||
|
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $line, $matches)) {
|
||
|
// Boolean value
|
||
|
$value = ($matches[1] === 'true');
|
||
|
} elseif (strpos($line, 'null') !== false) {
|
||
|
// Null value
|
||
|
$value = null;
|
||
|
} else {
|
||
|
echo "Could not extract value for key '$key' from line: " . trim($line) . "\n";
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
// The value might be on the next line
|
||
|
echo "Value for key '$key' might be on the next line, using fallback method\n";
|
||
|
|
||
|
// Read the next line
|
||
|
$nextLine = fgets($handle);
|
||
|
if ($nextLine !== false) {
|
||
|
$combinedLine = $line . $nextLine;
|
||
|
|
||
|
// Try to extract the value using regex
|
||
|
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $combinedLine, $matches)) {
|
||
|
// String value
|
||
|
$value = $matches[1];
|
||
|
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $combinedLine, $matches)) {
|
||
|
// Numeric value
|
||
|
$value = intval($matches[1]);
|
||
|
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $combinedLine, $matches)) {
|
||
|
// Boolean value
|
||
|
$value = ($matches[1] === 'true');
|
||
|
} elseif (strpos($combinedLine, 'null') !== false) {
|
||
|
// Null value
|
||
|
$value = null;
|
||
|
} else {
|
||
|
echo "Could not extract value for key '$key' from combined lines\n";
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Close the file
|
||
|
fclose($handle);
|
||
|
|
||
|
if (!$found) {
|
||
|
echo "Key '$key' not found in file: $filePath\n";
|
||
|
} else if ($value === null) {
|
||
|
echo "Value for key '$key' is null or could not be extracted\n";
|
||
|
}
|
||
|
|
||
|
return $value;
|
||
|
|
||
|
} catch (Exception $e) {
|
||
|
echo "Exception in extractJsonScalarByKey for key '$key': " . $e->getMessage() . "\n";
|
||
|
return null;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Create a mock controller
|
||
|
$controller = new MockController();
|
||
|
|
||
|
// Test the memory-efficient approach
|
||
|
echo "Testing memory-efficient approach for /wiki/compare/Key:harassment_prevention route\n";
|
||
|
echo "Memory limit: " . ini_get('memory_limit') . "\n\n";
|
||
|
|
||
|
// Get the file path
|
||
|
$jsonFile = __DIR__ . '/wiki_compare/outdated_pages.json';
|
||
|
$key = 'Key:harassment_prevention';
|
||
|
|
||
|
// Check if the file exists
|
||
|
if (!file_exists($jsonFile)) {
|
||
|
echo "Error: File $jsonFile does not exist\n";
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
echo "File size: " . round(filesize($jsonFile) / (1024 * 1024), 2) . " MB\n\n";
|
||
|
|
||
|
// Measure memory usage before
|
||
|
$memBefore = memory_get_usage();
|
||
|
echo "Memory usage before: " . round($memBefore / (1024 * 1024), 2) . " MB\n";
|
||
|
|
||
|
// Start timer
|
||
|
$startTime = microtime(true);
|
||
|
|
||
|
// Extract data using memory-efficient approach
|
||
|
$maxItems = 100;
|
||
|
$regularPages = $controller->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems);
|
||
|
$specificPages = $controller->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems);
|
||
|
$historyEntries = $controller->extractJsonArrayByKey($jsonFile, 'history', $maxItems);
|
||
|
|
||
|
// Combine regular_pages and specific_pages
|
||
|
$allPages = array_merge($regularPages, $specificPages);
|
||
|
|
||
|
// Find the page with the matching key
|
||
|
$targetPage = null;
|
||
|
foreach ($allPages as $page) {
|
||
|
if (isset($page['key']) && $page['key'] === $key) {
|
||
|
$targetPage = $page;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// End timer
|
||
|
$endTime = microtime(true);
|
||
|
|
||
|
// Measure memory usage after
|
||
|
$memAfter = memory_get_usage();
|
||
|
echo "Memory usage after: " . round($memAfter / (1024 * 1024), 2) . " MB\n";
|
||
|
echo "Memory used: " . round(($memAfter - $memBefore) / (1024 * 1024), 2) . " MB\n";
|
||
|
echo "Time taken: " . round($endTime - $startTime, 2) . " seconds\n\n";
|
||
|
|
||
|
// Check if we found the page
|
||
|
if ($targetPage) {
|
||
|
echo "Successfully found page with key '$key'\n";
|
||
|
echo "Page details:\n";
|
||
|
echo "- Staleness score: " . ($targetPage['staleness_score'] ?? 'N/A') . "\n";
|
||
|
echo "- Date diff: " . ($targetPage['date_diff'] ?? 'N/A') . "\n";
|
||
|
echo "- Word diff: " . ($targetPage['word_diff'] ?? 'N/A') . "\n";
|
||
|
} else {
|
||
|
echo "Page with key '$key' not found\n";
|
||
|
}
|
||
|
|
||
|
echo "\nTest completed successfully without memory exhaustion!\n";
|