qualiwiki/test_decrepitude.php
2025-09-05 15:58:26 +02:00

464 lines
No EOL
21 KiB
PHP

<?php
// Set memory limit to match the error condition (128MB)
ini_set('memory_limit', '128M');
// Path to the large JSON file
$outdatedPagesFile = __DIR__ . '/wiki_compare/outdated_pages.json';
$histogramFile = __DIR__ . '/wiki_compare/staleness_histogram.png';
echo "Testing memory usage for decrepitudeScores() method\n";
echo "File size: " . round(filesize($outdatedPagesFile) / (1024 * 1024), 2) . " MB\n";
echo "Memory limit: " . ini_get('memory_limit') . "\n\n";
// Test the original approach (loading entire file)
function testOriginalApproach($filePath) {
echo "Testing original approach (loading entire file)...\n";
$memBefore = memory_get_usage();
echo "Memory usage before: " . round($memBefore / (1024 * 1024), 2) . " MB\n";
try {
$startTime = microtime(true);
$regularPages = [];
$specificPages = [];
$lastUpdated = null;
if (file_exists($filePath)) {
$outdatedPagesData = json_decode(file_get_contents($filePath), true);
if (isset($outdatedPagesData['regular_pages']) && is_array($outdatedPagesData['regular_pages'])) {
$regularPages = $outdatedPagesData['regular_pages'];
}
if (isset($outdatedPagesData['specific_pages']) && is_array($outdatedPagesData['specific_pages'])) {
$specificPages = $outdatedPagesData['specific_pages'];
}
if (isset($outdatedPagesData['last_updated'])) {
$lastUpdated = $outdatedPagesData['last_updated'];
}
}
$endTime = microtime(true);
echo "Successfully loaded data:\n";
echo "- Regular pages: " . count($regularPages) . "\n";
echo "- Specific pages: " . count($specificPages) . "\n";
echo "- Last updated: " . ($lastUpdated ?? 'null') . "\n";
$memAfter = memory_get_usage();
echo "Memory usage after: " . round($memAfter / (1024 * 1024), 2) . " MB\n";
echo "Memory used: " . round(($memAfter - $memBefore) / (1024 * 1024), 2) . " MB\n";
echo "Time taken: " . round($endTime - $startTime, 2) . " seconds\n";
} catch (Exception $e) {
echo "Error: " . $e->getMessage() . "\n";
}
echo "\n";
}
// Test the new approach (streaming)
function testNewApproach($filePath) {
echo "Testing new approach (streaming)...\n";
$memBefore = memory_get_usage();
echo "Memory usage before: " . round($memBefore / (1024 * 1024), 2) . " MB\n";
try {
$startTime = microtime(true);
$regularPages = extractJsonArrayByKey($filePath, 'regular_pages', 100);
$specificPages = extractJsonArrayByKey($filePath, 'specific_pages', 100);
$lastUpdated = extractJsonScalarByKey($filePath, 'last_updated');
$endTime = microtime(true);
echo "Successfully loaded data:\n";
echo "- Regular pages: " . count($regularPages) . "\n";
echo "- Specific pages: " . count($specificPages) . "\n";
echo "- Last updated: " . ($lastUpdated ?? 'null') . "\n";
$memAfter = memory_get_usage();
echo "Memory usage after: " . round($memAfter / (1024 * 1024), 2) . " MB\n";
echo "Memory used: " . round(($memAfter - $memBefore) / (1024 * 1024), 2) . " MB\n";
echo "Time taken: " . round($endTime - $startTime, 2) . " seconds\n";
} catch (Exception $e) {
echo "Error: " . $e->getMessage() . "\n";
}
echo "\n";
}
// Implementation of extractJsonArrayByKey
function extractJsonArrayByKey(string $filePath, string $key, int $maxItems = 100): array
{
$result = [];
// First, check if the file exists and is readable
if (!is_readable($filePath)) {
echo "File is not readable: $filePath\n";
return $result;
}
// Get the file size
$fileSize = filesize($filePath);
if ($fileSize === false || $fileSize === 0) {
echo "File is empty or size could not be determined: $filePath\n";
return $result;
}
try {
// For very large files, we'll use a more efficient approach
// We'll search for the specified key directly
$handle = fopen($filePath, 'r');
if (!$handle) {
echo "Could not open file: $filePath\n";
return $result;
}
// Variables to track parsing state
$bracketCount = 0;
$buffer = '';
$itemCount = 0;
$inArray = false;
$arrayStarted = false;
// Skip ahead to find the specified key more quickly
$found = false;
$searchKey = '"' . $key . '"';
$lineCount = 0;
while (!$found && ($line = fgets($handle)) !== false) {
$lineCount++;
if ($lineCount % 1000 === 0) {
echo "Processed $lineCount lines searching for $key...\r";
}
if (strpos($line, $searchKey) !== false) {
$found = true;
echo "\nFound $key key at line $lineCount\n";
// Extract everything after the key
$keyPos = strpos($line, $searchKey);
$afterKey = substr($line, $keyPos + strlen($searchKey));
// Find the colon and then the opening bracket
if (strpos($afterKey, ':') !== false && strpos($afterKey, '[') !== false) {
$inArray = true;
$arrayStarted = true;
$bracketPos = strpos($afterKey, '[');
$buffer = '['; // Start the buffer with an opening bracket
$bracketCount = 1;
// Add everything after the opening bracket to the buffer
$buffer .= substr($afterKey, $bracketPos + 1);
echo "Opening bracket found on the same line\n";
} else if (strpos($afterKey, ':') !== false) {
// The opening bracket might be on the next line
$inArray = true;
echo "Colon found, but opening bracket might be on the next line\n";
}
break;
}
}
// If we didn't find the key, return empty array
if (!$found) {
echo "$key key not found in the file\n";
fclose($handle);
return $result;
}
// If we found the key but not the opening bracket yet, look for it
if ($inArray && !$arrayStarted) {
echo "Looking for opening bracket...\n";
while (($line = fgets($handle)) !== false) {
if (strpos($line, '[') !== false) {
$bracketPos = strpos($line, '[');
$buffer = '['; // Start the buffer with an opening bracket
$bracketCount = 1;
$arrayStarted = true;
// Add everything after the opening bracket to the buffer
$buffer .= substr($line, $bracketPos + 1);
echo "Opening bracket found on the next line\n";
break;
}
}
}
// If we still haven't found the opening bracket, something is wrong
if (!$arrayStarted) {
echo "Could not find opening bracket for array '$key' in file: $filePath\n";
fclose($handle);
return $result;
}
echo "Processing $key array...\n";
// Now process the array
$collectingItems = true;
while ($collectingItems && ($line = fgets($handle)) !== false) {
// Count opening and closing brackets to track array nesting
$openBrackets = substr_count($line, '[') + substr_count($line, '{');
$closeBrackets = substr_count($line, ']') + substr_count($line, '}');
$bracketCount += $openBrackets - $closeBrackets;
// Add the line to our buffer
$buffer .= $line;
// If we've reached the end of the array (bracketCount = 0)
if ($bracketCount === 0) {
$collectingItems = false;
echo "Reached end of $key array\n";
// Try to parse the buffer as JSON
try {
$parsedData = json_decode($buffer, true);
if (json_last_error() !== JSON_ERROR_NONE) {
echo "JSON parse error: " . json_last_error_msg() . " for key '$key'\n";
// Debug: output a small part of the buffer
echo "Buffer preview (first 100 chars): " . substr($buffer, 0, 100) . "...\n";
echo "Buffer preview (last 100 chars): ..." . substr($buffer, -100) . "\n";
} else if (is_array($parsedData)) {
// Limit the number of items to prevent memory exhaustion
$result = array_slice($parsedData, 0, $maxItems);
echo "Parsed " . count($result) . " items from the $key array\n";
}
} catch (Exception $e) {
echo "Exception parsing JSON for key '$key': " . $e->getMessage() . "\n";
}
break;
}
// Check if we've found a complete item (when we see a closing brace followed by a comma)
// This is used to count items and limit the number of items processed
if (preg_match('/\}\s*,\s*$/m', $line)) {
$itemCount++;
if ($itemCount % 10 === 0) {
echo "Found $itemCount items in $key array...\r";
}
// If we've reached the maximum number of items, stop processing
if ($itemCount >= $maxItems) {
$collectingItems = false;
echo "\nReached maximum number of items ($maxItems) for $key\n";
// Create a valid JSON array with the items we've collected so far
// We need to ensure the buffer ends with a complete JSON object and a closing bracket
// First, find the last complete object (ending with })
$lastObjectEnd = strrpos($buffer, '}');
if ($lastObjectEnd !== false) {
// Truncate the buffer at the end of the last complete object
$buffer = substr($buffer, 0, $lastObjectEnd + 1);
// Add the closing bracket for the array
$buffer .= ']';
echo "Truncated buffer and added closing bracket\n";
// Try to parse the buffer as JSON
try {
$parsedData = json_decode($buffer, true);
if (json_last_error() !== JSON_ERROR_NONE) {
echo "JSON parse error after max items: " . json_last_error_msg() . " for key '$key'\n";
// Debug: output a small part of the buffer
echo "Buffer preview (first 100 chars): " . substr($buffer, 0, 100) . "...\n";
echo "Buffer preview (last 100 chars): ..." . substr($buffer, -100) . "\n";
// Try a different approach - manually construct a valid JSON array
echo "Trying alternative approach to construct valid JSON...\n";
// Split the buffer by objects (each starting with { and ending with })
preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches);
if (!empty($matches[0])) {
// Take the first $maxItems objects
$objects = array_slice($matches[0], 0, $maxItems);
// Construct a valid JSON array
$validJson = '[' . implode(',', $objects) . ']';
// Try to parse the valid JSON
$parsedData = json_decode($validJson, true);
if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
$result = $parsedData;
echo "Successfully parsed " . count($result) . " items using alternative approach\n";
} else {
echo "Alternative approach also failed: " . json_last_error_msg() . "\n";
}
}
} else if (is_array($parsedData)) {
$result = $parsedData;
echo "Parsed " . count($result) . " items from the $key array\n";
}
} catch (Exception $e) {
echo "Exception parsing JSON after max items for key '$key': " . $e->getMessage() . "\n";
}
} else {
echo "Could not find the end of the last complete object\n";
}
break;
}
}
}
// Close the file
fclose($handle);
} catch (Exception $e) {
echo "Exception in extractJsonArrayByKey for key '$key': " . $e->getMessage() . "\n";
}
return $result;
}
// Implementation of extractJsonScalarByKey
function extractJsonScalarByKey(string $filePath, string $key): mixed
{
// First, check if the file exists and is readable
if (!is_readable($filePath)) {
echo "File is not readable: $filePath\n";
return null;
}
try {
// For very large files, we'll use a more efficient approach
// We'll search for the specified key directly
$handle = fopen($filePath, 'r');
if (!$handle) {
echo "Could not open file: $filePath\n";
return null;
}
// Skip ahead to find the specified key more quickly
$found = false;
$searchKey = '"' . $key . '"';
$value = null;
$lineCount = 0;
while (!$found && ($line = fgets($handle)) !== false) {
$lineCount++;
if ($lineCount % 1000 === 0) {
echo "Processed $lineCount lines searching for $key...\r";
}
if (strpos($line, $searchKey) !== false) {
$found = true;
echo "\nFound $key key at line $lineCount\n";
// Extract everything after the key
$keyPos = strpos($line, $searchKey);
$afterKey = substr($line, $keyPos + strlen($searchKey));
// Check if the value is on this line
if (strpos($afterKey, ':') !== false) {
$colonPos = strpos($afterKey, ':');
$afterColon = trim(substr($afterKey, $colonPos + 1));
// Extract the value based on its type
if (preg_match('/^"([^"]*)"/', $afterColon, $matches)) {
// String value
$value = $matches[1];
echo "Extracted string value: $value\n";
} elseif (preg_match('/^(\d+)/', $afterColon, $matches)) {
// Numeric value
$value = intval($matches[1]);
echo "Extracted numeric value: $value\n";
} elseif (preg_match('/^(true|false)/', $afterColon, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
echo "Extracted boolean value: " . ($value ? 'true' : 'false') . "\n";
} elseif (strpos($afterColon, 'null') === 0) {
// Null value
$value = null;
echo "Extracted null value\n";
} else {
// The value might be on the next line or more complex
// For simplicity, we'll just use the regex approach as a fallback
echo "Using fallback method to extract value\n";
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $line, $matches)) {
// String value
$value = $matches[1];
echo "Extracted string value: $value\n";
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $line, $matches)) {
// Numeric value
$value = intval($matches[1]);
echo "Extracted numeric value: $value\n";
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $line, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
echo "Extracted boolean value: " . ($value ? 'true' : 'false') . "\n";
} elseif (strpos($line, 'null') !== false) {
// Null value
$value = null;
echo "Extracted null value\n";
} else {
echo "Could not extract value for key '$key' from line: " . trim($line) . "\n";
}
}
} else {
// The value might be on the next line
echo "Value for key '$key' might be on the next line, using fallback method\n";
// Read the next line
$nextLine = fgets($handle);
if ($nextLine !== false) {
$combinedLine = $line . $nextLine;
// Try to extract the value using regex
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $combinedLine, $matches)) {
// String value
$value = $matches[1];
echo "Extracted string value: $value\n";
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $combinedLine, $matches)) {
// Numeric value
$value = intval($matches[1]);
echo "Extracted numeric value: $value\n";
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $combinedLine, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
echo "Extracted boolean value: " . ($value ? 'true' : 'false') . "\n";
} elseif (strpos($combinedLine, 'null') !== false) {
// Null value
$value = null;
echo "Extracted null value\n";
} else {
echo "Could not extract value for key '$key' from combined lines\n";
}
}
}
break;
}
}
// Close the file
fclose($handle);
if (!$found) {
echo "Key '$key' not found in file: $filePath\n";
} else if ($value === null) {
echo "Value for key '$key' is null or could not be extracted\n";
}
return $value;
} catch (Exception $e) {
echo "Exception in extractJsonScalarByKey for key '$key': " . $e->getMessage() . "\n";
return null;
}
}
// Skip the original approach since we know it fails with memory exhaustion
echo "=== ORIGINAL APPROACH ===\n";
echo "Skipping original approach - known to fail with memory exhaustion\n\n";
// Then try our new streaming approach
echo "=== NEW STREAMING APPROACH ===\n";
testNewApproach($outdatedPagesFile);
echo "Done testing!\n";