qualiwiki/src/Controller/WikiController.php
2025-09-08 10:20:51 +02:00

2370 lines
No EOL
104 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace App\Controller;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Annotation\Route;
class WikiController extends AbstractController
{
/**
* Displays the evolution of decrepitude scores from JSON history data
*/
#[Route('/wiki/decrepitude', name: 'app_admin_wiki_decrepitude')]
public function decrepitudeScores(): Response
{
$outdatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
$histogramFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/staleness_histogram.png';
$regularPages = [];
$specificPages = [];
$lastUpdated = null;
$histogramExists = file_exists($histogramFile);
if (file_exists($outdatedPagesFile)) {
// Use memory-efficient approach to extract data from the large JSON file
$maxPages = 100; // Limit the number of pages to prevent memory exhaustion
// Extract regular_pages array
$regularPages = $this->extractJsonArrayByKey($outdatedPagesFile, 'regular_pages', $maxPages);
// Extract specific_pages array
$specificPages = $this->extractJsonArrayByKey($outdatedPagesFile, 'specific_pages', $maxPages);
// Extract last_updated value
$lastUpdated = $this->extractJsonScalarByKey($outdatedPagesFile, 'last_updated');
}
return $this->render('admin/wiki_decrepitude.html.twig', [
'regular_pages' => $regularPages,
'specific_pages' => $specificPages,
'last_updated' => $lastUpdated,
'histogram_exists' => $histogramExists,
'json_exists' => file_exists($outdatedPagesFile)
]);
}
/**
* Displays the evolution of page rankings over time
*/
#[Route('/wiki/rankings', name: 'app_admin_wiki_rankings')]
public function pageRankings(): Response
{
$rankingsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/page_rankings.json';
$timestamps = [];
$pages = [];
$globalMetrics = [];
$lastUpdated = null;
if (file_exists($rankingsFile)) {
// Load the rankings data
try {
$rankingsData = json_decode(file_get_contents($rankingsFile), true);
if (json_last_error() === JSON_ERROR_NONE) {
$timestamps = $rankingsData['timestamps'] ?? [];
$pages = $rankingsData['pages'] ?? [];
$globalMetrics = $rankingsData['global_metrics'] ?? [];
// Get the last timestamp as last_updated
if (!empty($timestamps)) {
$lastUpdated = end($timestamps);
}
}
} catch (\Exception $e) {
// Log the error
error_log("Error loading rankings data: " . $e->getMessage());
}
}
return $this->render('admin/wiki_rankings.html.twig', [
'timestamps' => $timestamps,
'pages' => $pages,
'global_metrics' => $globalMetrics,
'last_updated' => $lastUpdated,
'json_exists' => file_exists($rankingsFile)
]);
}
/**
* Detects incorrect heading hierarchies in a list of sections
* For example, h4 directly under h2 without h3 in between
*
* @param array $sections List of sections with 'level' and 'title' keys
* @return array List of section indices with hierarchy errors
*/
private function detectHeadingHierarchyErrors(array $sections): array
{
$errors = [];
$lastLevel = 0;
foreach ($sections as $index => $section) {
$currentLevel = isset($section['level']) ? (int)$section['level'] : 0;
// Skip if level is not set or is 0
if ($currentLevel === 0) {
continue;
}
// If this is the first section, just record its level
if ($lastLevel === 0) {
$lastLevel = $currentLevel;
continue;
}
// Check if the level jump is more than 1
// For example, h2 -> h4 (skipping h3)
if ($currentLevel > $lastLevel + 1) {
$errors[] = $index;
}
$lastLevel = $currentLevel;
}
return $errors;
}
/**
* Builds an aligned list of sections for English and French
* Adds empty placeholders in the French column for sections that exist in English but not in French
*
* @param array $sectionComparison Section comparison data with 'common', 'en_only', and 'fr_only' keys
* @return array Aligned section list with 'en' and 'fr' columns
*/
private function buildAlignedSectionList(array $sectionComparison): array
{
$alignedSections = [];
// First, process common sections (they already have both en and fr)
// if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
// foreach ($sectionComparison['common'] as $section) {
// $alignedSections[] = [
// 'en' => $section['en'],
// 'fr' => $section['fr']
// ];
// }
// }
// Then, process English-only sections and add empty placeholders for French
if (isset($sectionComparison['en_only']) && is_array($sectionComparison['en_only'])) {
foreach ($sectionComparison['en_only'] as $section) {
$alignedSections[] = [
'en' => [
'title' => $section['title'],
'level' => $section['level']
],
'fr' => [
'title' => '', // Empty placeholder
'level' => $section['level'], // Same level as English
'is_placeholder' => true
]
];
}
}
//
// // Finally, process French-only sections (these will be shown at the end)
if (isset($sectionComparison['fr_only']) && is_array($sectionComparison['fr_only'])) {
foreach ($sectionComparison['fr_only'] as $section) {
$alignedSections[] = [
'en' => [
'title' => '', // Empty placeholder
'level' => $section['level'], // Same level as French
'is_placeholder' => true
],
'fr' => [
'title' => $section['title'],
'level' => $section['level']
]
];
}
}
return $alignedSections;
}
#[Route('/', name: 'app_public_index')]
public function accueilAction(): Response
{
return $this->redirectToRoute('app_admin_wiki');
}
#[Route('/wiki/recent-changes', name: 'app_admin_wiki_recent_changes')]
public function recentChanges(): Response
{
$recentChangesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/recent_changes.json';
// Initialize arrays
$recentChanges = [];
$lastUpdated = null;
$teamMembers = [];
// Check if the recent changes file exists and load it
if (file_exists($recentChangesFile)) {
$recentChangesData = json_decode(file_get_contents($recentChangesFile), true);
if (isset($recentChangesData['recent_changes']) && is_array($recentChangesData['recent_changes'])) {
$recentChanges = $recentChangesData['recent_changes'];
$lastUpdated = isset($recentChangesData['last_updated']) ? $recentChangesData['last_updated'] : null;
// Process team members statistics
$teamMembers = $this->processTeamMembersStats($recentChanges);
}
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
if ($diff->h >= 1 || $diff->days > 0) {
// $this->refreshRecentChangesData();
// return $this->redirectToRoute('app_admin_wiki_recent_changes');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshRecentChangesData();
// Check if the file was created
if (file_exists($recentChangesFile)) {
// return $this->redirectToRoute('app_admin_wiki_recent_changes');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des changements récents.');
}
}
return $this->render('admin/wiki_recent_changes.html.twig', [
'recent_changes' => $recentChanges,
'last_updated' => $lastUpdated,
'team_members' => $teamMembers
]);
}
/**
* Process team members statistics from recent changes data
*
* @param array $recentChanges Recent changes data
* @return array Team members statistics
*/
private function processTeamMembersStats(array $recentChanges): array
{
$teamMembers = [];
// Group changes by user and count modifications
foreach ($recentChanges as $change) {
$user = $change['user'];
$changeSize = $change['change_size'];
// Initialize user data if not exists
if (!isset($teamMembers[$user])) {
$teamMembers[$user] = [
'username' => $user,
'contributions' => 0,
'chars_added' => 0,
'chars_changed' => 0,
'chars_deleted' => 0,
'user_url' => "https://wiki.openstreetmap.org/wiki/User:" . urlencode($user)
];
}
// Increment contribution count
$teamMembers[$user]['contributions']++;
// Process change size
if (is_numeric($changeSize)) {
$changeSize = (int)$changeSize;
if ($changeSize > 0) {
$teamMembers[$user]['chars_added'] += $changeSize;
} elseif ($changeSize < 0) {
$teamMembers[$user]['chars_deleted'] += abs($changeSize);
} else {
// Change size is 0, might be a new page or other change
$teamMembers[$user]['chars_changed'] += 0;
}
} elseif (preg_match('/^\+(\d+)$/', $changeSize, $matches)) {
// Format like "+123"
$teamMembers[$user]['chars_added'] += (int)$matches[1];
} elseif (preg_match('/^(\d+)$/', $changeSize, $matches)) {
// Format like "123" (note: this is not a regular minus sign)
$teamMembers[$user]['chars_deleted'] += (int)$matches[1];
}
}
// Convert to indexed array and sort by contributions count (descending)
$teamMembers = array_values($teamMembers);
usort($teamMembers, function ($a, $b) {
return $b['contributions'] - $a['contributions'];
});
return $teamMembers;
}
/**
* Refresh the recent changes data by running the fetch_recent_changes.py script
*/
private function refreshRecentChangesData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_recent_changes.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les changements récents. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_recent_changes.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/missing-translations', name: 'app_admin_wiki_missing_translations')]
public function missingTranslations(): Response
{
$untranslatedFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/untranslated_french_pages.json';
// Initialize arrays
$untranslatedPages = [];
$lastUpdated = null;
// Check if the untranslated pages file exists and load it
if (file_exists($untranslatedFile)) {
$untranslatedData = json_decode(file_get_contents($untranslatedFile), true);
if (isset($untranslatedData['untranslated_pages']) && is_array($untranslatedData['untranslated_pages'])) {
$untranslatedPages = $untranslatedData['untranslated_pages'];
$lastUpdated = isset($untranslatedData['last_updated']) ? $untranslatedData['last_updated'] : null;
}
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
if ($diff->h >= 1 || $diff->days > 0) {
$this->refreshUntranslatedPagesData();
return $this->redirectToRoute('app_admin_wiki_missing_translations');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshUntranslatedPagesData();
// Check if the file was created
if (file_exists($untranslatedFile)) {
return $this->redirectToRoute('app_admin_wiki_missing_translations');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des pages sans traduction.');
}
}
// Remove duplicates based on page title
$uniquePages = [];
$seenTitles = [];
foreach ($untranslatedPages as $page) {
if (!isset($seenTitles[$page['title']])) {
$seenTitles[$page['title']] = true;
$uniquePages[] = $page;
}
}
// Sort pages by title
usort($uniquePages, function($a, $b) {
return strcasecmp($a['title'], $b['title']);
});
return $this->render('admin/wiki_missing_translations.html.twig', [
'untranslated_pages' => $uniquePages,
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the untranslated pages data by running the find_untranslated_french_pages.py script
*/
private function refreshUntranslatedPagesData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/find_untranslated_french_pages.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les pages sans traduction. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script find_untranslated_french_pages.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/pages-unavailable-in-french', name: 'app_admin_wiki_pages_unavailable_in_french')]
public function pagesUnavailableInFrench(): Response
{
$unavailablePagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/pages_unavailable_in_french.json';
// Initialize arrays
$groupedPages = [];
$allPages = [];
$lastUpdated = null;
// Check if the unavailable pages file exists and load it
if (file_exists($unavailablePagesFile)) {
$unavailableData = json_decode(file_get_contents($unavailablePagesFile), true);
if (isset($unavailableData['grouped_pages']) && is_array($unavailableData['grouped_pages'])) {
$groupedPages = $unavailableData['grouped_pages'];
}
if (isset($unavailableData['all_pages']) && is_array($unavailableData['all_pages'])) {
$allPages = $unavailableData['all_pages'];
}
$lastUpdated = isset($unavailableData['last_updated']) ? $unavailableData['last_updated'] : null;
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
if ($diff->h >= 1 || $diff->days > 0) {
$this->refreshPagesUnavailableInFrenchData();
return $this->redirectToRoute('app_admin_wiki_pages_unavailable_in_french');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshPagesUnavailableInFrenchData();
// Check if the file was created
if (file_exists($unavailablePagesFile)) {
return $this->redirectToRoute('app_admin_wiki_pages_unavailable_in_french');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des pages non disponibles en français.');
}
}
// Move English pages to the top of the list
$englishPages = $groupedPages['En'] ?? [];
unset($groupedPages['En']);
// Sort other language groups alphabetically
ksort($groupedPages);
// Reinsert English pages at the beginning
if (!empty($englishPages)) {
$groupedPages = ['En' => $englishPages] + $groupedPages;
}
return $this->render('admin/wiki_pages_unavailable_in_french.html.twig', [
'grouped_pages' => $groupedPages,
'all_pages' => $allPages,
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the pages unavailable in French data by running the find_pages_unavailable_in_french.py script
*/
private function refreshPagesUnavailableInFrenchData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/find_pages_unavailable_in_french.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les pages non disponibles en français. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script find_pages_unavailable_in_french.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/osm-fr-groups', name: 'app_admin_wiki_osm_fr_groups')]
public function osmFrGroups(): Response
{
$groupsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/osm_fr_groups.json';
// Initialize arrays
$workingGroups = [];
$localGroups = [];
$umapUrl = 'https://umap.openstreetmap.fr/fr/map/groupes-locaux-openstreetmap_152488';
$lastUpdated = null;
// Check if the groups file exists and load it
if (file_exists($groupsFile)) {
$groupsData = json_decode(file_get_contents($groupsFile), true);
if (isset($groupsData['working_groups']) && is_array($groupsData['working_groups'])) {
$workingGroups = $groupsData['working_groups'];
}
if (isset($groupsData['local_groups']) && is_array($groupsData['local_groups'])) {
$localGroups = $groupsData['local_groups'];
}
$umapUrl = isset($groupsData['umap_url']) ? $groupsData['umap_url'] : 'https://umap.openstreetmap.fr/fr/map/groupes-locaux-openstreetmap_152488';
$lastUpdated = isset($groupsData['last_updated']) ? $groupsData['last_updated'] : null;
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
// if ($diff->h >= 1 || $diff->days > 0) {
// $this->refreshOsmFrGroupsData();
// return $this->redirectToRoute('app_admin_wiki_osm_fr_groups');
// }
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshOsmFrGroupsData();
// Check if the file was created
if (file_exists($groupsFile)) {
// return $this->redirectToRoute('app_admin_wiki_osm_fr_groups');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des groupes OSM-FR.');
}
}
// Group working groups by category
$groupedWorkingGroups = [];
foreach ($workingGroups as $group) {
$category = $group['category'] ?? 'Autres';
if (!isset($groupedWorkingGroups[$category])) {
$groupedWorkingGroups[$category] = [];
}
$groupedWorkingGroups[$category][] = $group;
}
// Sort categories alphabetically
ksort($groupedWorkingGroups);
return $this->render('admin/wiki_osm_fr_groups.html.twig', [
'working_groups' => $groupedWorkingGroups,
'local_groups' => $localGroups,
'umap_url' => $umapUrl,
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the OSM-FR groups data by running the fetch_osm_fr_groups.py script
*/
private function refreshOsmFrGroupsData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_osm_fr_groups.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les groupes OSM-FR. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_osm_fr_groups.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/suspicious-deletions', name: 'app_admin_wiki_suspicious_deletions')]
public function suspiciousDeletions(): Response
{
$suspiciousDeletesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/suspicious_deletions.json';
$wordDiffFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
// Initialize arrays
$suspiciousPages = [];
$wordDiffPages = [];
// Check if the suspicious deletions file exists and load it
if (file_exists($suspiciousDeletesFile)) {
$suspiciousData = json_decode(file_get_contents($suspiciousDeletesFile), true);
if (isset($suspiciousData['deletions']) && is_array($suspiciousData['deletions'])) {
$suspiciousPages = $suspiciousData['deletions'];
$lastUpdated = isset($suspiciousData['last_updated']) ? $suspiciousData['last_updated'] : null;
}
} else {
// If the file doesn't exist, try to create it by running the script
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/detect_suspicious_deletions.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' 2>&1', $output, $returnCode);
if ($returnCode === 0 && file_exists($suspiciousDeletesFile)) {
$suspiciousData = json_decode(file_get_contents($suspiciousDeletesFile), true);
if (isset($suspiciousData['deletions']) && is_array($suspiciousData['deletions'])) {
$suspiciousPages = $suspiciousData['deletions'];
$lastUpdated = isset($suspiciousData['last_updated']) ? $suspiciousData['last_updated'] : null;
}
} else {
$this->addFlash('warning', 'Impossible de générer le fichier de suppressions suspectes. Erreur: ' . implode("\n", $output));
}
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
// Also load the word-diff based suspicious pages for comparison
if (file_exists($wordDiffFile)) {
$jsonData = json_decode(file_get_contents($wordDiffFile), true);
foreach ($jsonData as $page) {
if (isset($page['fr_page']) && isset($page['en_page'])) {
// Calculate deletion percentage
$enWordCount = (int)$page['en_page']['word_count'];
$frWordCount = (int)$page['fr_page']['word_count'];
$wordDiff = $enWordCount - $frWordCount;
// If English has more words and the difference is significant (>30%)
if ($wordDiff > 0 && $frWordCount > 0 && ($wordDiff / $enWordCount) > 0.3) {
$page['deletion_percentage'] = round(($wordDiff / $enWordCount) * 100, 2);
$wordDiffPages[] = $page;
}
}
}
// Sort by deletion percentage (highest first)
usort($wordDiffPages, function ($a, $b) {
return $b['deletion_percentage'] <=> $a['deletion_percentage'];
});
}
return $this->render('admin/wiki_suspicious_deletions.html.twig', [
'suspicious_pages' => $wordDiffPages,
'recent_deletions' => $suspiciousPages,
'last_updated' => $lastUpdated ?? null
]);
}
#[Route('/wiki/tag-proposals', name: 'app_admin_wiki_tag_proposals')]
public function tagProposals(): Response
{
$proposalsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/proposals.json';
// Initialize arrays
$votingProposals = [];
$recentProposals = [];
$lastUpdated = null;
// Check if the proposals file exists and load it
if (file_exists($proposalsFile)) {
$proposalsData = json_decode(file_get_contents($proposalsFile), true);
if (isset($proposalsData['voting_proposals']) && is_array($proposalsData['voting_proposals'])) {
$votingProposals = $proposalsData['voting_proposals'];
}
if (isset($proposalsData['recent_proposals']) && is_array($proposalsData['recent_proposals'])) {
$recentProposals = $proposalsData['recent_proposals'];
}
$lastUpdated = isset($proposalsData['last_updated']) ? $proposalsData['last_updated'] : null;
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
// if ($diff->h >= 1 || $diff->days > 0) {
// $this->refreshProposalsData();
// return $this->redirectToRoute('app_admin_wiki_tag_proposals');
// }
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshProposalsData();
// Check if the file was created
if (file_exists($proposalsFile)) {
// return $this->redirectToRoute('app_admin_wiki_tag_proposals');
} else {
$this->addFlash('error', 'Impossible de générer le fichier de propositions.');
}
}
// Format the proposals for the template
$formattedProposals = [];
foreach ($votingProposals as $proposal) {
$formattedProposal = [
'feature' => $proposal['title'],
'url' => $proposal['url'],
'description' => 'Proposition en cours de vote',
'proposer' => $proposal['proposer'] ?? '',
'status' => $proposal['status'] ?? 'Voting',
'type' => 'voting'
];
// Add voting information if available
if (isset($proposal['votes'])) {
$formattedProposal['votes'] = $proposal['votes'];
$formattedProposal['total_votes'] = $proposal['total_votes'] ?? 0;
$formattedProposal['approve_percentage'] = $proposal['approve_percentage'] ?? 0;
$formattedProposal['oppose_percentage'] = $proposal['oppose_percentage'] ?? 0;
$formattedProposal['abstain_percentage'] = $proposal['abstain_percentage'] ?? 0;
}
$formattedProposals[] = $formattedProposal;
}
foreach ($recentProposals as $proposal) {
$formattedProposals[] = [
'feature' => $proposal['title'],
'url' => $proposal['url'],
'description' => 'Dernière modification: ' . $proposal['last_modified'],
'proposer' => $proposal['modified_by'],
'status' => 'Draft',
'type' => 'recent'
];
}
return $this->render('admin/wiki_tag_proposals.html.twig', [
'proposals' => $formattedProposals,
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the proposals data by running the fetch_proposals.py script
*/
private function refreshProposalsData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_proposals.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les propositions. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_proposals.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/random-suggestion', name: 'app_admin_wiki_random_suggestion')]
public function randomSuggestion(): Response
{
$jsonFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
if (!file_exists($jsonFile)) {
$this->addFlash('error', 'Le fichier outdated_pages.json n\'existe pas.');
return $this->redirectToRoute('app_admin_wiki');
}
// Use memory-efficient approach to extract only the necessary data
$maxItems = 100; // Limit the number of items to prevent memory exhaustion
// Extract regular_pages and specific_pages arrays
$regularPages = $this->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems);
$specificPages = $this->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems);
// Combine them into a single array
$allPages = array_merge($regularPages, $specificPages);
if (empty($allPages)) {
$this->addFlash('error', 'Aucune page à améliorer n\'a été trouvée.');
return $this->redirectToRoute('app_admin_wiki');
}
// Select a random page from the combined pages
$randomPage = $allPages[array_rand($allPages)];
return $this->render('admin/wiki_random_suggestion.html.twig', [
'page' => $randomPage
]);
}
#[Route('/wiki/create-french/{key}', name: 'app_admin_wiki_create_french', requirements: ['key' => '.+'])]
public function createFrench(string $key): Response
{
// Construct the URLs for the English page and the French page creation form
$englishUrl = "https://wiki.openstreetmap.org/wiki/{$key}";
$frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
// Fetch the HTML content of the English page using wiki_compare.py
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
$englishHtml = null;
$frenchHtml = null;
$frenchCacheExists = false;
if (file_exists($scriptPath)) {
// Create a temporary Python script to fetch the page content
$tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
$pythonCode = <<<EOT
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import json
import hashlib
from pathlib import Path
from wiki_compare import fetch_wiki_page, HTML_CACHE_DIR
# Get the key from command line arguments
key = sys.argv[1]
language = sys.argv[2]
# Check if we're just checking cache existence
check_cache_only = len(sys.argv) > 3 and sys.argv[3] == 'check_cache'
if check_cache_only and language == 'fr':
# For French pages, construct the URL to check cache
if key.startswith('http'):
url = key
else:
url = f"https://wiki.openstreetmap.org/wiki/FR:{key}"
# Create cache key
cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
# Check if cache exists
if cache_file.exists():
print("CACHE_EXISTS")
else:
print("CACHE_MISSING")
else:
# Normal fetch operation
page = fetch_wiki_page(key, language)
# Output the HTML content
if page and 'html_content' in page:
print(page['html_content'])
else:
print("")
EOT;
file_put_contents($tempScriptPath, $pythonCode);
chmod($tempScriptPath, 0755);
// First check if French page exists in cache
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr check_cache";
$cacheCheckResult = trim(shell_exec($command));
$frenchCacheExists = ($cacheCheckResult === "CACHE_EXISTS");
// Fetch English page
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
$englishHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($englishHtml) {
$englishHtml = $this->extractMainContent($englishHtml);
}
// Fetch French page (might not exist, but we'll try)
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
$frenchHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($frenchHtml) {
$frenchHtml = $this->extractMainContent($frenchHtml);
}
// Clean up the temporary script
unlink($tempScriptPath);
}
return $this->render('admin/wiki_create_french.html.twig', [
'key' => $key,
'english_url' => $englishUrl,
'french_edit_url' => $frenchEditUrl,
'english_html' => $englishHtml,
'french_html' => $frenchHtml,
'french_cache_exists' => $frenchCacheExists
]);
}
#[Route('/wiki/archived-proposals', name: 'app_admin_wiki_archived_proposals')]
public function archivedProposals(\Symfony\Component\HttpFoundation\Request $request): Response
{
$jsonFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/archived_proposals.json';
$forceRefresh = $request->query->has('refresh');
$limit = $request->query->get('limit') ? (int)$request->query->get('limit') : null;
// Initialize arrays
$proposals = [];
$statistics = [];
$lastUpdated = null;
// Check if we should force a refresh
if ($forceRefresh) {
$this->refreshArchivedProposalsData($limit);
$this->addFlash('success', 'Les données des propositions archivées ont été rafraîchies.');
// Preserve the limit parameter in the redirect if it was provided
if ($limit) {
return $this->redirectToRoute('app_admin_wiki_archived_proposals', ['limit' => $limit]);
}
return $this->redirectToRoute('app_admin_wiki_archived_proposals');
}
// Check if the archived proposals file exists and load it
if (file_exists($jsonFile)) {
// Use memory-efficient approach to extract only the necessary data
$maxItems = 100; // Limit the number of items to prevent memory exhaustion
// Extract proposals array
$proposals = $this->extractJsonArrayByKey($jsonFile, 'proposals', $maxItems);
// Extract statistics object
$statistics = $this->extractJsonArrayByKey($jsonFile, 'statistics', $maxItems);
// Extract last_updated value
$lastUpdated = $this->extractJsonScalarByKey($jsonFile, 'last_updated');
// Check if the data is older than 1 day
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 day, refresh the data
if ($diff->days > 1) {
$this->refreshArchivedProposalsData($limit);
$this->addFlash('info', 'Les données des propositions archivées ont été automatiquement mises à jour car elles dataient de plus d\'un jour.');
// Preserve the limit parameter in the redirect if it was provided
if ($limit) {
return $this->redirectToRoute('app_admin_wiki_archived_proposals', ['limit' => $limit]);
}
return $this->redirectToRoute('app_admin_wiki_archived_proposals');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshArchivedProposalsData($limit);
// Check if the file was created
if (file_exists($jsonFile)) {
$this->addFlash('success', 'Le fichier des propositions archivées a été généré avec succès.');
// Preserve the limit parameter in the redirect if it was provided
if ($limit) {
return $this->redirectToRoute('app_admin_wiki_archived_proposals', ['limit' => $limit]);
}
return $this->redirectToRoute('app_admin_wiki_archived_proposals');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des propositions archivées.');
}
}
return $this->render('admin/wiki_archived_proposals.html.twig', [
'proposals' => $proposals,
'statistics' => $statistics,
'last_updated' => $lastUpdated,
'limit' => $limit
]);
}
/**
* Refresh the archived proposals data by running the fetch_archived_proposals.py script
*
* @param int|null $limit Optional limit for the number of proposals to process
*/
private function refreshArchivedProposalsData(?int $limit = null): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_archived_proposals.py';
if (file_exists($scriptPath)) {
$command = 'python3 ' . $scriptPath;
// Add limit parameter if provided
if ($limit !== null) {
$command .= ' --limit ' . $limit;
}
exec($command . ' 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les propositions archivées. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_archived_proposals.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki', name: 'app_admin_wiki')]
public function index(): Response
{
$csvFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_pages.csv';
if (!file_exists($csvFile)) {
$this->addFlash('error', 'Le fichier wiki_pages.csv n\'existe pas.');
return $this->redirectToRoute('app_admin_index');
}
$csvData = array_map('str_getcsv', file($csvFile));
$headers = array_shift($csvData);
$wikiPages = [];
$missingTranslations = [];
$pageDifferences = [];
$pagesUnavailableInEnglish = [];
// Collect all staleness scores for statistics
$stalenessScores = [];
foreach ($csvData as $row) {
$page = array_combine($headers, $row);
if (isset($page['staleness_score']) && is_numeric($page['staleness_score'])) {
$stalenessScores[] = (float)$page['staleness_score'];
}
}
// Calculate statistics
$stalenessStats = [
'count' => count($stalenessScores),
'min' => !empty($stalenessScores) ? min($stalenessScores) : 0,
'max' => !empty($stalenessScores) ? max($stalenessScores) : 0,
'mean' => 0,
'std_dev' => 0
];
// Calculate mean
if (!empty($stalenessScores)) {
$stalenessStats['mean'] = array_sum($stalenessScores) / count($stalenessScores);
// Calculate standard deviation
$variance = 0;
foreach ($stalenessScores as $score) {
$variance += pow($score - $stalenessStats['mean'], 2);
}
$stalenessStats['std_dev'] = sqrt($variance / count($stalenessScores));
}
// Round statistics to 2 decimal places
$stalenessStats['mean'] = round($stalenessStats['mean'], 2);
$stalenessStats['std_dev'] = round($stalenessStats['std_dev'], 2);
// Process pages - use absolute values without normalization
foreach ($csvData as $row) {
$page = array_combine($headers, $row);
// Use absolute values of staleness score without normalization
if (isset($page['staleness_score']) && is_numeric($page['staleness_score'])) {
$page['staleness_score'] = abs((float)$page['staleness_score']);
// Round to 2 decimal places
$page['staleness_score'] = round($page['staleness_score'], 2);
}
$wikiPages[$page['key']][$page['language']] = $page;
}
// Identify pages missing French translations
foreach ($wikiPages as $key => $languages) {
if (isset($languages['en']) && !isset($languages['fr'])) {
$missingTranslations[$key] = $languages['en'];
}
}
// Prepare arrays for statistics
$stats = [
'en_sections' => [],
'fr_sections' => [],
'en_words' => [],
'fr_words' => [],
'en_links' => [],
'fr_links' => [],
'en_media' => [],
'fr_media' => []
];
// Calculate differences between English and French versions
foreach ($wikiPages as $key => $languages) {
if (isset($languages['en']) && isset($languages['fr'])) {
$en = $languages['en'];
$fr = $languages['fr'];
// Calculate differences (French - English)
$sectionDiff = (int)$fr['sections'] - (int)$en['sections'];
$wordDiff = (int)$fr['word_count'] - (int)$en['word_count'];
$linkDiff = (int)$fr['link_count'] - (int)$en['link_count'];
$mediaDiff = isset($fr['media_count']) && isset($en['media_count']) ?
(int)$fr['media_count'] - (int)$en['media_count'] : 0;
// Format differences with + or - sign
$pageDifferences[$key] = [
'section_diff' => $sectionDiff,
'section_diff_formatted' => ($sectionDiff >= 0 ? '+' : '') . $sectionDiff,
'word_diff' => $wordDiff,
'word_diff_formatted' => ($wordDiff >= 0 ? '+' : '') . $wordDiff,
'link_diff' => $linkDiff,
'link_diff_formatted' => ($linkDiff >= 0 ? '+' : '') . $linkDiff,
'media_diff' => $mediaDiff,
'media_diff_formatted' => ($mediaDiff >= 0 ? '+' : '') . $mediaDiff,
];
// Collect data for statistics
$stats['en_sections'][] = (int)$en['sections'];
$stats['fr_sections'][] = (int)$fr['sections'];
$stats['en_words'][] = (int)$en['word_count'];
$stats['fr_words'][] = (int)$fr['word_count'];
$stats['en_links'][] = (int)$en['link_count'];
$stats['fr_links'][] = (int)$fr['link_count'];
$stats['en_media'][] = isset($en['media_count']) ? (int)$en['media_count'] : 0;
$stats['fr_media'][] = isset($fr['media_count']) ? (int)$fr['media_count'] : 0;
}
}
// Calculate statistics
$wikiPagesStats = [];
foreach ($stats as $key => $values) {
if (!empty($values)) {
$mean = array_sum($values) / count($values);
// Calculate standard deviation
$variance = 0;
foreach ($values as $value) {
$variance += pow($value - $mean, 2);
}
$stdDev = sqrt($variance / count($values));
$wikiPagesStats[$key] = [
'count' => count($values),
'min' => min($values),
'max' => max($values),
'mean' => round($mean, 2),
'std_dev' => round($stdDev, 2)
];
}
}
// Sort wiki pages by staleness score (descending)
uasort($wikiPages, function ($a, $b) {
$scoreA = isset($a['en']) && isset($a['fr']) && isset($a['en']['staleness_score']) ? (float)$a['en']['staleness_score'] : 0;
$scoreB = isset($b['en']) && isset($b['fr']) && isset($b['en']['staleness_score']) ? (float)$b['en']['staleness_score'] : 0;
return $scoreB <=> $scoreA;
});
// Load pages unavailable in English
$pagesUnavailableInEnglishFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/pages_unavailable_in_english.json';
if (file_exists($pagesUnavailableInEnglishFile)) {
$pagesUnavailableInEnglishData = json_decode(file_get_contents($pagesUnavailableInEnglishFile), true);
if (isset($pagesUnavailableInEnglishData['pages']) && is_array($pagesUnavailableInEnglishData['pages'])) {
// Deduplicate pages based on URL
$uniquePages = [];
$seenUrls = [];
foreach ($pagesUnavailableInEnglishData['pages'] as $page) {
if (isset($page['url'])) {
// Use URL as the key for deduplication
$url = $page['url'];
if (!isset($seenUrls[$url])) {
$seenUrls[$url] = true;
$uniquePages[] = $page;
}
} else {
// If no URL, keep the page (shouldn't happen, but just in case)
$uniquePages[] = $page;
}
}
$pagesUnavailableInEnglish = $uniquePages;
}
}
// Load specific pages from outdated_pages.json
$specificPages = [];
$outdatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
if (file_exists($outdatedPagesFile)) {
// Use a memory-efficient approach to extract only the specific_pages array
// without loading the entire file into memory
$maxPages = 100; // Limit the number of pages to prevent memory exhaustion
$specificPages = $this->extractSpecificPagesFromJson($outdatedPagesFile, $maxPages);
}
// Load newly created French pages
$newlyCreatedPages = [];
$newlyCreatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/newly_created_french_pages.json';
if (file_exists($newlyCreatedPagesFile)) {
$newlyCreatedPagesData = json_decode(file_get_contents($newlyCreatedPagesFile), true);
if (isset($newlyCreatedPagesData['created_pages']) && is_array($newlyCreatedPagesData['created_pages'])) {
$newlyCreatedPages = $newlyCreatedPagesData['created_pages'];
}
}
// Load machine translations
$availableTranslations = [];
$translationsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/translations.json';
if (file_exists($translationsFile)) {
$translationsData = json_decode(file_get_contents($translationsFile), true);
if (isset($translationsData['translations']) && is_array($translationsData['translations'])) {
$availableTranslations = $translationsData['translations'];
}
}
// Load keys without wiki pages
$keysWithoutWiki = [];
$keysWithoutWikiFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/keys_without_wiki.json';
if (file_exists($keysWithoutWikiFile)) {
$keysWithoutWikiData = json_decode(file_get_contents($keysWithoutWikiFile), true);
if (is_array($keysWithoutWikiData)) {
$keysWithoutWiki = $keysWithoutWikiData;
}
}
return $this->render('admin/wiki.html.twig', [
'wiki_pages' => $wikiPages,
'missing_translations' => $missingTranslations,
'page_differences' => $pageDifferences,
'pages_unavailable_in_english' => $pagesUnavailableInEnglish,
'specific_pages' => $specificPages,
'newly_created_pages' => $newlyCreatedPages,
'staleness_stats' => $stalenessStats,
'wiki_pages_stats' => $wikiPagesStats,
'available_translations' => $availableTranslations,
'keys_without_wiki' => $keysWithoutWiki
]);
}
#[Route('/wiki/translate/{key}', name: 'app_admin_wiki_translate', requirements: ['key' => '.+'])]
public function translate(string $key): Response
{
$this->addFlash('info', 'Traduction en cours pour la page ' . $key);
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_translate.py';
if (file_exists($scriptPath)) {
// Execute the translation script with virtual environment
$venvPython = $this->getParameter('kernel.project_dir') . '/venv/bin/python';
$command = 'cd ' . $this->getParameter('kernel.project_dir') . ' && ' . $venvPython . ' ' . $scriptPath . ' "' . $key . '"';
$output = [];
$returnVar = 0;
exec($command, $output, $returnVar);
if ($returnVar === 0) {
$this->addFlash('success', 'Traduction réussie pour la page ' . $key);
} else {
$this->addFlash('warning', 'Problème lors de la traduction: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script wiki_translate.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
return $this->redirectToRoute('app_admin_wiki');
}
#[Route('/wiki/update-translation/{key}', name: 'app_admin_wiki_update_translation', requirements: ['key' => '.+'])]
public function updateTranslation(string $key): Response
{
$this->addFlash('info', 'Mise à jour de la traduction en cours pour la page ' . $key);
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_translate.py';
if (file_exists($scriptPath)) {
// Execute the translation script with the update flag and virtual environment
$venvPython = $this->getParameter('kernel.project_dir') . '/venv/bin/python';
$command = 'cd ' . $this->getParameter('kernel.project_dir') . ' && ' . $venvPython . ' ' . $scriptPath . ' "' . $key . '"';
$output = [];
$returnVar = 0;
exec($command, $output, $returnVar);
if ($returnVar === 0) {
$this->addFlash('success', 'Mise à jour de la traduction réussie pour la page ' . $key);
} else {
$this->addFlash('warning', 'Problème lors de la mise à jour de la traduction: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script wiki_translate.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
return $this->redirectToRoute('app_admin_wiki');
}
#[Route('/wiki/compare/{key}', name: 'app_admin_wiki_compare', requirements: ['key' => '.+'])]
public function compare(string $key): Response
{
$csvFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_pages.csv';
$jsonFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
if (!file_exists($csvFile)) {
$this->addFlash('error', 'Le fichier wiki_pages.csv n\'existe pas.');
return $this->redirectToRoute('app_admin_index');
}
$csvData = array_map('str_getcsv', file($csvFile));
$headers = array_shift($csvData);
// Process CSV data to find the requested key
$enPage = null;
$frPage = null;
foreach ($csvData as $row) {
$page = array_combine($headers, $row);
if ($page['key'] === $key) {
if ($page['language'] === 'en') {
$enPage = $page;
} elseif ($page['language'] === 'fr') {
$frPage = $page;
}
}
}
// If English page doesn't exist, redirect back with error
if (!$enPage) {
$this->addFlash('error', 'La page wiki pour la clé "' . $key . '" n\'existe pas.');
return $this->redirectToRoute('app_admin_wiki');
}
// Get detailed content comparison from JSON file
$detailedComparison = null;
$mediaDiff = 0;
$historyData = null;
$prevPage = null;
$nextPage = null;
$stalenessDistribution = null;
if (file_exists($jsonFile)) {
// Use memory-efficient approach to extract only the necessary data
$maxItems = 100; // Limit the number of items to prevent memory exhaustion
// Extract history data if available
$historyData = [];
// Get history data from the JSON file
$historyEntries = $this->extractJsonArrayByKey($jsonFile, 'history', $maxItems);
// Process history data for the current key
foreach ($historyEntries as $timestamp => $entry) {
$historyEntry = [
'timestamp' => $timestamp,
'date' => is_string($timestamp) && !empty($timestamp) && $timestamp !== '0' ?
(new \DateTime($timestamp))->format('Y-m-d') : 'N/A',
'metrics' => []
];
// Check regular_pages
if (isset($entry['regular_pages']) && is_array($entry['regular_pages'])) {
foreach ($entry['regular_pages'] as $page) {
if (isset($page['key']) && $page['key'] === $key) {
// Extract metrics
$historyEntry['metrics'] = [
'staleness_score' => $page['staleness_score'] ?? 0,
'date_diff' => $page['date_diff'] ?? 0,
'word_diff' => $page['word_diff'] ?? 0,
'section_diff' => $page['section_diff'] ?? 0,
'link_diff' => $page['link_diff'] ?? 0,
'media_diff' => $page['media_diff'] ?? 0
];
$historyData[] = $historyEntry;
break;
}
}
}
// If not found in regular_pages, check specific_pages
if (empty($historyEntry['metrics']) && isset($entry['specific_pages']) && is_array($entry['specific_pages'])) {
foreach ($entry['specific_pages'] as $page) {
if (isset($page['key']) && $page['key'] === $key) {
// Extract metrics
$historyEntry['metrics'] = [
'staleness_score' => $page['staleness_score'] ?? 0,
'date_diff' => $page['date_diff'] ?? 0,
'word_diff' => $page['word_diff'] ?? 0,
'section_diff' => $page['section_diff'] ?? 0,
'link_diff' => $page['link_diff'] ?? 0,
'media_diff' => $page['media_diff'] ?? 0
];
$historyData[] = $historyEntry;
break;
}
}
}
}
// Sort history data by timestamp
usort($historyData, function($a, $b) {
return strtotime($a['timestamp']) - strtotime($b['timestamp']);
});
// Get regular_pages and specific_pages arrays
$regularPages = $this->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems);
$specificPages = $this->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems);
// Combine them into a single array
$allPages = array_merge($regularPages, $specificPages);
// Sort pages by staleness score (descending)
usort($allPages, function($a, $b) {
$scoreA = $a['staleness_score'] ?? 0;
$scoreB = $b['staleness_score'] ?? 0;
return $scoreB <=> $scoreA; // Descending order
});
// Find the current page index in the sorted array
$currentIndex = -1;
foreach ($allPages as $index => $page) {
if (isset($page['key']) && $page['key'] === $key) {
$currentIndex = $index;
break;
}
}
// Determine previous and next pages
if ($currentIndex > 0) {
$prevPage = $allPages[$currentIndex - 1];
}
if ($currentIndex < count($allPages) - 1 && $currentIndex >= 0) {
$nextPage = $allPages[$currentIndex + 1];
}
// Create staleness score distribution data for histogram
$stalenessScores = [];
foreach ($allPages as $page) {
if (isset($page['staleness_score'])) {
$stalenessScores[] = $page['staleness_score'];
}
}
if (!empty($stalenessScores)) {
// Calculate statistics
$min = min($stalenessScores);
$max = max($stalenessScores);
$avg = array_sum($stalenessScores) / count($stalenessScores);
$median = $this->calculateMedian($stalenessScores);
// Create histogram bins (10 bins)
$binCount = 10;
$binSize = ($max - $min) / $binCount;
$bins = [];
$binLabels = [];
// Initialize bins
for ($i = 0; $i < $binCount; $i++) {
$bins[$i] = 0;
$binStart = $min + ($i * $binSize);
$binEnd = $binStart + $binSize;
$binLabels[$i] = round($binStart, 1) . ' - ' . round($binEnd, 1);
}
// Count scores in each bin
foreach ($stalenessScores as $score) {
$binIndex = min($binCount - 1, floor(($score - $min) / $binSize));
$bins[$binIndex]++;
}
// Find which bin the current page falls into
$currentPageScore = 0;
foreach ($allPages as $page) {
if (isset($page['key']) && $page['key'] === $key && isset($page['staleness_score'])) {
$currentPageScore = $page['staleness_score'];
break;
}
}
$currentPageBin = min($binCount - 1, floor(($currentPageScore - $min) / $binSize));
$stalenessDistribution = [
'scores' => $stalenessScores,
'min' => $min,
'max' => $max,
'avg' => $avg,
'median' => $median,
'bins' => $bins,
'binLabels' => $binLabels,
'currentPageScore' => $currentPageScore,
'currentPageBin' => $currentPageBin,
'totalPages' => count($stalenessScores)
];
}
// Find the page with the matching key
foreach ($allPages as $page) {
if (isset($page['key']) && $page['key'] === $key) {
$mediaComparison = $page['media_comparison'] ?? null;
// Deduplicate images by URL in the controller and filter out images that appear in both languages
if ($mediaComparison) {
// Get all image URLs from both languages
$enOnlyImages = $mediaComparison['en_only'] ?? [];
$frOnlyImages = $mediaComparison['fr_only'] ?? [];
$commonImages = $mediaComparison['common'] ?? [];
// Extract all URLs from French images
$frImageUrls = [];
foreach ($frOnlyImages as $media) {
$frImageUrls[] = $media['src'];
}
// Also add URLs from common images (French side)
foreach ($commonImages as $commonMedia) {
if (isset($commonMedia['fr']['src'])) {
$frImageUrls[] = $commonMedia['fr']['src'];
}
}
// Extract all URLs from English images
$enImageUrls = [];
foreach ($enOnlyImages as $media) {
$enImageUrls[] = $media['src'];
}
// Also add URLs from common images (English side)
foreach ($commonImages as $commonMedia) {
if (isset($commonMedia['en']['src'])) {
$enImageUrls[] = $commonMedia['en']['src'];
}
}
// Process English-only images - deduplicate and filter out those that appear in French
$enUniqueImages = [];
$enProcessedUrls = [];
foreach ($enOnlyImages as $media) {
// Skip if this URL is already processed or if it appears in French images
if (!in_array($media['src'], $enProcessedUrls) && !in_array($media['src'], $frImageUrls)) {
$enProcessedUrls[] = $media['src'];
$enUniqueImages[] = $media;
}
}
// Process French-only images - deduplicate and filter out those that appear in English
$frUniqueImages = [];
$frProcessedUrls = [];
foreach ($frOnlyImages as $media) {
// Skip if this URL is already processed or if it appears in English images
if (!in_array($media['src'], $frProcessedUrls) && !in_array($media['src'], $enImageUrls)) {
$frProcessedUrls[] = $media['src'];
$frUniqueImages[] = $media;
}
}
// Replace the arrays with deduplicated and filtered versions
$mediaComparison['en_only'] = $enUniqueImages;
$mediaComparison['fr_only'] = $frUniqueImages;
$mediaComparison['en_only_count'] = count($enOnlyImages);
$mediaComparison['fr_only_count'] = count($frOnlyImages);
}
// Get link comparison data
$linkComparison = $page['link_comparison'] ?? null;
// Sort links alphabetically by URL if link comparison exists
if ($linkComparison) {
// Sort English-only links
if (isset($linkComparison['en_only']) && is_array($linkComparison['en_only'])) {
usort($linkComparison['en_only'], function ($a, $b) {
return strcmp($a['href'], $b['href']);
});
}
// Sort French-only links
if (isset($linkComparison['fr_only']) && is_array($linkComparison['fr_only'])) {
usort($linkComparison['fr_only'], function ($a, $b) {
return strcmp($a['href'], $b['href']);
});
}
// Sort common links
if (isset($linkComparison['common']) && is_array($linkComparison['common'])) {
usort($linkComparison['common'], function ($a, $b) {
return strcmp($a['en']['href'], $b['en']['href']);
});
}
}
// Get section comparison data and filter out "Contents" sections and navigation sections
$sectionComparison = $page['section_comparison'] ?? null;
// Sections to exclude from comparison (navigation elements)
$excludedSections = [
'Contents', 'Sommaire',
'Personal tools', 'Namespaces', 'Views', 'Search', 'Site', 'Tools', 'In other projects'
];
// Filter out excluded sections if section comparison exists
if ($sectionComparison) {
// Filter common sections
if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
$sectionComparison['common'] = array_filter($sectionComparison['common'], function ($section) use ($excludedSections) {
// Skip if either English or French title is in the excluded list
return !(in_array($section['en']['title'], $excludedSections) || in_array($section['fr']['title'], $excludedSections));
});
// Re-index array
$sectionComparison['common'] = array_values($sectionComparison['common']);
}
// Filter English-only sections
if (isset($sectionComparison['en_only']) && is_array($sectionComparison['en_only'])) {
$sectionComparison['en_only'] = array_filter($sectionComparison['en_only'], function ($section) use ($excludedSections) {
return !in_array($section['title'], $excludedSections);
});
// Re-index array
$sectionComparison['en_only'] = array_values($sectionComparison['en_only']);
}
// Filter French-only sections
if (isset($sectionComparison['fr_only']) && is_array($sectionComparison['fr_only'])) {
$sectionComparison['fr_only'] = array_filter($sectionComparison['fr_only'], function ($section) use ($excludedSections) {
return !in_array($section['title'], $excludedSections);
});
// Re-index array
$sectionComparison['fr_only'] = array_values($sectionComparison['fr_only']);
}
}
// Calculate adjusted section counts (excluding "Contents" sections)
$enSectionCount = $enPage['sections'];
$frSectionCount = $frPage['sections'];
// Adjust section counts if we have section comparison data
if ($sectionComparison) {
// Count how many sections were filtered out
$filteredCount = 0;
// Check common sections that were filtered
if (isset($page['section_comparison']['common']) && is_array($page['section_comparison']['common'])) {
foreach ($page['section_comparison']['common'] as $section) {
if (in_array($section['en']['title'], $excludedSections) || in_array($section['fr']['title'], $excludedSections)) {
$filteredCount++;
}
}
}
// Check English-only sections that were filtered
if (isset($page['section_comparison']['en_only']) && is_array($page['section_comparison']['en_only'])) {
foreach ($page['section_comparison']['en_only'] as $section) {
if (in_array($section['title'], $excludedSections)) {
$filteredCount++;
}
}
}
// Check French-only sections that were filtered
if (isset($page['section_comparison']['fr_only']) && is_array($page['section_comparison']['fr_only'])) {
foreach ($page['section_comparison']['fr_only'] as $section) {
if (in_array($section['title'], $excludedSections)) {
$filteredCount++;
}
}
}
// Adjust section counts
$enSectionCount -= $filteredCount;
$frSectionCount -= $filteredCount;
}
// Check for incorrect heading hierarchies
$enHierarchyErrors = [];
$frHierarchyErrors = [];
// Check English sections
if (isset($sectionComparison['en_only']) && is_array($sectionComparison['en_only'])) {
$enHierarchyErrors = $this->detectHeadingHierarchyErrors($sectionComparison['en_only']);
}
// Also check common sections (English side)
if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
$commonEnSections = array_map(function ($section) {
return $section['en'];
}, $sectionComparison['common']);
$enHierarchyErrors = array_merge($enHierarchyErrors, $this->detectHeadingHierarchyErrors($commonEnSections));
}
// Check French sections
if (isset($sectionComparison['fr_only']) && is_array($sectionComparison['fr_only'])) {
$frHierarchyErrors = $this->detectHeadingHierarchyErrors($sectionComparison['fr_only']);
}
// Also check common sections (French side)
if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
$commonFrSections = array_map(function ($section) {
return $section['fr'];
}, $sectionComparison['common']);
$frHierarchyErrors = array_merge($frHierarchyErrors, $this->detectHeadingHierarchyErrors($commonFrSections));
}
// Build aligned section list for better visualization of missing sections
$alignedSections = $this->buildAlignedSectionList($sectionComparison);
$detailedComparison = [
'section_comparison' => $sectionComparison,
'aligned_sections' => $alignedSections,
'link_comparison' => $linkComparison,
'media_comparison' => $mediaComparison,
'category_comparison' => $page['category_comparison'] ?? null,
'grammar_suggestions' => $page['grammar_suggestions'] ?? null,
'adjusted_en_section_count' => $enSectionCount,
'adjusted_fr_section_count' => $frSectionCount,
'en_hierarchy_errors' => $enHierarchyErrors,
'fr_hierarchy_errors' => $frHierarchyErrors
];
$mediaDiff = $page['media_diff'] ?? 0;
break;
}
}
}
// Calculate staleness score components
$scoreComponents = [];
if ($frPage) {
// Calculate date difference in days
$dateDiff = 0;
if ($enPage['last_modified'] && $frPage['last_modified']) {
$enDate = \DateTime::createFromFormat('Y-m-d', $enPage['last_modified']);
$frDate = \DateTime::createFromFormat('Y-m-d', $frPage['last_modified']);
if ($enDate && $frDate) {
$dateDiff = ($enDate->getTimestamp() - $frDate->getTimestamp()) / (60 * 60 * 24);
}
}
// Calculate content differences
$wordDiff = $enPage['word_count'] - $frPage['word_count'];
$sectionDiff = $enPage['sections'] - $frPage['sections'];
$linkDiff = $enPage['link_count'] - $frPage['link_count'];
// Calculate score components
$dateComponent = abs($dateDiff) * 0.2;
$wordComponent = (abs($wordDiff) / 100) * 0.5;
$sectionComponent = abs($sectionDiff) * 0.15;
$linkComponent = (abs($linkDiff) / 10) * 0.15;
$scoreComponents = [
'date' => [
'value' => $dateDiff,
'weight' => 0.2,
'component' => $dateComponent,
'description' => 'Différence de date (en jours)'
],
'word' => [
'value' => $wordDiff,
'weight' => 0.5,
'component' => $wordComponent,
'description' => 'Différence de nombre de mots'
],
'section' => [
'value' => $sectionDiff,
'weight' => 0.15,
'component' => $sectionComponent,
'description' => 'Différence de nombre de sections'
],
'link' => [
'value' => $linkDiff,
'weight' => 0.15,
'component' => $linkComponent,
'description' => 'Différence de nombre de liens'
]
];
// Add media component if available
if (isset($enPage['media_count']) && isset($frPage['media_count'])) {
$mediaComponent = (abs($mediaDiff) / 5) * 0.1;
$scoreComponents['media'] = [
'value' => $mediaDiff,
'weight' => 0.1,
'component' => $mediaComponent,
'description' => 'Différence de nombre d\'images'
];
// Adjust other weights to maintain total of 1.0
$scoreComponents['date']['weight'] = 0.2;
$scoreComponents['word']['weight'] = 0.45;
$scoreComponents['section']['weight'] = 0.15;
$scoreComponents['link']['weight'] = 0.1;
}
}
// Create URL for new French page if it doesn't exist
$createFrUrl = null;
if (!$frPage) {
$createFrUrl = 'https://wiki.openstreetmap.org/wiki/FR:' . $key;
}
// Format section titles for copy functionality
$enSections = '';
$frSections = '';
if ($detailedComparison && $detailedComparison['section_comparison']) {
// English sections
if ($enPage) {
$enSectionsList = [];
// Add common sections
foreach ($detailedComparison['section_comparison']['common'] as $section) {
$enSectionsList[] = str_repeat('=', $section['en']['level']) . ' ' .
$section['en']['title'] . ' ' .
str_repeat('=', $section['en']['level']);
}
// Add English-only sections
foreach ($detailedComparison['section_comparison']['en_only'] as $section) {
$enSectionsList[] = str_repeat('=', $section['level']) . ' ' .
$section['title'] . ' ' .
str_repeat('=', $section['level']) . ' (EN only)';
}
$enSections = implode("\n", $enSectionsList);
}
// French sections
if ($frPage) {
$frSectionsList = [];
// Add common sections
foreach ($detailedComparison['section_comparison']['common'] as $section) {
$frSectionsList[] = str_repeat('=', $section['fr']['level']) . ' ' .
$section['fr']['title'] . ' ' .
str_repeat('=', $section['fr']['level']);
}
// Add French-only sections
foreach ($detailedComparison['section_comparison']['fr_only'] as $section) {
$frSectionsList[] = str_repeat('=', $section['level']) . ' ' .
$section['title'] . ' ' .
str_repeat('=', $section['level']) . ' (FR only)';
}
$frSections = implode("\n", $frSectionsList);
}
}
// Format links for copy functionality
$enLinks = '';
$frLinks = '';
if ($detailedComparison && $detailedComparison['link_comparison']) {
// English links
if ($enPage) {
$enLinksList = [];
// Add common links
foreach ($detailedComparison['link_comparison']['common'] as $link) {
$enLinksList[] = $link['en']['text'] . ' - ' . $link['en']['href'];
}
// Add English-only links
foreach ($detailedComparison['link_comparison']['en_only'] as $link) {
$enLinksList[] = $link['text'] . ' - ' . $link['href'] . ' (EN only)';
}
$enLinks = implode("\n", $enLinksList);
}
// French links
if ($frPage) {
$frLinksList = [];
// Add common links
foreach ($detailedComparison['link_comparison']['common'] as $link) {
$frLinksList[] = $link['fr']['text'] . ' - ' . $link['fr']['href'];
}
// Add French-only links
foreach ($detailedComparison['link_comparison']['fr_only'] as $link) {
$frLinksList[] = $link['text'] . ' - ' . $link['href'] . ' (FR only)';
}
$frLinks = implode("\n", $frLinksList);
}
}
// Ensure page URLs are strings to prevent array to string conversion errors
if ($frPage && isset($frPage['url']) && is_array($frPage['url'])) {
$frPage['url'] = json_encode($frPage['url']);
}
if ($enPage && isset($enPage['url']) && is_array($enPage['url'])) {
$enPage['url'] = json_encode($enPage['url']);
}
return $this->render('admin/wiki_compare.html.twig', [
'key' => $key,
'en_page' => $enPage,
'fr_page' => $frPage,
'score_components' => $scoreComponents,
'create_fr_url' => $createFrUrl,
'detailed_comparison' => $detailedComparison,
'en_sections' => $enSections,
'fr_sections' => $frSections,
'en_links' => $enLinks,
'fr_links' => $frLinks,
'history_data' => $historyData,
'prev_page' => $prevPage,
'next_page' => $nextPage,
'staleness_distribution' => $stalenessDistribution
]);
}
/**
* Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
*
* @param string $html The full HTML content
* @return string The extracted main content
*/
private function extractMainContent(string $html): string
{
// Use a simple approach to extract the content
// This could be improved with a more sophisticated HTML parser if needed
// Create a DOMDocument to parse the HTML
$dom = new \DOMDocument();
// Suppress warnings about malformed HTML
libxml_use_internal_errors(true);
$dom->loadHTML($html);
libxml_clear_errors();
// Try to find the main content element
$contentElement = null;
// First, try to find the element with id "mw-content-text"
$contentElement = $dom->getElementById('mw-content-text');
// If not found, try to find the element with class "mw-content-ltr"
if (!$contentElement) {
$xpath = new \DOMXPath($dom);
$elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
if ($elements->length > 0) {
$contentElement = $elements->item(0);
}
}
// If still not found, return the original HTML
if (!$contentElement) {
return $html;
}
// Get the HTML of the content element
$contentHtml = $dom->saveHTML($contentElement);
// Clean up the content HTML
// Remove script and style elements
$contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
$contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
// Remove edit section links
$contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
return $contentHtml;
}
/**
* Extracts an array from a large JSON file by key without loading the entire file into memory
*
* @param string $filePath Path to the JSON file
* @param string $key The key of the array to extract
* @param int $maxItems Maximum number of items to extract (to prevent memory exhaustion)
* @return array The extracted array
*/
private function extractJsonArrayByKey(string $filePath, string $key, int $maxItems = 100): array
{
$result = [];
// First, check if the file exists and is readable
if (!is_readable($filePath)) {
error_log("File is not readable: $filePath");
return $result;
}
// Get the file size
$fileSize = filesize($filePath);
if ($fileSize === false || $fileSize === 0) {
error_log("File is empty or size could not be determined: $filePath");
return $result;
}
try {
// For very large files, we'll use a more efficient approach
// We'll search for the specified key directly
$handle = fopen($filePath, 'r');
if (!$handle) {
error_log("Could not open file: $filePath");
return $result;
}
// Variables to track parsing state
$bracketCount = 0;
$buffer = '';
$itemCount = 0;
$inArray = false;
$arrayStarted = false;
// Skip ahead to find the specified key more quickly
$found = false;
$searchKey = '"' . $key . '"';
while (!$found && ($line = fgets($handle)) !== false) {
if (strpos($line, $searchKey) !== false) {
$found = true;
// Extract everything after the key
$keyPos = strpos($line, $searchKey);
$afterKey = substr($line, $keyPos + strlen($searchKey));
// Find the colon and then the opening bracket
if (strpos($afterKey, ':') !== false && strpos($afterKey, '[') !== false) {
$inArray = true;
$arrayStarted = true;
$bracketPos = strpos($afterKey, '[');
$buffer = '['; // Start the buffer with an opening bracket
$bracketCount = 1;
// Add everything after the opening bracket to the buffer
$buffer .= substr($afterKey, $bracketPos + 1);
} else if (strpos($afterKey, ':') !== false) {
// The opening bracket might be on the next line
$inArray = true;
}
break;
}
}
// If we didn't find the key, return empty array
if (!$found) {
fclose($handle);
error_log("Key '$key' not found in file: $filePath");
return $result;
}
// If we found the key but not the opening bracket yet, look for it
if ($inArray && !$arrayStarted) {
while (($line = fgets($handle)) !== false) {
if (strpos($line, '[') !== false) {
$bracketPos = strpos($line, '[');
$buffer = '['; // Start the buffer with an opening bracket
$bracketCount = 1;
$arrayStarted = true;
// Add everything after the opening bracket to the buffer
$buffer .= substr($line, $bracketPos + 1);
break;
}
}
}
// If we still haven't found the opening bracket, something is wrong
if (!$arrayStarted) {
fclose($handle);
error_log("Could not find opening bracket for array '$key' in file: $filePath");
return $result;
}
// Now process the array
$collectingItems = true;
while ($collectingItems && ($line = fgets($handle)) !== false) {
// Count opening and closing brackets to track array nesting
$openBrackets = substr_count($line, '[') + substr_count($line, '{');
$closeBrackets = substr_count($line, ']') + substr_count($line, '}');
$bracketCount += $openBrackets - $closeBrackets;
// Add the line to our buffer
$buffer .= $line;
// If we've reached the end of the array (bracketCount = 0)
if ($bracketCount === 0) {
$collectingItems = false;
// Try to parse the buffer as JSON
try {
$parsedData = json_decode($buffer, true);
if (json_last_error() !== JSON_ERROR_NONE) {
error_log("JSON parse error: " . json_last_error_msg() . " for key '$key'");
// Try a different approach - manually construct a valid JSON array
// Split the buffer by objects (each starting with { and ending with })
preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches);
if (!empty($matches[0])) {
// Take the first $maxItems objects
$objects = array_slice($matches[0], 0, $maxItems);
// Construct a valid JSON array
$validJson = '[' . implode(',', $objects) . ']';
// Try to parse the valid JSON
$parsedData = json_decode($validJson, true);
if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
$result = $parsedData;
} else {
error_log("Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'");
}
}
} else if (is_array($parsedData)) {
// Limit the number of items to prevent memory exhaustion
$result = array_slice($parsedData, 0, $maxItems);
}
} catch (\Exception $e) {
error_log("Exception parsing JSON for key '$key': " . $e->getMessage());
}
break;
}
// Check if we've found a complete item (when we see a closing brace followed by a comma)
// This is used to count items and limit the number of items processed
if (preg_match('/\}\s*,\s*$/m', $line)) {
$itemCount++;
// If we've reached the maximum number of items, stop processing
if ($itemCount >= $maxItems) {
$collectingItems = false;
// Create a valid JSON array with the items we've collected so far
// We need to ensure the buffer ends with a complete JSON object and a closing bracket
// First, find the last complete object (ending with })
$lastObjectEnd = strrpos($buffer, '}');
if ($lastObjectEnd !== false) {
// Truncate the buffer at the end of the last complete object
$buffer = substr($buffer, 0, $lastObjectEnd + 1);
// Add the closing bracket for the array
$buffer .= ']';
// Try to parse the buffer as JSON
try {
$parsedData = json_decode($buffer, true);
if (json_last_error() !== JSON_ERROR_NONE) {
error_log("JSON parse error after max items: " . json_last_error_msg() . " for key '$key'");
// Try a different approach - manually construct a valid JSON array
// Split the buffer by objects (each starting with { and ending with })
preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches);
if (!empty($matches[0])) {
// Take the first $maxItems objects
$objects = array_slice($matches[0], 0, $maxItems);
// Construct a valid JSON array
$validJson = '[' . implode(',', $objects) . ']';
// Try to parse the valid JSON
$parsedData = json_decode($validJson, true);
if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
$result = $parsedData;
} else {
error_log("Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'");
}
}
} else if (is_array($parsedData)) {
$result = $parsedData;
}
} catch (\Exception $e) {
error_log("Exception parsing JSON after max items for key '$key': " . $e->getMessage());
}
} else {
error_log("Could not find the end of the last complete object for key '$key'");
}
break;
}
}
}
// Close the file
fclose($handle);
} catch (\Exception $e) {
error_log("Exception in extractJsonArrayByKey for key '$key': " . $e->getMessage());
}
return $result;
}
/**
* Extracts a scalar value from a large JSON file by key without loading the entire file into memory
*
* @param string $filePath Path to the JSON file
* @param string $key The key of the scalar value to extract
* @return mixed The extracted scalar value or null if not found
*/
private function extractJsonScalarByKey(string $filePath, string $key): mixed
{
// First, check if the file exists and is readable
if (!is_readable($filePath)) {
error_log("File is not readable: $filePath");
return null;
}
try {
// For very large files, we'll use a more efficient approach
// We'll search for the specified key directly
$handle = fopen($filePath, 'r');
if (!$handle) {
error_log("Could not open file: $filePath");
return null;
}
// Skip ahead to find the specified key more quickly
$found = false;
$searchKey = '"' . $key . '"';
$value = null;
while (!$found && ($line = fgets($handle)) !== false) {
if (strpos($line, $searchKey) !== false) {
$found = true;
// Extract everything after the key
$keyPos = strpos($line, $searchKey);
$afterKey = substr($line, $keyPos + strlen($searchKey));
// Check if the value is on this line
if (strpos($afterKey, ':') !== false) {
$colonPos = strpos($afterKey, ':');
$afterColon = trim(substr($afterKey, $colonPos + 1));
// Extract the value based on its type
if (preg_match('/^"([^"]*)"/', $afterColon, $matches)) {
// String value
$value = $matches[1];
} elseif (preg_match('/^(\d+)/', $afterColon, $matches)) {
// Numeric value
$value = intval($matches[1]);
} elseif (preg_match('/^(true|false)/', $afterColon, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
} elseif (strpos($afterColon, 'null') === 0) {
// Null value
$value = null;
} else {
// The value might be on the next line or more complex
// For simplicity, we'll just use the regex approach as a fallback
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $line, $matches)) {
// String value
$value = $matches[1];
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $line, $matches)) {
// Numeric value
$value = intval($matches[1]);
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $line, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
} elseif (strpos($line, 'null') !== false) {
// Null value
$value = null;
} else {
error_log("Could not extract value for key '$key' from line: " . trim($line));
}
}
} else {
// The value might be on the next line
error_log("Value for key '$key' might be on the next line, using fallback method");
// Read the next line
$nextLine = fgets($handle);
if ($nextLine !== false) {
$combinedLine = $line . $nextLine;
// Try to extract the value using regex
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $combinedLine, $matches)) {
// String value
$value = $matches[1];
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $combinedLine, $matches)) {
// Numeric value
$value = intval($matches[1]);
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $combinedLine, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
} elseif (strpos($combinedLine, 'null') !== false) {
// Null value
$value = null;
} else {
error_log("Could not extract value for key '$key' from combined lines");
}
}
}
break;
}
}
// Close the file
fclose($handle);
if (!$found) {
error_log("Key '$key' not found in file: $filePath");
} else if ($value === null) {
error_log("Value for key '$key' is null or could not be extracted");
}
return $value;
} catch (\Exception $e) {
error_log("Exception in extractJsonScalarByKey for key '$key': " . $e->getMessage());
return null;
}
}
/**
* Extracts the specific_pages array from a large JSON file without loading the entire file into memory
* This is a legacy method kept for backward compatibility
*
* @param string $filePath Path to the JSON file
* @param int $maxPages Maximum number of pages to extract (to prevent memory exhaustion)
* @return array The extracted specific_pages array
*/
private function extractSpecificPagesFromJson(string $filePath, int $maxPages = 100): array
{
return $this->extractJsonArrayByKey($filePath, 'specific_pages', $maxPages);
}
/**
* Calculate the median value of an array of numbers
*
* @param array $array Array of numbers
* @return float The median value
*/
private function calculateMedian(array $array): float
{
sort($array);
$count = count($array);
if ($count === 0) {
return 0;
}
$middle = floor($count / 2);
if ($count % 2 === 0) {
// Even number of elements, average the two middle values
return ($array[$middle - 1] + $array[$middle]) / 2;
} else {
// Odd number of elements, return the middle value
return $array[$middle];
}
}
}