qualiwiki/src/Controller/WikiController.php

2370 lines
104 KiB
PHP
Raw Normal View History

2025-09-01 18:28:23 +02:00
<?php
namespace App\Controller;
use Symfony\Bundle\FrameworkBundle\Controller\AbstractController;
use Symfony\Component\HttpFoundation\Response;
use Symfony\Component\Routing\Annotation\Route;
class WikiController extends AbstractController
{
2025-09-03 17:18:21 +02:00
/**
* Displays the evolution of decrepitude scores from JSON history data
*/
#[Route('/wiki/decrepitude', name: 'app_admin_wiki_decrepitude')]
public function decrepitudeScores(): Response
{
$outdatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
$histogramFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/staleness_histogram.png';
$regularPages = [];
$specificPages = [];
$lastUpdated = null;
$histogramExists = file_exists($histogramFile);
if (file_exists($outdatedPagesFile)) {
2025-09-05 15:58:26 +02:00
// Use memory-efficient approach to extract data from the large JSON file
$maxPages = 100; // Limit the number of pages to prevent memory exhaustion
2025-09-03 17:18:21 +02:00
2025-09-05 15:58:26 +02:00
// Extract regular_pages array
$regularPages = $this->extractJsonArrayByKey($outdatedPagesFile, 'regular_pages', $maxPages);
2025-09-03 17:18:21 +02:00
2025-09-05 15:58:26 +02:00
// Extract specific_pages array
$specificPages = $this->extractJsonArrayByKey($outdatedPagesFile, 'specific_pages', $maxPages);
2025-09-03 17:18:21 +02:00
2025-09-05 15:58:26 +02:00
// Extract last_updated value
$lastUpdated = $this->extractJsonScalarByKey($outdatedPagesFile, 'last_updated');
2025-09-03 17:18:21 +02:00
}
return $this->render('admin/wiki_decrepitude.html.twig', [
'regular_pages' => $regularPages,
'specific_pages' => $specificPages,
'last_updated' => $lastUpdated,
'histogram_exists' => $histogramExists,
'json_exists' => file_exists($outdatedPagesFile)
]);
}
2025-09-08 10:20:51 +02:00
/**
* Displays the evolution of page rankings over time
*/
#[Route('/wiki/rankings', name: 'app_admin_wiki_rankings')]
public function pageRankings(): Response
{
$rankingsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/page_rankings.json';
$timestamps = [];
$pages = [];
$globalMetrics = [];
$lastUpdated = null;
if (file_exists($rankingsFile)) {
// Load the rankings data
try {
$rankingsData = json_decode(file_get_contents($rankingsFile), true);
if (json_last_error() === JSON_ERROR_NONE) {
$timestamps = $rankingsData['timestamps'] ?? [];
$pages = $rankingsData['pages'] ?? [];
$globalMetrics = $rankingsData['global_metrics'] ?? [];
// Get the last timestamp as last_updated
if (!empty($timestamps)) {
$lastUpdated = end($timestamps);
}
}
} catch (\Exception $e) {
// Log the error
error_log("Error loading rankings data: " . $e->getMessage());
}
}
return $this->render('admin/wiki_rankings.html.twig', [
'timestamps' => $timestamps,
'pages' => $pages,
'global_metrics' => $globalMetrics,
'last_updated' => $lastUpdated,
'json_exists' => file_exists($rankingsFile)
]);
}
2025-09-01 18:28:23 +02:00
/**
* Detects incorrect heading hierarchies in a list of sections
* For example, h4 directly under h2 without h3 in between
*
* @param array $sections List of sections with 'level' and 'title' keys
* @return array List of section indices with hierarchy errors
*/
private function detectHeadingHierarchyErrors(array $sections): array
{
$errors = [];
$lastLevel = 0;
foreach ($sections as $index => $section) {
$currentLevel = isset($section['level']) ? (int)$section['level'] : 0;
// Skip if level is not set or is 0
if ($currentLevel === 0) {
continue;
}
// If this is the first section, just record its level
if ($lastLevel === 0) {
$lastLevel = $currentLevel;
continue;
}
// Check if the level jump is more than 1
// For example, h2 -> h4 (skipping h3)
if ($currentLevel > $lastLevel + 1) {
$errors[] = $index;
}
$lastLevel = $currentLevel;
}
return $errors;
}
/**
* Builds an aligned list of sections for English and French
* Adds empty placeholders in the French column for sections that exist in English but not in French
*
* @param array $sectionComparison Section comparison data with 'common', 'en_only', and 'fr_only' keys
* @return array Aligned section list with 'en' and 'fr' columns
*/
private function buildAlignedSectionList(array $sectionComparison): array
{
$alignedSections = [];
// First, process common sections (they already have both en and fr)
// if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
// foreach ($sectionComparison['common'] as $section) {
// $alignedSections[] = [
// 'en' => $section['en'],
// 'fr' => $section['fr']
// ];
// }
// }
// Then, process English-only sections and add empty placeholders for French
if (isset($sectionComparison['en_only']) && is_array($sectionComparison['en_only'])) {
foreach ($sectionComparison['en_only'] as $section) {
$alignedSections[] = [
'en' => [
'title' => $section['title'],
'level' => $section['level']
],
'fr' => [
'title' => '', // Empty placeholder
'level' => $section['level'], // Same level as English
'is_placeholder' => true
]
];
}
}
//
// // Finally, process French-only sections (these will be shown at the end)
if (isset($sectionComparison['fr_only']) && is_array($sectionComparison['fr_only'])) {
foreach ($sectionComparison['fr_only'] as $section) {
$alignedSections[] = [
'en' => [
'title' => '', // Empty placeholder
'level' => $section['level'], // Same level as French
'is_placeholder' => true
],
'fr' => [
'title' => $section['title'],
'level' => $section['level']
]
];
}
}
return $alignedSections;
}
#[Route('/', name: 'app_public_index')]
public function accueilAction(): Response
{
return $this->redirectToRoute('app_admin_wiki');
}
#[Route('/wiki/recent-changes', name: 'app_admin_wiki_recent_changes')]
public function recentChanges(): Response
{
$recentChangesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/recent_changes.json';
// Initialize arrays
$recentChanges = [];
$lastUpdated = null;
$teamMembers = [];
// Check if the recent changes file exists and load it
if (file_exists($recentChangesFile)) {
$recentChangesData = json_decode(file_get_contents($recentChangesFile), true);
if (isset($recentChangesData['recent_changes']) && is_array($recentChangesData['recent_changes'])) {
$recentChanges = $recentChangesData['recent_changes'];
$lastUpdated = isset($recentChangesData['last_updated']) ? $recentChangesData['last_updated'] : null;
// Process team members statistics
$teamMembers = $this->processTeamMembersStats($recentChanges);
}
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
if ($diff->h >= 1 || $diff->days > 0) {
// $this->refreshRecentChangesData();
// return $this->redirectToRoute('app_admin_wiki_recent_changes');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshRecentChangesData();
// Check if the file was created
if (file_exists($recentChangesFile)) {
// return $this->redirectToRoute('app_admin_wiki_recent_changes');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des changements récents.');
}
}
return $this->render('admin/wiki_recent_changes.html.twig', [
'recent_changes' => $recentChanges,
'last_updated' => $lastUpdated,
'team_members' => $teamMembers
]);
}
/**
* Process team members statistics from recent changes data
*
* @param array $recentChanges Recent changes data
* @return array Team members statistics
*/
private function processTeamMembersStats(array $recentChanges): array
{
$teamMembers = [];
// Group changes by user and count modifications
foreach ($recentChanges as $change) {
$user = $change['user'];
$changeSize = $change['change_size'];
// Initialize user data if not exists
if (!isset($teamMembers[$user])) {
$teamMembers[$user] = [
'username' => $user,
'contributions' => 0,
'chars_added' => 0,
'chars_changed' => 0,
'chars_deleted' => 0,
'user_url' => "https://wiki.openstreetmap.org/wiki/User:" . urlencode($user)
];
}
// Increment contribution count
$teamMembers[$user]['contributions']++;
// Process change size
if (is_numeric($changeSize)) {
$changeSize = (int)$changeSize;
if ($changeSize > 0) {
$teamMembers[$user]['chars_added'] += $changeSize;
} elseif ($changeSize < 0) {
$teamMembers[$user]['chars_deleted'] += abs($changeSize);
} else {
// Change size is 0, might be a new page or other change
$teamMembers[$user]['chars_changed'] += 0;
}
} elseif (preg_match('/^\+(\d+)$/', $changeSize, $matches)) {
// Format like "+123"
$teamMembers[$user]['chars_added'] += (int)$matches[1];
} elseif (preg_match('/^(\d+)$/', $changeSize, $matches)) {
// Format like "123" (note: this is not a regular minus sign)
$teamMembers[$user]['chars_deleted'] += (int)$matches[1];
}
}
// Convert to indexed array and sort by contributions count (descending)
$teamMembers = array_values($teamMembers);
usort($teamMembers, function ($a, $b) {
return $b['contributions'] - $a['contributions'];
});
return $teamMembers;
}
/**
* Refresh the recent changes data by running the fetch_recent_changes.py script
*/
private function refreshRecentChangesData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_recent_changes.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les changements récents. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_recent_changes.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/missing-translations', name: 'app_admin_wiki_missing_translations')]
public function missingTranslations(): Response
{
$untranslatedFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/untranslated_french_pages.json';
// Initialize arrays
$untranslatedPages = [];
$lastUpdated = null;
// Check if the untranslated pages file exists and load it
if (file_exists($untranslatedFile)) {
$untranslatedData = json_decode(file_get_contents($untranslatedFile), true);
if (isset($untranslatedData['untranslated_pages']) && is_array($untranslatedData['untranslated_pages'])) {
$untranslatedPages = $untranslatedData['untranslated_pages'];
$lastUpdated = isset($untranslatedData['last_updated']) ? $untranslatedData['last_updated'] : null;
}
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
if ($diff->h >= 1 || $diff->days > 0) {
$this->refreshUntranslatedPagesData();
return $this->redirectToRoute('app_admin_wiki_missing_translations');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshUntranslatedPagesData();
// Check if the file was created
if (file_exists($untranslatedFile)) {
return $this->redirectToRoute('app_admin_wiki_missing_translations');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des pages sans traduction.');
}
}
2025-09-03 17:18:21 +02:00
// Remove duplicates based on page title
$uniquePages = [];
$seenTitles = [];
foreach ($untranslatedPages as $page) {
if (!isset($seenTitles[$page['title']])) {
$seenTitles[$page['title']] = true;
$uniquePages[] = $page;
}
}
// Sort pages by title
usort($uniquePages, function($a, $b) {
return strcasecmp($a['title'], $b['title']);
});
2025-09-01 18:28:23 +02:00
return $this->render('admin/wiki_missing_translations.html.twig', [
2025-09-03 17:18:21 +02:00
'untranslated_pages' => $uniquePages,
2025-09-01 18:28:23 +02:00
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the untranslated pages data by running the find_untranslated_french_pages.py script
*/
private function refreshUntranslatedPagesData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/find_untranslated_french_pages.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les pages sans traduction. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script find_untranslated_french_pages.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/pages-unavailable-in-french', name: 'app_admin_wiki_pages_unavailable_in_french')]
public function pagesUnavailableInFrench(): Response
{
$unavailablePagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/pages_unavailable_in_french.json';
// Initialize arrays
$groupedPages = [];
$allPages = [];
$lastUpdated = null;
// Check if the unavailable pages file exists and load it
if (file_exists($unavailablePagesFile)) {
$unavailableData = json_decode(file_get_contents($unavailablePagesFile), true);
if (isset($unavailableData['grouped_pages']) && is_array($unavailableData['grouped_pages'])) {
$groupedPages = $unavailableData['grouped_pages'];
}
if (isset($unavailableData['all_pages']) && is_array($unavailableData['all_pages'])) {
$allPages = $unavailableData['all_pages'];
}
$lastUpdated = isset($unavailableData['last_updated']) ? $unavailableData['last_updated'] : null;
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
if ($diff->h >= 1 || $diff->days > 0) {
$this->refreshPagesUnavailableInFrenchData();
return $this->redirectToRoute('app_admin_wiki_pages_unavailable_in_french');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshPagesUnavailableInFrenchData();
// Check if the file was created
if (file_exists($unavailablePagesFile)) {
return $this->redirectToRoute('app_admin_wiki_pages_unavailable_in_french');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des pages non disponibles en français.');
}
}
// Move English pages to the top of the list
$englishPages = $groupedPages['En'] ?? [];
unset($groupedPages['En']);
// Sort other language groups alphabetically
ksort($groupedPages);
// Reinsert English pages at the beginning
if (!empty($englishPages)) {
$groupedPages = ['En' => $englishPages] + $groupedPages;
}
return $this->render('admin/wiki_pages_unavailable_in_french.html.twig', [
'grouped_pages' => $groupedPages,
'all_pages' => $allPages,
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the pages unavailable in French data by running the find_pages_unavailable_in_french.py script
*/
private function refreshPagesUnavailableInFrenchData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/find_pages_unavailable_in_french.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les pages non disponibles en français. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script find_pages_unavailable_in_french.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/osm-fr-groups', name: 'app_admin_wiki_osm_fr_groups')]
public function osmFrGroups(): Response
{
$groupsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/osm_fr_groups.json';
// Initialize arrays
$workingGroups = [];
$localGroups = [];
$umapUrl = 'https://umap.openstreetmap.fr/fr/map/groupes-locaux-openstreetmap_152488';
$lastUpdated = null;
// Check if the groups file exists and load it
if (file_exists($groupsFile)) {
$groupsData = json_decode(file_get_contents($groupsFile), true);
if (isset($groupsData['working_groups']) && is_array($groupsData['working_groups'])) {
$workingGroups = $groupsData['working_groups'];
}
if (isset($groupsData['local_groups']) && is_array($groupsData['local_groups'])) {
$localGroups = $groupsData['local_groups'];
}
$umapUrl = isset($groupsData['umap_url']) ? $groupsData['umap_url'] : 'https://umap.openstreetmap.fr/fr/map/groupes-locaux-openstreetmap_152488';
$lastUpdated = isset($groupsData['last_updated']) ? $groupsData['last_updated'] : null;
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
// if ($diff->h >= 1 || $diff->days > 0) {
// $this->refreshOsmFrGroupsData();
// return $this->redirectToRoute('app_admin_wiki_osm_fr_groups');
// }
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshOsmFrGroupsData();
// Check if the file was created
if (file_exists($groupsFile)) {
// return $this->redirectToRoute('app_admin_wiki_osm_fr_groups');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des groupes OSM-FR.');
}
}
// Group working groups by category
$groupedWorkingGroups = [];
foreach ($workingGroups as $group) {
$category = $group['category'] ?? 'Autres';
if (!isset($groupedWorkingGroups[$category])) {
$groupedWorkingGroups[$category] = [];
}
$groupedWorkingGroups[$category][] = $group;
}
// Sort categories alphabetically
ksort($groupedWorkingGroups);
return $this->render('admin/wiki_osm_fr_groups.html.twig', [
'working_groups' => $groupedWorkingGroups,
'local_groups' => $localGroups,
'umap_url' => $umapUrl,
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the OSM-FR groups data by running the fetch_osm_fr_groups.py script
*/
private function refreshOsmFrGroupsData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_osm_fr_groups.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les groupes OSM-FR. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_osm_fr_groups.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/suspicious-deletions', name: 'app_admin_wiki_suspicious_deletions')]
public function suspiciousDeletions(): Response
{
$suspiciousDeletesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/suspicious_deletions.json';
$wordDiffFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
// Initialize arrays
$suspiciousPages = [];
$wordDiffPages = [];
// Check if the suspicious deletions file exists and load it
if (file_exists($suspiciousDeletesFile)) {
$suspiciousData = json_decode(file_get_contents($suspiciousDeletesFile), true);
if (isset($suspiciousData['deletions']) && is_array($suspiciousData['deletions'])) {
$suspiciousPages = $suspiciousData['deletions'];
$lastUpdated = isset($suspiciousData['last_updated']) ? $suspiciousData['last_updated'] : null;
}
} else {
// If the file doesn't exist, try to create it by running the script
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/detect_suspicious_deletions.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' 2>&1', $output, $returnCode);
if ($returnCode === 0 && file_exists($suspiciousDeletesFile)) {
$suspiciousData = json_decode(file_get_contents($suspiciousDeletesFile), true);
if (isset($suspiciousData['deletions']) && is_array($suspiciousData['deletions'])) {
$suspiciousPages = $suspiciousData['deletions'];
$lastUpdated = isset($suspiciousData['last_updated']) ? $suspiciousData['last_updated'] : null;
}
} else {
$this->addFlash('warning', 'Impossible de générer le fichier de suppressions suspectes. Erreur: ' . implode("\n", $output));
}
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
// Also load the word-diff based suspicious pages for comparison
if (file_exists($wordDiffFile)) {
$jsonData = json_decode(file_get_contents($wordDiffFile), true);
foreach ($jsonData as $page) {
if (isset($page['fr_page']) && isset($page['en_page'])) {
// Calculate deletion percentage
$enWordCount = (int)$page['en_page']['word_count'];
$frWordCount = (int)$page['fr_page']['word_count'];
$wordDiff = $enWordCount - $frWordCount;
// If English has more words and the difference is significant (>30%)
if ($wordDiff > 0 && $frWordCount > 0 && ($wordDiff / $enWordCount) > 0.3) {
$page['deletion_percentage'] = round(($wordDiff / $enWordCount) * 100, 2);
$wordDiffPages[] = $page;
}
}
}
// Sort by deletion percentage (highest first)
usort($wordDiffPages, function ($a, $b) {
return $b['deletion_percentage'] <=> $a['deletion_percentage'];
});
}
return $this->render('admin/wiki_suspicious_deletions.html.twig', [
'suspicious_pages' => $wordDiffPages,
'recent_deletions' => $suspiciousPages,
'last_updated' => $lastUpdated ?? null
]);
}
#[Route('/wiki/tag-proposals', name: 'app_admin_wiki_tag_proposals')]
public function tagProposals(): Response
{
$proposalsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/proposals.json';
// Initialize arrays
$votingProposals = [];
$recentProposals = [];
$lastUpdated = null;
// Check if the proposals file exists and load it
if (file_exists($proposalsFile)) {
$proposalsData = json_decode(file_get_contents($proposalsFile), true);
if (isset($proposalsData['voting_proposals']) && is_array($proposalsData['voting_proposals'])) {
$votingProposals = $proposalsData['voting_proposals'];
}
if (isset($proposalsData['recent_proposals']) && is_array($proposalsData['recent_proposals'])) {
$recentProposals = $proposalsData['recent_proposals'];
}
$lastUpdated = isset($proposalsData['last_updated']) ? $proposalsData['last_updated'] : null;
// Check if the data is older than 1 hour
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 hour, refresh the data
// if ($diff->h >= 1 || $diff->days > 0) {
// $this->refreshProposalsData();
// return $this->redirectToRoute('app_admin_wiki_tag_proposals');
// }
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshProposalsData();
// Check if the file was created
if (file_exists($proposalsFile)) {
// return $this->redirectToRoute('app_admin_wiki_tag_proposals');
} else {
$this->addFlash('error', 'Impossible de générer le fichier de propositions.');
}
}
// Format the proposals for the template
$formattedProposals = [];
foreach ($votingProposals as $proposal) {
$formattedProposal = [
'feature' => $proposal['title'],
'url' => $proposal['url'],
'description' => 'Proposition en cours de vote',
'proposer' => $proposal['proposer'] ?? '',
'status' => $proposal['status'] ?? 'Voting',
'type' => 'voting'
];
// Add voting information if available
if (isset($proposal['votes'])) {
$formattedProposal['votes'] = $proposal['votes'];
$formattedProposal['total_votes'] = $proposal['total_votes'] ?? 0;
$formattedProposal['approve_percentage'] = $proposal['approve_percentage'] ?? 0;
$formattedProposal['oppose_percentage'] = $proposal['oppose_percentage'] ?? 0;
$formattedProposal['abstain_percentage'] = $proposal['abstain_percentage'] ?? 0;
}
$formattedProposals[] = $formattedProposal;
}
foreach ($recentProposals as $proposal) {
$formattedProposals[] = [
'feature' => $proposal['title'],
'url' => $proposal['url'],
'description' => 'Dernière modification: ' . $proposal['last_modified'],
'proposer' => $proposal['modified_by'],
'status' => 'Draft',
'type' => 'recent'
];
}
return $this->render('admin/wiki_tag_proposals.html.twig', [
'proposals' => $formattedProposals,
'last_updated' => $lastUpdated
]);
}
/**
* Refresh the proposals data by running the fetch_proposals.py script
*/
private function refreshProposalsData(): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_proposals.py';
if (file_exists($scriptPath)) {
exec('python3 ' . $scriptPath . ' --force 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les propositions. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_proposals.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki/random-suggestion', name: 'app_admin_wiki_random_suggestion')]
public function randomSuggestion(): Response
{
$jsonFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
if (!file_exists($jsonFile)) {
$this->addFlash('error', 'Le fichier outdated_pages.json n\'existe pas.');
return $this->redirectToRoute('app_admin_wiki');
}
2025-09-05 15:58:26 +02:00
// Use memory-efficient approach to extract only the necessary data
$maxItems = 100; // Limit the number of items to prevent memory exhaustion
// Extract regular_pages and specific_pages arrays
$regularPages = $this->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems);
$specificPages = $this->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems);
// Combine them into a single array
$allPages = array_merge($regularPages, $specificPages);
2025-09-01 18:28:23 +02:00
if (empty($allPages)) {
$this->addFlash('error', 'Aucune page à améliorer n\'a été trouvée.');
return $this->redirectToRoute('app_admin_wiki');
}
// Select a random page from the combined pages
$randomPage = $allPages[array_rand($allPages)];
return $this->render('admin/wiki_random_suggestion.html.twig', [
'page' => $randomPage
]);
}
2025-09-05 10:16:40 +02:00
#[Route('/wiki/create-french/{key}', name: 'app_admin_wiki_create_french', requirements: ['key' => '.+'])]
2025-09-01 18:28:23 +02:00
public function createFrench(string $key): Response
{
// Construct the URLs for the English page and the French page creation form
2025-09-03 17:18:21 +02:00
$englishUrl = "https://wiki.openstreetmap.org/wiki/{$key}";
2025-09-01 18:28:23 +02:00
$frenchEditUrl = "https://wiki.openstreetmap.org/w/index.php?title=FR:{$key}&action=edit";
2025-09-03 16:04:16 +02:00
// Fetch the HTML content of the English page using wiki_compare.py
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
$englishHtml = null;
$frenchHtml = null;
2025-09-05 11:37:19 +02:00
$frenchCacheExists = false;
2025-09-03 16:04:16 +02:00
if (file_exists($scriptPath)) {
// Create a temporary Python script to fetch the page content
$tempScriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/temp_fetch_page.py';
$pythonCode = <<<EOT
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import json
2025-09-05 11:37:19 +02:00
import hashlib
from pathlib import Path
from wiki_compare import fetch_wiki_page, HTML_CACHE_DIR
2025-09-03 16:04:16 +02:00
# Get the key from command line arguments
key = sys.argv[1]
language = sys.argv[2]
2025-09-05 11:37:19 +02:00
# Check if we're just checking cache existence
check_cache_only = len(sys.argv) > 3 and sys.argv[3] == 'check_cache'
2025-09-03 16:04:16 +02:00
2025-09-05 11:37:19 +02:00
if check_cache_only and language == 'fr':
# For French pages, construct the URL to check cache
if key.startswith('http'):
url = key
else:
url = f"https://wiki.openstreetmap.org/wiki/FR:{key}"
# Create cache key
cache_key = hashlib.md5(url.encode()).hexdigest()
cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
# Check if cache exists
if cache_file.exists():
print("CACHE_EXISTS")
else:
print("CACHE_MISSING")
2025-09-03 16:04:16 +02:00
else:
2025-09-05 11:37:19 +02:00
# Normal fetch operation
page = fetch_wiki_page(key, language)
# Output the HTML content
if page and 'html_content' in page:
print(page['html_content'])
else:
print("")
2025-09-03 16:04:16 +02:00
EOT;
file_put_contents($tempScriptPath, $pythonCode);
chmod($tempScriptPath, 0755);
2025-09-05 11:37:19 +02:00
// First check if French page exists in cache
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr check_cache";
$cacheCheckResult = trim(shell_exec($command));
$frenchCacheExists = ($cacheCheckResult === "CACHE_EXISTS");
2025-09-03 16:04:16 +02:00
// Fetch English page
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
$englishHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($englishHtml) {
$englishHtml = $this->extractMainContent($englishHtml);
}
// Fetch French page (might not exist, but we'll try)
$command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr";
$frenchHtml = shell_exec($command);
// Extract only the content part from the HTML (remove headers, footers, etc.)
if ($frenchHtml) {
$frenchHtml = $this->extractMainContent($frenchHtml);
}
// Clean up the temporary script
unlink($tempScriptPath);
}
2025-09-01 18:28:23 +02:00
return $this->render('admin/wiki_create_french.html.twig', [
'key' => $key,
'english_url' => $englishUrl,
2025-09-03 16:04:16 +02:00
'french_edit_url' => $frenchEditUrl,
'english_html' => $englishHtml,
2025-09-05 11:37:19 +02:00
'french_html' => $frenchHtml,
'french_cache_exists' => $frenchCacheExists
2025-09-01 18:28:23 +02:00
]);
}
#[Route('/wiki/archived-proposals', name: 'app_admin_wiki_archived_proposals')]
public function archivedProposals(\Symfony\Component\HttpFoundation\Request $request): Response
{
$jsonFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/archived_proposals.json';
$forceRefresh = $request->query->has('refresh');
$limit = $request->query->get('limit') ? (int)$request->query->get('limit') : null;
// Initialize arrays
$proposals = [];
$statistics = [];
$lastUpdated = null;
// Check if we should force a refresh
if ($forceRefresh) {
$this->refreshArchivedProposalsData($limit);
$this->addFlash('success', 'Les données des propositions archivées ont été rafraîchies.');
// Preserve the limit parameter in the redirect if it was provided
if ($limit) {
return $this->redirectToRoute('app_admin_wiki_archived_proposals', ['limit' => $limit]);
}
return $this->redirectToRoute('app_admin_wiki_archived_proposals');
}
// Check if the archived proposals file exists and load it
if (file_exists($jsonFile)) {
2025-09-05 15:58:26 +02:00
// Use memory-efficient approach to extract only the necessary data
$maxItems = 100; // Limit the number of items to prevent memory exhaustion
// Extract proposals array
$proposals = $this->extractJsonArrayByKey($jsonFile, 'proposals', $maxItems);
// Extract statistics object
$statistics = $this->extractJsonArrayByKey($jsonFile, 'statistics', $maxItems);
// Extract last_updated value
$lastUpdated = $this->extractJsonScalarByKey($jsonFile, 'last_updated');
2025-09-01 18:28:23 +02:00
// Check if the data is older than 1 day
if ($lastUpdated) {
$lastUpdatedTime = new \DateTime($lastUpdated);
$now = new \DateTime();
$diff = $now->diff($lastUpdatedTime);
// If older than 1 day, refresh the data
if ($diff->days > 1) {
$this->refreshArchivedProposalsData($limit);
$this->addFlash('info', 'Les données des propositions archivées ont été automatiquement mises à jour car elles dataient de plus d\'un jour.');
// Preserve the limit parameter in the redirect if it was provided
if ($limit) {
return $this->redirectToRoute('app_admin_wiki_archived_proposals', ['limit' => $limit]);
}
return $this->redirectToRoute('app_admin_wiki_archived_proposals');
}
}
} else {
// If the file doesn't exist, try to create it by running the script
$this->refreshArchivedProposalsData($limit);
// Check if the file was created
if (file_exists($jsonFile)) {
$this->addFlash('success', 'Le fichier des propositions archivées a été généré avec succès.');
// Preserve the limit parameter in the redirect if it was provided
if ($limit) {
return $this->redirectToRoute('app_admin_wiki_archived_proposals', ['limit' => $limit]);
}
return $this->redirectToRoute('app_admin_wiki_archived_proposals');
} else {
$this->addFlash('error', 'Impossible de générer le fichier des propositions archivées.');
}
}
return $this->render('admin/wiki_archived_proposals.html.twig', [
'proposals' => $proposals,
'statistics' => $statistics,
'last_updated' => $lastUpdated,
'limit' => $limit
]);
}
/**
* Refresh the archived proposals data by running the fetch_archived_proposals.py script
*
* @param int|null $limit Optional limit for the number of proposals to process
*/
private function refreshArchivedProposalsData(?int $limit = null): void
{
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/fetch_archived_proposals.py';
if (file_exists($scriptPath)) {
$command = 'python3 ' . $scriptPath;
// Add limit parameter if provided
if ($limit !== null) {
$command .= ' --limit ' . $limit;
}
exec($command . ' 2>&1', $output, $returnCode);
if ($returnCode !== 0) {
$this->addFlash('warning', 'Impossible de mettre à jour les propositions archivées. Erreur: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script fetch_archived_proposals.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
}
#[Route('/wiki', name: 'app_admin_wiki')]
public function index(): Response
{
$csvFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_pages.csv';
if (!file_exists($csvFile)) {
$this->addFlash('error', 'Le fichier wiki_pages.csv n\'existe pas.');
return $this->redirectToRoute('app_admin_index');
}
$csvData = array_map('str_getcsv', file($csvFile));
$headers = array_shift($csvData);
$wikiPages = [];
$missingTranslations = [];
$pageDifferences = [];
$pagesUnavailableInEnglish = [];
2025-09-03 17:18:21 +02:00
// Collect all staleness scores for statistics
2025-09-01 18:28:23 +02:00
$stalenessScores = [];
foreach ($csvData as $row) {
$page = array_combine($headers, $row);
if (isset($page['staleness_score']) && is_numeric($page['staleness_score'])) {
$stalenessScores[] = (float)$page['staleness_score'];
}
}
2025-09-03 17:18:21 +02:00
// Calculate statistics
$stalenessStats = [
'count' => count($stalenessScores),
'min' => !empty($stalenessScores) ? min($stalenessScores) : 0,
'max' => !empty($stalenessScores) ? max($stalenessScores) : 0,
'mean' => 0,
'std_dev' => 0
];
// Calculate mean
if (!empty($stalenessScores)) {
$stalenessStats['mean'] = array_sum($stalenessScores) / count($stalenessScores);
// Calculate standard deviation
$variance = 0;
foreach ($stalenessScores as $score) {
$variance += pow($score - $stalenessStats['mean'], 2);
}
$stalenessStats['std_dev'] = sqrt($variance / count($stalenessScores));
}
// Round statistics to 2 decimal places
$stalenessStats['mean'] = round($stalenessStats['mean'], 2);
$stalenessStats['std_dev'] = round($stalenessStats['std_dev'], 2);
2025-09-01 18:28:23 +02:00
2025-09-03 17:18:21 +02:00
// Process pages - use absolute values without normalization
2025-09-01 18:28:23 +02:00
foreach ($csvData as $row) {
$page = array_combine($headers, $row);
2025-09-03 17:18:21 +02:00
// Use absolute values of staleness score without normalization
2025-09-01 18:28:23 +02:00
if (isset($page['staleness_score']) && is_numeric($page['staleness_score'])) {
2025-09-03 17:18:21 +02:00
$page['staleness_score'] = abs((float)$page['staleness_score']);
2025-09-01 18:28:23 +02:00
// Round to 2 decimal places
2025-09-03 17:18:21 +02:00
$page['staleness_score'] = round($page['staleness_score'], 2);
2025-09-01 18:28:23 +02:00
}
$wikiPages[$page['key']][$page['language']] = $page;
}
// Identify pages missing French translations
foreach ($wikiPages as $key => $languages) {
if (isset($languages['en']) && !isset($languages['fr'])) {
$missingTranslations[$key] = $languages['en'];
}
}
2025-09-03 17:18:21 +02:00
// Prepare arrays for statistics
$stats = [
'en_sections' => [],
'fr_sections' => [],
'en_words' => [],
'fr_words' => [],
'en_links' => [],
'fr_links' => [],
'en_media' => [],
'fr_media' => []
];
2025-09-01 18:28:23 +02:00
// Calculate differences between English and French versions
foreach ($wikiPages as $key => $languages) {
if (isset($languages['en']) && isset($languages['fr'])) {
$en = $languages['en'];
$fr = $languages['fr'];
// Calculate differences (French - English)
$sectionDiff = (int)$fr['sections'] - (int)$en['sections'];
$wordDiff = (int)$fr['word_count'] - (int)$en['word_count'];
$linkDiff = (int)$fr['link_count'] - (int)$en['link_count'];
$mediaDiff = isset($fr['media_count']) && isset($en['media_count']) ?
(int)$fr['media_count'] - (int)$en['media_count'] : 0;
// Format differences with + or - sign
$pageDifferences[$key] = [
'section_diff' => $sectionDiff,
'section_diff_formatted' => ($sectionDiff >= 0 ? '+' : '') . $sectionDiff,
'word_diff' => $wordDiff,
'word_diff_formatted' => ($wordDiff >= 0 ? '+' : '') . $wordDiff,
'link_diff' => $linkDiff,
'link_diff_formatted' => ($linkDiff >= 0 ? '+' : '') . $linkDiff,
'media_diff' => $mediaDiff,
'media_diff_formatted' => ($mediaDiff >= 0 ? '+' : '') . $mediaDiff,
];
2025-09-03 17:18:21 +02:00
// Collect data for statistics
$stats['en_sections'][] = (int)$en['sections'];
$stats['fr_sections'][] = (int)$fr['sections'];
$stats['en_words'][] = (int)$en['word_count'];
$stats['fr_words'][] = (int)$fr['word_count'];
$stats['en_links'][] = (int)$en['link_count'];
$stats['fr_links'][] = (int)$fr['link_count'];
$stats['en_media'][] = isset($en['media_count']) ? (int)$en['media_count'] : 0;
$stats['fr_media'][] = isset($fr['media_count']) ? (int)$fr['media_count'] : 0;
}
}
// Calculate statistics
$wikiPagesStats = [];
foreach ($stats as $key => $values) {
if (!empty($values)) {
$mean = array_sum($values) / count($values);
// Calculate standard deviation
$variance = 0;
foreach ($values as $value) {
$variance += pow($value - $mean, 2);
}
$stdDev = sqrt($variance / count($values));
$wikiPagesStats[$key] = [
'count' => count($values),
'min' => min($values),
'max' => max($values),
'mean' => round($mean, 2),
'std_dev' => round($stdDev, 2)
];
2025-09-01 18:28:23 +02:00
}
}
// Sort wiki pages by staleness score (descending)
uasort($wikiPages, function ($a, $b) {
$scoreA = isset($a['en']) && isset($a['fr']) && isset($a['en']['staleness_score']) ? (float)$a['en']['staleness_score'] : 0;
$scoreB = isset($b['en']) && isset($b['fr']) && isset($b['en']['staleness_score']) ? (float)$b['en']['staleness_score'] : 0;
return $scoreB <=> $scoreA;
});
// Load pages unavailable in English
$pagesUnavailableInEnglishFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/pages_unavailable_in_english.json';
if (file_exists($pagesUnavailableInEnglishFile)) {
$pagesUnavailableInEnglishData = json_decode(file_get_contents($pagesUnavailableInEnglishFile), true);
if (isset($pagesUnavailableInEnglishData['pages']) && is_array($pagesUnavailableInEnglishData['pages'])) {
2025-09-05 11:37:19 +02:00
// Deduplicate pages based on URL
$uniquePages = [];
$seenUrls = [];
foreach ($pagesUnavailableInEnglishData['pages'] as $page) {
if (isset($page['url'])) {
// Use URL as the key for deduplication
$url = $page['url'];
if (!isset($seenUrls[$url])) {
$seenUrls[$url] = true;
$uniquePages[] = $page;
}
} else {
// If no URL, keep the page (shouldn't happen, but just in case)
$uniquePages[] = $page;
}
}
$pagesUnavailableInEnglish = $uniquePages;
2025-09-01 18:28:23 +02:00
}
}
// Load specific pages from outdated_pages.json
$specificPages = [];
$outdatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
if (file_exists($outdatedPagesFile)) {
2025-09-05 11:37:19 +02:00
// Use a memory-efficient approach to extract only the specific_pages array
// without loading the entire file into memory
$maxPages = 100; // Limit the number of pages to prevent memory exhaustion
$specificPages = $this->extractSpecificPagesFromJson($outdatedPagesFile, $maxPages);
2025-09-01 18:28:23 +02:00
}
// Load newly created French pages
$newlyCreatedPages = [];
$newlyCreatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/newly_created_french_pages.json';
if (file_exists($newlyCreatedPagesFile)) {
$newlyCreatedPagesData = json_decode(file_get_contents($newlyCreatedPagesFile), true);
if (isset($newlyCreatedPagesData['created_pages']) && is_array($newlyCreatedPagesData['created_pages'])) {
$newlyCreatedPages = $newlyCreatedPagesData['created_pages'];
}
}
// Load machine translations
$availableTranslations = [];
$translationsFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/translations.json';
if (file_exists($translationsFile)) {
$translationsData = json_decode(file_get_contents($translationsFile), true);
if (isset($translationsData['translations']) && is_array($translationsData['translations'])) {
$availableTranslations = $translationsData['translations'];
}
}
2025-09-05 11:37:19 +02:00
// Load keys without wiki pages
$keysWithoutWiki = [];
$keysWithoutWikiFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/keys_without_wiki.json';
if (file_exists($keysWithoutWikiFile)) {
$keysWithoutWikiData = json_decode(file_get_contents($keysWithoutWikiFile), true);
if (is_array($keysWithoutWikiData)) {
$keysWithoutWiki = $keysWithoutWikiData;
}
}
2025-09-01 18:28:23 +02:00
return $this->render('admin/wiki.html.twig', [
'wiki_pages' => $wikiPages,
'missing_translations' => $missingTranslations,
'page_differences' => $pageDifferences,
'pages_unavailable_in_english' => $pagesUnavailableInEnglish,
'specific_pages' => $specificPages,
2025-09-03 17:18:21 +02:00
'newly_created_pages' => $newlyCreatedPages,
'staleness_stats' => $stalenessStats,
'wiki_pages_stats' => $wikiPagesStats,
2025-09-05 11:37:19 +02:00
'available_translations' => $availableTranslations,
'keys_without_wiki' => $keysWithoutWiki
2025-09-01 18:28:23 +02:00
]);
}
#[Route('/wiki/translate/{key}', name: 'app_admin_wiki_translate', requirements: ['key' => '.+'])]
public function translate(string $key): Response
{
$this->addFlash('info', 'Traduction en cours pour la page ' . $key);
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_translate.py';
if (file_exists($scriptPath)) {
// Execute the translation script with virtual environment
2025-09-04 22:05:09 +02:00
$venvPython = $this->getParameter('kernel.project_dir') . '/venv/bin/python';
$command = 'cd ' . $this->getParameter('kernel.project_dir') . ' && ' . $venvPython . ' ' . $scriptPath . ' "' . $key . '"';
$output = [];
$returnVar = 0;
exec($command, $output, $returnVar);
if ($returnVar === 0) {
$this->addFlash('success', 'Traduction réussie pour la page ' . $key);
} else {
$this->addFlash('warning', 'Problème lors de la traduction: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script wiki_translate.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
return $this->redirectToRoute('app_admin_wiki');
}
#[Route('/wiki/update-translation/{key}', name: 'app_admin_wiki_update_translation', requirements: ['key' => '.+'])]
public function updateTranslation(string $key): Response
{
$this->addFlash('info', 'Mise à jour de la traduction en cours pour la page ' . $key);
try {
$scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_translate.py';
if (file_exists($scriptPath)) {
// Execute the translation script with the update flag and virtual environment
2025-09-04 22:05:09 +02:00
$venvPython = $this->getParameter('kernel.project_dir') . '/venv/bin/python';
$command = 'cd ' . $this->getParameter('kernel.project_dir') . ' && ' . $venvPython . ' ' . $scriptPath . ' "' . $key . '"';
$output = [];
$returnVar = 0;
exec($command, $output, $returnVar);
if ($returnVar === 0) {
$this->addFlash('success', 'Mise à jour de la traduction réussie pour la page ' . $key);
} else {
$this->addFlash('warning', 'Problème lors de la mise à jour de la traduction: ' . implode("\n", $output));
}
} else {
$this->addFlash('error', 'Le script wiki_translate.py n\'existe pas.');
}
} catch (\Exception $e) {
$this->addFlash('error', 'Erreur lors de l\'exécution du script: ' . $e->getMessage());
}
return $this->redirectToRoute('app_admin_wiki');
}
2025-09-01 18:28:23 +02:00
#[Route('/wiki/compare/{key}', name: 'app_admin_wiki_compare', requirements: ['key' => '.+'])]
public function compare(string $key): Response
{
$csvFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_pages.csv';
$jsonFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
if (!file_exists($csvFile)) {
$this->addFlash('error', 'Le fichier wiki_pages.csv n\'existe pas.');
return $this->redirectToRoute('app_admin_index');
}
$csvData = array_map('str_getcsv', file($csvFile));
$headers = array_shift($csvData);
// Process CSV data to find the requested key
$enPage = null;
$frPage = null;
foreach ($csvData as $row) {
$page = array_combine($headers, $row);
if ($page['key'] === $key) {
if ($page['language'] === 'en') {
$enPage = $page;
} elseif ($page['language'] === 'fr') {
$frPage = $page;
}
}
}
// If English page doesn't exist, redirect back with error
if (!$enPage) {
$this->addFlash('error', 'La page wiki pour la clé "' . $key . '" n\'existe pas.');
return $this->redirectToRoute('app_admin_wiki');
}
// Get detailed content comparison from JSON file
$detailedComparison = null;
$mediaDiff = 0;
$historyData = null;
2025-09-08 10:20:51 +02:00
$prevPage = null;
$nextPage = null;
$stalenessDistribution = null;
2025-09-01 18:28:23 +02:00
if (file_exists($jsonFile)) {
2025-09-05 15:58:26 +02:00
// Use memory-efficient approach to extract only the necessary data
$maxItems = 100; // Limit the number of items to prevent memory exhaustion
// Extract history data if available
$historyData = [];
2025-09-05 15:58:26 +02:00
// Get history data from the JSON file
$historyEntries = $this->extractJsonArrayByKey($jsonFile, 'history', $maxItems);
// Process history data for the current key
foreach ($historyEntries as $timestamp => $entry) {
$historyEntry = [
'timestamp' => $timestamp,
'date' => is_string($timestamp) && !empty($timestamp) && $timestamp !== '0' ?
(new \DateTime($timestamp))->format('Y-m-d') : 'N/A',
'metrics' => []
];
// Check regular_pages
if (isset($entry['regular_pages']) && is_array($entry['regular_pages'])) {
foreach ($entry['regular_pages'] as $page) {
if (isset($page['key']) && $page['key'] === $key) {
// Extract metrics
$historyEntry['metrics'] = [
'staleness_score' => $page['staleness_score'] ?? 0,
'date_diff' => $page['date_diff'] ?? 0,
'word_diff' => $page['word_diff'] ?? 0,
'section_diff' => $page['section_diff'] ?? 0,
'link_diff' => $page['link_diff'] ?? 0,
'media_diff' => $page['media_diff'] ?? 0
];
$historyData[] = $historyEntry;
break;
}
}
2025-09-05 15:58:26 +02:00
}
// If not found in regular_pages, check specific_pages
if (empty($historyEntry['metrics']) && isset($entry['specific_pages']) && is_array($entry['specific_pages'])) {
foreach ($entry['specific_pages'] as $page) {
if (isset($page['key']) && $page['key'] === $key) {
// Extract metrics
$historyEntry['metrics'] = [
'staleness_score' => $page['staleness_score'] ?? 0,
'date_diff' => $page['date_diff'] ?? 0,
'word_diff' => $page['word_diff'] ?? 0,
'section_diff' => $page['section_diff'] ?? 0,
'link_diff' => $page['link_diff'] ?? 0,
'media_diff' => $page['media_diff'] ?? 0
];
$historyData[] = $historyEntry;
break;
}
}
}
}
2025-09-05 15:58:26 +02:00
// Sort history data by timestamp
usort($historyData, function($a, $b) {
return strtotime($a['timestamp']) - strtotime($b['timestamp']);
});
2025-09-05 15:58:26 +02:00
// Get regular_pages and specific_pages arrays
$regularPages = $this->extractJsonArrayByKey($jsonFile, 'regular_pages', $maxItems);
$specificPages = $this->extractJsonArrayByKey($jsonFile, 'specific_pages', $maxItems);
// Combine them into a single array
$allPages = array_merge($regularPages, $specificPages);
2025-09-08 10:20:51 +02:00
// Sort pages by staleness score (descending)
usort($allPages, function($a, $b) {
$scoreA = $a['staleness_score'] ?? 0;
$scoreB = $b['staleness_score'] ?? 0;
return $scoreB <=> $scoreA; // Descending order
});
// Find the current page index in the sorted array
$currentIndex = -1;
foreach ($allPages as $index => $page) {
if (isset($page['key']) && $page['key'] === $key) {
$currentIndex = $index;
break;
}
}
// Determine previous and next pages
if ($currentIndex > 0) {
$prevPage = $allPages[$currentIndex - 1];
}
if ($currentIndex < count($allPages) - 1 && $currentIndex >= 0) {
$nextPage = $allPages[$currentIndex + 1];
}
// Create staleness score distribution data for histogram
$stalenessScores = [];
foreach ($allPages as $page) {
if (isset($page['staleness_score'])) {
$stalenessScores[] = $page['staleness_score'];
}
}
if (!empty($stalenessScores)) {
// Calculate statistics
$min = min($stalenessScores);
$max = max($stalenessScores);
$avg = array_sum($stalenessScores) / count($stalenessScores);
$median = $this->calculateMedian($stalenessScores);
// Create histogram bins (10 bins)
$binCount = 10;
$binSize = ($max - $min) / $binCount;
$bins = [];
$binLabels = [];
// Initialize bins
for ($i = 0; $i < $binCount; $i++) {
$bins[$i] = 0;
$binStart = $min + ($i * $binSize);
$binEnd = $binStart + $binSize;
$binLabels[$i] = round($binStart, 1) . ' - ' . round($binEnd, 1);
}
// Count scores in each bin
foreach ($stalenessScores as $score) {
$binIndex = min($binCount - 1, floor(($score - $min) / $binSize));
$bins[$binIndex]++;
}
// Find which bin the current page falls into
$currentPageScore = 0;
foreach ($allPages as $page) {
if (isset($page['key']) && $page['key'] === $key && isset($page['staleness_score'])) {
$currentPageScore = $page['staleness_score'];
break;
}
}
$currentPageBin = min($binCount - 1, floor(($currentPageScore - $min) / $binSize));
$stalenessDistribution = [
'scores' => $stalenessScores,
'min' => $min,
'max' => $max,
'avg' => $avg,
'median' => $median,
'bins' => $bins,
'binLabels' => $binLabels,
'currentPageScore' => $currentPageScore,
'currentPageBin' => $currentPageBin,
'totalPages' => count($stalenessScores)
];
}
2025-09-01 18:28:23 +02:00
2025-09-05 15:58:26 +02:00
// Find the page with the matching key
2025-09-01 18:28:23 +02:00
foreach ($allPages as $page) {
if (isset($page['key']) && $page['key'] === $key) {
$mediaComparison = $page['media_comparison'] ?? null;
// Deduplicate images by URL in the controller and filter out images that appear in both languages
if ($mediaComparison) {
// Get all image URLs from both languages
$enOnlyImages = $mediaComparison['en_only'] ?? [];
$frOnlyImages = $mediaComparison['fr_only'] ?? [];
$commonImages = $mediaComparison['common'] ?? [];
// Extract all URLs from French images
$frImageUrls = [];
foreach ($frOnlyImages as $media) {
$frImageUrls[] = $media['src'];
}
// Also add URLs from common images (French side)
foreach ($commonImages as $commonMedia) {
if (isset($commonMedia['fr']['src'])) {
$frImageUrls[] = $commonMedia['fr']['src'];
}
}
// Extract all URLs from English images
$enImageUrls = [];
foreach ($enOnlyImages as $media) {
$enImageUrls[] = $media['src'];
}
// Also add URLs from common images (English side)
foreach ($commonImages as $commonMedia) {
if (isset($commonMedia['en']['src'])) {
$enImageUrls[] = $commonMedia['en']['src'];
}
}
// Process English-only images - deduplicate and filter out those that appear in French
$enUniqueImages = [];
$enProcessedUrls = [];
foreach ($enOnlyImages as $media) {
// Skip if this URL is already processed or if it appears in French images
if (!in_array($media['src'], $enProcessedUrls) && !in_array($media['src'], $frImageUrls)) {
$enProcessedUrls[] = $media['src'];
$enUniqueImages[] = $media;
}
}
// Process French-only images - deduplicate and filter out those that appear in English
$frUniqueImages = [];
$frProcessedUrls = [];
foreach ($frOnlyImages as $media) {
// Skip if this URL is already processed or if it appears in English images
if (!in_array($media['src'], $frProcessedUrls) && !in_array($media['src'], $enImageUrls)) {
$frProcessedUrls[] = $media['src'];
$frUniqueImages[] = $media;
}
}
// Replace the arrays with deduplicated and filtered versions
$mediaComparison['en_only'] = $enUniqueImages;
$mediaComparison['fr_only'] = $frUniqueImages;
$mediaComparison['en_only_count'] = count($enOnlyImages);
$mediaComparison['fr_only_count'] = count($frOnlyImages);
}
// Get link comparison data
$linkComparison = $page['link_comparison'] ?? null;
// Sort links alphabetically by URL if link comparison exists
if ($linkComparison) {
// Sort English-only links
if (isset($linkComparison['en_only']) && is_array($linkComparison['en_only'])) {
usort($linkComparison['en_only'], function ($a, $b) {
return strcmp($a['href'], $b['href']);
});
}
// Sort French-only links
if (isset($linkComparison['fr_only']) && is_array($linkComparison['fr_only'])) {
usort($linkComparison['fr_only'], function ($a, $b) {
return strcmp($a['href'], $b['href']);
});
}
// Sort common links
if (isset($linkComparison['common']) && is_array($linkComparison['common'])) {
usort($linkComparison['common'], function ($a, $b) {
return strcmp($a['en']['href'], $b['en']['href']);
});
}
}
// Get section comparison data and filter out "Contents" sections and navigation sections
$sectionComparison = $page['section_comparison'] ?? null;
// Sections to exclude from comparison (navigation elements)
$excludedSections = [
'Contents', 'Sommaire',
'Personal tools', 'Namespaces', 'Views', 'Search', 'Site', 'Tools', 'In other projects'
];
// Filter out excluded sections if section comparison exists
if ($sectionComparison) {
// Filter common sections
if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
$sectionComparison['common'] = array_filter($sectionComparison['common'], function ($section) use ($excludedSections) {
// Skip if either English or French title is in the excluded list
return !(in_array($section['en']['title'], $excludedSections) || in_array($section['fr']['title'], $excludedSections));
});
// Re-index array
$sectionComparison['common'] = array_values($sectionComparison['common']);
}
// Filter English-only sections
if (isset($sectionComparison['en_only']) && is_array($sectionComparison['en_only'])) {
$sectionComparison['en_only'] = array_filter($sectionComparison['en_only'], function ($section) use ($excludedSections) {
return !in_array($section['title'], $excludedSections);
});
// Re-index array
$sectionComparison['en_only'] = array_values($sectionComparison['en_only']);
}
// Filter French-only sections
if (isset($sectionComparison['fr_only']) && is_array($sectionComparison['fr_only'])) {
$sectionComparison['fr_only'] = array_filter($sectionComparison['fr_only'], function ($section) use ($excludedSections) {
return !in_array($section['title'], $excludedSections);
});
// Re-index array
$sectionComparison['fr_only'] = array_values($sectionComparison['fr_only']);
}
}
// Calculate adjusted section counts (excluding "Contents" sections)
$enSectionCount = $enPage['sections'];
$frSectionCount = $frPage['sections'];
// Adjust section counts if we have section comparison data
if ($sectionComparison) {
// Count how many sections were filtered out
$filteredCount = 0;
// Check common sections that were filtered
if (isset($page['section_comparison']['common']) && is_array($page['section_comparison']['common'])) {
foreach ($page['section_comparison']['common'] as $section) {
if (in_array($section['en']['title'], $excludedSections) || in_array($section['fr']['title'], $excludedSections)) {
$filteredCount++;
}
}
}
// Check English-only sections that were filtered
if (isset($page['section_comparison']['en_only']) && is_array($page['section_comparison']['en_only'])) {
foreach ($page['section_comparison']['en_only'] as $section) {
if (in_array($section['title'], $excludedSections)) {
$filteredCount++;
}
}
}
// Check French-only sections that were filtered
if (isset($page['section_comparison']['fr_only']) && is_array($page['section_comparison']['fr_only'])) {
foreach ($page['section_comparison']['fr_only'] as $section) {
if (in_array($section['title'], $excludedSections)) {
$filteredCount++;
}
}
}
// Adjust section counts
$enSectionCount -= $filteredCount;
$frSectionCount -= $filteredCount;
}
// Check for incorrect heading hierarchies
$enHierarchyErrors = [];
$frHierarchyErrors = [];
// Check English sections
if (isset($sectionComparison['en_only']) && is_array($sectionComparison['en_only'])) {
$enHierarchyErrors = $this->detectHeadingHierarchyErrors($sectionComparison['en_only']);
}
// Also check common sections (English side)
if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
$commonEnSections = array_map(function ($section) {
return $section['en'];
}, $sectionComparison['common']);
$enHierarchyErrors = array_merge($enHierarchyErrors, $this->detectHeadingHierarchyErrors($commonEnSections));
}
// Check French sections
if (isset($sectionComparison['fr_only']) && is_array($sectionComparison['fr_only'])) {
$frHierarchyErrors = $this->detectHeadingHierarchyErrors($sectionComparison['fr_only']);
}
// Also check common sections (French side)
if (isset($sectionComparison['common']) && is_array($sectionComparison['common'])) {
$commonFrSections = array_map(function ($section) {
return $section['fr'];
}, $sectionComparison['common']);
$frHierarchyErrors = array_merge($frHierarchyErrors, $this->detectHeadingHierarchyErrors($commonFrSections));
}
// Build aligned section list for better visualization of missing sections
$alignedSections = $this->buildAlignedSectionList($sectionComparison);
$detailedComparison = [
'section_comparison' => $sectionComparison,
'aligned_sections' => $alignedSections,
'link_comparison' => $linkComparison,
'media_comparison' => $mediaComparison,
'category_comparison' => $page['category_comparison'] ?? null,
'grammar_suggestions' => $page['grammar_suggestions'] ?? null,
'adjusted_en_section_count' => $enSectionCount,
'adjusted_fr_section_count' => $frSectionCount,
'en_hierarchy_errors' => $enHierarchyErrors,
'fr_hierarchy_errors' => $frHierarchyErrors
];
$mediaDiff = $page['media_diff'] ?? 0;
break;
}
}
}
// Calculate staleness score components
$scoreComponents = [];
if ($frPage) {
// Calculate date difference in days
$dateDiff = 0;
if ($enPage['last_modified'] && $frPage['last_modified']) {
$enDate = \DateTime::createFromFormat('Y-m-d', $enPage['last_modified']);
$frDate = \DateTime::createFromFormat('Y-m-d', $frPage['last_modified']);
if ($enDate && $frDate) {
$dateDiff = ($enDate->getTimestamp() - $frDate->getTimestamp()) / (60 * 60 * 24);
}
}
// Calculate content differences
$wordDiff = $enPage['word_count'] - $frPage['word_count'];
$sectionDiff = $enPage['sections'] - $frPage['sections'];
$linkDiff = $enPage['link_count'] - $frPage['link_count'];
// Calculate score components
$dateComponent = abs($dateDiff) * 0.2;
$wordComponent = (abs($wordDiff) / 100) * 0.5;
$sectionComponent = abs($sectionDiff) * 0.15;
$linkComponent = (abs($linkDiff) / 10) * 0.15;
$scoreComponents = [
'date' => [
'value' => $dateDiff,
'weight' => 0.2,
'component' => $dateComponent,
'description' => 'Différence de date (en jours)'
],
'word' => [
'value' => $wordDiff,
'weight' => 0.5,
'component' => $wordComponent,
'description' => 'Différence de nombre de mots'
],
'section' => [
'value' => $sectionDiff,
'weight' => 0.15,
'component' => $sectionComponent,
'description' => 'Différence de nombre de sections'
],
'link' => [
'value' => $linkDiff,
'weight' => 0.15,
'component' => $linkComponent,
'description' => 'Différence de nombre de liens'
]
];
// Add media component if available
if (isset($enPage['media_count']) && isset($frPage['media_count'])) {
$mediaComponent = (abs($mediaDiff) / 5) * 0.1;
$scoreComponents['media'] = [
'value' => $mediaDiff,
'weight' => 0.1,
'component' => $mediaComponent,
'description' => 'Différence de nombre d\'images'
];
// Adjust other weights to maintain total of 1.0
$scoreComponents['date']['weight'] = 0.2;
$scoreComponents['word']['weight'] = 0.45;
$scoreComponents['section']['weight'] = 0.15;
$scoreComponents['link']['weight'] = 0.1;
}
}
// Create URL for new French page if it doesn't exist
$createFrUrl = null;
if (!$frPage) {
2025-09-03 17:18:21 +02:00
$createFrUrl = 'https://wiki.openstreetmap.org/wiki/FR:' . $key;
2025-09-01 18:28:23 +02:00
}
// Format section titles for copy functionality
$enSections = '';
$frSections = '';
if ($detailedComparison && $detailedComparison['section_comparison']) {
// English sections
if ($enPage) {
$enSectionsList = [];
// Add common sections
foreach ($detailedComparison['section_comparison']['common'] as $section) {
$enSectionsList[] = str_repeat('=', $section['en']['level']) . ' ' .
$section['en']['title'] . ' ' .
str_repeat('=', $section['en']['level']);
}
// Add English-only sections
foreach ($detailedComparison['section_comparison']['en_only'] as $section) {
$enSectionsList[] = str_repeat('=', $section['level']) . ' ' .
$section['title'] . ' ' .
str_repeat('=', $section['level']) . ' (EN only)';
}
$enSections = implode("\n", $enSectionsList);
}
// French sections
if ($frPage) {
$frSectionsList = [];
// Add common sections
foreach ($detailedComparison['section_comparison']['common'] as $section) {
$frSectionsList[] = str_repeat('=', $section['fr']['level']) . ' ' .
$section['fr']['title'] . ' ' .
str_repeat('=', $section['fr']['level']);
}
// Add French-only sections
foreach ($detailedComparison['section_comparison']['fr_only'] as $section) {
$frSectionsList[] = str_repeat('=', $section['level']) . ' ' .
$section['title'] . ' ' .
str_repeat('=', $section['level']) . ' (FR only)';
}
$frSections = implode("\n", $frSectionsList);
}
}
// Format links for copy functionality
$enLinks = '';
$frLinks = '';
if ($detailedComparison && $detailedComparison['link_comparison']) {
// English links
if ($enPage) {
$enLinksList = [];
// Add common links
foreach ($detailedComparison['link_comparison']['common'] as $link) {
$enLinksList[] = $link['en']['text'] . ' - ' . $link['en']['href'];
}
// Add English-only links
foreach ($detailedComparison['link_comparison']['en_only'] as $link) {
$enLinksList[] = $link['text'] . ' - ' . $link['href'] . ' (EN only)';
}
$enLinks = implode("\n", $enLinksList);
}
// French links
if ($frPage) {
$frLinksList = [];
// Add common links
foreach ($detailedComparison['link_comparison']['common'] as $link) {
$frLinksList[] = $link['fr']['text'] . ' - ' . $link['fr']['href'];
}
// Add French-only links
foreach ($detailedComparison['link_comparison']['fr_only'] as $link) {
$frLinksList[] = $link['text'] . ' - ' . $link['href'] . ' (FR only)';
}
$frLinks = implode("\n", $frLinksList);
}
}
// Ensure page URLs are strings to prevent array to string conversion errors
if ($frPage && isset($frPage['url']) && is_array($frPage['url'])) {
$frPage['url'] = json_encode($frPage['url']);
}
if ($enPage && isset($enPage['url']) && is_array($enPage['url'])) {
$enPage['url'] = json_encode($enPage['url']);
}
return $this->render('admin/wiki_compare.html.twig', [
'key' => $key,
'en_page' => $enPage,
'fr_page' => $frPage,
'score_components' => $scoreComponents,
'create_fr_url' => $createFrUrl,
'detailed_comparison' => $detailedComparison,
'en_sections' => $enSections,
'fr_sections' => $frSections,
'en_links' => $enLinks,
'fr_links' => $frLinks,
2025-09-08 10:20:51 +02:00
'history_data' => $historyData,
'prev_page' => $prevPage,
'next_page' => $nextPage,
'staleness_distribution' => $stalenessDistribution
2025-09-01 18:28:23 +02:00
]);
}
2025-09-03 16:04:16 +02:00
/**
* Extracts the main content from the HTML, removing headers, footers, and other unnecessary elements
*
* @param string $html The full HTML content
* @return string The extracted main content
*/
private function extractMainContent(string $html): string
{
// Use a simple approach to extract the content
// This could be improved with a more sophisticated HTML parser if needed
// Create a DOMDocument to parse the HTML
$dom = new \DOMDocument();
// Suppress warnings about malformed HTML
libxml_use_internal_errors(true);
$dom->loadHTML($html);
libxml_clear_errors();
// Try to find the main content element
$contentElement = null;
// First, try to find the element with id "mw-content-text"
$contentElement = $dom->getElementById('mw-content-text');
// If not found, try to find the element with class "mw-content-ltr"
if (!$contentElement) {
$xpath = new \DOMXPath($dom);
$elements = $xpath->query("//*[contains(@class, 'mw-content-ltr')]");
if ($elements->length > 0) {
$contentElement = $elements->item(0);
}
}
// If still not found, return the original HTML
if (!$contentElement) {
return $html;
}
// Get the HTML of the content element
$contentHtml = $dom->saveHTML($contentElement);
// Clean up the content HTML
// Remove script and style elements
$contentHtml = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $contentHtml);
$contentHtml = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $contentHtml);
// Remove edit section links
$contentHtml = preg_replace('/<span class="mw-editsection">(.*?)<\/span>/is', '', $contentHtml);
return $contentHtml;
}
2025-09-05 11:37:19 +02:00
/**
2025-09-05 15:58:26 +02:00
* Extracts an array from a large JSON file by key without loading the entire file into memory
2025-09-05 11:37:19 +02:00
*
* @param string $filePath Path to the JSON file
2025-09-05 15:58:26 +02:00
* @param string $key The key of the array to extract
* @param int $maxItems Maximum number of items to extract (to prevent memory exhaustion)
* @return array The extracted array
2025-09-05 11:37:19 +02:00
*/
2025-09-05 15:58:26 +02:00
private function extractJsonArrayByKey(string $filePath, string $key, int $maxItems = 100): array
2025-09-05 11:37:19 +02:00
{
2025-09-05 15:58:26 +02:00
$result = [];
2025-09-05 11:37:19 +02:00
// First, check if the file exists and is readable
if (!is_readable($filePath)) {
2025-09-05 15:58:26 +02:00
error_log("File is not readable: $filePath");
return $result;
2025-09-05 11:37:19 +02:00
}
// Get the file size
$fileSize = filesize($filePath);
if ($fileSize === false || $fileSize === 0) {
2025-09-05 15:58:26 +02:00
error_log("File is empty or size could not be determined: $filePath");
return $result;
2025-09-05 11:37:19 +02:00
}
2025-09-05 15:58:26 +02:00
try {
// For very large files, we'll use a more efficient approach
// We'll search for the specified key directly
$handle = fopen($filePath, 'r');
if (!$handle) {
error_log("Could not open file: $filePath");
return $result;
}
// Variables to track parsing state
$bracketCount = 0;
$buffer = '';
$itemCount = 0;
$inArray = false;
$arrayStarted = false;
// Skip ahead to find the specified key more quickly
$found = false;
$searchKey = '"' . $key . '"';
while (!$found && ($line = fgets($handle)) !== false) {
if (strpos($line, $searchKey) !== false) {
$found = true;
// Extract everything after the key
$keyPos = strpos($line, $searchKey);
$afterKey = substr($line, $keyPos + strlen($searchKey));
// Find the colon and then the opening bracket
if (strpos($afterKey, ':') !== false && strpos($afterKey, '[') !== false) {
$inArray = true;
$arrayStarted = true;
$bracketPos = strpos($afterKey, '[');
$buffer = '['; // Start the buffer with an opening bracket
2025-09-05 11:37:19 +02:00
$bracketCount = 1;
2025-09-05 15:58:26 +02:00
// Add everything after the opening bracket to the buffer
$buffer .= substr($afterKey, $bracketPos + 1);
} else if (strpos($afterKey, ':') !== false) {
// The opening bracket might be on the next line
$inArray = true;
}
break;
}
}
// If we didn't find the key, return empty array
if (!$found) {
fclose($handle);
error_log("Key '$key' not found in file: $filePath");
return $result;
}
// If we found the key but not the opening bracket yet, look for it
if ($inArray && !$arrayStarted) {
while (($line = fgets($handle)) !== false) {
if (strpos($line, '[') !== false) {
$bracketPos = strpos($line, '[');
2025-09-05 11:37:19 +02:00
$buffer = '['; // Start the buffer with an opening bracket
2025-09-05 15:58:26 +02:00
$bracketCount = 1;
$arrayStarted = true;
// Add everything after the opening bracket to the buffer
$buffer .= substr($line, $bracketPos + 1);
break;
}
}
}
// If we still haven't found the opening bracket, something is wrong
if (!$arrayStarted) {
fclose($handle);
error_log("Could not find opening bracket for array '$key' in file: $filePath");
return $result;
}
// Now process the array
$collectingItems = true;
while ($collectingItems && ($line = fgets($handle)) !== false) {
// Count opening and closing brackets to track array nesting
$openBrackets = substr_count($line, '[') + substr_count($line, '{');
$closeBrackets = substr_count($line, ']') + substr_count($line, '}');
$bracketCount += $openBrackets - $closeBrackets;
// Add the line to our buffer
$buffer .= $line;
// If we've reached the end of the array (bracketCount = 0)
if ($bracketCount === 0) {
$collectingItems = false;
// Try to parse the buffer as JSON
try {
$parsedData = json_decode($buffer, true);
if (json_last_error() !== JSON_ERROR_NONE) {
error_log("JSON parse error: " . json_last_error_msg() . " for key '$key'");
// Try a different approach - manually construct a valid JSON array
// Split the buffer by objects (each starting with { and ending with })
preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches);
if (!empty($matches[0])) {
// Take the first $maxItems objects
$objects = array_slice($matches[0], 0, $maxItems);
// Construct a valid JSON array
$validJson = '[' . implode(',', $objects) . ']';
// Try to parse the valid JSON
$parsedData = json_decode($validJson, true);
if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
$result = $parsedData;
} else {
error_log("Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'");
}
}
} else if (is_array($parsedData)) {
// Limit the number of items to prevent memory exhaustion
$result = array_slice($parsedData, 0, $maxItems);
}
} catch (\Exception $e) {
error_log("Exception parsing JSON for key '$key': " . $e->getMessage());
}
break;
}
// Check if we've found a complete item (when we see a closing brace followed by a comma)
// This is used to count items and limit the number of items processed
if (preg_match('/\}\s*,\s*$/m', $line)) {
$itemCount++;
// If we've reached the maximum number of items, stop processing
if ($itemCount >= $maxItems) {
$collectingItems = false;
// Create a valid JSON array with the items we've collected so far
// We need to ensure the buffer ends with a complete JSON object and a closing bracket
// First, find the last complete object (ending with })
$lastObjectEnd = strrpos($buffer, '}');
if ($lastObjectEnd !== false) {
// Truncate the buffer at the end of the last complete object
$buffer = substr($buffer, 0, $lastObjectEnd + 1);
// Add the closing bracket for the array
$buffer .= ']';
// Try to parse the buffer as JSON
try {
$parsedData = json_decode($buffer, true);
if (json_last_error() !== JSON_ERROR_NONE) {
error_log("JSON parse error after max items: " . json_last_error_msg() . " for key '$key'");
// Try a different approach - manually construct a valid JSON array
// Split the buffer by objects (each starting with { and ending with })
preg_match_all('/\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/s', $buffer, $matches);
if (!empty($matches[0])) {
// Take the first $maxItems objects
$objects = array_slice($matches[0], 0, $maxItems);
// Construct a valid JSON array
$validJson = '[' . implode(',', $objects) . ']';
// Try to parse the valid JSON
$parsedData = json_decode($validJson, true);
if (json_last_error() === JSON_ERROR_NONE && is_array($parsedData)) {
$result = $parsedData;
} else {
error_log("Alternative JSON parsing approach also failed: " . json_last_error_msg() . " for key '$key'");
}
}
} else if (is_array($parsedData)) {
$result = $parsedData;
}
} catch (\Exception $e) {
error_log("Exception parsing JSON after max items for key '$key': " . $e->getMessage());
}
} else {
error_log("Could not find the end of the last complete object for key '$key'");
}
break;
2025-09-05 11:37:19 +02:00
}
}
}
2025-09-05 15:58:26 +02:00
// Close the file
fclose($handle);
} catch (\Exception $e) {
error_log("Exception in extractJsonArrayByKey for key '$key': " . $e->getMessage());
2025-09-05 11:37:19 +02:00
}
2025-09-05 15:58:26 +02:00
return $result;
}
/**
* Extracts a scalar value from a large JSON file by key without loading the entire file into memory
*
* @param string $filePath Path to the JSON file
* @param string $key The key of the scalar value to extract
* @return mixed The extracted scalar value or null if not found
*/
private function extractJsonScalarByKey(string $filePath, string $key): mixed
{
// First, check if the file exists and is readable
if (!is_readable($filePath)) {
error_log("File is not readable: $filePath");
return null;
2025-09-05 11:37:19 +02:00
}
2025-09-05 15:58:26 +02:00
try {
// For very large files, we'll use a more efficient approach
// We'll search for the specified key directly
$handle = fopen($filePath, 'r');
if (!$handle) {
error_log("Could not open file: $filePath");
return null;
2025-09-05 11:37:19 +02:00
}
2025-09-05 15:58:26 +02:00
// Skip ahead to find the specified key more quickly
$found = false;
$searchKey = '"' . $key . '"';
$value = null;
while (!$found && ($line = fgets($handle)) !== false) {
if (strpos($line, $searchKey) !== false) {
$found = true;
// Extract everything after the key
$keyPos = strpos($line, $searchKey);
$afterKey = substr($line, $keyPos + strlen($searchKey));
// Check if the value is on this line
if (strpos($afterKey, ':') !== false) {
$colonPos = strpos($afterKey, ':');
$afterColon = trim(substr($afterKey, $colonPos + 1));
// Extract the value based on its type
if (preg_match('/^"([^"]*)"/', $afterColon, $matches)) {
// String value
$value = $matches[1];
} elseif (preg_match('/^(\d+)/', $afterColon, $matches)) {
// Numeric value
$value = intval($matches[1]);
} elseif (preg_match('/^(true|false)/', $afterColon, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
} elseif (strpos($afterColon, 'null') === 0) {
// Null value
$value = null;
} else {
// The value might be on the next line or more complex
// For simplicity, we'll just use the regex approach as a fallback
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $line, $matches)) {
// String value
$value = $matches[1];
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $line, $matches)) {
// Numeric value
$value = intval($matches[1]);
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $line, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
} elseif (strpos($line, 'null') !== false) {
// Null value
$value = null;
} else {
error_log("Could not extract value for key '$key' from line: " . trim($line));
}
}
2025-09-05 11:37:19 +02:00
} else {
2025-09-05 15:58:26 +02:00
// The value might be on the next line
error_log("Value for key '$key' might be on the next line, using fallback method");
// Read the next line
$nextLine = fgets($handle);
if ($nextLine !== false) {
$combinedLine = $line . $nextLine;
// Try to extract the value using regex
if (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*"([^"]*)"/', $combinedLine, $matches)) {
// String value
$value = $matches[1];
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(\d+)/', $combinedLine, $matches)) {
// Numeric value
$value = intval($matches[1]);
} elseif (preg_match('/"' . preg_quote($key, '/') . '"\s*:\s*(true|false)/', $combinedLine, $matches)) {
// Boolean value
$value = ($matches[1] === 'true');
} elseif (strpos($combinedLine, 'null') !== false) {
// Null value
$value = null;
} else {
error_log("Could not extract value for key '$key' from combined lines");
}
}
2025-09-05 11:37:19 +02:00
}
2025-09-05 15:58:26 +02:00
2025-09-05 11:37:19 +02:00
break;
}
}
2025-09-05 15:58:26 +02:00
// Close the file
fclose($handle);
if (!$found) {
error_log("Key '$key' not found in file: $filePath");
} else if ($value === null) {
error_log("Value for key '$key' is null or could not be extracted");
}
return $value;
} catch (\Exception $e) {
error_log("Exception in extractJsonScalarByKey for key '$key': " . $e->getMessage());
return null;
2025-09-05 11:37:19 +02:00
}
2025-09-05 15:58:26 +02:00
}
/**
* Extracts the specific_pages array from a large JSON file without loading the entire file into memory
* This is a legacy method kept for backward compatibility
*
* @param string $filePath Path to the JSON file
* @param int $maxPages Maximum number of pages to extract (to prevent memory exhaustion)
* @return array The extracted specific_pages array
*/
private function extractSpecificPagesFromJson(string $filePath, int $maxPages = 100): array
{
return $this->extractJsonArrayByKey($filePath, 'specific_pages', $maxPages);
2025-09-05 11:37:19 +02:00
}
2025-09-08 10:20:51 +02:00
/**
* Calculate the median value of an array of numbers
*
* @param array $array Array of numbers
* @return float The median value
*/
private function calculateMedian(array $array): float
{
sort($array);
$count = count($array);
if ($count === 0) {
return 0;
}
$middle = floor($count / 2);
if ($count % 2 === 0) {
// Even number of elements, average the two middle values
return ($array[$middle - 1] + $array[$middle]) / 2;
} else {
// Odd number of elements, return the middle value
return $array[$middle];
}
}
2025-09-01 18:28:23 +02:00
}