add missing wiki pages from taginfo fr

2025-09-05 11:37:19 +02:00 · 2025-09-05 11:37:19 +02:00 · dffb21b56e
commit dffb21b56e
parent e056cfc8fa
8 changed files with 469 additions and 131 deletions
--- a/src/Controller/WikiController.php
+++ b/src/Controller/WikiController.php
@ -778,6 +778,7 @@ class WikiController extends AbstractController
        $scriptPath = $this->getParameter('kernel.project_dir') . '/wiki_compare/wiki_compare.py';
        $englishHtml = null;
        $frenchHtml = null;
+        $frenchCacheExists = false;
        
        if (file_exists($scriptPath)) {
            // Create a temporary Python script to fetch the page content
@ -788,25 +789,52 @@ class WikiController extends AbstractController

 import sys
 import json
-from wiki_compare import fetch_wiki_page
+import hashlib
+from pathlib import Path
+from wiki_compare import fetch_wiki_page, HTML_CACHE_DIR

 # Get the key from command line arguments
 key = sys.argv[1]
 language = sys.argv[2]

-# Fetch the page
-page = fetch_wiki_page(key, language)
+# Check if we're just checking cache existence
+check_cache_only = len(sys.argv) > 3 and sys.argv[3] == 'check_cache'

-# Output the HTML content
-if page and 'html_content' in page:
-    print(page['html_content'])
+if check_cache_only and language == 'fr':
+    # For French pages, construct the URL to check cache
+    if key.startswith('http'):
+        url = key
+    else:
+        url = f"https://wiki.openstreetmap.org/wiki/FR:{key}"
+    
+    # Create cache key
+    cache_key = hashlib.md5(url.encode()).hexdigest()
+    cache_file = Path(HTML_CACHE_DIR) / f"{cache_key}.html"
+    
+    # Check if cache exists
+    if cache_file.exists():
+        print("CACHE_EXISTS")
+    else:
+        print("CACHE_MISSING")
 else:
-    print("")
+    # Normal fetch operation
+    page = fetch_wiki_page(key, language)
+    
+    # Output the HTML content
+    if page and 'html_content' in page:
+        print(page['html_content'])
+    else:
+        print("")
 EOT;
            
            file_put_contents($tempScriptPath, $pythonCode);
            chmod($tempScriptPath, 0755);
            
+            // First check if French page exists in cache
+            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} fr check_cache";
+            $cacheCheckResult = trim(shell_exec($command));
+            $frenchCacheExists = ($cacheCheckResult === "CACHE_EXISTS");
+            
            // Fetch English page
            $command = "cd " . $this->getParameter('kernel.project_dir') . "/wiki_compare && python3 {$tempScriptPath} {$key} en";
            $englishHtml = shell_exec($command);
@ -834,7 +862,8 @@ EOT;
            'english_url' => $englishUrl,
            'french_edit_url' => $frenchEditUrl,
            'english_html' => $englishHtml,
-            'french_html' => $frenchHtml
+            'french_html' => $frenchHtml,
+            'french_cache_exists' => $frenchCacheExists
        ]);
    }

@ -1104,7 +1133,25 @@ EOT;
        if (file_exists($pagesUnavailableInEnglishFile)) {
            $pagesUnavailableInEnglishData = json_decode(file_get_contents($pagesUnavailableInEnglishFile), true);
            if (isset($pagesUnavailableInEnglishData['pages']) && is_array($pagesUnavailableInEnglishData['pages'])) {
-                $pagesUnavailableInEnglish = $pagesUnavailableInEnglishData['pages'];
+                // Deduplicate pages based on URL
+                $uniquePages = [];
+                $seenUrls = [];
+                
+                foreach ($pagesUnavailableInEnglishData['pages'] as $page) {
+                    if (isset($page['url'])) {
+                        // Use URL as the key for deduplication
+                        $url = $page['url'];
+                        if (!isset($seenUrls[$url])) {
+                            $seenUrls[$url] = true;
+                            $uniquePages[] = $page;
+                        }
+                    } else {
+                        // If no URL, keep the page (shouldn't happen, but just in case)
+                        $uniquePages[] = $page;
+                    }
+                }
+                
+                $pagesUnavailableInEnglish = $uniquePages;
            }
        }

@ -1112,10 +1159,10 @@ EOT;
        $specificPages = [];
        $outdatedPagesFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/outdated_pages.json';
        if (file_exists($outdatedPagesFile)) {
-            $outdatedPagesData = json_decode(file_get_contents($outdatedPagesFile), true);
-            if (isset($outdatedPagesData['specific_pages']) && is_array($outdatedPagesData['specific_pages'])) {
-                $specificPages = $outdatedPagesData['specific_pages'];
-            }
+            // Use a memory-efficient approach to extract only the specific_pages array
+            // without loading the entire file into memory
+            $maxPages = 100; // Limit the number of pages to prevent memory exhaustion
+            $specificPages = $this->extractSpecificPagesFromJson($outdatedPagesFile, $maxPages);
        }

        // Load newly created French pages
@ -1137,6 +1184,16 @@ EOT;
                $availableTranslations = $translationsData['translations'];
            }
        }
+        
+        // Load keys without wiki pages
+        $keysWithoutWiki = [];
+        $keysWithoutWikiFile = $this->getParameter('kernel.project_dir') . '/wiki_compare/keys_without_wiki.json';
+        if (file_exists($keysWithoutWikiFile)) {
+            $keysWithoutWikiData = json_decode(file_get_contents($keysWithoutWikiFile), true);
+            if (is_array($keysWithoutWikiData)) {
+                $keysWithoutWiki = $keysWithoutWikiData;
+            }
+        }

        return $this->render('admin/wiki.html.twig', [
            'wiki_pages' => $wikiPages,
@ -1147,7 +1204,8 @@ EOT;
            'newly_created_pages' => $newlyCreatedPages,
            'staleness_stats' => $stalenessStats,
            'wiki_pages_stats' => $wikiPagesStats,
-            'available_translations' => $availableTranslations
+            'available_translations' => $availableTranslations,
+            'keys_without_wiki' => $keysWithoutWiki
        ]);
    }

@ -1791,4 +1849,124 @@ EOT;
        
        return $contentHtml;
    }
+    
+    /**
+     * Extracts the specific_pages array from a large JSON file without loading the entire file into memory
+     * 
+     * @param string $filePath Path to the JSON file
+     * @param int $maxPages Maximum number of pages to extract (to prevent memory exhaustion)
+     * @return array The extracted specific_pages array
+     */
+    private function extractSpecificPagesFromJson(string $filePath, int $maxPages = 100): array
+    {
+        $specificPages = [];
+        
+        // For very large files, we'll use a more direct approach
+        // Instead of parsing the entire JSON structure, we'll extract just what we need
+        
+        // First, check if the file exists and is readable
+        if (!is_readable($filePath)) {
+            return $specificPages;
+        }
+        
+        // Get the file size
+        $fileSize = filesize($filePath);
+        if ($fileSize === false || $fileSize === 0) {
+            return $specificPages;
+        }
+        
+        // For very large files, we'll use a more efficient approach
+        // We'll search for the "specific_pages" key directly
+        $handle = fopen($filePath, 'r');
+        if (!$handle) {
+            return $specificPages;
+        }
+        
+        // Variables to track parsing state
+        $inSpecificPages = false;
+        $bracketCount = 0;
+        $buffer = '';
+        $pageCount = 0;
+        $lineCount = 0;
+        
+        // Skip ahead to find the specific_pages key more quickly
+        // This is a simple optimization for this specific file structure
+        $found = false;
+        while (!$found && ($line = fgets($handle)) !== false) {
+            $lineCount++;
+            if (strpos($line, '"specific_pages"') !== false) {
+                $found = true;
+                $inSpecificPages = true;
+                
+                // Find the opening bracket of the array
+                if (strpos($line, '[') !== false) {
+                    $bracketCount = 1;
+                    $buffer = '['; // Start the buffer with an opening bracket
+                } else {
+                    // If the opening bracket is on the next line
+                    $nextLine = fgets($handle);
+                    if ($nextLine !== false && strpos($nextLine, '[') !== false) {
+                        $bracketCount = 1;
+                        $buffer = '['; // Start the buffer with an opening bracket
+                    }
+                }
+                break;
+            }
+        }
+        
+        // If we didn't find the specific_pages key, return empty array
+        if (!$found) {
+            fclose($handle);
+            return $specificPages;
+        }
+        
+        // Now process the specific_pages array
+        while (($line = fgets($handle)) !== false) {
+            // Count opening and closing brackets to track array nesting
+            $openBrackets = substr_count($line, '[') + substr_count($line, '{');
+            $closeBrackets = substr_count($line, ']') + substr_count($line, '}');
+            $bracketCount += $openBrackets - $closeBrackets;
+            
+            // Add the line to our buffer
+            $buffer .= $line;
+            
+            // If we've reached the end of the array (bracketCount = 0)
+            if ($bracketCount === 0) {
+                // Parse the buffer as JSON
+                $parsedData = json_decode($buffer, true);
+                if (is_array($parsedData)) {
+                    // Limit the number of pages to prevent memory exhaustion
+                    $specificPages = array_slice($parsedData, 0, $maxPages);
+                } else {
+                    // If parsing fails, log the error but don't crash
+                    error_log('Failed to parse specific_pages JSON data in ' . $filePath);
+                }
+                break;
+            }
+            
+            // Check if we've found a complete page object (when we see a closing brace followed by a comma)
+            if (preg_match('/\}\s*,\s*$/m', $line)) {
+                $pageCount++;
+                // If we've reached the maximum number of pages, stop processing
+                if ($pageCount >= $maxPages) {
+                    // Close the array properly
+                    $buffer = rtrim($buffer, ",\r\n") . ']';
+                    // Parse the buffer as JSON
+                    $parsedData = json_decode($buffer, true);
+                    if (is_array($parsedData)) {
+                        $specificPages = $parsedData;
+                    } else {
+                        // If parsing fails, log the error but don't crash
+                        error_log('Failed to parse specific_pages JSON data in ' . $filePath . ' after reaching max pages');
+                    }
+                    break;
+                }
+            }
+        }
+        
+        // Close the file
+        fclose($handle);
+        
+        return $specificPages;
+    }
 }