diff --git a/benchmarks/SiteCheckerBench.php b/benchmarks/SiteCheckerBench.php new file mode 100644 index 0000000..d18fcab --- /dev/null +++ b/benchmarks/SiteCheckerBench.php @@ -0,0 +1,78 @@ +outputDir = sys_get_temp_dir() . '/yiipress-site-checker-bench-' . uniqid(); + mkdir($this->outputDir, 0o755, true); + + for ($i = 1; $i <= 100; $i++) { + $dir = $this->outputDir . '/page-' . $i; + mkdir($dir, 0o755, true); + $next = $i === 100 ? 1 : $i + 1; + file_put_contents( + $dir . '/index.html', + '

Page ' . $i . '

' + . 'Next' + . 'Home' + . '', + ); + } + + mkdir($this->outputDir . '/assets', 0o755, true); + file_put_contents($this->outputDir . '/assets/logo.svg', ''); + + $this->checker = new SiteChecker(); + } + + public function tearDown(): void + { + $this->removeDir($this->outputDir); + } + + #[Revs(20)] + #[Iterations(3)] + #[Warmup(1)] + public function benchInternalSiteCheck(): void + { + $this->checker->check($this->outputDir); + } + + private function removeDir(string $path): void + { + if (!is_dir($path)) { + return; + } + + $iterator = new \RecursiveIteratorIterator( + new \RecursiveDirectoryIterator($path, \FilesystemIterator::SKIP_DOTS), + \RecursiveIteratorIterator::CHILD_FIRST, + ); + foreach ($iterator as $item) { + if ($item->isDir()) { + rmdir($item->getPathname()); + } else { + unlink($item->getPathname()); + } + } + + rmdir($path); + } +} diff --git a/config/common/di/content-pipeline.php b/config/common/di/content-pipeline.php index 0a1ca6d..8b27815 100644 --- a/config/common/di/content-pipeline.php +++ b/config/common/di/content-pipeline.php @@ -5,6 +5,7 @@ use YiiPress\Build\TemplateResolver; use YiiPress\Build\ThemeRegistry; use YiiPress\Console\BuildCommand; +use YiiPress\Console\CheckCommand; use YiiPress\Console\CleanCommand; use YiiPress\Console\InitCommand; use YiiPress\Console\NewCommand; @@ -63,6 +64,11 @@ 'eventDispatcher' => Reference::to(EventDispatcherInterface::class), ], ], + CheckCommand::class => [ + '__construct()' => [ + 'rootPath' => $workingDirectory, + ], + ], CleanCommand::class => [ '__construct()' => [ 'rootPath' => $workingDirectory, diff --git a/config/console/commands.php b/config/console/commands.php index ba8f987..344f664 100644 --- a/config/console/commands.php +++ b/config/console/commands.php @@ -6,6 +6,7 @@ return [ 'build' => Console\BuildCommand::class, + 'check' => Console\CheckCommand::class, 'clean|clear' => Console\CleanCommand::class, 'init' => Console\InitCommand::class, 'import' => Console\ImportCommand::class, diff --git a/docs/commands.md b/docs/commands.md index ed8768d..ce37347 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -58,6 +58,21 @@ The command: With `--workers=N` (N > 1), entry rendering and writing is parallelized across N forked processes. With `--workers=auto`, YiiPress uses up to the detected worker count and lets page writers clamp back to sequential mode for smaller workloads. Feeds are generated after entry writing and can be split per collection across workers. Sitemap generation remains serial. +## `check` + +Checks generated HTML output for broken local links, missing `src` targets, and missing anchor fragments. + +``` +./yiipress check [--output-dir=output] [--external] +``` + +**Options:** + +- `--output-dir`, `-o` — path to the generated output directory (default: `output`). Absolute or relative to project root. +- `--external` — also validate external `http://` and `https://` links. This performs network requests, so it is opt-in. + +Run `build` first, then `check` against the generated output. Local checks are filesystem-only and validate links such as `./guide/`, `/assets/site.css`, and `#heading-id`. + ## `serve` Starts the preview server for local development. diff --git a/roadmap.md b/roadmap.md index f36c641..16b375c 100644 --- a/roadmap.md +++ b/roadmap.md @@ -57,6 +57,7 @@ - [x] Smaller static package by removing unused runtime extension dependencies - [x] Build diagnostics (warn on broken internal links, missing images, invalid front matter) - [x] `yiipress clean` command — clear build output and caches +- [x] `yiipress check` command — validate generated links and anchors - [x] Dry run mode for build — show what would be generated without writing files - [x] No-write build mode for render-vs-filesystem performance diagnostics - [x] `serve` overlay button to open the current markdown source in a configured editor diff --git a/src/Build/SiteCheckIssue.php b/src/Build/SiteCheckIssue.php new file mode 100644 index 0000000..f97bfd7 --- /dev/null +++ b/src/Build/SiteCheckIssue.php @@ -0,0 +1,14 @@ + + */ + public function check(string $outputDir, bool $checkExternal = false): array + { + $htmlFiles = $this->htmlFiles($outputDir); + $issues = []; + + foreach ($htmlFiles as $htmlFile) { + $html = (string) file_get_contents($htmlFile); + $anchors = $this->anchors($html); + + foreach ($this->links($html) as $target) { + $issue = $this->checkTarget($outputDir, $htmlFile, $anchors, $target, $checkExternal); + if ($issue !== null) { + $issues[] = $issue; + } + } + } + + return $issues; + } + + /** + * @return list + */ + private function htmlFiles(string $outputDir): array + { + if (!is_dir($outputDir)) { + return []; + } + + $files = []; + $iterator = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator($outputDir, FilesystemIterator::SKIP_DOTS), + ); + + foreach ($iterator as $item) { + /** @var SplFileInfo $item */ + if ($item->isFile() && strtolower($item->getExtension()) === 'html') { + $files[] = $item->getPathname(); + } + } + + sort($files); + + return $files; + } + + /** + * @return list + */ + private function links(string $html): array + { + $lowerHtml = strtolower($html); + if (!str_contains($lowerHtml, 'href=') && !str_contains($lowerHtml, 'src=')) { + return []; + } + + preg_match_all('/\b(?:href|src)\s*=\s*(["\'])(.*?)\1/i', $html, $matches, PREG_SET_ORDER); + $links = []; + foreach ($matches as $match) { + $target = trim(html_entity_decode($match[2], ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5)); + if ($target !== '') { + $links[] = $target; + } + } + + return $links; + } + + /** + * @return array + */ + private function anchors(string $html): array + { + $anchors = []; + preg_match_all('/\b(?:id|name)\s*=\s*(["\'])(.*?)\1/i', $html, $matches, PREG_SET_ORDER); + foreach ($matches as $match) { + $anchor = html_entity_decode($match[2], ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5); + if ($anchor !== '') { + $anchors[$anchor] = true; + } + } + + return $anchors; + } + + /** + * @param array $currentAnchors + */ + private function checkTarget(string $outputDir, string $sourceFile, array $currentAnchors, string $target, bool $checkExternal): ?SiteCheckIssue + { + if ($this->shouldSkip($target)) { + return null; + } + + if ($this->isExternalHttpUrl($target)) { + if (!$checkExternal || $this->externalIsReachable($target)) { + return null; + } + + return new SiteCheckIssue($sourceFile, $target, 'external link is not reachable'); + } + + [$path, $fragment] = $this->splitTarget($target); + if ($path === '') { + if ($fragment === '' || isset($currentAnchors[$fragment])) { + return null; + } + + return new SiteCheckIssue($sourceFile, $target, 'fragment not found'); + } + + $targetFile = $this->resolveTargetFile($outputDir, $sourceFile, $path); + if ($targetFile === null || !is_file($targetFile)) { + return new SiteCheckIssue($sourceFile, $target, 'local target not found'); + } + + if ($fragment === '') { + return null; + } + + $targetHtml = (string) file_get_contents($targetFile); + if (isset($this->anchors($targetHtml)[$fragment])) { + return null; + } + + return new SiteCheckIssue($sourceFile, $target, 'fragment not found'); + } + + private function shouldSkip(string $target): bool + { + $lower = strtolower($target); + + return str_starts_with($lower, 'mailto:') + || str_starts_with($lower, 'tel:') + || str_starts_with($lower, 'javascript:') + || str_starts_with($lower, 'data:') + || str_starts_with($lower, 'urn:'); + } + + private function isExternalHttpUrl(string $target): bool + { + $lower = strtolower($target); + + return str_starts_with($lower, 'http://') + || str_starts_with($lower, 'https://') + || str_starts_with($lower, '//'); + } + + /** + * @return array{0: string, 1: string} + */ + private function splitTarget(string $target): array + { + $path = $target; + $fragment = ''; + + $fragmentPosition = strpos($path, '#'); + if ($fragmentPosition !== false) { + $fragment = rawurldecode(substr($path, $fragmentPosition + 1)); + $path = substr($path, 0, $fragmentPosition); + } + + $queryPosition = strpos($path, '?'); + if ($queryPosition !== false) { + $path = substr($path, 0, $queryPosition); + } + + return [rawurldecode(trim($path)), $fragment]; + } + + private function resolveTargetFile(string $outputDir, string $sourceFile, string $path): ?string + { + $base = str_starts_with($path, '/') + ? '' + : substr(dirname($sourceFile), strlen($outputDir) + 1); + $normalized = $this->normalizePath(($base !== '' ? $base . '/' : '') . $path); + + if ($normalized === null) { + return null; + } + + $targetPath = $outputDir . ($normalized !== '' ? '/' . $normalized : ''); + if (is_file($targetPath)) { + return $targetPath; + } + if (is_dir($targetPath)) { + return $targetPath . '/index.html'; + } + if (str_ends_with($path, '/') || !str_contains($this->lastPathSegment($path), '.')) { + return $targetPath . '/index.html'; + } + + return $targetPath; + } + + private function normalizePath(string $path): ?string + { + $parsedPath = parse_url($path, PHP_URL_PATH); + $path = is_string($parsedPath) ? $parsedPath : $path; + $parts = []; + + foreach (explode('/', $path) as $segment) { + if ($segment === '' || $segment === '.') { + continue; + } + if ($segment === '..') { + if ($parts === []) { + return null; + } + array_pop($parts); + continue; + } + $parts[] = $segment; + } + + return implode('/', $parts); + } + + private function lastPathSegment(string $path): string + { + $path = trim($path, '/'); + $slashPosition = strrpos($path, '/'); + + return $slashPosition === false ? $path : substr($path, $slashPosition + 1); + } + + private function externalIsReachable(string $url): bool + { + if (str_starts_with($url, '//')) { + $url = 'https:' . $url; + } + + if ($this->externalChecker !== null) { + return ($this->externalChecker)($url); + } + + $context = stream_context_create([ + 'http' => [ + 'method' => 'HEAD', + 'timeout' => 5, + 'ignore_errors' => true, + ], + ]); + $headers = @get_headers($url, true, $context); + + if ($this->headersAreSuccessful($headers)) { + return true; + } + + $context = stream_context_create([ + 'http' => [ + 'method' => 'GET', + 'timeout' => 5, + 'ignore_errors' => true, + ], + ]); + $headers = @get_headers($url, true, $context); + + return $this->headersAreSuccessful($headers); + } + + private function headersAreSuccessful(mixed $headers): bool + { + if (!is_array($headers) || $headers === []) { + return false; + } + + $statusLine = $headers[0] ?? ''; + if (!is_string($statusLine) || preg_match('/\s(\d{3})\s?/', $statusLine, $matches) !== 1) { + return false; + } + + $status = (int) $matches[1]; + + return $status >= 200 && $status < 400; + } +} diff --git a/src/Console/CheckCommand.php b/src/Console/CheckCommand.php new file mode 100644 index 0000000..b5758f3 --- /dev/null +++ b/src/Console/CheckCommand.php @@ -0,0 +1,98 @@ +addOption( + 'output-dir', + 'o', + InputOption::VALUE_REQUIRED, + 'Path to the generated output directory', + self::DEFAULT_OUTPUT_DIR, + ); + $this->addOption( + 'external', + null, + InputOption::VALUE_NONE, + 'Also validate external HTTP(S) links', + ); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + /** @var string $outputDirOption */ + $outputDirOption = $input->getOption('output-dir'); + $outputDir = $this->resolvePath($outputDirOption, $this->rootPath); + + if (!is_dir($outputDir)) { + $output->writeln('Output directory not found: ' . OutputFormatter::escape($outputDir) . ''); + return ExitCode::DATAERR; + } + + $issues = (new SiteChecker())->check($outputDir, (bool) $input->getOption('external')); + if ($issues === []) { + $output->writeln('Site check passed.'); + return ExitCode::OK; + } + + foreach ($issues as $issue) { + $output->writeln(sprintf( + '%s: %s "%s"', + OutputFormatter::escape($this->relativePath($issue->filePath, $outputDir)), + OutputFormatter::escape($issue->message), + OutputFormatter::escape($issue->target), + )); + } + + $output->writeln('Site check failed: ' . count($issues) . ' issue(s).'); + + return ExitCode::DATAERR; + } + + private function resolvePath(string $path, string $rootPath): string + { + if (str_starts_with($path, '/')) { + return $path; + } + + return $rootPath . '/' . $path; + } + + private function relativePath(string $path, string $baseDir): string + { + $prefix = $baseDir . '/'; + + return str_starts_with($path, $prefix) ? substr($path, strlen($prefix)) : $path; + } +} diff --git a/tests/Unit/Build/SiteCheckerTest.php b/tests/Unit/Build/SiteCheckerTest.php new file mode 100644 index 0000000..12bbc30 --- /dev/null +++ b/tests/Unit/Build/SiteCheckerTest.php @@ -0,0 +1,115 @@ +outputDir = sys_get_temp_dir() . '/yiipress-site-checker-test-' . uniqid(); + mkdir($this->outputDir . '/blog', 0o755, true); + mkdir($this->outputDir . '/assets', 0o755, true); + } + + protected function tearDown(): void + { + $this->removeDir($this->outputDir); + } + + public function testPassesForExistingLocalTargetsAndFragments(): void + { + file_put_contents($this->outputDir . '/index.html', 'PostMail'); + file_put_contents($this->outputDir . '/blog/index.html', '

Post

'); + file_put_contents($this->outputDir . '/assets/logo.svg', ''); + + $issues = (new SiteChecker())->check($this->outputDir); + + assertSame([], $issues); + } + + public function testExtractsCaseInsensitiveLinkAttributes(): void + { + file_put_contents($this->outputDir . '/index.html', 'Post'); + file_put_contents($this->outputDir . '/blog/index.html', '

Post

'); + file_put_contents($this->outputDir . '/assets/logo.svg', ''); + + $issues = (new SiteChecker())->check($this->outputDir); + + assertSame([], $issues); + } + + public function testReportsMissingLocalTargetsAndFragments(): void + { + file_put_contents($this->outputDir . '/index.html', 'MissingBad fragment'); + file_put_contents($this->outputDir . '/blog/index.html', '

Post

'); + + $issues = (new SiteChecker())->check($this->outputDir); + + assertCount(2, $issues); + assertSame('local target not found', $issues[0]->message); + assertSame('./missing/', $issues[0]->target); + assertSame('fragment not found', $issues[1]->message); + assertSame('./blog/#missing', $issues[1]->target); + } + + public function testChecksExternalLinksOnlyWhenRequested(): void + { + file_put_contents($this->outputDir . '/index.html', 'Broken'); + $checker = new SiteChecker(static fn (string $url): bool => $url !== 'https://example.test/broken'); + + assertSame([], $checker->check($this->outputDir)); + + $issues = $checker->check($this->outputDir, checkExternal: true); + + assertCount(1, $issues); + assertSame('external link is not reachable', $issues[0]->message); + assertSame('https://example.test/broken', $issues[0]->target); + } + + public function testRejectsLinksEscapingOutputDirectory(): void + { + file_put_contents($this->outputDir . '/blog/index.html', 'Secret'); + + $issues = (new SiteChecker())->check($this->outputDir); + + assertCount(1, $issues); + assertSame('local target not found', $issues[0]->message); + assertSame('../../secret.html', $issues[0]->target); + } + + private function removeDir(string $path): void + { + if (!is_dir($path)) { + return; + } + + $iterator = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator($path, FilesystemIterator::SKIP_DOTS), + RecursiveIteratorIterator::CHILD_FIRST, + ); + foreach ($iterator as $item) { + /** @var SplFileInfo $item */ + if ($item->isDir()) { + rmdir($item->getPathname()); + } else { + unlink($item->getPathname()); + } + } + + rmdir($path); + } +} diff --git a/tests/Unit/Console/CheckCommandTest.php b/tests/Unit/Console/CheckCommandTest.php new file mode 100644 index 0000000..265fefd --- /dev/null +++ b/tests/Unit/Console/CheckCommandTest.php @@ -0,0 +1,102 @@ +outputDir = sys_get_temp_dir() . '/yiipress-check-command-test-' . uniqid(); + mkdir($this->outputDir . '/docs', 0o755, true); + } + + protected function tearDown(): void + { + $this->removeDir($this->outputDir); + } + + public function testCheckPassesForValidGeneratedSite(): void + { + file_put_contents($this->outputDir . '/index.html', 'Docs'); + file_put_contents($this->outputDir . '/docs/index.html', '

Intro

'); + + $result = $this->runCheck(); + + assertSame(ExitCode::OK, $result['exitCode'], $result['output']); + assertStringContainsString('Site check passed.', $result['output']); + } + + public function testCheckFailsForBrokenGeneratedSiteLink(): void + { + file_put_contents($this->outputDir . '/index.html', 'Missing'); + + $result = $this->runCheck(); + + assertSame(ExitCode::DATAERR, $result['exitCode'], $result['output']); + assertStringContainsString('index.html: local target not found "./missing/"', $result['output']); + assertStringContainsString('Site check failed: 1 issue(s).', $result['output']); + } + + public function testCheckFailsForMissingOutputDirectory(): void + { + $this->removeDir($this->outputDir); + + $result = $this->runCheck(); + + assertSame(ExitCode::DATAERR, $result['exitCode'], $result['output']); + assertStringContainsString('Output directory not found', $result['output']); + } + + /** + * @return array{exitCode: int, output: string} + */ + private function runCheck(): array + { + $yii = dirname(__DIR__, 3) . '/yii'; + exec( + $yii . ' check' + . ' --output-dir=' . escapeshellarg($this->outputDir) + . ' 2>&1', + $output, + $exitCode, + ); + + return ['exitCode' => $exitCode, 'output' => implode("\n", $output)]; + } + + private function removeDir(string $path): void + { + if (!is_dir($path)) { + return; + } + + $iterator = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator($path, FilesystemIterator::SKIP_DOTS), + RecursiveIteratorIterator::CHILD_FIRST, + ); + foreach ($iterator as $item) { + /** @var SplFileInfo $item */ + if ($item->isDir()) { + rmdir($item->getPathname()); + } else { + unlink($item->getPathname()); + } + } + + rmdir($path); + } +}