From c23aa14d7acc8b6cdb90ac92e1365fc8aae816b7 Mon Sep 17 00:00:00 2001 From: Alexander Makarov Date: Sat, 13 Jun 2026 13:25:55 +0300 Subject: [PATCH] Add Hugo content importer --- benchmarks/HugoImporterBench.php | 76 ++++ config/common/di/importer.php | 2 + docs/commands.md | 28 +- docs/importing-content.md | 12 + roadmap.md | 2 +- src/Import/Hugo/HugoContentImporter.php | 422 ++++++++++++++++++ tests/Unit/Console/ImportCommandTest.php | 16 + tests/Unit/Import/HugoContentImporterTest.php | 141 ++++++ 8 files changed, 697 insertions(+), 2 deletions(-) create mode 100644 benchmarks/HugoImporterBench.php create mode 100644 src/Import/Hugo/HugoContentImporter.php create mode 100644 tests/Unit/Import/HugoContentImporterTest.php diff --git a/benchmarks/HugoImporterBench.php b/benchmarks/HugoImporterBench.php new file mode 100644 index 0000000..b65b422 --- /dev/null +++ b/benchmarks/HugoImporterBench.php @@ -0,0 +1,76 @@ +sourceDir = sys_get_temp_dir() . '/yiipress-hugo-bench-source-' . uniqid(); + $this->targetDir = sys_get_temp_dir() . '/yiipress-hugo-bench-target-' . uniqid(); + mkdir($this->sourceDir . '/content/posts', 0o755, true); + mkdir($this->targetDir, 0o755, true); + + for ($i = 1; $i <= 100; $i++) { + file_put_contents( + $this->sourceDir . '/content/posts/post-' . $i . '.md', + "+++\ntitle = \"Post $i\"\ndate = \"2024-03-15T10:30:00Z\"\ntags = [\"php\", \"yii\"]\n+++\n\nBody $i.\n", + ); + } + + $this->importer = new HugoContentImporter(); + } + + public function tearDown(): void + { + $this->removeDir($this->sourceDir); + $this->removeDir($this->targetDir); + } + + #[Revs(10)] + #[Iterations(3)] + #[Warmup(1)] + public function benchImportPosts(): void + { + $this->removeDir($this->targetDir); + mkdir($this->targetDir, 0o755, true); + + $this->importer->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + } + + private function removeDir(string $path): void + { + if (!is_dir($path)) { + return; + } + + $iterator = new \RecursiveIteratorIterator( + new \RecursiveDirectoryIterator($path, \FilesystemIterator::SKIP_DOTS), + \RecursiveIteratorIterator::CHILD_FIRST, + ); + foreach ($iterator as $item) { + if ($item->isDir()) { + rmdir($item->getPathname()); + } else { + unlink($item->getPathname()); + } + } + + rmdir($path); + } +} diff --git a/config/common/di/importer.php b/config/common/di/importer.php index 477b2f3..761e99e 100644 --- a/config/common/di/importer.php +++ b/config/common/di/importer.php @@ -3,6 +3,7 @@ declare(strict_types=1); use YiiPress\Console\ImportCommand; +use YiiPress\Import\Hugo\HugoContentImporter; use YiiPress\Import\Telegram\TelegramContentImporter; $workingDirectory = getcwd() ?: dirname(__DIR__, 3); @@ -12,6 +13,7 @@ '__construct()' => [ 'rootPath' => $workingDirectory, 'importers' => [ + 'hugo' => new HugoContentImporter(), 'telegram' => new TelegramContentImporter(), ], ], diff --git a/docs/commands.md b/docs/commands.md index ed8768d..8ca019e 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -144,7 +144,7 @@ Imports content from external sources into a YiiPress collection. **Arguments:** -- `source` — source type to import from (required). Currently supported: `telegram`. +- `source` — source type to import from (required). Currently supported: `hugo`, `telegram`. **Common options:** @@ -191,6 +191,32 @@ Supports both single-chat exports (`result.json` with `messages` array) and full ./yiipress import telegram --directory=./telegram-data --content-dir=content ``` +### Hugo import + +Imports Markdown content from a Hugo site directory. The importer scans `content/posts/`, then `content/post/`, then `content/`, reads `.md` files recursively, and writes YiiPress markdown files into the target collection. + +**Importer options:** + +- `--directory` — path to the Hugo site directory (required). Absolute or relative to project root. + +The importer supports YAML (`---`) and simple TOML (`+++`) front matter and preserves: + +- `title` +- `date` +- `url` / `permalink` +- `draft` +- `tags` +- `categories` + +If `title` is missing, it is inferred from the first `# Heading` in the post body and then from the filename slug. If `date` is missing, filenames starting with `YYYY-MM-DD-` provide the date. + +**Examples:** + +```bash +./yiipress import hugo --directory=/path/to/hugo-site +./yiipress import hugo --directory=../old-hugo-site --collection=blog +``` + ### Adding custom importers Importers implement `YiiPress\Import\ContentImporterInterface` and are registered via [Yii3 DI](https://yiisoft.github.io/docs/guide/concept/di-container.html) in `config/common/di/importer.php`. Each importer declares its own options via the `options()` method. See [Importing content](importing-content.md) for details. diff --git a/docs/importing-content.md b/docs/importing-content.md index ae12c12..84e2c3d 100644 --- a/docs/importing-content.md +++ b/docs/importing-content.md @@ -60,6 +60,18 @@ Imports messages from a Telegram Desktop channel export (JSON format). See [commands.md](commands.md#yii-import) for usage details. +### HugoContentImporter + +Imports Markdown content from a Hugo site. + +**Options:** + +- `--directory` — Path to the Hugo site directory (required) + +The importer scans `content/posts/`, then `content/post/`, then `content/`, accepts `.md` files, supports YAML (`---`) and simple TOML (`+++`) front matter, preserves common fields (`title`, `date`, `url` / `permalink`, `draft`, `tags`, `categories`), and creates a default collection config when one does not exist. + +See [commands.md](commands.md#hugo-import) for usage details. + ## Writing a custom importer Create a class implementing `ContentImporterInterface`. Each importer declares its own options — a file-based importer might need a `directory`, while an API-based importer might need `url` and `api-key`. diff --git a/roadmap.md b/roadmap.md index f36c641..8a3567c 100644 --- a/roadmap.md +++ b/roadmap.md @@ -108,7 +108,7 @@ - [ ] WordPress - [ ] Jekyll -- [ ] Hugo +- [x] Hugo - [ ] Medium exported Markdown - [ ] Ghost - [x] Telegram export diff --git a/src/Import/Hugo/HugoContentImporter.php b/src/Import/Hugo/HugoContentImporter.php new file mode 100644 index 0000000..850bec3 --- /dev/null +++ b/src/Import/Hugo/HugoContentImporter.php @@ -0,0 +1,422 @@ +contentDirectory($sourceDirectory); + if ($contentDirectory === '') { + return new ImportResult( + totalMessages: 0, + importedCount: 0, + importedFiles: [], + skippedFiles: [], + warnings: ["content directory not found in $sourceDirectory"], + ); + } + + $collectionDir = $targetDirectory . '/' . $collection; + FileHelper::ensureDirectory($collectionDir, 0o755); + + $contentFiles = $this->contentFiles($contentDirectory); + $importedFiles = []; + $skippedFiles = []; + $warnings = []; + + foreach ($contentFiles as $contentFile) { + $post = $this->readContentFile($contentFile); + if ($post === null) { + $skippedFiles[] = $contentFile; + $warnings[] = 'Skipped unreadable Hugo content file: ' . basename($contentFile); + continue; + } + + [$date, $slug, $fields, $body] = $post; + $filename = ($date !== '' ? $date . '-' : '') . $slug . '.md'; + $targetPath = $collectionDir . '/' . $filename; + file_put_contents($targetPath, $this->buildMarkdownFile($date, $slug, $fields, $body)); + $importedFiles[] = $targetPath; + } + + $this->ensureCollectionConfig($collectionDir, $collection); + + return new ImportResult( + totalMessages: count($contentFiles), + importedCount: count($importedFiles), + importedFiles: $importedFiles, + skippedFiles: $skippedFiles, + warnings: $warnings, + ); + } + + public function name(): string + { + return 'hugo'; + } + + private function contentDirectory(string $sourceDirectory): string + { + foreach (['content/posts', 'content/post', 'content'] as $relativePath) { + $path = $sourceDirectory . '/' . $relativePath; + if (is_dir($path)) { + return $path; + } + } + + return ''; + } + + /** + * @return list + */ + private function contentFiles(string $contentDirectory): array + { + $files = []; + $iterator = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator($contentDirectory, FilesystemIterator::SKIP_DOTS), + ); + + foreach ($iterator as $item) { + /** @var SplFileInfo $item */ + if ($item->isFile() && strtolower($item->getExtension()) === 'md') { + $files[] = $item->getPathname(); + } + } + + sort($files); + + return $files; + } + + /** + * @return array{0: string, 1: string, 2: array, 3: string}|null + */ + private function readContentFile(string $contentFile): ?array + { + $content = file_get_contents($contentFile); + if ($content === false) { + return null; + } + + [$fields, $body] = $this->splitFrontMatter($content); + $filename = pathinfo($contentFile, PATHINFO_FILENAME); + $date = $this->datePart($this->stringField($fields['date'] ?? null)); + if ($date === '' && preg_match('/^(\d{4}-\d{2}-\d{2})-(.+)$/', $filename, $matches) === 1) { + $date = $matches[1]; + $filename = $matches[2]; + } + + $slug = $this->stringField($fields['slug'] ?? null); + if ($slug === '') { + $slug = $filename; + } + $slug = $this->filesystemSlug($slug); + + return [$date, $slug, $fields, trim($body) . "\n"]; + } + + /** + * @return array{0: array, 1: string} + */ + private function splitFrontMatter(string $content): array + { + $content = str_replace("\r\n", "\n", $content); + if (str_starts_with($content, "---\n")) { + return $this->splitDelimitedFrontMatter($content, '---', $this->parseYaml(...)); + } + if (str_starts_with($content, "+++\n")) { + return $this->splitDelimitedFrontMatter($content, '+++', $this->parseToml(...)); + } + + return [[], $content]; + } + + /** + * @param callable(string): array $parser + * @return array{0: array, 1: string} + */ + private function splitDelimitedFrontMatter(string $content, string $delimiter, callable $parser): array + { + $endPosition = strpos($content, "\n" . $delimiter . "\n", 4); + if ($endPosition === false) { + return [[], $content]; + } + + $fields = $parser(substr($content, 4, $endPosition - 4)); + $body = substr($content, $endPosition + 5); + + return [$fields, $body]; + } + + /** + * @return array + */ + private function parseYaml(string $frontMatter): array + { + $data = yaml_parse($frontMatter); + if (!is_array($data)) { + return []; + } + + $fields = []; + foreach ($data as $key => $value) { + if (is_string($key)) { + $fields[$key] = $value; + } + } + + return $fields; + } + + /** + * @return array + */ + private function parseToml(string $frontMatter): array + { + $fields = []; + foreach (explode("\n", $frontMatter) as $line) { + $line = trim($line); + if ($line === '' || str_starts_with($line, '#') || !str_contains($line, '=')) { + continue; + } + + $equalsPosition = strpos($line, '='); + if ($equalsPosition === false) { + continue; + } + + $key = trim(substr($line, 0, $equalsPosition)); + if ($key === '') { + continue; + } + + $fields[$key] = $this->parseTomlValue(trim(substr($line, $equalsPosition + 1))); + } + + return $fields; + } + + private function parseTomlValue(string $value): mixed + { + if (str_starts_with($value, '[') && str_ends_with($value, ']')) { + $items = trim(substr($value, 1, -1)); + if ($items === '') { + return []; + } + + return array_map( + fn (string $item): string => $this->unquoteTomlString(trim($item)), + explode(',', $items), + ); + } + + if ($value === 'true') { + return true; + } + if ($value === 'false') { + return false; + } + + return $this->unquoteTomlString($value); + } + + private function unquoteTomlString(string $value): string + { + if ( + (str_starts_with($value, '"') && str_ends_with($value, '"')) + || (str_starts_with($value, "'") && str_ends_with($value, "'")) + ) { + return substr($value, 1, -1); + } + + return $value; + } + + /** + * @param array $fields + */ + private function buildMarkdownFile(string $date, string $slug, array $fields, string $body): string + { + $title = $this->stringField($fields['title'] ?? null); + if ($title === '') { + $title = $this->titleFromBody($body) ?: ucwords(str_replace(['-', '_'], ' ', $slug)); + } + + $frontMatter = "---\n"; + $frontMatter .= 'title: ' . $this->yamlEscape($title) . "\n"; + + $dateValue = $this->stringField($fields['date'] ?? null, $date); + if ($dateValue !== '') { + $frontMatter .= 'date: ' . $dateValue . "\n"; + } + + $permalink = $this->stringField($fields['url'] ?? null); + if ($permalink === '') { + $permalink = $this->stringField($fields['permalink'] ?? null); + } + if ($permalink !== '') { + $frontMatter .= 'permalink: ' . $this->yamlEscape($permalink) . "\n"; + } + + if (($fields['draft'] ?? false) === true) { + $frontMatter .= "draft: true\n"; + } + + $tags = $this->listField($fields['tags'] ?? []); + if ($tags !== []) { + $frontMatter .= "tags:\n"; + foreach ($tags as $tag) { + $frontMatter .= ' - ' . $this->yamlEscape($tag) . "\n"; + } + } + + $categories = $this->listField($fields['categories'] ?? []); + if ($categories !== []) { + $frontMatter .= "categories:\n"; + foreach ($categories as $category) { + $frontMatter .= ' - ' . $this->yamlEscape($category) . "\n"; + } + } + + return $frontMatter . "---\n\n" . $body; + } + + private function datePart(string $date): string + { + return preg_match('/^(\d{4}-\d{2}-\d{2})/', $date, $matches) === 1 ? $matches[1] : ''; + } + + private function titleFromBody(string $body): string + { + return preg_match('/^#\s+(.+)$/m', $body, $matches) === 1 ? trim($matches[1]) : ''; + } + + private function filesystemSlug(string $slug): string + { + $slug = str_replace(['/', '\\'], '-', trim($slug)); + $slug = (string) preg_replace('/[<>:"|?*\x00-\x1F]+/', '-', $slug); + $slug = trim($slug, ". \t\n\r\0\x0B-"); + + return $slug === '' ? 'post' : $slug; + } + + private function stringField(mixed $value, string $default = ''): string + { + if ($value === null) { + return $default; + } + + return trim((string) $value); + } + + /** + * @return list + */ + private function listField(mixed $value): array + { + if (is_array($value)) { + return array_values(array_filter(array_map(static fn (mixed $item): string => trim((string) $item), $value))); + } + + $value = trim((string) $value); + if ($value === '') { + return []; + } + + $items = str_ends_with($value, ',') || str_contains($value, ',') + ? explode(',', $value) + : (preg_split('/\s+/', $value) ?: []); + + return array_values(array_filter(array_map(static fn (string $item): string => trim($item), $items))); + } + + private function ensureCollectionConfig(string $collectionDir, string $collection): void + { + $configPath = $collectionDir . '/_collection.yaml'; + if (is_file($configPath)) { + return; + } + + $config = 'title: ' . ucfirst($collection) . "\n"; + $config .= "sort_by: date\n"; + $config .= "sort_order: desc\n"; + $config .= "entries_per_page: 10\n"; + $config .= "feed: true\n"; + + file_put_contents($configPath, $config); + } + + private function yamlEscape(string $value): string + { + if (preg_match('/[:#\[\]{}|>&*!,\'"%@`]/', $value) === 1) { + return '"' . addcslashes($value, '"\\') . '"'; + } + + return $value; + } +} diff --git a/tests/Unit/Console/ImportCommandTest.php b/tests/Unit/Console/ImportCommandTest.php index a53cdb1..32b831d 100644 --- a/tests/Unit/Console/ImportCommandTest.php +++ b/tests/Unit/Console/ImportCommandTest.php @@ -103,11 +103,27 @@ public function testImportsToCustomCollection(): void assertStringContainsString('Imported: 1', $result['output']); } + public function testImportsHugoSite(): void + { + mkdir($this->sourceDir . '/content/posts', 0o755, true); + file_put_contents( + $this->sourceDir . '/content/posts/hello-hugo.md', + "+++\ntitle = \"Hello Hugo\"\n+++\n\nBody.\n", + ); + + $result = $this->runImport('hugo', ['--directory' => $this->sourceDir]); + + assertSame(0, $result['exitCode'], $result['output']); + assertStringContainsString('Importing from hugo', $result['output']); + assertStringContainsString('Imported: 1', $result['output']); + } + public function testShowsAvailableImportersOnError(): void { $result = $this->runImport('wordpress', ['--directory' => $this->sourceDir]); assertSame(65, $result['exitCode']); + assertStringContainsString('hugo', $result['output']); assertStringContainsString('telegram', $result['output']); } diff --git a/tests/Unit/Import/HugoContentImporterTest.php b/tests/Unit/Import/HugoContentImporterTest.php new file mode 100644 index 0000000..6a4a473 --- /dev/null +++ b/tests/Unit/Import/HugoContentImporterTest.php @@ -0,0 +1,141 @@ +sourceDir = sys_get_temp_dir() . '/yiipress-hugo-source-' . uniqid(); + $this->targetDir = sys_get_temp_dir() . '/yiipress-hugo-target-' . uniqid(); + mkdir($this->sourceDir . '/content/posts', 0o755, true); + mkdir($this->targetDir, 0o755, true); + } + + protected function tearDown(): void + { + $this->removeDir($this->sourceDir); + $this->removeDir($this->targetDir); + } + + public function testImportsTomlFrontMatterPosts(): void + { + file_put_contents( + $this->sourceDir . '/content/posts/hello-hugo.md', + "+++\n" + . "title = \"Hello: Hugo\"\n" + . "date = \"2024-03-15T10:30:00Z\"\n" + . "tags = [\"php\", \"yii\"]\n" + . "categories = [\"docs\", \"guides\"]\n" + . "url = \"/custom/hello/\"\n" + . "draft = true\n" + . "+++\n\n" + . "Body text.\n", + ); + + $result = (new HugoContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(1, $result->importedCount()); + assertSame(1, $result->totalMessages()); + assertSame([], $result->warnings()); + + $targetFile = $this->targetDir . '/blog/2024-03-15-hello-hugo.md'; + $content = file_get_contents($targetFile); + $this->assertNotFalse($content); + assertStringContainsString('title: "Hello: Hugo"', $content); + assertStringContainsString('date: 2024-03-15T10:30:00Z', $content); + assertStringContainsString('permalink: /custom/hello/', $content); + assertStringContainsString("draft: true\n", $content); + assertStringContainsString("tags:\n - php\n - yii\n", $content); + assertStringContainsString("categories:\n - docs\n - guides\n", $content); + assertStringContainsString("Body text.\n", $content); + $this->assertFileExists($this->targetDir . '/blog/_collection.yaml'); + } + + public function testImportsYamlFrontMatterPostsAndDerivesDateFromFilename(): void + { + file_put_contents( + $this->sourceDir . '/content/posts/2024-04-01-heading-title.md', + "---\n" + . "tags: php yii\n" + . "---\n\n" + . "# Heading Title\n\n" + . "Body.\n", + ); + + $result = (new HugoContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(1, $result->importedCount()); + $content = file_get_contents($this->targetDir . '/blog/2024-04-01-heading-title.md'); + $this->assertNotFalse($content); + assertStringContainsString('title: Heading Title', $content); + assertStringContainsString('date: 2024-04-01', $content); + assertStringContainsString("tags:\n - php\n - yii\n", $content); + } + + public function testWarnsWhenContentDirectoryIsMissing(): void + { + $this->removeDir($this->sourceDir . '/content'); + + $result = (new HugoContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(0, $result->importedCount()); + assertCount(1, $result->warnings()); + assertStringContainsString('content directory not found', $result->warnings()[0]); + } + + public function testNormalizesFrontMatterSlugForFilesystem(): void + { + file_put_contents( + $this->sourceDir . '/content/posts/unsafe.md', + "---\n" + . "title: Unsafe\n" + . "slug: ../../outside\n" + . "---\n\n" + . "Body.\n", + ); + + $result = (new HugoContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(1, $result->importedCount()); + $this->assertFileExists($this->targetDir . '/blog/outside.md'); + $this->assertFileDoesNotExist($this->targetDir . '/outside.md'); + } + + private function removeDir(string $path): void + { + if (!is_dir($path)) { + return; + } + + $iterator = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator($path, FilesystemIterator::SKIP_DOTS), + RecursiveIteratorIterator::CHILD_FIRST, + ); + foreach ($iterator as $item) { + /** @var SplFileInfo $item */ + if ($item->isDir()) { + rmdir($item->getPathname()); + } else { + unlink($item->getPathname()); + } + } + rmdir($path); + } +}