diff --git a/benchmarks/MediumImporterBench.php b/benchmarks/MediumImporterBench.php new file mode 100644 index 0000000..f72043e --- /dev/null +++ b/benchmarks/MediumImporterBench.php @@ -0,0 +1,76 @@ +sourceDir = sys_get_temp_dir() . '/yiipress-medium-bench-source-' . uniqid(); + $this->targetDir = sys_get_temp_dir() . '/yiipress-medium-bench-target-' . uniqid(); + mkdir($this->sourceDir . '/posts', 0o755, true); + mkdir($this->targetDir, 0o755, true); + + for ($i = 1; $i <= 100; $i++) { + file_put_contents( + $this->sourceDir . '/posts/post-' . $i . '.md', + "---\ntitle: Post $i\ndate: 2024-03-15\ntags: php, yii\n---\n\nBody $i.\n", + ); + } + + $this->importer = new MediumContentImporter(); + } + + public function tearDown(): void + { + $this->removeDir($this->sourceDir); + $this->removeDir($this->targetDir); + } + + #[Revs(10)] + #[Iterations(3)] + #[Warmup(1)] + public function benchImportPosts(): void + { + $this->removeDir($this->targetDir); + mkdir($this->targetDir, 0o755, true); + + $this->importer->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + } + + private function removeDir(string $path): void + { + if (!is_dir($path)) { + return; + } + + $iterator = new \RecursiveIteratorIterator( + new \RecursiveDirectoryIterator($path, \FilesystemIterator::SKIP_DOTS), + \RecursiveIteratorIterator::CHILD_FIRST, + ); + foreach ($iterator as $item) { + if ($item->isDir()) { + rmdir($item->getPathname()); + } else { + unlink($item->getPathname()); + } + } + + rmdir($path); + } +} diff --git a/config/common/di/importer.php b/config/common/di/importer.php index 477b2f3..f7d4a2e 100644 --- a/config/common/di/importer.php +++ b/config/common/di/importer.php @@ -3,6 +3,7 @@ declare(strict_types=1); use YiiPress\Console\ImportCommand; +use YiiPress\Import\Medium\MediumContentImporter; use YiiPress\Import\Telegram\TelegramContentImporter; $workingDirectory = getcwd() ?: dirname(__DIR__, 3); @@ -12,6 +13,7 @@ '__construct()' => [ 'rootPath' => $workingDirectory, 'importers' => [ + 'medium' => new MediumContentImporter(), 'telegram' => new TelegramContentImporter(), ], ], diff --git a/docs/commands.md b/docs/commands.md index ed8768d..606cdf7 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -144,7 +144,7 @@ Imports content from external sources into a YiiPress collection. **Arguments:** -- `source` — source type to import from (required). Currently supported: `telegram`. +- `source` — source type to import from (required). Currently supported: `medium`, `telegram`. **Common options:** @@ -191,6 +191,25 @@ Supports both single-chat exports (`result.json` with `messages` array) and full ./yiipress import telegram --directory=./telegram-data --content-dir=content ``` +### Medium Markdown import + +Imports Markdown files from a Medium Markdown export directory. The importer scans `posts/` first when it exists, otherwise it scans the provided directory recursively. + +**Importer options:** + +- `--directory` — path to the Medium Markdown export directory (required). Absolute or relative to project root. + +The importer reads `.md` files and converts YAML front matter plus body content into YiiPress entries. It preserves `title`, `date` / `published_at`, `canonical_url` / `url` as `origin`, `draft`, `published`, `tags`, and `categories`. When metadata is missing, the title is inferred from the first `# Heading` or filename, and the date can be inferred from filenames starting with `YYYY-MM-DD-`. + +Duplicate output filenames get numeric suffixes so earlier files are not overwritten. + +**Examples:** + +```bash +./yiipress import medium --directory=/path/to/medium-markdown-export +./yiipress import medium --directory=./medium --collection=blog +``` + ### Adding custom importers Importers implement `YiiPress\Import\ContentImporterInterface` and are registered via [Yii3 DI](https://yiisoft.github.io/docs/guide/concept/di-container.html) in `config/common/di/importer.php`. Each importer declares its own options via the `options()` method. See [Importing content](importing-content.md) for details. diff --git a/docs/importing-content.md b/docs/importing-content.md index ae12c12..44c33fc 100644 --- a/docs/importing-content.md +++ b/docs/importing-content.md @@ -60,6 +60,18 @@ Imports messages from a Telegram Desktop channel export (JSON format). See [commands.md](commands.md#yii-import) for usage details. +### MediumContentImporter + +Imports Markdown files from a Medium Markdown export directory. + +**Options:** + +- `--directory` — Path to the Medium Markdown export directory (required) + +The importer scans `posts/` first when present, otherwise the provided directory, and converts `.md` files into the selected YiiPress collection. It supports YAML front matter, preserves common metadata (`title`, date, origin URL, draft status, tags, and categories), infers missing titles/dates from headings and filenames, and avoids overwriting duplicate output filenames. + +See [commands.md](commands.md#medium-markdown-import) for usage details. + ## Writing a custom importer Create a class implementing `ContentImporterInterface`. Each importer declares its own options — a file-based importer might need a `directory`, while an API-based importer might need `url` and `api-key`. diff --git a/roadmap.md b/roadmap.md index f36c641..a0aebee 100644 --- a/roadmap.md +++ b/roadmap.md @@ -109,6 +109,6 @@ - [ ] WordPress - [ ] Jekyll - [ ] Hugo -- [ ] Medium exported Markdown +- [x] Medium exported Markdown - [ ] Ghost - [x] Telegram export diff --git a/src/Import/Medium/MediumContentImporter.php b/src/Import/Medium/MediumContentImporter.php new file mode 100644 index 0000000..487bab8 --- /dev/null +++ b/src/Import/Medium/MediumContentImporter.php @@ -0,0 +1,379 @@ +contentDirectory($sourceDirectory); + $contentFiles = $this->contentFiles($contentDirectory); + + $collectionDir = $targetDirectory . '/' . $collection; + FileHelper::ensureDirectory($collectionDir, 0o755); + + $importedFiles = []; + $skippedFiles = []; + $warnings = []; + $usedPaths = []; + + foreach ($contentFiles as $contentFile) { + $entry = $this->readContentFile($contentFile); + if ($entry === null) { + $skippedFiles[] = $contentFile; + $warnings[] = 'Skipped unreadable Medium Markdown file: ' . basename($contentFile); + continue; + } + + $path = $this->uniquePath($collectionDir, $this->filename($entry), $usedPaths); + file_put_contents($path, $this->buildMarkdownFile($entry)); + $importedFiles[] = $path; + } + + $this->ensureCollectionConfig($collectionDir, $collection); + + return new ImportResult( + totalMessages: count($contentFiles), + importedCount: count($importedFiles), + importedFiles: $importedFiles, + skippedFiles: $skippedFiles, + warnings: $warnings, + ); + } + + public function name(): string + { + return 'medium'; + } + + private function contentDirectory(string $sourceDirectory): string + { + $postsDirectory = $sourceDirectory . '/posts'; + + return is_dir($postsDirectory) ? $postsDirectory : $sourceDirectory; + } + + /** + * @return list + */ + private function contentFiles(string $contentDirectory): array + { + $files = []; + $iterator = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator($contentDirectory, FilesystemIterator::SKIP_DOTS), + ); + + foreach ($iterator as $item) { + /** @var SplFileInfo $item */ + if ($item->isFile() && mb_strtolower($item->getExtension()) === 'md') { + $files[] = $item->getPathname(); + } + } + + sort($files); + + return $files; + } + + /** + * @return array{ + * title: string, + * slug: string, + * date: string, + * origin: string, + * draft: bool, + * tags: list, + * categories: list, + * body: string + * }|null + */ + private function readContentFile(string $contentFile): ?array + { + $content = file_get_contents($contentFile); + if ($content === false) { + return null; + } + + [$fields, $body] = $this->splitFrontMatter($content); + $filename = pathinfo($contentFile, PATHINFO_FILENAME); + $date = $this->datePart($this->stringField($fields['date'] ?? $fields['published_at'] ?? null)); + if ($date === '' && preg_match('/^(\d{4}-\d{2}-\d{2})-(.+)$/', $filename, $matches) === 1) { + $date = $matches[1]; + $filename = $matches[2]; + } + + $slug = $this->filesystemSlug($this->stringField($fields['slug'] ?? null, $filename)); + $title = $this->stringField($fields['title'] ?? null); + if ($title === '') { + $title = $this->titleFromBody($body) ?: ucwords(str_replace(['-', '_'], ' ', $slug)); + } + + return [ + 'title' => $title, + 'slug' => $slug, + 'date' => $date, + 'origin' => $this->stringField($fields['canonical_url'] ?? $fields['url'] ?? null), + 'draft' => ($fields['draft'] ?? false) === true || ($fields['published'] ?? true) === false, + 'tags' => $this->listField($fields['tags'] ?? []), + 'categories' => $this->listField($fields['categories'] ?? []), + 'body' => trim($body) . "\n", + ]; + } + + /** + * @return array{0: array, 1: string} + */ + private function splitFrontMatter(string $content): array + { + $content = str_replace("\r\n", "\n", $content); + if (!str_starts_with($content, "---\n")) { + return [[], $content]; + } + + $endPosition = strpos($content, "\n---\n", 4); + if ($endPosition === false) { + return [[], $content]; + } + + $data = yaml_parse(substr($content, 4, $endPosition - 4)); + if (!is_array($data)) { + return [[], substr($content, $endPosition + 5)]; + } + + $fields = []; + foreach ($data as $key => $value) { + if (is_string($key)) { + $fields[$key] = $value; + } + } + + return [$fields, substr($content, $endPosition + 5)]; + } + + /** + * @param array{ + * title: string, + * slug: string, + * date: string, + * origin: string, + * draft: bool, + * tags: list, + * categories: list, + * body: string + * } $entry + */ + private function filename(array $entry): string + { + return ($entry['date'] !== '' ? $entry['date'] . '-' : '') . $entry['slug'] . '.md'; + } + + /** + * @param array $usedPaths + */ + private function uniquePath(string $directory, string $filename, array &$usedPaths): string + { + $path = $directory . '/' . $filename; + if (!isset($usedPaths[$path]) && !file_exists($path)) { + $usedPaths[$path] = true; + return $path; + } + + $base = pathinfo($filename, PATHINFO_FILENAME); + $extension = pathinfo($filename, PATHINFO_EXTENSION); + $suffix = 2; + do { + $path = $directory . '/' . $base . '-' . $suffix . ($extension !== '' ? '.' . $extension : ''); + $suffix++; + } while (isset($usedPaths[$path]) || file_exists($path)); + + $usedPaths[$path] = true; + + return $path; + } + + /** + * @param array{ + * title: string, + * slug: string, + * date: string, + * origin: string, + * draft: bool, + * tags: list, + * categories: list, + * body: string + * } $entry + */ + private function buildMarkdownFile(array $entry): string + { + $frontMatter = "---\n"; + $frontMatter .= 'title: ' . $this->yamlEscape($entry['title']) . "\n"; + + if ($entry['date'] !== '') { + $frontMatter .= 'date: ' . $entry['date'] . "\n"; + } + + if ($entry['origin'] !== '') { + $frontMatter .= 'origin: ' . $this->yamlEscape($entry['origin']) . "\n"; + } + + if ($entry['draft']) { + $frontMatter .= "draft: true\n"; + } + + if ($entry['tags'] !== []) { + $frontMatter .= "tags:\n"; + foreach ($entry['tags'] as $tag) { + $frontMatter .= ' - ' . $this->yamlEscape($tag) . "\n"; + } + } + + if ($entry['categories'] !== []) { + $frontMatter .= "categories:\n"; + foreach ($entry['categories'] as $category) { + $frontMatter .= ' - ' . $this->yamlEscape($category) . "\n"; + } + } + + return $frontMatter . "---\n\n" . $entry['body']; + } + + private function titleFromBody(string $body): string + { + return preg_match('/^#\s+(.+)$/m', $body, $matches) === 1 ? trim($matches[1]) : ''; + } + + private function datePart(string $date): string + { + return preg_match('/^(\d{4}-\d{2}-\d{2})/', $date, $matches) === 1 ? $matches[1] : ''; + } + + private function stringField(mixed $value, string $default = ''): string + { + if ($value === null) { + return $default; + } + + return trim((string) $value); + } + + /** + * @return list + */ + private function listField(mixed $value): array + { + if (is_array($value)) { + return array_values(array_filter(array_map(static fn (mixed $item): string => trim((string) $item), $value))); + } + + $value = trim((string) $value); + if ($value === '') { + return []; + } + + $items = str_contains($value, ',') + ? explode(',', $value) + : (preg_split('/\s+/', $value) ?: []); + + return array_values(array_filter(array_map(static fn (string $item): string => trim($item), $items))); + } + + private function filesystemSlug(string $slug): string + { + $slug = str_replace(['/', '\\'], '-', trim($slug)); + $slug = (string) preg_replace('/[<>:"|?*\x00-\x1F]+/', '-', $slug); + $slug = trim($slug, ". \t\n\r\0\x0B-"); + + return $slug === '' ? 'post' : $slug; + } + + private function ensureCollectionConfig(string $collectionDir, string $collection): void + { + $configPath = $collectionDir . '/_collection.yaml'; + if (is_file($configPath)) { + return; + } + + $config = 'title: ' . ucfirst($collection) . "\n"; + $config .= "sort_by: date\n"; + $config .= "sort_order: desc\n"; + $config .= "entries_per_page: 10\n"; + $config .= "feed: true\n"; + + file_put_contents($configPath, $config); + } + + private function yamlEscape(string $value): string + { + $value = str_replace(["\r", "\n"], ' ', $value); + if (preg_match('/[:#\[\]{}|>&*!,\'"%@`]/', $value) === 1) { + return '"' . addcslashes($value, '"\\') . '"'; + } + + return $value; + } +} diff --git a/tests/Unit/Console/ImportCommandTest.php b/tests/Unit/Console/ImportCommandTest.php index a53cdb1..349a5a1 100644 --- a/tests/Unit/Console/ImportCommandTest.php +++ b/tests/Unit/Console/ImportCommandTest.php @@ -103,11 +103,27 @@ public function testImportsToCustomCollection(): void assertStringContainsString('Imported: 1', $result['output']); } + public function testImportsMediumMarkdownExport(): void + { + mkdir($this->sourceDir . '/posts', 0o755, true); + file_put_contents( + $this->sourceDir . '/posts/hello-medium.md', + "---\ntitle: Hello Medium\n---\n\nBody.\n", + ); + + $result = $this->runImport('medium', ['--directory' => $this->sourceDir]); + + assertSame(0, $result['exitCode'], $result['output']); + assertStringContainsString('Importing from medium', $result['output']); + assertStringContainsString('Imported: 1', $result['output']); + } + public function testShowsAvailableImportersOnError(): void { $result = $this->runImport('wordpress', ['--directory' => $this->sourceDir]); assertSame(65, $result['exitCode']); + assertStringContainsString('medium', $result['output']); assertStringContainsString('telegram', $result['output']); } diff --git a/tests/Unit/Import/MediumContentImporterTest.php b/tests/Unit/Import/MediumContentImporterTest.php new file mode 100644 index 0000000..f7d0218 --- /dev/null +++ b/tests/Unit/Import/MediumContentImporterTest.php @@ -0,0 +1,133 @@ +sourceDir = sys_get_temp_dir() . '/yiipress-medium-source-' . uniqid(); + $this->targetDir = sys_get_temp_dir() . '/yiipress-medium-target-' . uniqid(); + mkdir($this->sourceDir . '/posts', 0o755, true); + mkdir($this->targetDir, 0o755, true); + } + + protected function tearDown(): void + { + $this->removeDir($this->sourceDir); + $this->removeDir($this->targetDir); + } + + public function testImportsMarkdownWithYamlFrontMatter(): void + { + file_put_contents( + $this->sourceDir . '/posts/hello-medium.md', + "---\n" + . "title: \"Hello: Medium\"\n" + . "date: 2024-03-15T10:30:00Z\n" + . "canonical_url: https://medium.com/@author/hello-medium\n" + . "tags: php, yii\n" + . "categories:\n" + . " - docs\n" + . "draft: true\n" + . "---\n\n" + . "Body text.\n", + ); + + $result = (new MediumContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(1, $result->importedCount()); + assertSame(1, $result->totalMessages()); + assertSame([], $result->warnings()); + + $content = file_get_contents($this->targetDir . '/blog/2024-03-15-hello-medium.md'); + $this->assertNotFalse($content); + assertStringContainsString('title: "Hello: Medium"', $content); + assertStringContainsString('date: 2024-03-15', $content); + assertStringContainsString('origin: "https://medium.com/@author/hello-medium"', $content); + assertStringContainsString("draft: true\n", $content); + assertStringContainsString("tags:\n - php\n - yii\n", $content); + assertStringContainsString("categories:\n - docs\n", $content); + assertStringContainsString("Body text.\n", $content); + $this->assertFileExists($this->targetDir . '/blog/_collection.yaml'); + } + + public function testInfersTitleAndDateFromMarkdown(): void + { + file_put_contents( + $this->sourceDir . '/posts/2024-04-01-heading-title.md', + "# Heading Title\n\nBody.\n", + ); + + $result = (new MediumContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(1, $result->importedCount()); + $content = file_get_contents($this->targetDir . '/blog/2024-04-01-heading-title.md'); + $this->assertNotFalse($content); + assertStringContainsString('title: Heading Title', $content); + assertStringContainsString('date: 2024-04-01', $content); + } + + public function testDoesNotOverwriteDuplicateSlugs(): void + { + file_put_contents($this->sourceDir . '/posts/2024-05-01-duplicate.md', "# First\n"); + file_put_contents( + $this->sourceDir . '/posts/duplicate-copy.md', + "---\ndate: 2024-05-01\nslug: duplicate\n---\n# Second\n", + ); + + $result = (new MediumContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(2, $result->importedCount()); + $this->assertFileExists($this->targetDir . '/blog/2024-05-01-duplicate.md'); + $this->assertFileExists($this->targetDir . '/blog/2024-05-01-duplicate-2.md'); + } + + public function testWarnsWhenDirectoryIsMissing(): void + { + $this->removeDir($this->sourceDir); + + $result = (new MediumContentImporter())->import(['directory' => $this->sourceDir], $this->targetDir, 'blog'); + + assertSame(0, $result->importedCount()); + assertCount(1, $result->warnings()); + assertStringContainsString('directory option is required', $result->warnings()[0]); + } + + private function removeDir(string $path): void + { + if (!is_dir($path)) { + return; + } + + $iterator = new RecursiveIteratorIterator( + new RecursiveDirectoryIterator($path, FilesystemIterator::SKIP_DOTS), + RecursiveIteratorIterator::CHILD_FIRST, + ); + foreach ($iterator as $item) { + /** @var SplFileInfo $item */ + if ($item->isDir()) { + rmdir($item->getPathname()); + } else { + unlink($item->getPathname()); + } + } + rmdir($path); + } +}