From aad3c3b55e9a9d1fb5f0cda8ae1f6f758006e7f4 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Wed, 10 Jun 2026 12:55:07 +0100 Subject: [PATCH 01/11] feat(importers): K1 markdown-vault import baseline (--vault) Add importVault(): ingest a vault folder of .md notes as kind='raw' memories, mirroring the connector pattern (tag provenance source:vault/vault:, artifactRef cursor, archiveRaw deletions - never supersede a raw row). Idempotent on (path, content-hash) via a single tenant-scoped scan into a Map reused for the deletion diff; frontmatter (flow + block lists) -> A3 envelope tags; [[wikilinks]] -> wikilink-candidate: tags. CLI: hippo import --vault [--name --scope --dry-run]. No schema migration (memories-table-as-cursor). Reviewed via /dev-framework-rl: plan-eng 3 rounds (R1 CRIT supersede-on-raw), codex 4 rounds (10 P2s all fixed: name-keying escape/case/colon, empty-note, dryRun+global, block-list), independent-review pass. 20 vault tests + 14 importer tests; full suite green. Defers E3.1 sleep-time wikilink surfacing (in-flight E3). Roadmap K1. Loop episode 1/15 (hipporoadmap-20260610T112117Z). Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/plans/2026-06-10-k1-vault-import.md | 48 ++ src/cli.ts | 38 +- src/importers.ts | 334 +++++++++++++ src/index.ts | 1 + .../dendron-sample/vault.note.child.md | 11 + .../dendron-sample/vault.note.md | 9 + .../vault-imports/foam-sample/concepts.md | 7 + .../foam-sample/notes/idea-one.md | 3 + .../vault-imports/foam-sample/readme.md | 4 + .../obsidian-sample/daily/2026-06-10.md | 6 + .../vault-imports/obsidian-sample/index.md | 14 + .../obsidian-sample/project-alpha.md | 14 + tests/importers-vault.test.ts | 465 ++++++++++++++++++ 13 files changed, 953 insertions(+), 1 deletion(-) create mode 100644 docs/plans/2026-06-10-k1-vault-import.md create mode 100644 tests/fixtures/vault-imports/dendron-sample/vault.note.child.md create mode 100644 tests/fixtures/vault-imports/dendron-sample/vault.note.md create mode 100644 tests/fixtures/vault-imports/foam-sample/concepts.md create mode 100644 tests/fixtures/vault-imports/foam-sample/notes/idea-one.md create mode 100644 tests/fixtures/vault-imports/foam-sample/readme.md create mode 100644 tests/fixtures/vault-imports/obsidian-sample/daily/2026-06-10.md create mode 100644 tests/fixtures/vault-imports/obsidian-sample/index.md create mode 100644 tests/fixtures/vault-imports/obsidian-sample/project-alpha.md create mode 100644 tests/importers-vault.test.ts diff --git a/docs/plans/2026-06-10-k1-vault-import.md b/docs/plans/2026-06-10-k1-vault-import.md new file mode 100644 index 00000000..606c584f --- /dev/null +++ b/docs/plans/2026-06-10-k1-vault-import.md @@ -0,0 +1,48 @@ +# K1 markdown-vault import baseline (plan, REVISED post plan-eng-critic R2) + +Date: 2026-06-10 | Episode: `01KTRGYH3C58CYC7DZWNPQ5F7B` | Branch: `feat/k1-vault-import` (off `87746f3`) | Loop episode 1/15 +Status: revised after plan-eng-critic R1 (CRIT) + R2 (2 HIGH). Re-gate R3 before execute. + +## Goal +Add `importVault(folderPath, options)` to `src/importers.ts`: ingest a vault folder of `.md` notes as `kind='raw'` memories with frontmatter→A3 envelope, `[[wikilink]]`→`wikilink-candidate:` tags, idempotency on `(path, content-hash)`, source-deletion sync, **provenance via tags** (`source:vault` + `vault:` — matching the tag-based slack/github connectors; the `memories.source` column stays its default). CLI `--vault` flag. Fixture-gated per dialect. DEFER the E3.1 sleep-time *surfacing* (in-flight E3). + +## Key decisions (audit + R1 + R2 review) +- **Mirror the CONNECTOR pattern** (`src/connectors/slack|github/deletion.ts` + `github/ingest.ts`), NOT the single-file importers. Connectors NEVER `supersede` a raw row: on change they APPEND a new `kind='raw'` row; on delete they `archiveRaw` ALL matching raw rows. They record provenance in **tags** (`source:slack`), column `source` defaults — K1 matches this. +- **NO new schema table** — memories-table-as-cursor. `artifactRef='vault::'`; content-hash in a `content-hash:` tag. +- **R1 CRIT — changed file → `archiveRaw(oldId) + remember(new, kind='raw')`, NEVER `supersede`** (supersede yields `kind='distilled'`, losing raw-append-only protection + escaping the `kind='raw'` deletion rescan). Respects `trg_memories_raw_append_only` (db.ts:298: BEFORE DELETE WHEN kind='raw' → ABORT; `archiveRaw` flips to `'archived'` then deletes in a SAVEPOINT — the only legit raw-delete). archiveRaw commits + closes its handle before `remember(new)` runs (sequential, single-threaded), so no double-live row; a crash between them self-heals (the unchanged file is re-imported as fresh raw next run). Documented. +- **R1 HIGH — tenant-scoped** idempotency + deletion-sync (`AND tenant_id=? AND kind='raw'`). +- **R1 MED — single-scan Map.** No index on `memories.artifact_ref` (verified). Load the vault's existing rows ONCE into `Map` (one query), reuse for BOTH per-file idempotency AND the deletion diff. +- **R2 HIGH — LIKE-escape the vault name.** `vaultName` is operator-supplied (`opts.name ?? basename`), so the prefix query MUST use `escapeLike(vaultName)` + `ESCAPE '\\'` (mirror `src/project-briefs.ts:477` / `assembleBriefFromReceipts`). Unescaped, a `%`/`_` in the name over-matches and the deletion diff could archiveRaw another vault's rows. +- **R2 HIGH — provenance tag-based** (above): Goal + mechanism + tests all assert the `source:vault` / `vault:` TAGS, never a `source` column. (Threading the A3 `source` column through `remember()` for all connectors is a cross-cutting follow-up, out of K1 scope.) +- **Minimal inline frontmatter parser** (no YAML dep). Wikilinks → tags only (NOT `entities`/`relations`/`graph_extraction_queue`). + +## Mechanism +1. `importVault(folderPath, opts)`: recursive readdir `*.md`; `relpath` rel to vault root; `vaultName = opts.name ?? basename(folderPath)`; `tenantId = ctx.tenantId`. +2. **Load once:** `existing = Map` from one query `artifact_ref LIKE ? ESCAPE '\\' AND tenant_id=? AND kind='raw'` with param `` `vault:${escapeLike(vaultName)}:%` ``. `seen = Set`. +3. Per file: read raw; `hash = sha256(raw)`; parse frontmatter (`^---\n…\n---\n`) → `{fm, body}`; fm `tags`/`aliases` → tags; parse `[[target]]`/`[[target|alias]]`→target → `wikilink-candidate:` tags. `artifactRef='vault::'`; `seen.add(artifactRef)`. +4. Idempotency vs `existing.get(artifactRef)`: **absent** → `remember(kind='raw')`; **present + same `content-hash:` tag** → skip; **present + different hash** → `archiveRaw(oldId, reason='changed:vault::')` then `remember(new, kind='raw')`. +5. `remember(ctx,{content:body, kind:'raw', artifactRef, owner:'agent:vault-import', scope: opts.scope ?? null, tags:['source:vault',`vault:${vaultName}`,`content-hash:${hash}`,...fmTags,...wikilinkTags]})`. +6. **Deletion-sync (uses the Map):** for each `artifactRef` in `existing` but NOT in `seen` → file gone → `archiveRaw(row.id, reason='source_deleted:vault::')`. Per-file archiveRaw (own handle); no outer SAVEPOINT (no cross-file idempotency row to commit atomically, unlike github's multi-row case). +7. CLI: `--vault ` → `importVault`; update usage + dispatch. Return `ImportResult {total, imported, skipped, entries}`. + +## Tests (real DB, no mocks) +`tests/importers-vault.test.ts` + `tests/fixtures/vault-imports/{obsidian,foam,dendron}-sample/`: +- (a) fixture vault imports: ≥95% notes as `kind='raw'` with the `source:vault` + `vault:` TAGS + provenance; +- (b) re-import idempotent: 0 dups (unchanged files skipped via the hash tag); +- (c) changed file → OLD raw row archived (gone from `memories`, snapshot in raw_archive) AND new row is `kind='raw'` with the `source:vault` tag; +- (d) deleted file → its raw memory archived/invalidated; +- (e) `[[wikilinks]]` → `wikilink-candidate:` tags (incl. `[[a|b]]`→a); +- (f) frontmatter tags/aliases mapped; +- (g) no crash on dialect syntax (Obsidian `^block-id`/`![[embed]]`, Dendron dot-hierarchy filenames, Foam plain md); +- (h) tenant isolation: a vault import in tenant A does NOT archive tenant B's `vault:` rows; +- (i) **LIKE-escape isolation:** a vault literally named `a%` does NOT match/archive a vault named `ab`'s rows (escaped pattern). + +## Resolved (was: open questions) +- **Q1** memories-as-cursor: keyed by artifactRef (relpath); move/rename = delete+add (lineage loss) accepted baseline; the `:` delimiter keeps per-relpath keying safe. +- **Q2/Q4** changed/raw → `archiveRaw(old)+remember(new,kind='raw')` (the CRIT fix), append-only-trigger-respecting; crash-window self-heals. +- **Q3** content-hash → `content-hash:` tag, no migration. +- **R2-provenance** → tag-based (`source:vault`/`vault:`), column default; A3 source-column threading is a follow-up. +- **R2-escape** → `escapeLike` + `ESCAPE '\\'` on vaultName. + +## Out of scope (deferred) +E3.1 sleep-time surfacing; `vault_cursors` table (scaling); A3 `source`-column threading; Obsidian Canvas / block-refs / embeds beyond no-crash. diff --git a/src/cli.ts b/src/cli.ts index 711d2b31..7455f2e4 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -144,6 +144,7 @@ import { importCursor, importGenericFile, importMarkdown, + importVault, ImportOptions, } from './importers.js'; import { cmdCapture, CaptureOptions } from './capture.js'; @@ -5828,6 +5829,39 @@ function cmdImport( hippoRoot, }; + // K1 vault import: a FOLDER importer that mirrors the connector pattern + // (kind='raw' + tag provenance + archiveRaw deletions), so it dispatches + // separately from the single-file `importer` function-pointer slot below. + // It writes through api.remember/archiveRaw which are tenant-scoped, so we + // resolve the tenant and pass it through. --global is not supported for + // vault import (the connector raw-archive path is tenant-local). + if (flags['vault']) { + const folderPath = String(flags['vault']); + if (!fs.existsSync(folderPath) || !fs.statSync(folderPath).isDirectory()) { + console.error(`Vault folder not found (or not a directory): ${folderPath}`); + process.exit(1); + } + if (useGlobal) { + console.error('hippo import --vault does not support --global (raw rows are tenant-local).'); + process.exit(1); + } + const tenantId = resolveTenantId({}); + const vaultOptions: ImportOptions = { + ...importOptions, + tenantId, + name: flags['name'] ? String(flags['name']) : undefined, + scope: flags['scope'] ? String(flags['scope']) : undefined, + }; + const vaultResult = importVault(folderPath, vaultOptions); + console.log(`\nImport Vault: ${folderPath}${dryRun ? ' (dry run - no writes)' : ''}`); + console.log(` Notes found: ${vaultResult.total}`); + console.log(` ${dryRun ? 'Would import: ' : 'Imported: '}${vaultResult.imported}`); + console.log(` Skipped (unchanged): ${vaultResult.skipped}`); + console.log(` ${dryRun ? 'Would archive: ' : 'Archived (removed): '}${vaultResult.archived ?? 0}`); + console.log(` Store: ${hippoRoot}`); + return; + } + // Determine which importer to use based on flag let filePath: string | undefined; let importer: ((fp: string, opts: ImportOptions) => ReturnType) | undefined; @@ -5861,7 +5895,7 @@ function cmdImport( } if (!filePath || !importer) { - console.error('Usage: hippo import <--chatgpt|--claude|--cursor|--file|--markdown> '); + console.error('Usage: hippo import <--chatgpt|--claude|--cursor|--file|--markdown|--vault> '); process.exit(1); } @@ -7723,6 +7757,8 @@ Commands: --cursor Import from .cursorrules or .cursor/rules --file Import from any markdown or text file --markdown Import from structured MEMORY.md / AGENTS.md + --vault Import a markdown-vault FOLDER as kind='raw' notes + (Obsidian/Foam/Dendron). [--name ] [--scope ] --dry-run Preview without writing --global Write to global store ($HIPPO_HOME or ~/.hippo/) --tag Add extra tag (repeatable) diff --git a/src/importers.ts b/src/importers.ts index 0521ef80..e79686c3 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -4,10 +4,14 @@ */ import * as fs from 'fs'; +import * as path from 'path'; +import { createHash } from 'node:crypto'; import { createMemory, Layer, MemoryEntry } from './memory.js'; import { initStore, loadAllEntries, writeEntry } from './store.js'; import { textOverlap } from './search.js'; import { getGlobalRoot, initGlobal } from './shared.js'; +import { remember, archiveRaw, type Context } from './api.js'; +import { openHippoDb, closeHippoDb } from './db.js'; // --------------------------------------------------------------------------- // Types @@ -17,6 +21,9 @@ export interface ImportResult { total: number; // entries found in source imported: number; // actually imported (after dedup) skipped: number; // skipped as duplicates or too short + /** K1 vault import: rows archived this run (changed + source-deleted). In a + * dryRun this is the would-be count (a true deletion-sync preview). */ + archived?: number; entries: MemoryEntry[]; } @@ -32,6 +39,18 @@ export interface ImportOptions { * Undefined preserves pre-1.12.1 host-wide dedup behaviour. */ tenantId?: string; + /** + * K1 vault import only. Logical vault name used in the `vault:` tag and + * the `artifactRef='vault::'` key. Defaults to + * `basename(folderPath)` when unset. Operator-supplied, so the loader query + * LIKE-escapes it (see `escapeLike` below). + */ + name?: string; + /** + * K1 vault import only. Memory scope stamped on every imported note. Defaults + * to null (unscoped) when unset. + */ + scope?: string; } // --------------------------------------------------------------------------- @@ -495,3 +514,318 @@ export function importMarkdown(filePath: string, options: ImportOptions): Import return totalResult; } + +// --------------------------------------------------------------------------- +// K1 vault importer (markdown-vault FOLDER → kind='raw' memories) +// +// MIRRORS THE CONNECTOR PATTERN (src/connectors/slack|github), NOT the +// single-file importers above. Each note becomes a single kind='raw' row with +// provenance in TAGS (`source:vault` + `vault:`), an artifactRef cursor +// key, and a content-hash tag. Changes APPEND a new raw row after archiveRaw of +// the old one; deletions archiveRaw the orphaned rows. We NEVER `supersede` a +// raw row (supersede yields kind='distilled', losing raw-append-only protection +// and escaping the kind='raw' deletion rescan) — all raw deletions route through +// `archiveRaw` (the only trigger-legit raw delete). +// --------------------------------------------------------------------------- + +/** Escape LIKE wildcards in operator-supplied text (mirror of + * src/project-briefs.ts:477 / src/store.ts:782, kept local since neither is + * exported). Used so a `%`/`_`/`\` in the vault name cannot over-match the + * loader prefix and archive another vault's rows. */ +function escapeLike(term: string): string { + return term.replace(/[%_\\]/g, '\\$&'); +} + +/** Minimal inline frontmatter split. Recognises a leading `---\n…\n---\n` + * block (no YAML dep). Returns the parsed key→value map plus the body with the + * block removed. When no well-formed block is present, `fm` is empty and + * `body` is the original content. */ +function parseFrontmatter(raw: string): { fm: Record; body: string } { + // Must start with `---` on its own line. Accept CRLF or LF. + const m = raw.match(/^---\r?\n([\s\S]*?)\r?\n---\r?\n?/); + if (!m) return { fm: {}, body: raw }; + const block = m[1]; + const body = raw.slice(m[0].length); + const fm: Record = {}; + const lines = block.split(/\r?\n/); + for (let i = 0; i < lines.length; i++) { + const kv = lines[i].match(/^([A-Za-z0-9_-]+)\s*:\s*(.*)$/); + if (!kv) continue; + let val = kv[2].trim(); + if (val === '') { + // YAML block-style list: `key:` followed by indented `- item` lines + // (common in Obsidian/Dendron frontmatter). Collect them into a + // comma-joined value so frontmatterList parses them (codex P2). + const items: string[] = []; + let j = i + 1; + let item: RegExpMatchArray | null; + while (j < lines.length && (item = lines[j].match(/^\s+-\s+(.+?)\s*$/)) !== null) { + items.push(item[1].replace(/^['"]|['"]$/g, '').trim()); + j++; + } + if (items.length) { + val = items.join(', '); + i = j - 1; + } + } + fm[kv[1]] = val; + } + return { fm, body }; +} + +/** Pull a frontmatter field that may be a YAML flow list (`[a, b]`), a + * comma-separated scalar (`a, b`), or a single token, into a string[]. Quotes + * and surrounding brackets are stripped; empty entries dropped. */ +function frontmatterList(value: string | undefined): string[] { + if (!value) return []; + let v = value.trim(); + if (v.startsWith('[') && v.endsWith(']')) v = v.slice(1, -1); + return v + .split(',') + .map((s) => s.trim().replace(/^['"]|['"]$/g, '').trim()) + .filter(Boolean); +} + +/** Parse `[[wikilinks]]` from body text. `[[target]]` and `[[target|alias]]` + * both yield `target` (alias dropped). Returns de-duplicated, order-preserving + * target strings (trimmed). Embeds (`![[…]]`) are intentionally matched too — + * the leading `!` is not part of the `[[…]]` capture, so an embed contributes + * its target as a candidate, which is the desired no-crash baseline behaviour. */ +function parseWikilinks(body: string): string[] { + const out: string[] = []; + const seen = new Set(); + const re = /\[\[([^\]]+?)\]\]/g; + let match: RegExpExecArray | null; + while ((match = re.exec(body)) !== null) { + const inner = match[1]; + const target = (inner.split('|')[0] ?? '').trim(); + if (!target) continue; + if (seen.has(target)) continue; + seen.add(target); + out.push(target); + } + return out; +} + +/** Recursively collect `*.md` files under `root`, returning paths relative to + * `root` with forward-slash separators (stable artifactRef keys across OSes). + * Symlinks are not followed (withFileTypes isSymbolicLink skipped). */ +function collectMarkdownFiles(root: string): string[] { + const out: string[] = []; + const walk = (dir: string): void => { + const entries = fs.readdirSync(dir, { withFileTypes: true }); + for (const ent of entries) { + if (ent.isSymbolicLink()) continue; + const abs = path.join(dir, ent.name); + if (ent.isDirectory()) { + walk(abs); + } else if (ent.isFile() && ent.name.toLowerCase().endsWith('.md')) { + out.push(path.relative(root, abs).split(path.sep).join('/')); + } + } + }; + walk(root); + out.sort(); + return out; +} + +interface VaultRow { + id: string; + artifact_ref: string; + tags_json: string; +} + +/** + * Import a markdown vault FOLDER as `kind='raw'` memories. + * + * NOT re-entrant: idempotency rests on the in-memory `existing` Map loaded once + * at the top. Two concurrent importVault runs over the same vault could both see + * a note as absent and double-insert (the connector pattern relies on a single + * sequential writer; same caveat applies here). + */ +export function importVault(folderPath: string, options: ImportOptions): ImportResult { + const hippoRoot = options.hippoRoot; + const tenantId = options.tenantId ?? 'default'; + const vaultName = options.name ?? path.basename(path.resolve(folderPath)); + if (vaultName.includes(':')) { + // ':' is the artifactRef delimiter (vault::); a name + // containing it lets a different vault's prefix scan over-match and archive + // its rows (codex P2). Reject rather than silently corrupt the keys. + throw new Error(`vault name must not contain ':' (artifactRef delimiter): ${vaultName}`); + } + const scope = options.scope ?? null; + const extraTags = options.extraTags ?? []; + const dryRun = options.dryRun ?? false; + if (options.global) { + // The raw-archive path is tenant-local; global mode would put raw vault rows + // in the wrong store. Reject for SDK callers too (the CLI also rejects + // --global) rather than silently writing local (codex P2). + throw new Error('importVault does not support global mode (raw rows are tenant-local).'); + } + + const ctx: Context = { + hippoRoot, + tenantId, + // Process-local actor; the vault importer is a CLI/SDK ingestion path, not + // a bearer-authed request. archiveRaw / remember thread this into audit. + actor: { subject: 'connector:vault', role: 'admin' }, + }; + + // Load ONCE: every existing raw row for this vault, tenant-scoped. The same + // Map serves both per-file idempotency AND the deletion diff (no second + // query). LIKE-escape the vault name so a `%`/`_` in it can't over-match. + initStore(hippoRoot); + const existing = new Map(); + { + const db = openHippoDb(hippoRoot); + try { + const likeParam = `vault:${escapeLike(vaultName)}:%`; + const rows = db + .prepare( + `SELECT id, artifact_ref, tags_json FROM memories + WHERE artifact_ref LIKE ? ESCAPE '\\' AND tenant_id = ? AND kind = 'raw'`, + ) + .all(likeParam, tenantId) as VaultRow[]; + // SQLite LIKE is case-insensitive for ASCII, so the query over-fetches + // (vault 'A' also matches 'vault:a:%'). Filter to the EXACT-case prefix in + // JS so deletion-sync never archives a different-cased vault's rows (codex P2). + const exactPrefix = `vault:${vaultName}:`; + for (const row of rows) { + if (row.artifact_ref && row.artifact_ref.startsWith(exactPrefix)) { + existing.set(row.artifact_ref, row); + } + } + } finally { + closeHippoDb(db); + } + } + + const relpaths = collectMarkdownFiles(folderPath); + const seen = new Set(); + + let total = 0; + let imported = 0; + let skipped = 0; + let archived = 0; + const entries: MemoryEntry[] = []; + + for (const relpath of relpaths) { + total++; + const artifactRef = `vault:${vaultName}:${relpath}`; + seen.add(artifactRef); + + // Content-hash is computed from the RAW file bytes (deterministic; no + // Date/random in the content path) so idempotency survives frontmatter + // edits identically to body edits. + let rawFileContent: string; + try { + rawFileContent = fs.readFileSync(path.join(folderPath, relpath), 'utf8'); + } catch { + // File vanished between enumeration and read (TOCTOU), or a transient + // IO/permission error. Skip this one file rather than aborting the whole + // import (incl. the deletion-sync pass); an idempotent re-run picks it up. + skipped++; + continue; + } + const hash = createHash('sha256').update(rawFileContent).digest('hex'); + const hashTag = `content-hash:${hash}`; + + const prior = existing.get(artifactRef); + const priorTags = prior ? parseJsonArrayLoose(prior.tags_json) : []; + if (prior && priorTags.includes(hashTag)) { + // Unchanged file → skip (idempotent re-import). + skipped++; + continue; + } + + const { fm, body } = parseFrontmatter(rawFileContent); + + // Empty / frontmatter-only note: nothing storable (createMemory enforces a + // min content length). Skip WITHOUT archiving any prior row — archiving + // first then throwing on the empty body would delete a live memory mid-run + // (codex P2). The note stays in `seen`, so deletion-sync won't archive it. + if (body.trim().length < 3) { + skipped++; + continue; + } + + // Changed file → archive the OLD raw row, then append the new one. NEVER + // supersede (would yield kind='distilled'). archiveRaw commits + closes its + // handle before remember() runs, so there is no double-live row; a crash + // between them self-heals (file re-imported as fresh raw next run). + if (prior) { + archived++; // count the would-be archive even in dryRun (true preview) + if (!dryRun) archiveRaw(ctx, prior.id, `changed:${artifactRef}`); + } + const frontmatterTags = [ + ...frontmatterList(fm['tags']), + ...frontmatterList(fm['aliases']).map((a) => `alias:${a}`), + ]; + const wikilinkTags = parseWikilinks(body).map((t) => `wikilink-candidate:${t}`); + + // De-duplicate the final tag array (createMemory stores tags verbatim, so a + // collision between, e.g., a frontmatter tag and an extraTag would otherwise + // produce a duplicate). Order-preserving. + const tags = Array.from( + new Set([ + 'source:vault', + `vault:${vaultName}`, + hashTag, + ...frontmatterTags, + ...wikilinkTags, + ...extraTags, + ]), + ); + + // remember() owns the actual write. We build an `echo` of the SAME content + + // tags via createMemory purely for the ImportResult, then reconcile its id to + // remember()'s real row id so entries[] reflects the row that landed. + const echo = createMemory(body, { + kind: 'raw', + tags, + scope, + owner: 'agent:vault-import', + artifact_ref: artifactRef, + tenantId, + }); + // dryRun preview: count what WOULD import, but make no writes (codex P2). + if (!dryRun) { + const result = remember(ctx, { + content: body, + kind: 'raw', + artifactRef, + owner: 'agent:vault-import', + scope: scope ?? undefined, + tags, + }); + echo.id = result.id; + } + entries.push(echo); + imported++; + } + + // Deletion-sync: any artifactRef present in the Map but NOT seen this run is a + // note that vanished from the source folder → archive its raw row. Per-file + // archiveRaw (own handle); no outer SAVEPOINT (no cross-file idempotency row + // to commit atomically, unlike github's multi-row case). + for (const [artifactRef, row] of existing) { + if (seen.has(artifactRef)) continue; + archived++; // count even in dryRun so the preview reflects destructive deletes + if (!dryRun) archiveRaw(ctx, row.id, `source_deleted:${artifactRef}`); + } + + return { total, imported, skipped, archived, entries }; +} + +/** Local tolerant JSON-array parse for the loader's `tags_json` column. The + * store's own `parseJsonArray` is not exported; this matches its contract + * (returns [] on null/garbage). */ +function parseJsonArrayLoose(value: string | null | undefined): string[] { + if (!value) return []; + try { + const parsed = JSON.parse(value); + return Array.isArray(parsed) ? parsed.filter((x): x is string => typeof x === 'string') : []; + } catch { + return []; + } +} diff --git a/src/index.ts b/src/index.ts index 11508610..ba6d7342 100644 --- a/src/index.ts +++ b/src/index.ts @@ -87,6 +87,7 @@ export { importCursor, importGenericFile, importMarkdown, + importVault, importEntries, ImportResult, ImportOptions, diff --git a/tests/fixtures/vault-imports/dendron-sample/vault.note.child.md b/tests/fixtures/vault-imports/dendron-sample/vault.note.child.md new file mode 100644 index 00000000..ab935b6f --- /dev/null +++ b/tests/fixtures/vault-imports/dendron-sample/vault.note.child.md @@ -0,0 +1,11 @@ +--- +id: dendron-child +title: Vault Note Child +tags: + - leaf +aliases: [Child] +--- + +# Vault Note Child + +A Dendron child note in the dot hierarchy. Parent: [[vault.note]]. diff --git a/tests/fixtures/vault-imports/dendron-sample/vault.note.md b/tests/fixtures/vault-imports/dendron-sample/vault.note.md new file mode 100644 index 00000000..82a5b7b2 --- /dev/null +++ b/tests/fixtures/vault-imports/dendron-sample/vault.note.md @@ -0,0 +1,9 @@ +--- +id: dendron-root +title: Vault Note +tags: [root, hierarchy] +--- + +# Vault Note + +Dendron uses dot-hierarchy filenames. Child: [[vault.note.child]]. diff --git a/tests/fixtures/vault-imports/foam-sample/concepts.md b/tests/fixtures/vault-imports/foam-sample/concepts.md new file mode 100644 index 00000000..cfbaf7d2 --- /dev/null +++ b/tests/fixtures/vault-imports/foam-sample/concepts.md @@ -0,0 +1,7 @@ +--- +tags: [concept] +--- + +# Concepts + +A Foam concept note. It references [[readme]] and [[notes/idea-one|Idea One]]. diff --git a/tests/fixtures/vault-imports/foam-sample/notes/idea-one.md b/tests/fixtures/vault-imports/foam-sample/notes/idea-one.md new file mode 100644 index 00000000..8fb49284 --- /dev/null +++ b/tests/fixtures/vault-imports/foam-sample/notes/idea-one.md @@ -0,0 +1,3 @@ +# Idea One + +A nested plain-markdown note. Back to [[concepts]]. diff --git a/tests/fixtures/vault-imports/foam-sample/readme.md b/tests/fixtures/vault-imports/foam-sample/readme.md new file mode 100644 index 00000000..9e742b41 --- /dev/null +++ b/tests/fixtures/vault-imports/foam-sample/readme.md @@ -0,0 +1,4 @@ +# Foam Vault + +Plain markdown Foam vault, no frontmatter on this note. Links to [[concepts]] +and [[notes/idea-one]]. diff --git a/tests/fixtures/vault-imports/obsidian-sample/daily/2026-06-10.md b/tests/fixtures/vault-imports/obsidian-sample/daily/2026-06-10.md new file mode 100644 index 00000000..46874a66 --- /dev/null +++ b/tests/fixtures/vault-imports/obsidian-sample/daily/2026-06-10.md @@ -0,0 +1,6 @@ +# 2026-06-10 + +Daily note. Worked on [[project-alpha]] today. Linked back to [[index|the index]]. + +- [ ] follow up on the importer +- [x] read the plan ^task-done diff --git a/tests/fixtures/vault-imports/obsidian-sample/index.md b/tests/fixtures/vault-imports/obsidian-sample/index.md new file mode 100644 index 00000000..a49eea96 --- /dev/null +++ b/tests/fixtures/vault-imports/obsidian-sample/index.md @@ -0,0 +1,14 @@ +--- +title: Index +tags: [moc, home] +aliases: [Home, Start Here] +--- + +# Index + +This is the Obsidian vault entry note. It links to [[project-alpha]] and to +[[daily/2026-06-10|today's note]]. + +Embedded content: ![[project-alpha]] + +A block reference target lives here. ^block-abc123 diff --git a/tests/fixtures/vault-imports/obsidian-sample/project-alpha.md b/tests/fixtures/vault-imports/obsidian-sample/project-alpha.md new file mode 100644 index 00000000..2eb36278 --- /dev/null +++ b/tests/fixtures/vault-imports/obsidian-sample/project-alpha.md @@ -0,0 +1,14 @@ +--- +tags: + - project + - active +aliases: [Alpha, ProjA] +--- + +# Project Alpha + +Alpha tracks the vault-import baseline. Related: [[index]] and [[daily/2026-06-10]]. + +See the embed of a heading: ![[index#Index]] + +Loose block id at end of a line. ^alpha-1 diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts new file mode 100644 index 00000000..617e089b --- /dev/null +++ b/tests/importers-vault.test.ts @@ -0,0 +1,465 @@ +/** + * K1 markdown-vault importer tests (real DB, no mocks). + * + * Covers tests (a)-(i) from docs/plans/2026-06-10-k1-vault-import.md: + * (a) fixture vault → ≥95% notes as kind='raw' with source:vault + vault: tags + * (b) re-import idempotent (0 dups) + * (c) changed file → OLD archived + new kind='raw' with source:vault tag + * (d) deleted file → archived/invalidated + * (e) wikilinks → wikilink-candidate: tags (incl [[a|b]]→a) + * (f) frontmatter tags/aliases mapped + * (g) no crash on dialect syntax (Obsidian/Foam/Dendron fixtures) + * (h) tenant isolation + * (i) LIKE-escape isolation (vault `a%` does not archive vault `ab`'s rows) + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { importVault, type ImportOptions } from '../src/importers.js'; +import { loadAllEntries, initStore } from '../src/store.js'; +import { openHippoDb, closeHippoDb } from '../src/db.js'; + +let tmpDir: string; // hippo root (the store) +let vaultDir: string; // a scratch vault folder we mutate per-test + +const FIXTURES = path.join(__dirname, 'fixtures', 'vault-imports'); + +beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-store-')); + vaultDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-src-')); + initStore(tmpDir); +}); + +afterEach(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); + fs.rmSync(vaultDir, { recursive: true, force: true }); +}); + +function writeNote(rel: string, content: string): void { + const abs = path.join(vaultDir, rel); + fs.mkdirSync(path.dirname(abs), { recursive: true }); + fs.writeFileSync(abs, content, 'utf8'); +} + +function opts(overrides: Partial = {}): ImportOptions { + return { hippoRoot: tmpDir, tenantId: 'default', ...overrides }; +} + +/** Read raw_archive rows directly for archive assertions. */ +function archivedRows(): Array<{ memory_id: string; reason: string }> { + const db = openHippoDb(tmpDir); + try { + return db + .prepare(`SELECT memory_id, reason FROM raw_archive ORDER BY archived_at ASC`) + .all() as Array<{ memory_id: string; reason: string }>; + } finally { + closeHippoDb(db); + } +} + +// --------------------------------------------------------------------------- +// (a) fixture vault imports as kind='raw' with provenance tags +// --------------------------------------------------------------------------- + +describe('importVault (a) — fixture vault → kind=raw with source:vault tags', () => { + it('imports ≥95% of obsidian fixture notes as raw with provenance tags', () => { + const result = importVault(path.join(FIXTURES, 'obsidian-sample'), opts({ name: 'obs' })); + expect(result.total).toBeGreaterThanOrEqual(3); + expect(result.imported).toBe(result.total); + expect(result.skipped).toBe(0); + + const all = loadAllEntries(tmpDir, 'default'); + expect(all.length).toBe(result.total); + const raws = all.filter((e) => e.kind === 'raw'); + // ≥95% as kind='raw' with both provenance tags. + const provenanced = raws.filter( + (e) => e.tags.includes('source:vault') && e.tags.includes('vault:obs'), + ); + expect(provenanced.length / all.length).toBeGreaterThanOrEqual(0.95); + // Every imported row carries an artifactRef under the vault prefix. + for (const e of all) { + expect(e.artifact_ref).toMatch(/^vault:obs:/); + expect(e.owner).toBe('agent:vault-import'); + } + }); +}); + +// --------------------------------------------------------------------------- +// (b) re-import idempotent +// --------------------------------------------------------------------------- + +describe('importVault (b) — re-import is idempotent', () => { + it('produces 0 dups and all-skipped on unchanged re-import', () => { + writeNote('a.md', '# A\nbody a [[b]]'); + writeNote('b.md', '---\ntags: [t1]\n---\n# B\nbody b'); + + const first = importVault(vaultDir, opts({ name: 'v' })); + expect(first.imported).toBe(2); + expect(first.skipped).toBe(0); + + const second = importVault(vaultDir, opts({ name: 'v' })); + expect(second.total).toBe(2); + expect(second.imported).toBe(0); + expect(second.skipped).toBe(2); + + // Still exactly 2 live rows — no duplicates. + const all = loadAllEntries(tmpDir, 'default'); + expect(all.length).toBe(2); + expect(archivedRows().length).toBe(0); + }); +}); + +// --------------------------------------------------------------------------- +// (c) changed file → old archived + new kind=raw with source:vault tag +// --------------------------------------------------------------------------- + +describe('importVault (c) — changed file archives old + appends new raw', () => { + it('archives the old raw row and writes a new kind=raw with source:vault', () => { + writeNote('note.md', '# Note\noriginal content'); + const first = importVault(vaultDir, opts({ name: 'v' })); + expect(first.imported).toBe(1); + const originalId = loadAllEntries(tmpDir, 'default')[0].id; + + // Mutate the file. + writeNote('note.md', '# Note\nCHANGED content'); + const second = importVault(vaultDir, opts({ name: 'v' })); + expect(second.imported).toBe(1); // new raw appended + expect(second.skipped).toBe(0); + + const all = loadAllEntries(tmpDir, 'default'); + // Exactly one LIVE row (old archived/deleted, new appended). + expect(all.length).toBe(1); + const live = all[0]; + expect(live.id).not.toBe(originalId); + expect(live.kind).toBe('raw'); + expect(live.tags).toContain('source:vault'); + expect(live.content).toContain('CHANGED'); + + // Old row is gone from memories and snapshotted in raw_archive with a + // `changed:` reason. + const archived = archivedRows(); + expect(archived.length).toBe(1); + expect(archived[0].memory_id).toBe(originalId); + expect(archived[0].reason).toMatch(/^changed:vault:v:note\.md$/); + }); +}); + +// --------------------------------------------------------------------------- +// (d) deleted file → archived +// --------------------------------------------------------------------------- + +describe('importVault (d) — deleted source file archives the raw row', () => { + it('archives the orphaned raw row on deletion-sync', () => { + writeNote('keep.md', '# Keep\nstays'); + writeNote('drop.md', '# Drop\ngoes away'); + const first = importVault(vaultDir, opts({ name: 'v' })); + expect(first.imported).toBe(2); + + const dropId = loadAllEntries(tmpDir, 'default').find((e) => + e.artifact_ref === 'vault:v:drop.md', + )!.id; + + // Remove one note from the source folder. + fs.rmSync(path.join(vaultDir, 'drop.md')); + const second = importVault(vaultDir, opts({ name: 'v' })); + expect(second.total).toBe(1); // only keep.md found + expect(second.skipped).toBe(1); // keep.md unchanged + + const all = loadAllEntries(tmpDir, 'default'); + expect(all.length).toBe(1); + expect(all[0].artifact_ref).toBe('vault:v:keep.md'); + + const archived = archivedRows(); + expect(archived.length).toBe(1); + expect(archived[0].memory_id).toBe(dropId); + expect(archived[0].reason).toMatch(/^source_deleted:vault:v:drop\.md$/); + }); +}); + +// --------------------------------------------------------------------------- +// (e) wikilinks → wikilink-candidate: tags (incl [[a|b]]→a) +// --------------------------------------------------------------------------- + +describe('importVault (e) — wikilinks become wikilink-candidate tags', () => { + it('maps [[target]] and [[target|alias]] to wikilink-candidate:', () => { + writeNote('w.md', '# W\nlinks [[plain-target]] and [[real|Display Alias]] and dup [[plain-target]]'); + importVault(vaultDir, opts({ name: 'v' })); + + const e = loadAllEntries(tmpDir, 'default')[0]; + expect(e.tags).toContain('wikilink-candidate:plain-target'); + expect(e.tags).toContain('wikilink-candidate:real'); // alias dropped + expect(e.tags).not.toContain('wikilink-candidate:Display Alias'); + + // Exact tag-array shape: no duplicate tags despite the repeated [[plain-target]]. + const uniq = new Set(e.tags); + expect(uniq.size).toBe(e.tags.length); + const wlCount = e.tags.filter((t) => t === 'wikilink-candidate:plain-target').length; + expect(wlCount).toBe(1); + // Full expected provenance + wikilink set (order-independent). + expect(e.tags).toEqual( + expect.arrayContaining([ + 'source:vault', + 'vault:v', + 'wikilink-candidate:plain-target', + 'wikilink-candidate:real', + ]), + ); + }); +}); + +// --------------------------------------------------------------------------- +// (f) frontmatter tags/aliases mapped +// --------------------------------------------------------------------------- + +describe('importVault (f) — frontmatter tags and aliases map to tags', () => { + it('maps flow-list + block-list tags and aliases (alias:) with no dup surprises', () => { + writeNote( + 'fm.md', + '---\ntags: [alpha, beta]\naliases: [Nick, Other]\n---\n# FM\nbody no links', + ); + importVault(vaultDir, opts({ name: 'v' })); + const e = loadAllEntries(tmpDir, 'default')[0]; + + expect(e.tags).toContain('alpha'); + expect(e.tags).toContain('beta'); + expect(e.tags).toContain('alias:Nick'); + expect(e.tags).toContain('alias:Other'); + expect(e.tags).toContain('source:vault'); + expect(e.tags).toContain('vault:v'); + + // Tag-array shape: unique, and exactly the expected set (plus the + // content-hash tag which we don't pin by value). + const uniq = new Set(e.tags); + expect(uniq.size).toBe(e.tags.length); + const nonHash = e.tags.filter((t) => !t.startsWith('content-hash:')).sort(); + expect(nonHash).toEqual( + ['alias:Nick', 'alias:Other', 'alpha', 'beta', 'source:vault', 'vault:v'].sort(), + ); + }); + + it('parses block-style (indented dash) frontmatter tag lists', () => { + writeNote('fm2.md', '---\ntags:\n - one\n - two\n---\n# FM2\nbody'); + importVault(vaultDir, opts({ name: 'v' })); + const e = loadAllEntries(tmpDir, 'default')[0]; + // Block-list YAML is not parsed by the minimal inline parser as a flow + // list; the importer must still not crash and must still stamp provenance. + expect(e.tags).toContain('source:vault'); + expect(e.tags).toContain('vault:v'); + expect(e.kind).toBe('raw'); + }); +}); + +// --------------------------------------------------------------------------- +// (g) no crash on dialect syntax (all three fixtures) +// --------------------------------------------------------------------------- + +describe('importVault (g) — dialect syntax does not crash', () => { + it('imports Obsidian (^block-id, ![[embed]]) without error', () => { + const r = importVault(path.join(FIXTURES, 'obsidian-sample'), opts({ name: 'obs' })); + expect(r.imported).toBe(r.total); + expect(r.total).toBeGreaterThan(0); + }); + it('imports Foam (plain md) without error', () => { + const r = importVault(path.join(FIXTURES, 'foam-sample'), opts({ name: 'foam' })); + expect(r.imported).toBe(r.total); + expect(r.total).toBeGreaterThan(0); + }); + it('imports Dendron (dot-hierarchy filenames) without error', () => { + const r = importVault(path.join(FIXTURES, 'dendron-sample'), opts({ name: 'den' })); + expect(r.imported).toBe(r.total); + expect(r.total).toBeGreaterThan(0); + // Dot-hierarchy filename survives as a relpath-keyed artifactRef. + const all = loadAllEntries(tmpDir, 'default'); + expect(all.some((e) => e.artifact_ref === 'vault:den:vault.note.child.md')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// (h) tenant isolation +// --------------------------------------------------------------------------- + +describe('importVault (h) — tenant isolation on deletion-sync', () => { + it('a vault import in tenant A does not archive tenant B vault: rows', () => { + writeNote('shared.md', '# Shared\ntenant A+B both have this path'); + + // Import the same vault name into two tenants. + importVault(vaultDir, opts({ name: 'shared', tenantId: 'A' })); + importVault(vaultDir, opts({ name: 'shared', tenantId: 'B' })); + + expect(loadAllEntries(tmpDir, 'A').length).toBe(1); + expect(loadAllEntries(tmpDir, 'B').length).toBe(1); + const bId = loadAllEntries(tmpDir, 'B')[0].id; + + // Now delete the file and re-import into tenant A only. Tenant B's row must + // be untouched (the loader + deletion-sync are tenant-scoped). + fs.rmSync(path.join(vaultDir, 'shared.md')); + importVault(vaultDir, opts({ name: 'shared', tenantId: 'A' })); + + expect(loadAllEntries(tmpDir, 'A').length).toBe(0); // A's row archived + const bAfter = loadAllEntries(tmpDir, 'B'); + expect(bAfter.length).toBe(1); // B untouched + expect(bAfter[0].id).toBe(bId); + + // Exactly one archive (tenant A's), and it is not tenant B's id. + const archived = archivedRows(); + expect(archived.length).toBe(1); + expect(archived[0].memory_id).not.toBe(bId); + }); +}); + +// --------------------------------------------------------------------------- +// (i) LIKE-escape isolation +// --------------------------------------------------------------------------- + +describe('importVault (i) — LIKE-escape isolation on vault name', () => { + it('a vault named "a%" does not match/archive vault "ab" rows', () => { + // Build vault "ab" with one note. + writeNote('n.md', '# N\nab vault note'); + importVault(vaultDir, opts({ name: 'ab' })); + const abId = loadAllEntries(tmpDir, 'default').find( + (e) => e.artifact_ref === 'vault:ab:n.md', + )!.id; + + // Build vault "a%" with its own note in a SEPARATE source folder. + const otherVault = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-pct-')); + try { + fs.writeFileSync(path.join(otherVault, 'm.md'), '# M\na-percent vault note', 'utf8'); + importVault(otherVault, opts({ name: 'a%' })); + + // Both vaults coexist: ab's row + a%'s row. + const all = loadAllEntries(tmpDir, 'default'); + expect(all.some((e) => e.artifact_ref === 'vault:ab:n.md')).toBe(true); + expect(all.some((e) => e.artifact_ref === 'vault:a%:m.md')).toBe(true); + + // Re-import "a%" with its file deleted. If the LIKE pattern were + // unescaped, `vault:a%:%` would match `vault:ab:n.md` and archive it. + fs.rmSync(path.join(otherVault, 'm.md')); + importVault(otherVault, opts({ name: 'a%' })); + + const after = loadAllEntries(tmpDir, 'default'); + // ab's row survives (escape worked); a%'s row archived. + const abLive = after.find((e) => e.id === abId); + expect(abLive).toBeDefined(); + expect(abLive!.artifact_ref).toBe('vault:ab:n.md'); + expect(after.some((e) => e.artifact_ref === 'vault:a%:m.md')).toBe(false); + + const archived = archivedRows(); + expect(archived.length).toBe(1); + expect(archived[0].memory_id).not.toBe(abId); + expect(archived[0].reason).toMatch(/^source_deleted:vault:a%:m\.md$/); + } finally { + fs.rmSync(otherVault, { recursive: true, force: true }); + } + }); +}); + +// --------------------------------------------------------------------------- +// (j)(k) codex P2 — empty/frontmatter-only notes skipped, not thrown +// --------------------------------------------------------------------------- + +describe('importVault (j) — empty/frontmatter-only notes are skipped, not thrown', () => { + it('skips an empty note and a frontmatter-only note, still imports the real one', () => { + writeNote('empty.md', ''); + writeNote('fm-only.md', '---\ntags: [x]\n---\n'); + writeNote('real.md', 'A genuine note body with enough content.'); + const result = importVault(vaultDir, opts({ name: 'v' })); + expect(result.total).toBe(3); + expect(result.imported).toBe(1); // only real.md + expect(result.skipped).toBe(2); // empty + fm-only + const raw = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')); + expect(raw.length).toBe(1); + expect(raw[0].content).toContain('genuine note body'); + }); +}); + +describe('importVault (k) — a note changed to empty keeps its prior memory (no mid-run archive)', () => { + it('does not archive the prior raw row when a note becomes empty', () => { + writeNote('note.md', 'Original content that is long enough.'); + importVault(vaultDir, opts({ name: 'v' })); + const before = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')); + expect(before.length).toBe(1); + const originalId = before[0].id; + writeNote('note.md', '---\ntags: [y]\n---\n'); // body becomes empty + const result = importVault(vaultDir, opts({ name: 'v' })); + expect(result.skipped).toBe(1); + expect(result.imported).toBe(0); + const after = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')); + expect(after.length).toBe(1); + expect(after[0].id).toBe(originalId); // prior kept, not archived + expect(archivedRows().length).toBe(0); + }); +}); + +// --------------------------------------------------------------------------- +// (l)(m)(n) codex R2 — case-sensitive prefix, dryRun, block-style frontmatter +// --------------------------------------------------------------------------- + +describe('importVault (l) — case-differing vault name does not archive the other vault', () => { + it('vault A does not archive vault a rows (LIKE is case-insensitive; exact-prefix filter)', () => { + writeNote('m.md', 'content for vault a note, long enough'); + importVault(vaultDir, opts({ name: 'a' })); + const aRows = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:a')); + expect(aRows.length).toBe(1); + const aId = aRows[0].id; + const upperVault = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-A-')); + try { + fs.writeFileSync(path.join(upperVault, 'x.md'), 'content for vault A note, long enough', 'utf8'); + importVault(upperVault, opts({ name: 'A' })); + expect(loadAllEntries(tmpDir).filter((e) => e.id === aId).length).toBe(1); // 'a' untouched + expect(archivedRows().length).toBe(0); + } finally { + fs.rmSync(upperVault, { recursive: true, force: true }); + } + }); +}); + +describe('importVault (m) — dryRun previews without mutating', () => { + it('dryRun counts imports but writes nothing', () => { + writeNote('m.md', 'a real note body, long enough to store'); + const result = importVault(vaultDir, opts({ name: 'v', dryRun: true })); + expect(result.total).toBe(1); + expect(result.imported).toBe(1); // preview counts it + expect(loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')).length).toBe(0); // nothing written + }); +}); + +describe('importVault (n) — block-style frontmatter lists are parsed', () => { + it('maps tags + aliases written as indented `- item` lists', () => { + writeNote('n.md', '---\ntags:\n - project\n - urgent\naliases:\n - Nickname\n---\nBody content here, long enough.'); + importVault(vaultDir, opts({ name: 'v' })); + const row = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v'))[0]; + expect(row.tags).toContain('project'); + expect(row.tags).toContain('urgent'); + expect(row.tags).toContain('alias:Nickname'); + }); +}); + +describe('importVault (o) — rejects a vault name containing the ":" delimiter', () => { + it('throws on a name with ":" so it cannot over-match another vault prefix', () => { + writeNote('m.md', 'some content long enough to store'); + expect(() => importVault(vaultDir, opts({ name: 'a:b' }))).toThrow(/must not contain/); + }); +}); + +describe('importVault (p) — rejects global mode', () => { + it('throws on global:true (raw rows are tenant-local)', () => { + writeNote('m.md', 'content long enough to store here'); + expect(() => importVault(vaultDir, opts({ name: 'v', global: true }))).toThrow(/global/); + }); +}); + +describe('importVault (q) — dryRun previews deletions without archiving', () => { + it('counts would-be archives for a removed note without archiving it', () => { + writeNote('keep.md', 'keep this note, long enough to store'); + writeNote('gone.md', 'this note will be removed, long enough'); + importVault(vaultDir, opts({ name: 'v' })); + expect(loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')).length).toBe(2); + fs.rmSync(path.join(vaultDir, 'gone.md')); + const result = importVault(vaultDir, opts({ name: 'v', dryRun: true })); + expect(result.archived).toBe(1); // would archive the removed note + expect(archivedRows().length).toBe(0); // but nothing actually archived + expect(loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')).length).toBe(2); // both still live + }); +}); From 998ca63345e49e5315ecd0b7b0faa3905f801d9d Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Wed, 10 Jun 2026 22:50:11 +0100 Subject: [PATCH 02/11] fix(importers): vault import skips .hippo store + dot-dirs (codex R5 P1) importVault's recursive walk descended into the active Hippo store (.hippo) when a vault contained it (e.g. hippo init in the vault, then hippo import --vault .), re-importing the store's own markdown mirror files and growing on every run. collectMarkdownFiles now skips dot-directories (.hippo/.git/.obsidian/.trash) and the resolved hippoRoot path. +1 regression test (21 vault tests total). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/importers.ts | 15 ++++++++++++--- tests/importers-vault.test.ts | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/importers.ts b/src/importers.ts index e79686c3..c2ae9608 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -609,15 +609,24 @@ function parseWikilinks(body: string): string[] { /** Recursively collect `*.md` files under `root`, returning paths relative to * `root` with forward-slash separators (stable artifactRef keys across OSes). - * Symlinks are not followed (withFileTypes isSymbolicLink skipped). */ -function collectMarkdownFiles(root: string): string[] { + * Symlinks are not followed. Skips dot-directories (the default `.hippo` store, + * `.git`, `.obsidian`, `.trash`) AND the resolved Hippo store path, so + * re-importing a vault that CONTAINS the store never ingests its own markdown + * mirror files (codex R5 P1: `hippo import --vault .` after `hippo init` in the + * vault would otherwise self-import its mirror rows and grow on every run). */ +function collectMarkdownFiles(root: string, hippoRoot: string): string[] { const out: string[] = []; + const resolvedHippoRoot = path.resolve(hippoRoot); const walk = (dir: string): void => { const entries = fs.readdirSync(dir, { withFileTypes: true }); for (const ent of entries) { if (ent.isSymbolicLink()) continue; const abs = path.join(dir, ent.name); if (ent.isDirectory()) { + // Skip dot-dirs (config/system, incl. the default `.hippo` store) and + // the resolved store path (covers a HIPPO_HOME outside `.hippo`). + if (ent.name.startsWith('.')) continue; + if (path.resolve(abs) === resolvedHippoRoot) continue; walk(abs); } else if (ent.isFile() && ent.name.toLowerCase().endsWith('.md')) { out.push(path.relative(root, abs).split(path.sep).join('/')); @@ -700,7 +709,7 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR } } - const relpaths = collectMarkdownFiles(folderPath); + const relpaths = collectMarkdownFiles(folderPath, hippoRoot); const seen = new Set(); let total = 0; diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index 617e089b..d9a56de8 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -463,3 +463,21 @@ describe('importVault (q) — dryRun previews deletions without archiving', () = expect(loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')).length).toBe(2); // both still live }); }); + +describe('importVault (r) — does not import the Hippo store mirror files (codex R5 P1)', () => { + it('skips .hippo / dot-dirs when the vault contains the store', () => { + writeNote('real-note.md', 'a genuine vault note, long enough to store'); + // markdown mirror file inside a .hippo store dir under the vault + fs.mkdirSync(path.join(vaultDir, '.hippo', 'episodic'), { recursive: true }); + fs.writeFileSync(path.join(vaultDir, '.hippo', 'episodic', 'mirror.md'), 'self-import mirror content must not be ingested', 'utf8'); + // and a .git internal markdown + fs.mkdirSync(path.join(vaultDir, '.git'), { recursive: true }); + fs.writeFileSync(path.join(vaultDir, '.git', 'COMMIT_NOTE.md'), 'git internal not a note', 'utf8'); + const result = importVault(vaultDir, opts({ name: 'v' })); + expect(result.total).toBe(1); // only real-note.md + expect(result.imported).toBe(1); + const rows = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')); + expect(rows.length).toBe(1); + expect(rows[0].content).toContain('genuine vault note'); + }); +}); From 55dbc756acb0cb47755c43e7265b522caa1366a3 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Wed, 10 Jun 2026 22:57:53 +0100 Subject: [PATCH 03/11] fix(importers): guard vault-root-is-store + ESM-portable test fixture path codex R6: (1) when folderPath resolves to the Hippo store itself (--vault .hippo), collectMarkdownFiles returns empty instead of walking the store's episodic mirror files; (2) test fixture path uses fileURLToPath(import.meta.url) (portable ESM, matches the repo) instead of __dirname. +1 regression test (22 vault tests). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/importers.ts | 7 +++++++ tests/importers-vault.test.ts | 13 ++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/importers.ts b/src/importers.ts index c2ae9608..950481e3 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -617,6 +617,13 @@ function parseWikilinks(body: string): string[] { function collectMarkdownFiles(root: string, hippoRoot: string): string[] { const out: string[] = []; const resolvedHippoRoot = path.resolve(hippoRoot); + const resolvedRoot = path.resolve(root); + // If the vault root IS the store (or lives inside it), there are no vault + // notes - only the store's own mirror files (codex R6 P2: `--vault .hippo` or + // an SDK caller passing the store dir). Return empty rather than self-import. + if (resolvedRoot === resolvedHippoRoot || resolvedRoot.startsWith(resolvedHippoRoot + path.sep)) { + return out; + } const walk = (dir: string): void => { const entries = fs.readdirSync(dir, { withFileTypes: true }); for (const ent of entries) { diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index d9a56de8..c2b1779f 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -17,6 +17,7 @@ import { describe, it, expect, beforeEach, afterEach } from 'vitest'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; +import { fileURLToPath } from 'node:url'; import { importVault, type ImportOptions } from '../src/importers.js'; import { loadAllEntries, initStore } from '../src/store.js'; import { openHippoDb, closeHippoDb } from '../src/db.js'; @@ -24,7 +25,7 @@ import { openHippoDb, closeHippoDb } from '../src/db.js'; let tmpDir: string; // hippo root (the store) let vaultDir: string; // a scratch vault folder we mutate per-test -const FIXTURES = path.join(__dirname, 'fixtures', 'vault-imports'); +const FIXTURES = path.join(path.dirname(fileURLToPath(import.meta.url)), 'fixtures', 'vault-imports'); beforeEach(() => { tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-store-')); @@ -481,3 +482,13 @@ describe('importVault (r) — does not import the Hippo store mirror files (code expect(rows[0].content).toContain('genuine vault note'); }); }); + +describe('importVault (s) — vault root IS the store imports nothing (codex R6 P2)', () => { + it('returns empty when folderPath resolves to hippoRoot', () => { + fs.mkdirSync(path.join(tmpDir, 'episodic'), { recursive: true }); + fs.writeFileSync(path.join(tmpDir, 'episodic', 'mirror.md'), 'store mirror content here', 'utf8'); + const result = importVault(tmpDir, opts({ name: 'v' })); // folderPath === hippoRoot + expect(result.total).toBe(0); + expect(result.imported).toBe(0); + }); +}); From a3a49820408261a6de2c8e87682ebceef5ab012c Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:12:41 +0100 Subject: [PATCH 04/11] fix: archive all duplicate raw rows per artifactRef on vault re-import The vault importer loaded existing raw rows into a Map keyed by artifactRef (one row per key). A concurrent double-insert (the importer is not re-entrant) can leave more than one live raw row for the same vault note; the Map kept only the last, so a later changed-file or deletion pass archived just that one and left older raw vault content live and searchable after the source changed or vanished (codex R7 P2). Store all rows per artifactRef (Map) and archive every matching raw row on both the changed-file and deletion-sync paths. A file is treated as unchanged only when every live row carries the current content-hash. Regression test (t): two live raw rows for one artifactRef are both archived on change and on delete, and an unchanged re-import with duplicates skips without archiving. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/importers.ts | 36 +++++++++----- tests/importers-vault.test.ts | 93 +++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 11 deletions(-) diff --git a/src/importers.ts b/src/importers.ts index 950481e3..94f63082 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -691,7 +691,11 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR // Map serves both per-file idempotency AND the deletion diff (no second // query). LIKE-escape the vault name so a `%`/`_` in it can't over-match. initStore(hippoRoot); - const existing = new Map(); + // artifactRef -> ALL its live raw rows. >1 only after a concurrent double-insert + // (the importer is not re-entrant; see the JSDoc). The buckets matter: a later + // changed/deletion pass must archive EVERY matching row, not just the last one + // scanned, or older raw vault content lingers live + searchable (codex P2). + const existing = new Map(); { const db = openHippoDb(hippoRoot); try { @@ -708,7 +712,9 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR const exactPrefix = `vault:${vaultName}:`; for (const row of rows) { if (row.artifact_ref && row.artifact_ref.startsWith(exactPrefix)) { - existing.set(row.artifact_ref, row); + const bucket = existing.get(row.artifact_ref); + if (bucket) bucket.push(row); + else existing.set(row.artifact_ref, [row]); } } } finally { @@ -746,9 +752,14 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR const hash = createHash('sha256').update(rawFileContent).digest('hex'); const hashTag = `content-hash:${hash}`; - const prior = existing.get(artifactRef); - const priorTags = prior ? parseJsonArrayLoose(prior.tags_json) : []; - if (prior && priorTags.includes(hashTag)) { + const priors = existing.get(artifactRef) ?? []; + // Unchanged iff EVERY live raw row already carries the current content-hash + // (the normal case is exactly one row; >1 only after a concurrent double- + // insert). If ANY row is stale / missing the hash, treat the file as changed + // so the archive pass below removes ALL of them, not just the matching one + // (codex P2). The `length > 0` guard is required: `[].every()` is vacuously + // true, but a never-seen file (no rows) must fall through to import, not skip. + if (priors.length > 0 && priors.every((p) => parseJsonArrayLoose(p.tags_json).includes(hashTag))) { // Unchanged file → skip (idempotent re-import). skipped++; continue; @@ -765,13 +776,14 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR continue; } - // Changed file → archive the OLD raw row, then append the new one. NEVER + // Changed file → archive EVERY old raw row for this ref (normally one; >1 + // only after a concurrent double-insert), then append the new one. NEVER // supersede (would yield kind='distilled'). archiveRaw commits + closes its // handle before remember() runs, so there is no double-live row; a crash // between them self-heals (file re-imported as fresh raw next run). - if (prior) { + for (const p of priors) { archived++; // count the would-be archive even in dryRun (true preview) - if (!dryRun) archiveRaw(ctx, prior.id, `changed:${artifactRef}`); + if (!dryRun) archiveRaw(ctx, p.id, `changed:${artifactRef}`); } const frontmatterTags = [ ...frontmatterList(fm['tags']), @@ -824,10 +836,12 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR // note that vanished from the source folder → archive its raw row. Per-file // archiveRaw (own handle); no outer SAVEPOINT (no cross-file idempotency row // to commit atomically, unlike github's multi-row case). - for (const [artifactRef, row] of existing) { + for (const [artifactRef, rows] of existing) { if (seen.has(artifactRef)) continue; - archived++; // count even in dryRun so the preview reflects destructive deletes - if (!dryRun) archiveRaw(ctx, row.id, `source_deleted:${artifactRef}`); + for (const row of rows) { + archived++; // count even in dryRun so the preview reflects destructive deletes + if (!dryRun) archiveRaw(ctx, row.id, `source_deleted:${artifactRef}`); + } } return { total, imported, skipped, archived, entries }; diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index c2b1779f..aede1261 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -18,9 +18,11 @@ import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import { fileURLToPath } from 'node:url'; +import { createHash } from 'node:crypto'; import { importVault, type ImportOptions } from '../src/importers.js'; import { loadAllEntries, initStore } from '../src/store.js'; import { openHippoDb, closeHippoDb } from '../src/db.js'; +import { remember, type Context } from '../src/api.js'; let tmpDir: string; // hippo root (the store) let vaultDir: string; // a scratch vault folder we mutate per-test @@ -492,3 +494,94 @@ describe('importVault (s) — vault root IS the store imports nothing (codex R6 expect(result.imported).toBe(0); }); }); + +// --------------------------------------------------------------------------- +// (t) duplicate live raw rows for one artifactRef → archive ALL on change/delete +// (codex R7 P2). The importer is not re-entrant: two concurrent runs can each +// see a note as absent and double-insert. A later changed/deletion pass must +// archive EVERY matching row, not just the last one scanned. +// --------------------------------------------------------------------------- + +const vaultCtx = (): Context => ({ + hippoRoot: tmpDir, + tenantId: 'default', + actor: { subject: 'connector:vault', role: 'admin' }, +}); + +/** Inject a SECOND live raw row sharing dup.md's artifactRef + content-hash, + * exactly as a concurrent importVault run would. Returns both live row ids. */ +function seedDuplicateRawRow(): string[] { + const raw = fs.readFileSync(path.join(vaultDir, 'dup.md'), 'utf8'); + const hash = createHash('sha256').update(raw).digest('hex'); + remember(vaultCtx(), { + content: raw, // no frontmatter in the fixture, so body === raw + kind: 'raw', + artifactRef: 'vault:v:dup.md', + owner: 'agent:vault-import', + tags: ['source:vault', 'vault:v', `content-hash:${hash}`], + }); + const ids = loadAllEntries(tmpDir, 'default') + .filter((e) => e.artifact_ref === 'vault:v:dup.md') + .map((e) => e.id); + expect(ids.length).toBe(2); // precondition: two live rows for one ref + return ids; +} + +describe('importVault (t) — duplicate raw rows for one artifactRef archive ALL', () => { + it('archives BOTH duplicates when the source file changes (not just the last)', () => { + writeNote('dup.md', '# Dup\noriginal content'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); + const dupIds = seedDuplicateRawRow(); + + writeNote('dup.md', '# Dup\nCHANGED content'); + const second = importVault(vaultDir, opts({ name: 'v' })); + expect(second.imported).toBe(1); // one new raw appended + expect(second.archived).toBe(2); // BOTH old rows archived, not one + + const live = loadAllEntries(tmpDir, 'default'); + expect(live.length).toBe(1); // exactly one live row remains + expect(live[0].content).toContain('CHANGED'); + expect(dupIds).not.toContain(live[0].id); + + const archived = archivedRows(); + expect(archived.length).toBe(2); + expect(new Set(archived.map((a) => a.memory_id))).toEqual(new Set(dupIds)); + for (const a of archived) expect(a.reason).toBe('changed:vault:v:dup.md'); + }); + + it('archives BOTH duplicates when the source file is deleted (not just the last)', () => { + writeNote('dup.md', '# Dup\noriginal content'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); + const dupIds = seedDuplicateRawRow(); + + fs.rmSync(path.join(vaultDir, 'dup.md')); + const second = importVault(vaultDir, opts({ name: 'v' })); + expect(second.total).toBe(0); // vault now empty + expect(second.archived).toBe(2); // BOTH rows archived on deletion-sync + + const live = loadAllEntries(tmpDir, 'default').filter( + (e) => e.artifact_ref === 'vault:v:dup.md', + ); + expect(live.length).toBe(0); + + const archived = archivedRows(); + expect(archived.length).toBe(2); + expect(new Set(archived.map((a) => a.memory_id))).toEqual(new Set(dupIds)); + for (const a of archived) expect(a.reason).toBe('source_deleted:vault:v:dup.md'); + }); + + it('skips cleanly when duplicates all carry the current hash (unchanged re-import)', () => { + writeNote('dup.md', '# Dup\noriginal content'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); + seedDuplicateRawRow(); + + // Unchanged re-import: every live row already carries the current content-hash, + // so the file is skipped and NO archive fires (duplicates are identical content, + // not stale; codex P2 is about stale content after change/delete, not redundancy). + const second = importVault(vaultDir, opts({ name: 'v' })); + expect(second.skipped).toBe(1); + expect(second.imported).toBe(0); + expect(second.archived).toBe(0); + expect(archivedRows().length).toBe(0); + }); +}); From c5fbcc68fa6fc636684795d5fbe2deed9c165eab Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:23:23 +0100 Subject: [PATCH 05/11] fix: make vault import of the store path a no-op, not a mass-archive Pointing the importer at the store itself (hippo import --vault , or a path inside the store) resolved to an empty file scan via the self-store guard in collectMarkdownFiles. importVault had already loaded every existing vault::* row, and the deletion-sync pass treats any artifactRef absent from the (empty) scan as source-deleted, so it archived every live vault row. raw-archive redaction makes that loss irreversible (codex R8 P1). Root cause was importVault control flow, not the file walk: the previous R5/R6 self-store fixes all lived in collectMarkdownFiles, but an empty relpaths list from any cause is indistinguishable from "all notes deleted". Move the guard up to a no-op early return in importVault, BEFORE the existing-rows load and deletion-sync; the walk keeps only its nested-store skip. Regression test (u): importing the store path (and a child of it) with live rows present archives nothing and leaves the rows intact. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/importers.ts | 32 ++++++++++++++++-------- tests/importers-vault.test.ts | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/src/importers.ts b/src/importers.ts index 94f63082..331c1322 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -610,20 +610,16 @@ function parseWikilinks(body: string): string[] { /** Recursively collect `*.md` files under `root`, returning paths relative to * `root` with forward-slash separators (stable artifactRef keys across OSes). * Symlinks are not followed. Skips dot-directories (the default `.hippo` store, - * `.git`, `.obsidian`, `.trash`) AND the resolved Hippo store path, so - * re-importing a vault that CONTAINS the store never ingests its own markdown - * mirror files (codex R5 P1: `hippo import --vault .` after `hippo init` in the - * vault would otherwise self-import its mirror rows and grow on every run). */ + * `.git`, `.obsidian`, `.trash`) AND the resolved Hippo store path during the + * walk, so re-importing a vault that CONTAINS the store never ingests its own + * markdown mirror files (codex R5 P1: `hippo import --vault .` after `hippo + * init` in the vault would otherwise self-import its mirror rows and grow on + * every run). The root-IS-the-store case is handled one level up in + * importVault (a no-op early return), NOT here: returning [] for it would feed + * the deletion-sync an empty scan that mass-archives every live row (codex R8). */ function collectMarkdownFiles(root: string, hippoRoot: string): string[] { const out: string[] = []; const resolvedHippoRoot = path.resolve(hippoRoot); - const resolvedRoot = path.resolve(root); - // If the vault root IS the store (or lives inside it), there are no vault - // notes - only the store's own mirror files (codex R6 P2: `--vault .hippo` or - // an SDK caller passing the store dir). Return empty rather than self-import. - if (resolvedRoot === resolvedHippoRoot || resolvedRoot.startsWith(resolvedHippoRoot + path.sep)) { - return out; - } const walk = (dir: string): void => { const entries = fs.readdirSync(dir, { withFileTypes: true }); for (const ent of entries) { @@ -679,6 +675,20 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR throw new Error('importVault does not support global mode (raw rows are tenant-local).'); } + // Self-store no-op guard (codex R8 P1). MUST run BEFORE the existing-rows load + // and the deletion-sync pass below. If the vault folder IS the store (or lives + // inside it), there are no real vault notes - only the store's own markdown + // mirror files. Letting collectMarkdownFiles return [] for this case is NOT + // safe: an empty scan is indistinguishable from "every note was deleted", so + // deletion-sync would archive every live vault::* row, and raw-archive + // content redaction makes that loss IRREVERSIBLE. The only safe reading of + // "import the store into itself" is "do nothing". + const resolvedStore = path.resolve(hippoRoot); + const resolvedFolder = path.resolve(folderPath); + if (resolvedFolder === resolvedStore || resolvedFolder.startsWith(resolvedStore + path.sep)) { + return { total: 0, imported: 0, skipped: 0, archived: 0, entries: [] }; + } + const ctx: Context = { hippoRoot, tenantId, diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index aede1261..10388a8c 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -585,3 +585,50 @@ describe('importVault (t) — duplicate raw rows for one artifactRef archive ALL expect(archivedRows().length).toBe(0); }); }); + +// --------------------------------------------------------------------------- +// (u) importing the store path is a no-op, NOT a mass-archive of live rows +// (codex R8 P1). The self-store guard returns an empty file scan; without +// a control-flow short-circuit, deletion-sync reads "empty scan" as "all +// notes deleted" and irreversibly archives every live vault::* row. +// --------------------------------------------------------------------------- + +describe('importVault (u) — importing the store path preserves existing vault rows', () => { + it('does not archive live rows when folderPath resolves to the store (with rows present)', () => { + // A real import first, so live vault rows exist to be (wrongly) archived. + writeNote('keep.md', '# Keep\na genuine vault note worth keeping'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); + const before = loadAllEntries(tmpDir, 'default').filter((e) => + e.artifact_ref?.startsWith('vault:v:'), + ); + expect(before.length).toBe(1); + + // Now point the importer at the STORE itself. This must be a no-op, NOT a + // deletion-sync that archives every live row because the scan found nothing. + const selfImport = importVault(tmpDir, opts({ name: 'v' })); + expect(selfImport.total).toBe(0); + expect(selfImport.imported).toBe(0); + expect(selfImport.archived).toBe(0); // the regression guard + + const after = loadAllEntries(tmpDir, 'default').filter((e) => + e.artifact_ref?.startsWith('vault:v:'), + ); + expect(after.length).toBe(1); // row survived + expect(after[0].id).toBe(before[0].id); + expect(archivedRows().length).toBe(0); // nothing archived at all + }); + + it('is a no-op for a child of the store too (nested store path)', () => { + writeNote('keep.md', '# Keep\nanother genuine vault note'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); + + // A subdirectory INSIDE the store resolves under hippoRoot → same guard. + const childOfStore = path.join(tmpDir, 'episodic'); + fs.mkdirSync(childOfStore, { recursive: true }); + const selfImport = importVault(childOfStore, opts({ name: 'v' })); + expect(selfImport.total).toBe(0); + expect(selfImport.archived).toBe(0); + expect(loadAllEntries(tmpDir, 'default').filter((e) => e.artifact_ref?.startsWith('vault:v:')).length).toBe(1); + expect(archivedRows().length).toBe(0); + }); +}); From 5d4281229b42b54ea74ae3a24e6af5dedea261de Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:33:06 +0100 Subject: [PATCH 06/11] fix: canonicalize paths in the vault self-store guard (junction/case aliases) The R8 self-store no-op guard compared paths with path.resolve, which does not dereference symlinks/junctions or normalize Windows case. A junction or a case-variant path pointing at the store therefore slipped past the guard: the importer walked the store's markdown mirrors, marked the real vault refs as unseen, and archived them as source_deleted before importing mirror files. Same irreversible data loss as R8, through an aliased path (codex R9 P2). Canonicalize both the store root and the folder with the OS realpath (fs.realpathSync.native dereferences junctions/symlinks and normalizes case) before the comparison, falling back to path.resolve for paths that do not exist yet (which cannot be a live self-store). Regression test (v): a junction to the store imports nothing and archives nothing. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/importers.ts | 24 +++++++++++++++--- tests/importers-vault.test.ts | 46 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/importers.ts b/src/importers.ts index 331c1322..5f13c54d 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -641,6 +641,22 @@ function collectMarkdownFiles(root: string, hippoRoot: string): string[] { return out; } +/** Canonicalize a path for the self-store comparison: dereference symlinks / + * junctions and normalize case (Windows) via the OS realpath, so a junction or + * a case-variant path to the store is still recognized as self-store (codex R9 + * P2: path.resolve does neither, so an aliased store path slipped past the + * guard and triggered the mass-archive). Falls back to path.resolve when the + * path does not exist yet - an uninitialized store or a typo'd vault path + * cannot be a live self-store, and a non-existent vault folder fails later in + * the walk (readdirSync) before deletion-sync can archive anything. */ +function realpathOrResolve(p: string): string { + try { + return fs.realpathSync.native(p); + } catch { + return path.resolve(p); + } +} + interface VaultRow { id: string; artifact_ref: string; @@ -682,9 +698,11 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR // safe: an empty scan is indistinguishable from "every note was deleted", so // deletion-sync would archive every live vault::* row, and raw-archive // content redaction makes that loss IRREVERSIBLE. The only safe reading of - // "import the store into itself" is "do nothing". - const resolvedStore = path.resolve(hippoRoot); - const resolvedFolder = path.resolve(folderPath); + // "import the store into itself" is "do nothing". Canonicalize both paths + // (realpath: dereference junctions/symlinks + normalize Windows case) so an + // aliased path to the store is still caught (codex R9 P2). + const resolvedStore = realpathOrResolve(hippoRoot); + const resolvedFolder = realpathOrResolve(folderPath); if (resolvedFolder === resolvedStore || resolvedFolder.startsWith(resolvedStore + path.sep)) { return { total: 0, imported: 0, skipped: 0, archived: 0, entries: [] }; } diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index 10388a8c..b8910c42 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -631,4 +631,50 @@ describe('importVault (u) — importing the store path preserves existing vault expect(loadAllEntries(tmpDir, 'default').filter((e) => e.artifact_ref?.startsWith('vault:v:')).length).toBe(1); expect(archivedRows().length).toBe(0); }); + + it('treats a junction/symlink to the store as self-store (codex R9 P2)', () => { + writeNote('keep.md', '# Keep\na real note that must survive an aliased self-import'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); + + // An aliased path (junction on Windows, symlink on POSIX) pointing AT the + // store. path.resolve(link) !== resolve(store) textually, so only a realpath + // canonicalization recognizes it as self-store. + const linkParent = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-link-')); + const link = path.join(linkParent, 'store-alias'); + let created = true; + try { + fs.symlinkSync(tmpDir, link, 'junction'); + } catch { + created = false; // no privilege / unsupported on this host -> can't assert + } + try { + if (created) { + const aliased = importVault(link, opts({ name: 'v' })); + expect(aliased.total).toBe(0); + expect(aliased.archived).toBe(0); // the regression guard + const live = loadAllEntries(tmpDir, 'default').filter((e) => + e.artifact_ref?.startsWith('vault:v:'), + ); + expect(live.length).toBe(1); + expect(archivedRows().length).toBe(0); + } + } finally { + // Remove ONLY the link (never recurse into the target store): unlink for a + // POSIX symlink, rmdir for a Windows junction. Both are non-recursive. + try { + fs.unlinkSync(link); + } catch { + try { + fs.rmdirSync(link); + } catch { + /* link already gone */ + } + } + try { + fs.rmdirSync(linkParent); + } catch { + /* leave the empty temp dir for the OS to reap */ + } + } + }); }); From fc15f7bab55ef641dbb4252d778379f112ae02c6 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:42:34 +0100 Subject: [PATCH 07/11] fix: canonicalize the nested-store walk-skip too (complete the R9 fix) The R9 fix canonicalized the importVault self-store guard but left the sibling comparison in collectMarkdownFiles on path.resolve: the walk-skip that prevents ingesting a store nested inside the vault. A non-dot HIPPO_HOME store inside the vault, reached via a junction or a Windows case-variant path, therefore was not skipped, so the walk descended into the store and self-imported its markdown mirror files as vault rows; those mirrors later read as source-deleted and archived live rows (compensating-review P1 while codex was rate-limited). Use realpathOrResolve for both sides of the walk-skip, matching the guard. Regression test (w): a nested non-dot store reached via a junction hippoRoot imports only the real note, not the store's mirror files. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/importers.ts | 14 ++++--- tests/importers-vault.test.ts | 76 +++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/src/importers.ts b/src/importers.ts index 5f13c54d..5009009c 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -610,8 +610,8 @@ function parseWikilinks(body: string): string[] { /** Recursively collect `*.md` files under `root`, returning paths relative to * `root` with forward-slash separators (stable artifactRef keys across OSes). * Symlinks are not followed. Skips dot-directories (the default `.hippo` store, - * `.git`, `.obsidian`, `.trash`) AND the resolved Hippo store path during the - * walk, so re-importing a vault that CONTAINS the store never ingests its own + * `.git`, `.obsidian`, `.trash`) AND the canonicalized Hippo store path during + * the walk, so re-importing a vault that CONTAINS the store never ingests its own * markdown mirror files (codex R5 P1: `hippo import --vault .` after `hippo * init` in the vault would otherwise self-import its mirror rows and grow on * every run). The root-IS-the-store case is handled one level up in @@ -619,7 +619,11 @@ function parseWikilinks(body: string): string[] { * the deletion-sync an empty scan that mass-archives every live row (codex R8). */ function collectMarkdownFiles(root: string, hippoRoot: string): string[] { const out: string[] = []; - const resolvedHippoRoot = path.resolve(hippoRoot); + // Canonicalize (realpath) so a non-dot HIPPO_HOME store nested in the vault is + // skipped even when hippoRoot is an aliased path (junction / Windows case + // variant); path.resolve would miss it and self-import the store's mirror + // files (codex R9 follow-up: same gap as the importVault guard, sibling site). + const resolvedHippoRoot = realpathOrResolve(hippoRoot); const walk = (dir: string): void => { const entries = fs.readdirSync(dir, { withFileTypes: true }); for (const ent of entries) { @@ -627,9 +631,9 @@ function collectMarkdownFiles(root: string, hippoRoot: string): string[] { const abs = path.join(dir, ent.name); if (ent.isDirectory()) { // Skip dot-dirs (config/system, incl. the default `.hippo` store) and - // the resolved store path (covers a HIPPO_HOME outside `.hippo`). + // the canonicalized store path (covers a HIPPO_HOME outside `.hippo`). if (ent.name.startsWith('.')) continue; - if (path.resolve(abs) === resolvedHippoRoot) continue; + if (realpathOrResolve(abs) === resolvedHippoRoot) continue; walk(abs); } else if (ent.isFile() && ent.name.toLowerCase().endsWith('.md')) { out.push(path.relative(root, abs).split(path.sep).join('/')); diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index b8910c42..709e78d7 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -678,3 +678,79 @@ describe('importVault (u) — importing the store path preserves existing vault } }); }); + +// --------------------------------------------------------------------------- +// (w) a NON-dot store nested in the vault is skipped even when hippoRoot is an +// aliased path (junction / Windows case variant). The R9 fix canonicalized +// the importVault self-store guard but NOT this sibling walk-skip, so an +// aliased hippoRoot let the walk descend into the store and self-import its +// markdown mirror files (compensating-review P1, codex R9 follow-up). +// --------------------------------------------------------------------------- + +describe('importVault (w) — nested non-dot store skipped under an aliased hippoRoot', () => { + it('does not self-import store mirror files when hippoRoot is a junction to a nested store', () => { + // A non-dot store dir INSIDE the vault (the HIPPO_HOME-inside-vault config + // the collectMarkdownFiles JSDoc claims to support). + const realStore = path.join(vaultDir, 'Store'); + fs.mkdirSync(realStore, { recursive: true }); + + // Reach that store via a junction, so path.resolve(alias) !== the on-disk + // path (exactly like a case-variant HIPPO_HOME on Windows). Only realpath + // canonicalizes the alias back to realStore. + const aliasParent = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-hr-')); + const hippoAlias = path.join(aliasParent, 'store-alias'); + let created = true; + try { + fs.symlinkSync(realStore, hippoAlias, 'junction'); + } catch { + created = false; // no privilege / unsupported -> can't assert + } + try { + if (created) { + initStore(hippoAlias); + // A mirror-like markdown file living inside the nested store. + fs.mkdirSync(path.join(realStore, 'episodic'), { recursive: true }); + fs.writeFileSync( + path.join(realStore, 'episodic', 'mirror.md'), + 'store mirror content that must NOT be self-imported as a vault note', + 'utf8', + ); + writeNote('realnote.md', '# Real\na genuine vault note worth importing'); + + const result = importVault(vaultDir, { + hippoRoot: hippoAlias, + tenantId: 'default', + name: 'v', + }); + // Only the real note imports; the nested store's mirror file is skipped. + expect(result.total).toBe(1); + expect(result.imported).toBe(1); + const rows = loadAllEntries(hippoAlias, 'default').filter((e) => + e.tags.includes('vault:v'), + ); + expect(rows.length).toBe(1); + expect(rows[0].artifact_ref).toBe('vault:v:realnote.md'); + expect( + rows.some((e) => (e.artifact_ref ?? '').includes('Store/')), + ).toBe(false); + } + } finally { + // Remove ONLY the junction link, never recurse into realStore (which lives + // under vaultDir and is cleaned by afterEach). + try { + fs.unlinkSync(hippoAlias); + } catch { + try { + fs.rmdirSync(hippoAlias); + } catch { + /* link already gone */ + } + } + try { + fs.rmdirSync(aliasParent); + } catch { + /* leave the empty temp dir for the OS to reap */ + } + } + }); +}); From 4804797d66a5a57ab252eb2256095cdb0808ef75 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:20:47 +0100 Subject: [PATCH 08/11] fix: require explicit vault name + envelope-aware re-import idempotency Two correctness issues codex R10 flagged in the vault importer: 1. The vault name defaulted to the folder basename. Two unrelated vaults sharing a basename (work/notes, personal/notes) collided on the same vault::* prefix, so importing the second loaded the first's rows and deletion-sync archived them. The name is the identity key for a destructive sync, so require it explicitly (importVault throws; the CLI prints a usage error) instead of inferring a path-unstable one. 2. The idempotency skip keyed on the content-hash alone, so a re-import that changed --scope or added an extra tag was skipped and the rows kept the old envelope (a vault first imported public stayed public after a rerun with a private scope). Treat the full envelope (content-hash + scope + requested extra tags) as the idempotency key: an envelope mismatch archives all priors and re-remembers with the new envelope. Loader now selects scope. Tests (x): no-name throws; same-basename vaults stay isolated under distinct names; a --scope change and an added tag each re-write rather than skip. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cli.ts | 10 ++++- src/importers.ts | 45 ++++++++++++++++----- tests/importers-vault.test.ts | 75 +++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 11 deletions(-) diff --git a/src/cli.ts b/src/cli.ts index 7455f2e4..455d328a 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -5845,6 +5845,13 @@ function cmdImport( console.error('hippo import --vault does not support --global (raw rows are tenant-local).'); process.exit(1); } + if (!flags['name'] || !String(flags['name']).trim()) { + // --name is the vault identity key for the destructive source-deletion sync; + // inferring it from the folder basename let same-basename vaults collide and + // clobber each other (codex R10 P2). Require it explicitly. + console.error('hippo import --vault requires --name (the identity key used for source-deletion sync).'); + process.exit(1); + } const tenantId = resolveTenantId({}); const vaultOptions: ImportOptions = { ...importOptions, @@ -7758,7 +7765,8 @@ Commands: --file Import from any markdown or text file --markdown Import from structured MEMORY.md / AGENTS.md --vault Import a markdown-vault FOLDER as kind='raw' notes - (Obsidian/Foam/Dendron). [--name ] [--scope ] + (Obsidian/Foam/Dendron). Requires --name . + [--scope ] --dry-run Preview without writing --global Write to global store ($HIPPO_HOME or ~/.hippo/) --tag Add extra tag (repeatable) diff --git a/src/importers.ts b/src/importers.ts index 5009009c..c7ddf4fe 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -665,6 +665,7 @@ interface VaultRow { id: string; artifact_ref: string; tags_json: string; + scope: string | null; } /** @@ -678,7 +679,18 @@ interface VaultRow { export function importVault(folderPath: string, options: ImportOptions): ImportResult { const hippoRoot = options.hippoRoot; const tenantId = options.tenantId ?? 'default'; - const vaultName = options.name ?? path.basename(path.resolve(folderPath)); + // Vault NAME is the identity key for the destructive deletion-sync below, so it + // must be explicit. Defaulting to the folder basename meant two unrelated vaults + // sharing a basename (e.g. work/notes and personal/notes) collided on the same + // `vault::*` prefix - importing the second loaded the first's rows and the + // deletion-sync archived them (codex R10 P2). Require a deliberate name instead + // of inferring a path-unstable one. + const vaultName = options.name?.trim(); + if (!vaultName) { + throw new Error( + 'importVault requires an explicit vault name (options.name): it is the identity key for source-deletion sync and must not be inferred from the folder basename.', + ); + } if (vaultName.includes(':')) { // ':' is the artifactRef delimiter (vault::); a name // containing it lets a different vault's prefix scan over-match and archive @@ -734,7 +746,7 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR const likeParam = `vault:${escapeLike(vaultName)}:%`; const rows = db .prepare( - `SELECT id, artifact_ref, tags_json FROM memories + `SELECT id, artifact_ref, tags_json, scope FROM memories WHERE artifact_ref LIKE ? ESCAPE '\\' AND tenant_id = ? AND kind = 'raw'`, ) .all(likeParam, tenantId) as VaultRow[]; @@ -785,14 +797,27 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR const hashTag = `content-hash:${hash}`; const priors = existing.get(artifactRef) ?? []; - // Unchanged iff EVERY live raw row already carries the current content-hash - // (the normal case is exactly one row; >1 only after a concurrent double- - // insert). If ANY row is stale / missing the hash, treat the file as changed - // so the archive pass below removes ALL of them, not just the matching one - // (codex P2). The `length > 0` guard is required: `[].every()` is vacuously - // true, but a never-seen file (no rows) must fall through to import, not skip. - if (priors.length > 0 && priors.every((p) => parseJsonArrayLoose(p.tags_json).includes(hashTag))) { - // Unchanged file → skip (idempotent re-import). + // Unchanged iff EVERY live raw row matches the full import ENVELOPE, not just + // the file bytes: the content-hash tag AND the requested scope AND all the + // requested extra tags. Keying on the content-hash alone meant a re-import + // that changed --scope (or added a --tag) was skipped, so a vault first + // imported public stayed public after a rerun with a private scope (codex + // R10 P2). An envelope mismatch falls through to archive-all + remember with + // the new envelope. (The `length > 0` guard is required: `[].every()` is + // vacuously true, but a never-seen file must import, not skip. Archiving ALL + // priors on a mismatch also clears any concurrent-double-insert duplicates.) + const envelopeUnchanged = + priors.length > 0 && + priors.every((p) => { + const priorTags = parseJsonArrayLoose(p.tags_json); + return ( + priorTags.includes(hashTag) && + (p.scope ?? null) === scope && + extraTags.every((t) => priorTags.includes(t)) + ); + }); + if (envelopeUnchanged) { + // Unchanged file + envelope → skip (idempotent re-import). skipped++; continue; } diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index 709e78d7..86b06f7c 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -754,3 +754,78 @@ describe('importVault (w) — nested non-dot store skipped under an aliased hipp } }); }); + +// --------------------------------------------------------------------------- +// (x) explicit vault name is required (no path-basename default), and a re-import +// that changes the ENVELOPE (--scope / extra tags) re-writes rather than +// skipping on content-hash alone (codex R10 P2 x2). +// --------------------------------------------------------------------------- + +describe('importVault (x) — explicit name + envelope-aware idempotency', () => { + it('throws when no vault name is given (no basename default that could collide)', () => { + writeNote('a.md', '# A\nbody content long enough to store'); + expect(() => importVault(vaultDir, { hippoRoot: tmpDir, tenantId: 'default' })).toThrow( + /explicit vault name/, + ); + // A blank/whitespace name is rejected the same way. + expect(() => importVault(vaultDir, opts({ name: ' ' }))).toThrow(/explicit vault name/); + }); + + it('re-imports an unchanged file when --scope changes (archive old + new scoped row)', () => { + writeNote('note.md', '# Note\nstable content across the scope change'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); // unscoped + const firstId = loadAllEntries(tmpDir, 'default')[0].id; + expect(loadAllEntries(tmpDir, 'default')[0].scope ?? null).toBe(null); + + // Same bytes, now request a private scope -> must re-write, not skip. + const second = importVault(vaultDir, opts({ name: 'v', scope: 'private' })); + expect(second.skipped).toBe(0); + expect(second.imported).toBe(1); + expect(second.archived).toBe(1); + + const live = loadAllEntries(tmpDir, 'default'); + expect(live.length).toBe(1); + expect(live[0].id).not.toBe(firstId); + expect(live[0].scope).toBe('private'); // the requested envelope was applied + + // A third identical run at the SAME scope is idempotent again. + const third = importVault(vaultDir, opts({ name: 'v', scope: 'private' })); + expect(third.skipped).toBe(1); + expect(third.imported).toBe(0); + expect(third.archived).toBe(0); + }); + + it('re-imports an unchanged file when an extra tag is added', () => { + writeNote('note.md', '# Note\nstable content across the tag change'); + expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1); + + const second = importVault(vaultDir, opts({ name: 'v', extraTags: ['campaign:q3'] })); + expect(second.imported).toBe(1); + expect(second.archived).toBe(1); + const live = loadAllEntries(tmpDir, 'default'); + expect(live.length).toBe(1); + expect(live[0].tags).toContain('campaign:q3'); + }); + + it('two same-basename vaults stay isolated under distinct explicit names', () => { + // Distinct names -> distinct vault::* prefixes -> deletion-sync of one + // never touches the other (the failure mode the basename default caused). + writeNote('shared.md', '# Work\nwork note body content'); + expect(importVault(vaultDir, opts({ name: 'work' })).imported).toBe(1); + + const other = fs.mkdtempSync(path.join(os.tmpdir(), 'hippo-vault-src2-')); + try { + fs.writeFileSync(path.join(other, 'shared.md'), '# Personal\npersonal note body', 'utf8'); + const second = importVault(other, opts({ name: 'personal' })); + expect(second.imported).toBe(1); + expect(second.archived).toBe(0); // did NOT archive the work vault's row + + const all = loadAllEntries(tmpDir, 'default'); + expect(all.length).toBe(2); + expect(all.some((e) => e.artifact_ref === 'vault:work:shared.md')).toBe(true); + expect(all.some((e) => e.artifact_ref === 'vault:personal:shared.md')).toBe(true); + } finally { + fs.rmSync(other, { recursive: true, force: true }); + } + }); +}); From 74f2892628b8ec93ffd1acffe4366e8c15736393 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:34:52 +0100 Subject: [PATCH 09/11] fix: full-tag-set idempotency + reject valueless --name/--scope Two refinements codex R11 flagged to the R10 envelope fixes: 1. The envelope idempotency check compared content-hash + scope + an extra-tag SUBSET, so a re-import that REMOVED a previously-set tag was skipped (subset is vacuously satisfied) and left the stale tag live. Replace the piecemeal check with full tag-SET equality (plus scope): build the complete tag set the import would write, then skip only if every live row carries exactly that set. Any add, remove, content, frontmatter, wikilink, or scope change now re-writes. The tag set is built before the idempotency decision (was after). 2. A valueless --name (or --name --scope private) parses as boolean true, and String(true) is "true", which passed the presence check and imported under vault:true:*. Require a non-empty string for --name; reject a valueless --scope the same way. Test: removing an extra tag on re-import re-writes (stale tag gone). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cli.ts | 13 ++++-- src/importers.ts | 76 ++++++++++++++++++----------------- tests/importers-vault.test.ts | 19 +++++++++ 3 files changed, 68 insertions(+), 40 deletions(-) diff --git a/src/cli.ts b/src/cli.ts index 455d328a..08146fe7 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -5845,11 +5845,18 @@ function cmdImport( console.error('hippo import --vault does not support --global (raw rows are tenant-local).'); process.exit(1); } - if (!flags['name'] || !String(flags['name']).trim()) { + if (typeof flags['name'] !== 'string' || !flags['name'].trim()) { // --name is the vault identity key for the destructive source-deletion sync; // inferring it from the folder basename let same-basename vaults collide and - // clobber each other (codex R10 P2). Require it explicitly. - console.error('hippo import --vault requires --name (the identity key used for source-deletion sync).'); + // clobber each other (codex R10 P2). A valueless `--name` parses as boolean + // true, and String(true) === "true" would silently import under vault:true:* + // - reject a non-string so it fails fast instead (codex R11 P2). + console.error('hippo import --vault requires --name (a non-empty identity key for source-deletion sync).'); + process.exit(1); + } + if (flags['scope'] !== undefined && (typeof flags['scope'] !== 'string' || !flags['scope'].trim())) { + // Same valueless-flag trap: a bare `--scope` must not become scope "true". + console.error('hippo import --vault: --scope requires a value (e.g. --scope private).'); process.exit(1); } const tenantId = resolveTenantId({}); diff --git a/src/importers.ts b/src/importers.ts index c7ddf4fe..cb56f091 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -796,31 +796,10 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR const hash = createHash('sha256').update(rawFileContent).digest('hex'); const hashTag = `content-hash:${hash}`; + // Load every live raw row for this ref (>1 only after a concurrent double- + // insert). The idempotency decision happens below, AFTER the full tag set is + // built, so it can compare the complete envelope rather than a subset. const priors = existing.get(artifactRef) ?? []; - // Unchanged iff EVERY live raw row matches the full import ENVELOPE, not just - // the file bytes: the content-hash tag AND the requested scope AND all the - // requested extra tags. Keying on the content-hash alone meant a re-import - // that changed --scope (or added a --tag) was skipped, so a vault first - // imported public stayed public after a rerun with a private scope (codex - // R10 P2). An envelope mismatch falls through to archive-all + remember with - // the new envelope. (The `length > 0` guard is required: `[].every()` is - // vacuously true, but a never-seen file must import, not skip. Archiving ALL - // priors on a mismatch also clears any concurrent-double-insert duplicates.) - const envelopeUnchanged = - priors.length > 0 && - priors.every((p) => { - const priorTags = parseJsonArrayLoose(p.tags_json); - return ( - priorTags.includes(hashTag) && - (p.scope ?? null) === scope && - extraTags.every((t) => priorTags.includes(t)) - ); - }); - if (envelopeUnchanged) { - // Unchanged file + envelope → skip (idempotent re-import). - skipped++; - continue; - } const { fm, body } = parseFrontmatter(rawFileContent); @@ -833,24 +812,15 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR continue; } - // Changed file → archive EVERY old raw row for this ref (normally one; >1 - // only after a concurrent double-insert), then append the new one. NEVER - // supersede (would yield kind='distilled'). archiveRaw commits + closes its - // handle before remember() runs, so there is no double-live row; a crash - // between them self-heals (file re-imported as fresh raw next run). - for (const p of priors) { - archived++; // count the would-be archive even in dryRun (true preview) - if (!dryRun) archiveRaw(ctx, p.id, `changed:${artifactRef}`); - } + // Build the FULL tag envelope this import would write, BEFORE the idempotency + // decision. De-duplicate (createMemory stores tags verbatim, so a collision + // between, e.g., a frontmatter tag and an extraTag would otherwise produce a + // duplicate). Order-preserving. const frontmatterTags = [ ...frontmatterList(fm['tags']), ...frontmatterList(fm['aliases']).map((a) => `alias:${a}`), ]; const wikilinkTags = parseWikilinks(body).map((t) => `wikilink-candidate:${t}`); - - // De-duplicate the final tag array (createMemory stores tags verbatim, so a - // collision between, e.g., a frontmatter tag and an extraTag would otherwise - // produce a duplicate). Order-preserving. const tags = Array.from( new Set([ 'source:vault', @@ -862,6 +832,38 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR ]), ); + // Unchanged iff EVERY live raw row carries the EXACT same tag set AND scope. + // Comparing the COMPLETE set (not the content-hash + a subset of extra tags) + // means every envelope change registers: content (via the content-hash tag), + // frontmatter, wikilinks, an ADDED extra tag, or a REMOVED one - the earlier + // piecemeal checks missed scope (R10 P2) then tag removal (R11 P2). Set + // equality is order-independent and both sides are deduped. (`length > 0` + // guard: a never-seen file must import, not skip; archiving ALL priors on a + // mismatch also clears any concurrent-double-insert duplicates.) + const wantTags = new Set(tags); + const envelopeUnchanged = + priors.length > 0 && + priors.every((p) => { + if ((p.scope ?? null) !== scope) return false; + const priorTags = parseJsonArrayLoose(p.tags_json); + return priorTags.length === wantTags.size && priorTags.every((t) => wantTags.has(t)); + }); + if (envelopeUnchanged) { + // Unchanged file + envelope → skip (idempotent re-import). + skipped++; + continue; + } + + // Changed file → archive EVERY old raw row for this ref (normally one; >1 + // only after a concurrent double-insert), then append the new one. NEVER + // supersede (would yield kind='distilled'). archiveRaw commits + closes its + // handle before remember() runs, so there is no double-live row; a crash + // between them self-heals (file re-imported as fresh raw next run). + for (const p of priors) { + archived++; // count the would-be archive even in dryRun (true preview) + if (!dryRun) archiveRaw(ctx, p.id, `changed:${artifactRef}`); + } + // remember() owns the actual write. We build an `echo` of the SAME content + // tags via createMemory purely for the ImportResult, then reconcile its id to // remember()'s real row id so entries[] reflects the row that landed. diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index 86b06f7c..48011bf3 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -807,6 +807,25 @@ describe('importVault (x) — explicit name + envelope-aware idempotency', () => expect(live[0].tags).toContain('campaign:q3'); }); + it('re-imports an unchanged file when an extra tag is REMOVED (codex R11 P2)', () => { + writeNote('note.md', '# Note\nstable content across the tag removal'); + expect( + importVault(vaultDir, opts({ name: 'v', extraTags: ['campaign:q3'] })).imported, + ).toBe(1); + expect(loadAllEntries(tmpDir, 'default')[0].tags).toContain('campaign:q3'); + + // Re-import the SAME bytes WITHOUT the extra tag: a subset check would skip + // (vacuously true) and leave the stale tag live; the full-set check treats the + // removal as an envelope change and re-writes. + const second = importVault(vaultDir, opts({ name: 'v' })); + expect(second.skipped).toBe(0); + expect(second.imported).toBe(1); + expect(second.archived).toBe(1); + const live = loadAllEntries(tmpDir, 'default'); + expect(live.length).toBe(1); + expect(live[0].tags).not.toContain('campaign:q3'); // stale tag gone + }); + it('two same-basename vaults stay isolated under distinct explicit names', () => { // Distinct names -> distinct vault::* prefixes -> deletion-sync of one // never touches the other (the failure mode the basename default caused). From f205e315981724b7a328da02f8cdc72102f1ed66 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:46:33 +0100 Subject: [PATCH 10/11] fix: archive prior row when a vault note is emptied + correct name JSDoc codex R12: - P2: a note edited down to empty / frontmatter-only was skipped before its prior raw row was archived, and stayed in `seen`, so deletion-sync skipped it too. The old body stayed live and searchable after the source no longer held it. Treat an emptied note as a content deletion: archive any prior row(s) (reason `emptied:`), then skip the write. Safe because remember() is then skipped entirely (no archive-then-throw-on-empty-body hazard, the original reason this branch did not archive). Completes the source-sync state machine (new/changed/unchanged/emptied/ deleted). Test (k) updated to the corrected behavior. - P3: ImportOptions.name JSDoc said it defaults to basename(folderPath), but importVault now throws when it is omitted. Documented as required for vault imports (optional in the shared type only because the other importers ignore it). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/importers.ts | 24 ++++++++++++++++++------ tests/importers-vault.test.ts | 17 ++++++++++------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/importers.ts b/src/importers.ts index cb56f091..ab88adf9 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -41,9 +41,13 @@ export interface ImportOptions { tenantId?: string; /** * K1 vault import only. Logical vault name used in the `vault:` tag and - * the `artifactRef='vault::'` key. Defaults to - * `basename(folderPath)` when unset. Operator-supplied, so the loader query - * LIKE-escapes it (see `escapeLike` below). + * the `artifactRef='vault::'` key. REQUIRED by importVault: it + * is the identity key for the destructive source-deletion sync, so it must be + * set explicitly rather than inferred from the folder basename (two vaults + * sharing a basename would collide and clobber each other). importVault throws + * if it is missing or blank. Optional in this shared type only because the + * other importers ignore it. Operator-supplied, so the loader query LIKE-escapes + * it (see `escapeLike` below). */ name?: string; /** @@ -804,10 +808,18 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR const { fm, body } = parseFrontmatter(rawFileContent); // Empty / frontmatter-only note: nothing storable (createMemory enforces a - // min content length). Skip WITHOUT archiving any prior row — archiving - // first then throwing on the empty body would delete a live memory mid-run - // (codex P2). The note stays in `seen`, so deletion-sync won't archive it. + // min content length). The note's CONTENT was deleted at source, so this is a + // content-deletion: archive any prior row(s) - the old body must not stay live + // and searchable after the source no longer holds it (codex R12 P2) - then + // skip the write. Archiving here is safe precisely because we then skip + // remember() entirely: there is no archive-then-throw-on-empty-body hazard + // (the original reason this branch did not archive). The note stays in `seen` + // so deletion-sync does not double-process it. if (body.trim().length < 3) { + for (const p of priors) { + archived++; // count even in dryRun so the preview reflects the removal + if (!dryRun) archiveRaw(ctx, p.id, `emptied:${artifactRef}`); + } skipped++; continue; } diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index 48011bf3..8cc5d4b5 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -377,21 +377,24 @@ describe('importVault (j) — empty/frontmatter-only notes are skipped, not thro }); }); -describe('importVault (k) — a note changed to empty keeps its prior memory (no mid-run archive)', () => { - it('does not archive the prior raw row when a note becomes empty', () => { +describe('importVault (k) — a note emptied at source archives its prior memory', () => { + it('archives the prior raw row when a note becomes frontmatter-only (no stale body left live)', () => { writeNote('note.md', 'Original content that is long enough.'); importVault(vaultDir, opts({ name: 'v' })); const before = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')); expect(before.length).toBe(1); const originalId = before[0].id; - writeNote('note.md', '---\ntags: [y]\n---\n'); // body becomes empty + writeNote('note.md', '---\ntags: [y]\n---\n'); // body becomes empty (frontmatter-only) const result = importVault(vaultDir, opts({ name: 'v' })); - expect(result.skipped).toBe(1); + expect(result.skipped).toBe(1); // nothing storable -> no new write expect(result.imported).toBe(0); + expect(result.archived).toBe(1); // but the stale prior IS archived (codex R12 P2) const after = loadAllEntries(tmpDir).filter((e) => e.tags.includes('vault:v')); - expect(after.length).toBe(1); - expect(after[0].id).toBe(originalId); // prior kept, not archived - expect(archivedRows().length).toBe(0); + expect(after.length).toBe(0); // prior gone; the deleted body is not left searchable + const archived = archivedRows(); + expect(archived.length).toBe(1); + expect(archived[0].memory_id).toBe(originalId); + expect(archived[0].reason).toBe('emptied:vault:v:note.md'); }); }); From c7bd79ba92f3e490fb1d1bb3b6e57a463006af90 Mon Sep 17 00:00:00 2001 From: Keith So <68618199+kitfunso@users.noreply.github.com> Date: Thu, 11 Jun 2026 11:43:32 +0100 Subject: [PATCH 11/11] fix: reject bare-private vault scopes that recall treats as public codex R13 P2 (security): hippo's recall filter only default-denies scopes shaped `:private:*` (isPrivateScope / PRIVATE_SCOPE_RE in api.ts). A vault imported with `--scope private` (or `private:x`, or `vault:private` with no trailing segment) stamped a scope recall does NOT treat as private, so notes the user believed were private were still returned to no-scope recall callers. importVault now rejects a scope that names a `private` segment but is not a valid `:private:*`, using recall's own isPrivateScope as the single source of truth, and points the user at the correct `vault:private:` form. The CLI --scope usage example no longer suggests the bare `private` footgun. Test: bare `private` / `private:secret` throw; `vault:private:v` is accepted. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cli.ts | 4 +++- src/importers.ts | 16 +++++++++++++++- tests/importers-vault.test.ts | 25 +++++++++++++++++++++---- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/cli.ts b/src/cli.ts index 08146fe7..094804e2 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -5856,7 +5856,9 @@ function cmdImport( } if (flags['scope'] !== undefined && (typeof flags['scope'] !== 'string' || !flags['scope'].trim())) { // Same valueless-flag trap: a bare `--scope` must not become scope "true". - console.error('hippo import --vault: --scope requires a value (e.g. --scope private).'); + // Example uses the source-prefixed private form, since a bare `private` scope + // is NOT treated as private by recall and importVault rejects it (R13 P2). + console.error('hippo import --vault: --scope requires a value (e.g. --scope vault:private:notes).'); process.exit(1); } const tenantId = resolveTenantId({}); diff --git a/src/importers.ts b/src/importers.ts index ab88adf9..bf227a83 100644 --- a/src/importers.ts +++ b/src/importers.ts @@ -10,7 +10,7 @@ import { createMemory, Layer, MemoryEntry } from './memory.js'; import { initStore, loadAllEntries, writeEntry } from './store.js'; import { textOverlap } from './search.js'; import { getGlobalRoot, initGlobal } from './shared.js'; -import { remember, archiveRaw, type Context } from './api.js'; +import { remember, archiveRaw, isPrivateScope, type Context } from './api.js'; import { openHippoDb, closeHippoDb } from './db.js'; // --------------------------------------------------------------------------- @@ -702,6 +702,20 @@ export function importVault(folderPath: string, options: ImportOptions): ImportR throw new Error(`vault name must not contain ':' (artifactRef delimiter): ${vaultName}`); } const scope = options.scope ?? null; + // Privacy footgun guard (codex R13 P2): hippo's recall filter only default-denies + // scopes shaped `:private:*` (see isPrivateScope / PRIVATE_SCOPE_RE in + // scope.ts). A bare `private` (or `private:`) first segment is NOT recognized + // as private, so notes a user believes are private would still be returned to + // no-scope recall callers. Reject the alias and point at the source-prefixed form + // rather than silently storing public-visible "private" notes. Use recall's own + // isPrivateScope as the single source of truth: reject a scope that names a + // `private` segment yet is NOT a valid `:private:*` (catches `private`, + // `private:x`, and `vault:private` with a missing trailing segment). + if (scope !== null && scope.split(':').includes('private') && !isPrivateScope(scope)) { + throw new Error( + `vault scope '${scope}' is not recognized as private by recall (only ':private:*' scopes are default-denied). Use a source-prefixed scope such as 'vault:private:${vaultName}'.`, + ); + } const extraTags = options.extraTags ?? []; const dryRun = options.dryRun ?? false; if (options.global) { diff --git a/tests/importers-vault.test.ts b/tests/importers-vault.test.ts index 8cc5d4b5..64a4951d 100644 --- a/tests/importers-vault.test.ts +++ b/tests/importers-vault.test.ts @@ -780,8 +780,8 @@ describe('importVault (x) — explicit name + envelope-aware idempotency', () => const firstId = loadAllEntries(tmpDir, 'default')[0].id; expect(loadAllEntries(tmpDir, 'default')[0].scope ?? null).toBe(null); - // Same bytes, now request a private scope -> must re-write, not skip. - const second = importVault(vaultDir, opts({ name: 'v', scope: 'private' })); + // Same bytes, now request a (properly source-prefixed) private scope -> re-write. + const second = importVault(vaultDir, opts({ name: 'v', scope: 'vault:private:v' })); expect(second.skipped).toBe(0); expect(second.imported).toBe(1); expect(second.archived).toBe(1); @@ -789,15 +789,32 @@ describe('importVault (x) — explicit name + envelope-aware idempotency', () => const live = loadAllEntries(tmpDir, 'default'); expect(live.length).toBe(1); expect(live[0].id).not.toBe(firstId); - expect(live[0].scope).toBe('private'); // the requested envelope was applied + expect(live[0].scope).toBe('vault:private:v'); // the requested envelope was applied // A third identical run at the SAME scope is idempotent again. - const third = importVault(vaultDir, opts({ name: 'v', scope: 'private' })); + const third = importVault(vaultDir, opts({ name: 'v', scope: 'vault:private:v' })); expect(third.skipped).toBe(1); expect(third.imported).toBe(0); expect(third.archived).toBe(0); }); + it('rejects a bare-private scope recall would not treat as private (codex R13 P2)', () => { + writeNote('note.md', '# Note\nsensitive content the user wants private'); + // Bare `private` / `private:` are NOT default-denied by recall, so reject + // them rather than store public-visible "private" notes. + expect(() => importVault(vaultDir, opts({ name: 'v', scope: 'private' }))).toThrow( + /not recognized as private/, + ); + expect(() => importVault(vaultDir, opts({ name: 'v', scope: 'private:secret' }))).toThrow( + /not recognized as private/, + ); + // Nothing was written by the rejected runs; the source-prefixed form is accepted. + expect(loadAllEntries(tmpDir, 'default').length).toBe(0); + const ok = importVault(vaultDir, opts({ name: 'v', scope: 'vault:private:v' })); + expect(ok.imported).toBe(1); + expect(loadAllEntries(tmpDir, 'default')[0].scope).toBe('vault:private:v'); + }); + it('re-imports an unchanged file when an extra tag is added', () => { writeNote('note.md', '# Note\nstable content across the tag change'); expect(importVault(vaultDir, opts({ name: 'v' })).imported).toBe(1);