From bcfc97d4122c3f0dfaa551ca03f15820b2a351e5 Mon Sep 17 00:00:00 2001 From: "bonk-ai[bot]" <269762587+bonk-ai[bot]@users.noreply.github.com> Date: Sat, 9 May 2026 21:12:15 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20v1=20finalize=20Track=20C=20=E2=80=94?= =?UTF-8?q?=20debt=20sweep=20(7=20ACs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .erpaval/INDEX.md | 1 + .erpaval/debt.md | 12 +- .../no-spec-coordinate-leakage-into-source.md | 74 +++++++ ...cip-references-and-embedder-fingerprint.md | 106 ++++++++++ packages/cli/README.md | 68 ++++++ packages/cli/src/commands/analyze.ts | 9 +- packages/cli/src/commands/query.test.ts | 5 + packages/cli/src/commands/query.ts | 48 +++-- packages/cli/src/index.ts | 5 + packages/core-types/src/edges.test.ts | 16 +- packages/core-types/src/edges.ts | 4 +- packages/core-types/src/graph-hash.ts | 24 +++ packages/embedder/src/factory.test.ts | 111 ++++++++++ packages/embedder/src/factory.ts | 95 +++++++++ packages/embedder/src/fingerprint.test.ts | 54 +++++ packages/embedder/src/fingerprint.ts | 64 ++++++ packages/embedder/src/index.ts | 6 + packages/ingestion/README.md | 69 ++++++ .../src/pipeline/phases/content-cache.test.ts | 200 ++++++++++++++++++ .../src/pipeline/phases/content-cache.ts | 135 +++++++++++- .../src/pipeline/phases/embeddings.ts | 4 + .../src/pipeline/phases/scip-index.ts | 71 ++++++- packages/mcp/README.md | 62 ++++++ packages/mcp/src/error-envelope.ts | 3 +- packages/mcp/src/tools/query.ts | 71 ++++--- packages/scanners/README.md | 80 +++++++ packages/scip-ingest/src/derive.ts | 46 ++-- packages/storage/src/column-encode.test.ts | 17 +- packages/storage/src/column-encode.ts | 38 +++- packages/storage/src/duckdb-adapter.ts | 31 ++- .../storage/src/graph-hash-parity.test.ts | 123 ++++++++++- packages/storage/src/graphdb-adapter.ts | 18 +- packages/storage/src/graphdb-schema.test.ts | 6 +- packages/storage/src/graphdb-schema.ts | 2 + packages/storage/src/interface.ts | 11 + packages/storage/src/schema-ddl.ts | 25 ++- 36 files changed, 1602 insertions(+), 112 deletions(-) create mode 100644 .erpaval/solutions/best-practices/no-spec-coordinate-leakage-into-source.md create mode 100644 docs/adr/0014-scip-references-and-embedder-fingerprint.md create mode 100644 packages/cli/README.md create mode 100644 packages/embedder/src/factory.test.ts create mode 100644 packages/embedder/src/factory.ts create mode 100644 packages/embedder/src/fingerprint.test.ts create mode 100644 packages/embedder/src/fingerprint.ts create mode 100644 packages/ingestion/README.md create mode 100644 packages/mcp/README.md create mode 100644 packages/scanners/README.md diff --git a/.erpaval/INDEX.md b/.erpaval/INDEX.md index 0b8ad4e1..ffb4128b 100644 --- a/.erpaval/INDEX.md +++ b/.erpaval/INDEX.md @@ -31,6 +31,7 @@ development sessions. Solutions are reusable; specs are per-feature. - [Replace raw-SQL escape hatches with typed finders on the storage interface](solutions/architecture-patterns/typed-finders-replace-raw-sql-in-consumers.md) — 108 raw-SQL sites collapse into 15 named finders. Adapters internalize dialect; consumers stay backend-agnostic. Liskov-clean parity harness via public-method rebuilder. - [Parallel Act subagents on a shared git tree — interleaving + cherry-pick discipline](solutions/best-practices/parallel-act-subagents-with-shared-git-tree.md) — verify branch state, spawn on non-overlapping packages, watch for stale dist + phantom test counts, watch the test-fixup tail. - [Squash-merge masks pre-existing repo-wide debt](solutions/best-practices/squash-merge-masks-pre-existing-debt.md) — first action on a fresh branch from main is `mise run check` BEFORE starting work; lint rules / transitive deps / cross-package test assertions drift across squash boundaries even when per-commit gating was green inside the prior PR. +- [No spec-coordinate leakage into source](solutions/best-practices/no-spec-coordinate-leakage-into-source.md) — ERPAVal `AC-*`, `M-*`, `W-*`, `CL-*` prefixes belong in commits, PR bodies, ADR refs sections — NOT in JSDoc, inline comments, CLI flag help, MCP tool descriptions, or test names. Sweep `rg -n "AC-[A-Z]-[0-9]" packages/` before every PR-open; LLM clients pick up the leakage and start citing it back. ## Specs diff --git a/.erpaval/debt.md b/.erpaval/debt.md index d3bc2ceb..edd01525 100644 --- a/.erpaval/debt.md +++ b/.erpaval/debt.md @@ -288,11 +288,13 @@ architecture pages + mermaid rendering. `sarif`, `scip-ingest`, `search` — all have code-level docs but no README. -2. **`.gitmodules` thiserror pin comment.** The sweep reconciled - `packages/gym/corpus/rust/README.md` and `corpus/repos/README.md` on - `thiserror@2.0.17`. `.gitmodules` line 19 still says - `pin: v2.0.0 tag`. One-line fix — the subagent's write was denied, - deferred to the user. +2. **`.gitmodules` thiserror pin comment.** **Status: CLOSED-STALE** — + `git show HEAD:.gitmodules` returns "fatal: path .gitmodules does + not exist in HEAD"; the file was removed when `packages/gym` moved + to `opencodehub-testbed` (commit 378f79f). The submodule set lives + in the testbed repo now; any thiserror-pin reconciliation belongs + over there, not here. Closed as stale by AC-C-7 (Track C, v1 + finalize, 2026-05-09). 3. **Dead eval-harness fallback.** `packages/eval/src/opencodehub_eval/ test_parametrized.py:167-175` has tool-still-unregistered fallback diff --git a/.erpaval/solutions/best-practices/no-spec-coordinate-leakage-into-source.md b/.erpaval/solutions/best-practices/no-spec-coordinate-leakage-into-source.md new file mode 100644 index 00000000..d60b5c7a --- /dev/null +++ b/.erpaval/solutions/best-practices/no-spec-coordinate-leakage-into-source.md @@ -0,0 +1,74 @@ +--- +name: ERPAVal spec coordinates (CL-*, AC-*, M-*, W-*) MUST NOT leak into production source or comments +description: Specifier prefixes from EARS specs and the ERPAVal classifier vocabulary are session-local bookkeeping; production code, comments, JSDoc, and CLI/MCP option descriptions must not reference them +type: feedback +--- + +ERPAVal specs use a structured vocabulary — `AC-A-1`, `AC-C-3`, `M3-1`, +`W-A-2`, `E-C-3`, `CL-VALIDATE`, `S-A-2`, `architecture-revised.md +§AC-A-7` — to coordinate work across the orchestrator and Act +subagents. These prefixes are useful inside ERPAVal artifacts: +`.erpaval/specs/`, `.erpaval/sessions//`, ADR validation tables, +commit messages, PR bodies. They are NOT useful in production source. + +Observed leakage on Track C cleanup (2026-05-09): the orchestrator and +multiple Act subagents seeded `AC-C-3:`, `AC-C-2:`, `AC-A-1:`, `AC-A-6c:`, +`AC-A-9:` into JSDoc, inline comments, MCP tool option descriptions +(visible to every MCP client), and CLI flag help (visible to every +`codehub query --help` user). Counts after Wave C.1 + my Wave C.2 first +pass: ~45 source references to AC-A-* (legacy from Track A — already +on main via PR #71), 14 source references to AC-C-* introduced this +session before sweep. + +**Why:** session-local coordinates rot. Six months after the AC graduates +into a release, the spec packet is in `.erpaval/sessions/session-/` +which is gitignored — readers of the source can't follow the citation. +The MCP option description "Bypass the embedder fingerprint check +(AC-C-3)." leaks ERPAVal vocabulary into the MCP tool surface, which +LLM clients then pick up and start citing back; the leakage compounds. + +**How to apply:** + +- **Source comments / JSDoc:** name the underlying invariant, behavior, + or contract. "Refuse when the persisted embedder modelId differs from + the current one" is forever; "AC-C-3 refusal" is until the AC merges + and then forgets itself. +- **Variable names, function names, type fields:** never carry the prefix. + `forceBackendMismatch` (good) not `acC3ForceBackendMismatch` (never). +- **CLI help / MCP descriptions / tool descriptions:** describe the + user-visible contract. The user does not know what an AC is. Strip. +- **ADR text:** ADRs MAY cite AC-* coordinates because the ADR is the + permanent home of the decision rationale and links to the spec packet. + But cite once, in a "References" section, not inline throughout the + decision body. +- **Commit messages and PR descriptions:** AC citations are great here. + Reviewers grep for them; release-please may include them in the + changelog. +- **Test names and fixture names:** prefer the behavior under test + ("graphHash parity: medium-with-empty-keywords ([] vs absent)") over + the AC ("AC-C-2: graphHash..."). The behavior survives renames; AC + numbers don't. +- **Sweep before commit.** Run `rg -n "AC-[A-Z]-[0-9]" packages/` + against your branch before PR-open. Anything that hits is a + candidate for rephrase. If the comment NEEDS to cite the AC, use a + short reference at the end like "(AC-C-5)" rather than leading with + it. +- **The test fakes are the trap.** When a Wave subagent edits a test + fake, it tends to add `// AC-XXX: stubs ...` because it's writing + the comment WITH the AC packet open in front of it. Sweep test files + the same way as source files. + +**Why it's worth a hook:** the leakage is mechanical and silent. A +PostToolUse hook on Edit/Write that scans the diff for `^[\\s*/]*AC-[A-Z]-[0-9]+` +in `packages/**` (excluding `.erpaval/`, `.md` ADRs, and commit-message +files) and either blocks the write or appends a stderr advisory would +catch every recurrence at the source. Until that hook exists, the +discipline is on the orchestrator + reviewer. + +**Carry-forward debt:** Track A merged with extensive `AC-A-*` +references throughout `packages/storage/`, `packages/mcp/`, and +`packages/cli/`. They are on main and any Track-after-A branch picks +them up. A standalone `chore(repo): scrub spec coordinates from +source` cleanup PR is the right venue — not Track C, not Track D. +That PR can ship in its own session because the cleanup is mechanical +and reviewable in one window. diff --git a/docs/adr/0014-scip-references-and-embedder-fingerprint.md b/docs/adr/0014-scip-references-and-embedder-fingerprint.md new file mode 100644 index 00000000..7c6c2d27 --- /dev/null +++ b/docs/adr/0014-scip-references-and-embedder-fingerprint.md @@ -0,0 +1,106 @@ +# ADR 0014 — SCIP REFERENCES + TYPE_OF emission and embedder-fingerprint refusal + +**Status**: Accepted +**Date**: 2026-05-09 +**Supersedes**: none +**Superseded by**: none + +## Context + +Two unrelated holes in v1.0 finalize, both routing through a shared one-time graphHash content delta. They land in a single ADR per spec.md§Q7 because the fixture-regeneration cost is paid once. + +### Hole A — Embedder rebuild-on-switch silent corruption (AC-C-3) + +The `embeddings` table on disk is populated by ONE specific embedder at index time. The currently-shipped store_meta schema (`packages/storage/src/schema-ddl.ts:172-183`) records `schema_version, last_commit, indexed_at, node_count, edge_count, stats_json, cache_hit_ratio, cache_size_bytes, last_compaction` — but NOT which embedder produced the vectors. + +Failure mode: an operator runs `codehub analyze` with the local ONNX `gte-modernbert-base/fp32` embedder, then later runs `codehub query` with `CODEHUB_EMBEDDING_SAGEMAKER_ENDPOINT` set to a SageMaker `gte-modernbert-base` deployment. Both embedders publish 768-dim vectors, so the dim guard at `bindParam(stmt, idx, vectorBuffer)` does not fire. The vector subspaces are NOT identical — different fp32/quantization, different post-processing pipelines, different L2-normalisation cutoffs — so cosine-similarity retrieval silently misranks. + +There is no test suite that catches this; there is no error envelope at the query path. + +### Hole B — SCIP REFERENCES + TYPE_OF unwired (AC-C-5) + +`packages/scip-ingest/src/derive.ts` already correctly: +- Emits CALLS edges via `deriveEdges` for function-like SCIP occurrences (`derive.ts:128-152`). +- Collects `is_implementation → IMPLEMENTS` and `is_type_definition → TYPE_OF` rows into `derived.relations` via `collectRels` (`derive.ts:184-199`). + +But: +- `packages/ingestion/src/pipeline/phases/scip-index.ts:245-252` consumes `derived.edges` (CALLS) and ignores `derived.relations` entirely. IMPLEMENTS and TYPE_OF reach the graph zero times even though the data is parsed. +- `derive.ts:136` filters non-call SCIP occurrences out of `deriveEdges` with `if (!isFunctionLike(occ.symbol)) continue;` — so non-call references (an identifier reading a class field, importing a type, accessing a constant) never produce REFERENCES edges either. +- `RelationType` at `packages/core-types/src/edges.ts:3-27` lists `REFERENCES` (position 21) but does NOT list `TYPE_OF`. The append-only rule at `edges.ts:29-32` requires new relation types to be appended at the end — `TYPE_OF` lands at position 25. + +The combined effect: every existing OCH index understates the call/reference graph by ~3 edge classes, and the `RelationType` union is missing one of the two heritage relations SCIP exposes. + +## Decision + +### A — Persist embedder modelId; refuse mismatched queries + +1. Add `embedder_model_id TEXT` column to `store_meta` (DuckDB) and the matching `STRING` field to the `StoreMeta` graph-db NODE TABLE. +2. `Store.setMeta(meta)` writes the currently-active embedder's `Embedder.modelId`. +3. `Store.getMeta()` returns the persisted value via the new `StoreMeta.embedderModelId?: string` field. +4. At query time (cli `runQuery`, MCP `runQuery`), read `meta.embedderModelId`, compare to `embedder.modelId`: + - Equal → proceed. + - Persisted is `undefined` (pre-AC-C-3 store) → proceed; the operator is trusted to know what they indexed. + - Mismatch + force flag set → proceed. + - Mismatch + no force flag → refuse. CLI prints to stderr and `process.exit(2)` per E-C-3. MCP returns a `EMBEDDER_MISMATCH` envelope via `toolError` per the same hint string. +5. Frozen remediation hint string lives in `packages/embedder/src/fingerprint.ts` as `EMBEDDER_MISMATCH_HINT`. Both surfaces import it so the message can never drift. +6. CLI `--force-backend-mismatch` flag and MCP `force_backend_mismatch` tool input give the operator an override path. Default `false`. + +The `assertEmbedderCompatible(persistedModelId, currentModelId, force)` helper lives in `@opencodehub/embedder` so cli + mcp share one comparator. + +### B — Emit IMPLEMENTS, TYPE_OF, REFERENCES from SCIP + +1. Append `TYPE_OF` at position 25 of `RelationType` and `RELATION_TYPES` per the append-only rule. The schema-shape stays append-stable; `graphHash` for content that does NOT include TYPE_OF stays byte-identical. +2. Widen `derive.ts:136` to also emit a `REFERENCES` `DerivedEdge` for non-call SCIP occurrences whose symbol has a DEFINITION elsewhere AND is not an `SCIP_ROLE_IMPORT`-only occurrence. +3. Add a sibling `emitRelations` call in `scip-index.ts` that consumes `derived.relations` and writes IMPLEMENTS + TYPE_OF graph edges using the same caller→callee join shape as `emitEdges`. Both joins use `buildSymbolDefIndex` for callee resolution, per the `scip-callee-definition-site` lesson; both add `+1` at the SCIP→OCH boundary, per the `scip-0-indexed-vs-graph-1-indexed` lesson. +4. Regenerate `packages/ingestion/src/pipeline/incremental-determinism.test.ts` fixtures one time. Document this in the commit message as the expected one-time content delta. +5. Extend `packages/storage/src/graph-hash-parity.test.ts` with a fixture variant exercising IMPLEMENTS + TYPE_OF + REFERENCES; assert cross-adapter parity holds. + +## Consequences + +### graphHash impact + +- **Hole A (embedder fingerprint)** is graphHash-NEUTRAL. `graphHash` (`packages/core-types/src/graph-hash.ts:44-69`) hashes ONLY `(nodes, edges)` — `store_meta` is not part of the input. Adding a `store_meta` column does not change any per-commit hash. +- **Hole B (SCIP edges)** is graphHash-CONTENT-DELTA. The first index run after merge produces additional edges (REFERENCES + IMPLEMENTS + TYPE_OF) that did not previously exist. Every existing OCH index will yield a different graphHash on next `codehub analyze`. This is documented as a v1.0 minor bump (schema-shape preserved via append-only; only content changes). + +### Cross-track sequencing + +This ADR is shared with AC-C-3 (Hole A) and AC-C-5 (Hole B). They land in the same Track C PR; the fixture regen runs once for both. + +### Migration cost + +For Hole A, existing stores have `embedder_model_id IS NULL`. On next `Store.open` an `ALTER TABLE ... ADD COLUMN IF NOT EXISTS` runs cheaply; the `setMeta` call after the next `codehub analyze` populates the value. Until then, the query-time refusal sees `meta.embedderModelId === undefined` and passes — no false-positive refusals on existing stores. + +For Hole B, every existing store needs a `codehub analyze --force` to pick up the new edges. That is the same posture as every prior schema-content delta in OCH (e.g. the M3+M6 IGraphStore split landed under the same minor-bump rule). + +### Forward compatibility + +`EMBEDDER_MISMATCH` is added to `ErrorCode` at `packages/mcp/src/error-envelope.ts`. Existing clients that do not enumerate the union ignore it; existing clients that DO enumerate the union pick up the new code on rebuild. No on-wire breaking change. + +## Alternatives considered + +### Hole A +- **Embed dim alone, not modelId.** Dim collisions (768 = 768) are exactly the failure mode this ADR addresses. Rejected. +- **Hash the first vector against a known canary string.** More complex, more storage, indistinguishable from "different post-processing pipelines" cases that produce identical canaries by accident. Rejected. +- **Force re-index on EVERY embedder env-var change.** Too aggressive for SageMaker→ONNX fallbacks during dev. The override flag exists for that case. + +### Hole B +- **Insert TYPE_OF mid-union next to IMPLEMENTS.** Violates W-A-2 + the `edges.ts:29-32` append-only comment. Would break every existing graphHash on every existing OCH index, even for content with no IMPLEMENTS / TYPE_OF / REFERENCES. Rejected. +- **Split AC-C-5 into a sibling PR after Track C.** Considered in `pr-split-analysis.md` Option (b). Rejected because the fixture-regeneration cost would be paid twice (once for the v1.0 finalize hash bump that ships SCIP REFERENCES, once for the next ADR adding TYPE_OF). Bundling them is cheaper. + +## Validation + +- `mise run check` exits 0 on the Track C branch. +- `pnpm --filter @opencodehub/storage test` parity green (DuckDb leg + skip-clean GraphDb leg on dev box without `@ladybugdb/core` binding). +- `pnpm --filter @opencodehub/embedder test` covers `assertEmbedderCompatible` 5 cases. +- `pnpm --filter @opencodehub/scip-ingest test` covers REFERENCES emission + IMPLEMENTS/TYPE_OF collectRels. +- `pnpm --filter @opencodehub/ingestion test` regenerates incremental-determinism fixtures. +- ROADMAP constraint U1 (graphHash byte-identity per commit) holds for all content that does not exercise the new edge kinds; for content that does, the new fixture variant proves cross-adapter parity. + +## References + +- `.erpaval/specs/006-v1-finalize/spec.md§AC-C-3, §AC-C-5, §E-C-3, §E-C-4, §W-A-2` +- `.erpaval/sessions/session-33f24f/research-detectsecrets-scip.yaml` (SCIP role enum + Relationship message) +- `.erpaval/solutions/architecture-patterns/scip-callee-definition-site.md` +- `.erpaval/solutions/conventions/scip-0-indexed-vs-graph-1-indexed.md` +- `docs/adr/0011-graph-db-backend.md` (M3+M6 IGraphStore precedent) +- `docs/adr/0013-m7-default-flip-and-abstraction.md` (M7 LadybugDB default flip) diff --git a/packages/cli/README.md b/packages/cli/README.md new file mode 100644 index 00000000..35250835 --- /dev/null +++ b/packages/cli/README.md @@ -0,0 +1,68 @@ +# @opencodehub/cli + +The `codehub` command-line front end. Every subcommand lazy-loads its +implementation so `codehub --help` stays fast — no DuckDB binding, no +pipeline, no MCP SDK is initialised until the matching action runs +(`packages/cli/src/index.ts:1-13`). + +## Surface + +```bash +codehub [options] +``` + +- The CLI binary is the only OpenCodeHub-distributed UX. There is no + daemon, no hosted service, and no second transport — agents talk to + OpenCodeHub through the stdio MCP server launched by `codehub mcp`. +- Errors print as `codehub: ` and set exit code 1 + (`packages/cli/src/index.ts:860-864`). + +## Commands + +Registered in `packages/cli/src/index.ts:14-737`. The table groups the 25 +top-level subcommands by phase of the workflow. + +| Command | Purpose | +| ------------------ | -------------------------------------------------------------------------- | +| `init` | Bootstrap a repo: `.claude/`, `.mcp.json`, `.gitignore`, policy seed | +| `setup` | Write MCP config for editors, fetch embedder weights, install SCIP tools | +| `analyze` | Run the 31-phase ingestion pipeline against a repo | +| `index` | Register an existing `.codehub/` folder without re-analysing | +| `status` | Show registry metadata + index freshness for a repo | +| `list` | Enumerate every repo registered on this machine | +| `clean` | Delete one or all registered indexes | +| `mcp` | Launch the stdio MCP server | +| `query` | Hybrid BM25 + vector search against a repo's graph | +| `context` | 360-degree view of a symbol — callers, callees, flows | +| `impact` | Blast-radius traversal up/down/both with risk tier | +| `detect-changes` | Map an uncommitted or committed diff onto affected symbols + processes | +| `verdict` | 5-tier PR decision (`auto_merge`/`single_review`/.../`block`) | +| `scan` | Run Priority-1 scanners and ingest findings into the graph | +| `ingest-sarif` | Ingest an external SARIF 2.1.0 log into the graph | +| `pack` | Single-file LLM snapshot via repomix (AST-compressed) | +| `code-pack` | Deterministic 9-item BOM under `.codehub/packs//` | +| `wiki` | Emit a Markdown wiki tree (deterministic, optionally LLM-narrated) | +| `bench` | Run the acceptance gate suite and render a pass/fail dashboard | +| `doctor` | Probe the local environment and print actionable hints | +| `ci-init` | Emit GitHub Actions / GitLab CI workflow scaffolds | +| `augment` | Fast-path BM25 enrichment for editor PreToolUse hooks | +| `sql` | Read-only SQL against the graph store with a 5 s timeout | +| `group ` | Cross-repo groups: `create`, `list`, `delete`, `status`, `query`, `sync` | + +## Design + +- **Lazy loading** — each `.action()` does `await import(...)` so cold + startup is bounded by Commander, not DuckDB or the parse pool + (`packages/cli/src/index.ts:78-81`). +- **No stateful daemon** — `analyze` runs to completion and exits; + `mcp` is the only long-running process. +- **Registry on disk** — `~/.codehub/registry.json` enumerates indexed + repos; per-repo state lives under `/.codehub/` + (`packages/cli/src/registry.ts`). +- **Env-toggle defaults** — `OCH_NATIVE_PARSER`, `CODEHUB_STORE`, + `CODEHUB_BEDROCK_DISABLED` flip behaviour without touching flags. +- **`mcp` is launched, never embedded** — agents that need the MCP + surface spawn `codehub mcp` over stdio (`packages/cli/src/commands/mcp.ts`). + +See ADR 0013 for the storage-backend toggle and the root README's +"MCP tool surface" section for the agent-facing tool inventory. diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index faa32549..f54c4cfa 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -731,13 +731,20 @@ function boolField(r: Record, col: string): boolean | undefined } function stringArrayField(r: Record, col: string): readonly string[] | undefined { + // Preserve `[]` distinct from absent. The DuckDB TEXT[] binder returns + // a 0-length JS array for an empty SQL array literal and `null` for + // SQL NULL; mirror the storage adapter's `setStringArrayField` and + // return the array verbatim so a Community / Route node written as + // `{keywords: []}` (or `{responseKeys: []}`) survives the carry-forward + // load with its empty array intact — required so canonical-JSON / + // graphHash byte-identity holds across the incremental re-index. const v = r[col]; if (!Array.isArray(v)) return undefined; const out: string[] = []; for (const item of v) { if (typeof item === "string") out.push(item); } - return out.length > 0 ? out : undefined; + return out; } function parseJsonStringArrayField( diff --git a/packages/cli/src/commands/query.test.ts b/packages/cli/src/commands/query.test.ts index 6605ead5..dba412e0 100644 --- a/packages/cli/src/commands/query.test.ts +++ b/packages/cli/src/commands/query.test.ts @@ -111,6 +111,11 @@ function makeFakeStore(opts: FakeStoreOptions = {}): FakeStoreHandle { } return out; }, + // The query path reads `getMeta()` to compare the persisted embedder + // modelId against the currently-active embedder. Returning `undefined` + // makes every fake store look like a legacy / pre-tag store, so the + // compatibility check passes without any test having to set a model id. + getMeta: async () => undefined, }; // The temporal-tier surface the query path touches is just diff --git a/packages/cli/src/commands/query.ts b/packages/cli/src/commands/query.ts index a2aeb46d..4f6a866f 100644 --- a/packages/cli/src/commands/query.ts +++ b/packages/cli/src/commands/query.ts @@ -27,7 +27,11 @@ import { readFile } from "node:fs/promises"; import { isAbsolute, resolve } from "node:path"; -import type { Embedder } from "@opencodehub/embedder"; +import { + assertEmbedderCompatible, + type Embedder, + openDefaultEmbedder, +} from "@opencodehub/embedder"; import { bm25Search, DEFAULT_RRF_TOP_K, @@ -93,6 +97,13 @@ export interface QueryOptions { * queries that should land on Community nodes. */ readonly granularity?: "symbol" | "file" | "community"; + /** + * `--force-backend-mismatch` — bypass the embedder fingerprint refusal. + * Lets a query proceed against an `embeddings` table that was populated + * by a different embedder than the one currently active. The vectors + * may be stale; results may misrank. Default `false`. + */ + readonly forceBackendMismatch?: boolean; } /** @@ -113,19 +124,6 @@ interface QueryRow { readonly signatureSummary?: string; } -/** - * Default production factory — lazy-imports `@opencodehub/embedder` so the - * ONNX runtime native binding only loads when the command actually needs - * it. Priority mirrors the MCP tool: HTTP env vars first, ONNX weights - * second, graceful `tryOpenEmbedder` fallback on any failure. - */ -async function defaultOpenEmbedder(): Promise { - const mod = await import("@opencodehub/embedder"); - const httpEmbedder = mod.tryOpenHttpEmbedder(); - if (httpEmbedder !== null) return httpEmbedder; - return mod.openOnnxEmbedder(); -} - export async function runQuery( text: string, opts: QueryOptions = {}, @@ -134,7 +132,9 @@ export async function runQuery( const limit = opts.limit ?? 10; const rerankTopK = opts.rerankTopK ?? DEFAULT_RRF_TOP_K; const openStore = hooks.openStore ?? openStoreForCommand; - const openEmbedder = hooks.openEmbedder ?? defaultOpenEmbedder; + // Shared HTTP-priority + ONNX-fallback factory. ONNX binding only loads + // on the fallback branch, so plain (non-dynamic) import is fine here. + const openEmbedder = hooks.openEmbedder ?? (() => openDefaultEmbedder()); const { store, repoPath } = await openStore(opts); const graph = store.graph; try { @@ -151,6 +151,24 @@ export async function runQuery( const embedder = await tryOpenEmbedder(openEmbedder, "[cli:query]"); if (embedder !== null) { try { + // Refuse the hybrid path when the persisted embedder modelId + // differs from the current one. Same-dim vectors from different + // embedders silently corrupt ranking. `--force-backend-mismatch` + // lets the operator override; legacy stores have + // `embedderModelId === undefined` and the check passes. + const meta = await store.graph.getMeta(); + const compat = assertEmbedderCompatible( + meta?.embedderModelId, + embedder.modelId, + opts.forceBackendMismatch === true, + ); + if (!compat.ok) { + process.stderr.write( + `Embedder mismatch: store was indexed with '${compat.persistedModelId}', ` + + `current embedder is '${compat.currentModelId}'.\n${compat.hint}\n`, + ); + process.exit(2); + } const fused = await hybridSearch( graph, { diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 38089dd7..90c2c0c5 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -382,6 +382,10 @@ program "--granularity ", "Restrict ANN to one hierarchical tier: symbol (default), file, or community", ) + .option( + "--force-backend-mismatch", + "Bypass the embedder fingerprint check. Lets a query run when the persisted embedder model_id differs from the current one. Vectors may be stale.", + ) .action(async (text: string, opts: Record) => { const mod = await import("./commands/query.js"); const granularity = parseQueryGranularity(opts["granularity"]); @@ -398,6 +402,7 @@ program zoom: opts["zoom"] === true, ...(typeof opts["fanout"] === "number" ? { fanout: opts["fanout"] } : {}), ...(granularity !== undefined ? { granularity } : {}), + forceBackendMismatch: opts["forceBackendMismatch"] === true, }); }); diff --git a/packages/core-types/src/edges.test.ts b/packages/core-types/src/edges.test.ts index a3d213c9..b615da65 100644 --- a/packages/core-types/src/edges.test.ts +++ b/packages/core-types/src/edges.test.ts @@ -5,17 +5,22 @@ import { KnowledgeGraph } from "./graph.js"; import { makeNodeId } from "./id.js"; import type { GraphNode } from "./nodes.js"; -test("RELATION_TYPES: length is 24 after COCHANGES extraction (21 MVP + 3 new)", () => { - assert.equal(RELATION_TYPES.length, 24); +test("RELATION_TYPES: length is 25 after TYPE_OF append", () => { + assert.equal(RELATION_TYPES.length, 25); }); -test("RELATION_TYPES: contains the three remaining v1.0 additions (append-only)", () => { - for (const t of ["FOUND_IN", "DEPENDS_ON", "OWNED_BY"] as const) { +test("RELATION_TYPES: contains the v1.0 additions in append order", () => { + for (const t of ["FOUND_IN", "DEPENDS_ON", "OWNED_BY", "TYPE_OF"] as const) { assert.ok(RELATION_TYPES.includes(t), `RELATION_TYPES missing ${t}`); } const firstNewIdx = RELATION_TYPES.indexOf("FOUND_IN"); assert.equal(RELATION_TYPES[firstNewIdx - 1], "REFERENCES"); - assert.deepEqual(RELATION_TYPES.slice(firstNewIdx), ["FOUND_IN", "DEPENDS_ON", "OWNED_BY"]); + assert.deepEqual(RELATION_TYPES.slice(firstNewIdx), [ + "FOUND_IN", + "DEPENDS_ON", + "OWNED_BY", + "TYPE_OF", + ]); }); test("RELATION_TYPES: COCHANGES is NOT a relation type (moved to cochanges table)", () => { @@ -48,6 +53,7 @@ test("type-level exhaustiveness: every RelationType appears in RELATION_TYPES", FOUND_IN: true, DEPENDS_ON: true, OWNED_BY: true, + TYPE_OF: true, }; for (const t of RELATION_TYPES) { assert.equal(flags[t], true, `RELATION_TYPES contains ${t} but type-check map does not`); diff --git a/packages/core-types/src/edges.ts b/packages/core-types/src/edges.ts index ddcefe44..f23289d9 100644 --- a/packages/core-types/src/edges.ts +++ b/packages/core-types/src/edges.ts @@ -24,7 +24,8 @@ export type RelationType = | "REFERENCES" | "FOUND_IN" | "DEPENDS_ON" - | "OWNED_BY"; + | "OWNED_BY" + | "TYPE_OF"; // Insertion order is load-bearing: graphHash serializes edges ordered by // (from, type, to, step) but the RELATION_TYPES runtime array is referenced by @@ -55,6 +56,7 @@ export const RELATION_TYPES: readonly RelationType[] = [ "FOUND_IN", "DEPENDS_ON", "OWNED_BY", + "TYPE_OF", ] as const; /** diff --git a/packages/core-types/src/graph-hash.ts b/packages/core-types/src/graph-hash.ts index 182577ab..a09d17e8 100644 --- a/packages/core-types/src/graph-hash.ts +++ b/packages/core-types/src/graph-hash.ts @@ -16,6 +16,30 @@ import { writeCanonicalJson } from "./hash.js"; * inputs small enough for the all-in-memory path to succeed. The determinism * tests in `graph-hash.test.ts` cover cross-permutation stability; the * upstream tests must not change hex output for fixture-sized graphs. + * + * **Empty-collection contract:** `canonicalJson` (in `./hash.ts`) + * treats an empty array `[]` and an empty object `{}` as DISTINCT from an + * absent / `undefined` field. A node written as `{keywords: []}` emits + * `{"keywords":[]}` in the canonical JSON projection, while the same node + * with the `keywords` key absent emits no key at all — the two + * canonical-JSON byte streams differ, so their SHA-256 graph hashes + * differ. Storage adapters preserve this distinction at the writer + + * reader boundary: see + * `packages/storage/src/column-encode.ts:stringArrayOrNull`, + * `packages/storage/src/duckdb-adapter.ts:setStringArrayField`, + * `packages/storage/src/graphdb-adapter.ts:setStringArrayFieldGd`, and + * `packages/cli/src/commands/analyze.ts:stringArrayField`. The contract + * is exercised end-to-end by the + * `graphHash parity: medium-with-empty-keywords` fixture in + * `packages/storage/src/graph-hash-parity.test.ts`, which asserts both + * (a) cross-adapter parity for `{keywords: []}` and (b) the resulting + * hash differs from the equivalent fixture without the `keywords` key. + * + * The same `[]`-vs-absent semantics apply to `responseKeys` on RouteNode. + * Empty `Record` (`languageStats: {}`) goes through a + * separate sentinel path — see `coerceLanguageStats` in + * `column-encode.ts` — because that column is JSON-encoded TEXT, not a + * native typed array. */ export function graphHash(graph: KnowledgeGraph): string { const hasher = createHash("sha256"); diff --git a/packages/embedder/src/factory.test.ts b/packages/embedder/src/factory.test.ts new file mode 100644 index 00000000..6147bb86 --- /dev/null +++ b/packages/embedder/src/factory.test.ts @@ -0,0 +1,111 @@ +/** + * Tests for {@link openDefaultEmbedder} — the shared HTTP-priority + + * ONNX-fallback factory used by the CLI and MCP query call sites. + * + * Branches covered: + * 1. HTTP env vars set → returns the HTTP embedder (sentinel). + * 2. HTTP env vars absent + `allowOnnxFallback: true` (default) → returns + * the ONNX embedder (sentinel). + * 3. HTTP env vars absent + `allowOnnxFallback: false` → throws + * {@link EmbedderNotSetupError}; ONNX path is never invoked. + * 4. HTTP env vars absent + ONNX setup fails → propagates the underlying + * error (no swallowing, no wrapping). + * + * Dependency injection (the second `deps` arg) keeps the test pure: no + * tempdirs, no env-var manipulation, no real ONNX session. + */ + +import { equal, ok, rejects, strictEqual } from "node:assert/strict"; +import { describe, it } from "node:test"; + +import { openDefaultEmbedder } from "./factory.js"; +import { type Embedder, EmbedderNotSetupError } from "./types.js"; + +/** Build a sentinel Embedder whose identity we can assert against. */ +function makeSentinelEmbedder(modelId: string): Embedder { + return { + dim: 768, + modelId, + embed: async () => new Float32Array(768), + embedBatch: async (texts) => texts.map(() => new Float32Array(768)), + close: async () => {}, + }; +} + +describe("openDefaultEmbedder", () => { + it("returns the HTTP embedder when env vars are set", async () => { + const httpSentinel = makeSentinelEmbedder("remote/http"); + const result = await openDefaultEmbedder( + {}, + { + tryOpenHttp: () => httpSentinel, + openOnnx: async () => { + throw new Error("openOnnx must not be called when HTTP is configured"); + }, + }, + ); + strictEqual(result, httpSentinel, "factory should return the HTTP embedder reference"); + equal(result.modelId, "remote/http"); + }); + + it("falls back to ONNX when no HTTP env vars and allowOnnxFallback defaults to true", async () => { + const onnxSentinel = makeSentinelEmbedder("gte-modernbert-base/fp32"); + const result = await openDefaultEmbedder( + {}, + { + tryOpenHttp: () => null, + openOnnx: async () => onnxSentinel, + }, + ); + strictEqual(result, onnxSentinel, "factory should return the ONNX embedder reference"); + equal(result.modelId, "gte-modernbert-base/fp32"); + }); + + it("throws EmbedderNotSetupError when HTTP env vars absent and allowOnnxFallback=false", async () => { + let onnxCalled = false; + await rejects( + openDefaultEmbedder( + { allowOnnxFallback: false }, + { + tryOpenHttp: () => null, + openOnnx: async () => { + onnxCalled = true; + return makeSentinelEmbedder("should-not-be-reached"); + }, + }, + ), + (err: unknown) => { + ok(err instanceof EmbedderNotSetupError, "expected EmbedderNotSetupError"); + equal(err.code, "EMBEDDER_NOT_SETUP"); + ok( + err.message.includes("allowOnnxFallback") || + err.message.includes("CODEHUB_EMBEDDING_URL"), + "message should mention the option or the env var", + ); + return true; + }, + ); + equal(onnxCalled, false, "openOnnx must not be invoked when fallback is disabled"); + }); + + it("propagates the underlying error when ONNX setup fails", async () => { + const onnxFailure = new EmbedderNotSetupError( + "Run `codehub setup --embeddings` to install gte-modernbert-base", + ); + await rejects( + openDefaultEmbedder( + {}, + { + tryOpenHttp: () => null, + openOnnx: async () => { + throw onnxFailure; + }, + }, + ), + (err: unknown) => { + strictEqual(err, onnxFailure, "factory should re-throw the original error"); + return true; + }, + ); + }); +}); diff --git a/packages/embedder/src/factory.ts b/packages/embedder/src/factory.ts new file mode 100644 index 00000000..174f60f8 --- /dev/null +++ b/packages/embedder/src/factory.ts @@ -0,0 +1,95 @@ +/** + * `openDefaultEmbedder` — the shared HTTP-priority + ONNX-fallback factory + * used by `@opencodehub/cli` and `@opencodehub/mcp` query call sites. + * + * Selection precedence: + * 1. {@link tryOpenHttpEmbedder} reads SageMaker / OpenAI-HTTP env vars + * first and returns a remote-backed embedder when configured. + * 2. Otherwise — and only when `allowOnnxFallback === true` (the default) — + * fall back to {@link openOnnxEmbedder}, which loads gte-modernbert-base + * weights from disk (the lazy-load side effect). + * 3. With `allowOnnxFallback: false` and no HTTP/SageMaker env, throw + * {@link EmbedderNotSetupError} — the ONNX binding is never loaded. + * + * The fuller variant in `packages/ingestion/src/pipeline/phases/embeddings.ts` + * intentionally stays separate: ingestion needs an offline flag, an explicit + * ONNX variant + modelDir config, a weight canary, and a Piscina pool. None + * of those apply to the query-time path. + */ + +import { openHttpEmbedder, readHttpEmbedderConfigFromEnv } from "./http-embedder.js"; +import { openOnnxEmbedder as defaultOpenOnnxEmbedder } from "./onnx-embedder.js"; +import { openSagemakerEmbedder, readSagemakerEmbedderConfigFromEnv } from "./sagemaker-embedder.js"; +import { type Embedder, EmbedderNotSetupError } from "./types.js"; + +/** + * Inline copy of {@link tryOpenHttpEmbedder} from `./index.ts` — kept + * separate to avoid a circular import between `index.ts` (which re-exports + * the factory) and the factory module. Behavior matches the public + * `tryOpenHttpEmbedder` exactly: SageMaker env first, then HTTP env, then + * `null`. The factory does not honor the `offline` flag — query-time + * call-sites do not run in offline mode. + */ +function tryOpenHttpEmbedderFromEnv(): Embedder | Promise | null { + const sagemakerCfg = readSagemakerEmbedderConfigFromEnv(); + if (sagemakerCfg !== null) { + return openSagemakerEmbedder(sagemakerCfg); + } + const httpCfg = readHttpEmbedderConfigFromEnv(); + if (httpCfg === null) return null; + return openHttpEmbedder(httpCfg); +} + +/** + * Options for {@link openDefaultEmbedder}. + */ +export interface OpenDefaultEmbedderOptions { + /** + * When `true` (default) — fall back to the local ONNX embedder if no + * HTTP / SageMaker env vars are configured. When `false` — throw + * {@link EmbedderNotSetupError} instead. Use `false` for fully-remote + * deployments that should never load ONNX weights. + */ + readonly allowOnnxFallback?: boolean; +} + +/** + * Internal injection seam used only by the unit test. The production + * call-sites do not need to provide overrides. + */ +export interface OpenDefaultEmbedderDeps { + readonly tryOpenHttp?: () => Embedder | Promise | null; + readonly openOnnx?: typeof defaultOpenOnnxEmbedder; +} + +/** + * HTTP-priority + ONNX-fallback embedder factory. + * + * @param opts.allowOnnxFallback default `true` — set `false` to refuse the + * ONNX path and throw {@link EmbedderNotSetupError} when no remote + * embedder env vars are set. + */ +export async function openDefaultEmbedder( + opts: OpenDefaultEmbedderOptions = {}, + deps: OpenDefaultEmbedderDeps = {}, +): Promise { + const tryOpenHttp = deps.tryOpenHttp ?? tryOpenHttpEmbedderFromEnv; + const openOnnx = deps.openOnnx ?? defaultOpenOnnxEmbedder; + const allowOnnxFallback = opts.allowOnnxFallback ?? true; + + // tryOpenHttp returns Embedder | Promise | null. `await` on a + // non-Promise value just resolves to that value, so this normalizes both + // shapes into a single branch. + const httpEmbedder = await tryOpenHttp(); + if (httpEmbedder !== null) return httpEmbedder; + + if (!allowOnnxFallback) { + throw new EmbedderNotSetupError( + "No remote embedder is configured (set CODEHUB_EMBEDDING_URL or " + + "CODEHUB_EMBEDDING_SAGEMAKER_ENDPOINT) and `allowOnnxFallback` is " + + "disabled. Either configure a remote endpoint or pass " + + "`allowOnnxFallback: true`.", + ); + } + return openOnnx(); +} diff --git a/packages/embedder/src/fingerprint.test.ts b/packages/embedder/src/fingerprint.test.ts new file mode 100644 index 00000000..b31bceab --- /dev/null +++ b/packages/embedder/src/fingerprint.test.ts @@ -0,0 +1,54 @@ +/** + * Tests for `assertEmbedderCompatible`. + */ + +import { equal, ok } from "node:assert/strict"; +import { describe, test } from "node:test"; +import { assertEmbedderCompatible, EMBEDDER_MISMATCH_HINT } from "./fingerprint.js"; + +describe("assertEmbedderCompatible", () => { + test("ok when persisted is undefined (legacy store, never tagged)", () => { + const result = assertEmbedderCompatible(undefined, "gte-modernbert-base/fp32", false); + ok(result.ok); + }); + + test("ok when persisted equals current", () => { + const result = assertEmbedderCompatible( + "gte-modernbert-base/fp32", + "gte-modernbert-base/fp32", + false, + ); + ok(result.ok); + }); + + test("ok when persisted differs from current but force is true", () => { + const result = assertEmbedderCompatible( + "gte-modernbert-base/fp32", + "sagemaker:gte-modernbert-base@my-endpoint", + true, + ); + ok(result.ok); + }); + + test("not ok when persisted differs from current and force is false", () => { + const result = assertEmbedderCompatible( + "gte-modernbert-base/fp32", + "sagemaker:gte-modernbert-base@my-endpoint", + false, + ); + ok(!result.ok); + if (!result.ok) { + equal(result.persistedModelId, "gte-modernbert-base/fp32"); + equal(result.currentModelId, "sagemaker:gte-modernbert-base@my-endpoint"); + equal(result.hint, EMBEDDER_MISMATCH_HINT); + } + }); + + test("hint is the stable remediation string", () => { + equal( + EMBEDDER_MISMATCH_HINT, + "Re-run 'codehub analyze --force' or pass --force-backend-mismatch to " + + "query with potentially stale vectors.", + ); + }); +}); diff --git a/packages/embedder/src/fingerprint.ts b/packages/embedder/src/fingerprint.ts new file mode 100644 index 00000000..91d932b8 --- /dev/null +++ b/packages/embedder/src/fingerprint.ts @@ -0,0 +1,64 @@ +/** + * Embedder-fingerprint compatibility check. + * + * The `embeddings` table on disk was populated by ONE specific embedder + * — usually identified by its {@link Embedder.modelId} (e.g. + * `gte-modernbert-base/fp32`, `sagemaker:gte-modernbert-base@`). + * If the operator switches the active embedder between index runs (ONNX + * → SageMaker, fp32 → int8) the dim might still match by coincidence + * (768 = 768) but the vector subspace is different — hybrid search + * silently corrupts ranking with no error. + * + * `assertEmbedderCompatible` makes the mismatch loud: + * - PASS → the persisted modelId equals the current modelId, OR the + * persisted modelId is unset (legacy store, never tagged). + * - PASS → mismatch but the caller passed `force: true` (the operator + * knows the vectors might be stale and accepts the risk). + * - FAIL → mismatch + no force — return an envelope with a remediation + * hint. The caller (cli/query, mcp/query) decides how to + * surface: cli exits 2, MCP returns a structured error. + */ + +/** Stable remediation hint surfaced on every embedder-mismatch refusal. */ +export const EMBEDDER_MISMATCH_HINT: string = + "Re-run 'codehub analyze --force' or pass --force-backend-mismatch to " + + "query with potentially stale vectors."; + +export type EmbedderCompatibilityResult = + | { readonly ok: true } + | { + readonly ok: false; + readonly persistedModelId: string; + readonly currentModelId: string; + readonly hint: string; + }; + +/** + * Compare the embedder modelId persisted in `store_meta.embedder_model_id` + * against the modelId of the embedder the caller just opened. + * + * @param persistedModelId - `StoreMeta.embedderModelId` from the store — + * pass `undefined` for legacy stores that never recorded it (the + * compatibility check passes; the open-time backfill attributes the + * row to the current embedder with a one-shot stderr warning). + * @param currentModelId - {@link Embedder.modelId} of the embedder the + * caller opened for this query. + * @param force - When `true`, force the check to pass even on mismatch. + * Set by the cli `--force-backend-mismatch` flag and the equivalent + * `force_backend_mismatch` MCP tool option. + */ +export function assertEmbedderCompatible( + persistedModelId: string | undefined, + currentModelId: string, + force: boolean, +): EmbedderCompatibilityResult { + if (force) return { ok: true }; + if (persistedModelId === undefined) return { ok: true }; + if (persistedModelId === currentModelId) return { ok: true }; + return { + ok: false, + persistedModelId, + currentModelId, + hint: EMBEDDER_MISMATCH_HINT, + }; +} diff --git a/packages/embedder/src/index.ts b/packages/embedder/src/index.ts index b80e8de7..bc198a61 100644 --- a/packages/embedder/src/index.ts +++ b/packages/embedder/src/index.ts @@ -34,6 +34,12 @@ import { } from "./sagemaker-embedder.js"; import type { Embedder, EmbedderConfig } from "./types.js"; +export { type OpenDefaultEmbedderOptions, openDefaultEmbedder } from "./factory.js"; +export { + assertEmbedderCompatible, + EMBEDDER_MISMATCH_HINT, + type EmbedderCompatibilityResult, +} from "./fingerprint.js"; export { type HttpEmbedderConfig, openHttpEmbedder, diff --git a/packages/ingestion/README.md b/packages/ingestion/README.md new file mode 100644 index 00000000..65ec1d2a --- /dev/null +++ b/packages/ingestion/README.md @@ -0,0 +1,69 @@ +# @opencodehub/ingestion + +The indexing pipeline. Walks a repo, extracts symbols and edges via +tree-sitter (WASM by default, native opt-in), then runs a 30-phase DAG +that emits the graph and supporting artifacts under `/.codehub/`. + +## Surface + +```ts +import { runIngestion, DEFAULT_PHASES } from "@opencodehub/ingestion/pipeline"; + +await runIngestion({ + repoRoot: "/path/to/repo", + phases: DEFAULT_PHASES, + // ...embeddings, summaries, scan, sbom toggles +}); +``` + +- The pipeline runs serially in topological order — determinism is + worth more than the parallelism win at MVP scale + (`packages/ingestion/src/pipeline/phases/default-set.ts:14-17`). +- The runner validates the DAG (missing dependencies, cycles) on every + invocation (`packages/ingestion/src/pipeline/runner.ts`). +- Parse runtime defaults to `web-tree-sitter` (WASM); set + `OCH_NATIVE_PARSER=1` to opt into native on Node 22 (root `CLAUDE.md`, + Parse runtime section). + +## Phases + +The 30-phase ordering, sourced from +`packages/ingestion/src/pipeline/phases/default-set.ts:55-135`. Phases +group by what they read from the repo or graph. + +| Group | Phases | +| ------------------ | ----------------------------------------------------------------------------------------------------- | +| Discovery | `scan`, `profile`, `repo-node`, `structure`, `markdown` | +| Parse + scope | `parse`, `incremental-scope`, `complexity` | +| Heuristic graph | `routes`, `openapi`, `tools`, `orm`, `cross-file`, `accesses` | +| SCIP overlay | `scip-index`, `confidence-demote`, `mro` | +| Clustering | `communities`, `dead-code`, `processes`, `fetches` | +| Temporal | `temporal`, `cochange`, `ownership` | +| Supply chain | `dependencies`, `sbom` | +| Annotation | `annotate`, `risk-snapshot` | +| Optional emitters | `summarize`, `embeddings` | + +## Design + +- **Single canonical ordering** — the runner consumes `DEFAULT_PHASES` + as the source of truth. Adding a phase is one import + one array + entry; the DAG validator does the rest. +- **Heuristic first, SCIP overlay second** — `parse` and friends emit + confidence-0.5 edges; `scip-index` upgrades them to 1.0 and + `confidence-demote` drops the unconfirmed survivors to 0.2 with a + `+scip-unconfirmed` reason suffix + (`packages/ingestion/src/pipeline/phases/default-set.ts:90-95`). +- **Dual parser runtime** — WASM is the default for cross-platform + determinism; the native N-API addon is opt-in for Node 22 dev boxes. + The `complexity` phase still requires native and degrades with a + one-shot stderr warning otherwise (root `CLAUDE.md`). +- **Silent toggles** — `summarize`, `embeddings`, `sbom`, and the + scanner phase are no-ops unless their option is on, so a default + `analyze` writes only the deterministic graph. +- **Phase outputs are typed** — each phase declares an output type + consumed by `ctx.phaseOutputs[]`, surfacing dependency drift at + compile time (`packages/ingestion/src/pipeline/types.ts`). + +See ADR 0013 for the storage backend the pipeline writes into and the +root README's "Embedding backends" section for the optional +`embeddings` phase. diff --git a/packages/ingestion/src/pipeline/phases/content-cache.test.ts b/packages/ingestion/src/pipeline/phases/content-cache.test.ts index df4a57f6..22f8b85c 100644 --- a/packages/ingestion/src/pipeline/phases/content-cache.test.ts +++ b/packages/ingestion/src/pipeline/phases/content-cache.test.ts @@ -10,6 +10,8 @@ import { cacheFilePath, computeCacheSize, deriveCacheKey, + evictIfOverCap, + parseHumanSizeBytes, readCacheEntry, writeCacheEntry, } from "./content-cache.js"; @@ -177,3 +179,201 @@ describe("content-cache", () => { assert.ok(bytes > 0); }); }); + +describe("parseHumanSizeBytes", () => { + it("parses binary units (GiB, MiB, KiB)", () => { + assert.equal(parseHumanSizeBytes("1GiB"), 1024 ** 3); + assert.equal(parseHumanSizeBytes("2MiB"), 2 * 1024 ** 2); + assert.equal(parseHumanSizeBytes("4KiB"), 4 * 1024); + }); + + it("parses decimal units (GB, MB, KB) distinct from binary", () => { + assert.equal(parseHumanSizeBytes("1GB"), 1_000_000_000); + assert.equal(parseHumanSizeBytes("500MB"), 500_000_000); + assert.equal(parseHumanSizeBytes("1KB"), 1_000); + }); + + it("parses bare bytes and the explicit B unit", () => { + assert.equal(parseHumanSizeBytes("1024"), 1024); + assert.equal(parseHumanSizeBytes("1024B"), 1024); + }); + + it("treats 0 and malformed input as 0", () => { + assert.equal(parseHumanSizeBytes("0"), 0); + assert.equal(parseHumanSizeBytes(""), 0); + assert.equal(parseHumanSizeBytes("abc"), 0); + assert.equal(parseHumanSizeBytes("-5MB"), 0); + }); + + it("clamps negative numeric input to 0 and floors fractional bytes", () => { + assert.equal(parseHumanSizeBytes(-1), 0); + assert.equal(parseHumanSizeBytes(123.7), 123); + assert.equal(parseHumanSizeBytes("0.5KiB"), 512); + }); +}); + +describe("evictIfOverCap", () => { + let cacheDir: string; + + beforeEach(async () => { + cacheDir = await mkdtemp(path.join(tmpdir(), "och-evict-")); + }); + + afterEach(async () => { + await rm(cacheDir, { recursive: true, force: true }); + }); + + /** + * Build N fake cache files of exactly `byteSize` bytes each, with + * monotonically increasing mtimes (oldest = index 0). Files land in + * the standard shard layout so {@link evictIfOverCap}'s walker finds + * them. Returns the absolute paths in oldest-first order. + */ + async function seedEntries(n: number, byteSize: number): Promise { + const buf = Buffer.alloc(byteSize, "x"); + const paths: string[] = []; + const baseMs = 1_700_000_000_000; // arbitrary stable epoch + for (let i = 0; i < n; i++) { + // Distinct contentSha → distinct shard prefixes spread across buckets. + const sha = `${i.toString(16).padStart(2, "0")}${"a".repeat(62)}`; + const shard = sha.slice(0, 2); + const filename = `${sha}-${"b".repeat(6)}-1.0.0.json`; + const dir = path.join(cacheDir, shard); + await fs.mkdir(dir, { recursive: true }); + const filePath = path.join(dir, filename); + await fs.writeFile(filePath, buf); + // Force a deterministic mtime — older index → older mtime. + const t = (baseMs + i * 1000) / 1000; // utimes takes seconds + await fs.utimes(filePath, t, t); + paths.push(filePath); + } + return paths; + } + + async function existing(paths: readonly string[]): Promise { + return Promise.all( + paths.map((p) => + fs + .access(p) + .then(() => true) + .catch(() => false), + ), + ); + } + + it("evicts oldest entries until total ≤ 0.9 × cap (12 × 100 KiB under 1 MiB)", async () => { + const ENTRY_SIZE = 100 * 1024; // 102_400 + const CAP = 1 << 20; // 1 MiB + const paths = await seedEntries(12, ENTRY_SIZE); + // Sanity: pre-eviction we are over cap. + const before = await computeCacheSize(cacheDir); + assert.equal(before.fileCount, 12); + assert.equal(before.bytes, 12 * ENTRY_SIZE); + + await evictIfOverCap(cacheDir, CAP); + + // 0.9 × 1 MiB = 943_718. Max kept entries = floor(943_718 / 102_400) = 9. + const present = await existing(paths); + const keptCount = present.filter(Boolean).length; + assert.equal(keptCount, 9); + // Oldest 3 (indices 0..2) deleted; youngest 9 (indices 3..11) survive. + for (let i = 0; i < 3; i++) { + assert.equal(present[i], false, `expected oldest entry ${i} to be evicted`); + } + for (let i = 3; i < 12; i++) { + assert.equal(present[i], true, `expected youngest entry ${i} to survive`); + } + const after = await computeCacheSize(cacheDir); + assert.ok(after.bytes <= Math.floor(0.9 * CAP), "post-eviction total must be ≤ 0.9 × cap"); + }); + + it("is idempotent — second call under cap does nothing", async () => { + const ENTRY_SIZE = 100 * 1024; + const CAP = 1 << 20; + const paths = await seedEntries(12, ENTRY_SIZE); + await evictIfOverCap(cacheDir, CAP); + const firstPass = await computeCacheSize(cacheDir); + + await evictIfOverCap(cacheDir, CAP); + const secondPass = await computeCacheSize(cacheDir); + + assert.equal(secondPass.fileCount, firstPass.fileCount); + assert.equal(secondPass.bytes, firstPass.bytes); + // Still the same 9 youngest entries. + const present = await existing(paths); + assert.equal(present.filter(Boolean).length, 9); + }); + + it("manual delete then re-evict does not delete more (still under cap)", async () => { + const ENTRY_SIZE = 100 * 1024; + const CAP = 1 << 20; + const paths = await seedEntries(12, ENTRY_SIZE); + await evictIfOverCap(cacheDir, CAP); + // Manually delete one survivor (index 11 = newest). + await fs.unlink(paths[11] as string); + const before = await computeCacheSize(cacheDir); + assert.equal(before.fileCount, 8); + + await evictIfOverCap(cacheDir, CAP); + + const after = await computeCacheSize(cacheDir); + assert.equal(after.fileCount, 8, "no further eviction expected when under cap"); + assert.equal(after.bytes, before.bytes); + }); + + it("cap = 0 short-circuits — no entries removed", async () => { + const ENTRY_SIZE = 100 * 1024; + const paths = await seedEntries(12, ENTRY_SIZE); + await evictIfOverCap(cacheDir, 0); + const present = await existing(paths); + assert.equal(present.filter(Boolean).length, 12); + }); + + it("missing cache dir is a silent no-op", async () => { + const ghost = path.join(cacheDir, "does", "not", "exist"); + await assert.doesNotReject(() => evictIfOverCap(ghost, 1 << 20)); + }); + + it("writeCacheEntry triggers LRU sweep when CODEHUB_PARSE_CACHE_MAX_BYTES is exceeded", async () => { + const prev = process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"]; + // Tiny cap — every write past the first should trigger eviction. + process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"] = "1KiB"; + try { + // Pre-seed many large entries directly (bypass writeCacheEntry's own evict). + const paths = await seedEntries(8, 1024); + // Now do a writeCacheEntry — its post-write hook should sweep. + const key = deriveCacheKey(SAMPLE_SHA, GRAMMAR_SHA, PIPELINE_VERSION); + await writeCacheEntry(cacheDir, key, sampleEntry()); + // Cap is 1024; target is 921. The freshly-written entry is youngest, + // so after sweep at least the oldest seeded entries must be gone. + const present = await existing(paths); + assert.ok( + present.filter(Boolean).length < 8, + "expected at least some seeded entries to be evicted", + ); + // Freshly-written entry must still be present (it is the newest). + const back = await readCacheEntry(cacheDir, key); + assert.ok(back, "newly-written entry must survive its own eviction sweep"); + } finally { + if (prev === undefined) delete process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"]; + else process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"] = prev; + } + }); + + it("CODEHUB_PARSE_CACHE_MAX_BYTES=0 disables sweep on writeCacheEntry", async () => { + const prev = process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"]; + process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"] = "0"; + try { + // Seed 8 KiB of fake entries, then write a real one. Disabled sweep + // means everything stays. + const paths = await seedEntries(8, 1024); + const key = deriveCacheKey(SAMPLE_SHA, GRAMMAR_SHA, PIPELINE_VERSION); + await writeCacheEntry(cacheDir, key, sampleEntry()); + const present = await existing(paths); + assert.equal(present.filter(Boolean).length, 8, "all seeded entries must survive"); + } finally { + if (prev === undefined) delete process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"]; + else process.env["CODEHUB_PARSE_CACHE_MAX_BYTES"] = prev; + } + }); +}); diff --git a/packages/ingestion/src/pipeline/phases/content-cache.ts b/packages/ingestion/src/pipeline/phases/content-cache.ts index 43e7ed1e..619c3e49 100644 --- a/packages/ingestion/src/pipeline/phases/content-cache.ts +++ b/packages/ingestion/src/pipeline/phases/content-cache.ts @@ -129,8 +129,10 @@ export function deriveCacheKey( * * The grammarSha/pipelineVersion suffix ensures two simultaneous entries * for the same content (e.g. before/after a grammar bump) cannot clobber - * each other — older entries simply become unreachable and are cleaned up - * lazily by a future eviction pass. + * each other — older entries simply become unreachable and are reclaimed + * by the LRU sweep in {@link evictIfOverCap}, which runs after every + * `writeCacheEntry` when `CODEHUB_PARSE_CACHE_MAX_BYTES` (default `1GiB`) + * is non-zero. */ export function cacheFilePath(cacheDir: string, key: CacheKey): string { const shard = key.contentSha.slice(0, SHARD_PREFIX_LEN); @@ -171,9 +173,22 @@ export async function readCacheEntry(cacheDir: string, key: CacheKey): Promise 0) { + try { + await evictIfOverCap(cacheDir, cap); + } catch { + // Cache-eviction failure is never fatal; caller still got their write. + } + } +} + +/** + * Parse a human-readable size string (e.g. `"1GiB"`, `"500MB"`, `"0"`) into + * bytes. Numeric inputs pass through clamped to non-negative. Unknown + * units, malformed input, or negative numbers all yield `0` (which the + * eviction code treats as "disabled"). Both decimal (KB/MB/GB/TB) and + * binary (KiB/MiB/GiB/TiB) prefixes are supported. + */ +export function parseHumanSizeBytes(input: string | number): number { + if (typeof input === "number") return Number.isFinite(input) ? Math.max(0, Math.floor(input)) : 0; + const m = /^\s*(\d+(?:\.\d+)?)\s*([KMGT]i?B?|B)?\s*$/i.exec(input); + if (!m) return 0; + const n = Number.parseFloat(m[1] ?? "0"); + if (!Number.isFinite(n) || n < 0) return 0; + const unit = (m[2] ?? "").toUpperCase(); + const mult: Record = { + "": 1, + B: 1, + KB: 1_000, + KIB: 1024, + MB: 1_000_000, + MIB: 1024 ** 2, + GB: 1_000_000_000, + GIB: 1024 ** 3, + TB: 1_000_000_000_000, + TIB: 1024 ** 4, + }; + return Math.floor(n * (mult[unit] ?? 1)); +} + +/** + * LRU-evict cache entries until total on-disk bytes ≤ `0.9 × capBytes`. + * + * Walks the same shard layout as {@link computeCacheSize}: each top-level + * directory under `cacheDir` is treated as a shard, and every regular + * file inside it is a candidate. Entries are sorted by mtime ascending, + * then unlinked in oldest-first order until the running total reaches + * the 90 % water-mark — the headroom prevents thrash where each new + * write evicts exactly one older entry. + * + * Behavior: + * - `capBytes <= 0` short-circuits (eviction disabled). + * - Missing `cacheDir` is a no-op. + * - Per-file errors during stat or unlink are swallowed (skipped). + * - Total under cap → no work done. + * + * Cache layout reminder: `//--.json`. + */ +export async function evictIfOverCap(cacheDir: string, capBytes: number): Promise { + if (capBytes <= 0) return; + + interface Candidate { + readonly path: string; + readonly size: number; + readonly mtimeMs: number; + } + const candidates: Candidate[] = []; + let total = 0; + + let shards: import("node:fs").Dirent[]; + try { + shards = await fs.readdir(cacheDir, { withFileTypes: true }); + } catch { + return; // Missing cache dir → nothing to evict. + } + // Deterministic shard order matches computeCacheSize. + shards.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0)); + for (const shard of shards) { + if (!shard.isDirectory()) continue; + const shardPath = path.join(cacheDir, shard.name); + let entries: import("node:fs").Dirent[]; + try { + entries = await fs.readdir(shardPath, { withFileTypes: true }); + } catch { + continue; + } + entries.sort((a, b) => (a.name < b.name ? -1 : a.name > b.name ? 1 : 0)); + for (const e of entries) { + if (!e.isFile()) continue; + const entryPath = path.join(shardPath, e.name); + try { + const s = await fs.stat(entryPath); + candidates.push({ path: entryPath, size: s.size, mtimeMs: s.mtimeMs }); + total += s.size; + } catch { + // File vanished mid-traversal; skip. + } + } + } + + if (total <= capBytes) return; + + const target = Math.floor(0.9 * capBytes); + // Oldest first → LRU eviction order. + candidates.sort((a, b) => a.mtimeMs - b.mtimeMs); + for (const c of candidates) { + if (total <= target) break; + try { + await fs.unlink(c.path); + total -= c.size; + } catch { + // Concurrent unlink, EACCES, etc. — keep going; over-cap is recoverable. + } + } } /** diff --git a/packages/ingestion/src/pipeline/phases/embeddings.ts b/packages/ingestion/src/pipeline/phases/embeddings.ts index 3dae73fe..f8ceefb6 100644 --- a/packages/ingestion/src/pipeline/phases/embeddings.ts +++ b/packages/ingestion/src/pipeline/phases/embeddings.ts @@ -513,6 +513,10 @@ async function runEmbeddings(ctx: PipelineContext): Promise let embedder: Embedder; try { + // Intentionally NOT using `openDefaultEmbedder` from `@opencodehub/embedder`: + // ingestion needs the offline flag, an explicit ONNX variant + modelDir, + // a weight canary, and an OnnxEmbedderPool — none of which apply at query + // time. Keep the two paths separate. const httpEmbedder = await tryOpenHttpEmbedder({ offline: ctx.options.offline === true }); if (httpEmbedder !== null) { embedder = httpEmbedder; diff --git a/packages/ingestion/src/pipeline/phases/scip-index.ts b/packages/ingestion/src/pipeline/phases/scip-index.ts index df7f57a9..5ba04165 100644 --- a/packages/ingestion/src/pipeline/phases/scip-index.ts +++ b/packages/ingestion/src/pipeline/phases/scip-index.ts @@ -34,6 +34,7 @@ import { join } from "node:path"; import type { GraphNode, NodeId } from "@opencodehub/core-types"; import type { DerivedEdge, + DerivedRelation, IndexerKind, IndexerResult, ScipIndexerName, @@ -242,7 +243,7 @@ async function runScipIndex( result.version || index.tool.version || "unknown", ); - const { added, upgraded } = emitEdges( + const { added: edgeAdded, upgraded: edgeUpgraded } = emitEdges( ctx, nodesByFile, derived.edges, @@ -250,6 +251,16 @@ async function runScipIndex( reason, existingEdgeKeys, ); + const { added: relAdded, upgraded: relUpgraded } = emitRelations( + ctx, + nodesByFile, + derived.relations, + symbolDef, + reason, + existingEdgeKeys, + ); + const added = edgeAdded + relAdded; + const upgraded = edgeUpgraded + relUpgraded; totalAdded += added; totalUpgraded += upgraded; perLang.push({ @@ -475,6 +486,9 @@ function emitEdges( // disambiguated even when multiple in-repo symbols share a display // name. Symbols without a DEFINITION occurrence are external // (stdlib / vendored / absent typings) and their edges are dropped. + // Each edge carries its own `kind` (CALLS or REFERENCES) so this loop + // routes both function-call and read-side reference fanout through the + // same caller→callee join shape. for (const e of edges) { const fromId = findEnclosingNodeId(nodesByFile, e.document, e.callLine + 1); if (!fromId) continue; @@ -484,13 +498,64 @@ function emitEdges( if (!toId) continue; if (fromId === toId) continue; - const key = edgeKey(fromId, "CALLS", toId); + const key = edgeKey(fromId, e.kind, toId); + const priorExists = existingKeys.has(key); + + ctx.graph.addEdge({ + from: fromId, + to: toId, + type: e.kind, + confidence: SCIP_CONFIDENCE, + reason, + }); + + existingKeys.add(key); + if (priorExists) upgraded += 1; + else added += 1; + } + return { added, upgraded }; +} + +/** + * Emit IMPLEMENTS / TYPE_OF graph edges from `derived.relations`. + * + * `collectRels` in `@opencodehub/scip-ingest/derive.ts` translates the + * SCIP `Relationship` message (`is_implementation`, `is_type_definition`) + * into structural relations between two SCIP symbols. Both ends need to + * resolve to OCH nodes via `symbolDef` — a relation whose source or + * target has no DEFINITION anywhere in the index is dropped (the + * relation lives entirely outside the indexed corpus). Otherwise the + * lookup uses the same `+1` boundary translation as `emitEdges` because + * SCIP `range.startLine` is 0-indexed and OCH graph nodes are 1-indexed. + */ +function emitRelations( + ctx: PipelineContext, + nodesByFile: NodesByFile, + relations: readonly DerivedRelation[], + symbolDef: ReadonlyMap, + reason: string, + existingKeys: Set, +): { added: number; upgraded: number } { + let added = 0; + let upgraded = 0; + for (const r of relations) { + const fromDef = symbolDef.get(r.from); + if (!fromDef) continue; + const toDef = symbolDef.get(r.to); + if (!toDef) continue; + const fromId = findEnclosingNodeId(nodesByFile, fromDef.file, fromDef.line + 1); + if (!fromId) continue; + const toId = findEnclosingNodeId(nodesByFile, toDef.file, toDef.line + 1); + if (!toId) continue; + if (fromId === toId) continue; + + const key = edgeKey(fromId, r.kind, toId); const priorExists = existingKeys.has(key); ctx.graph.addEdge({ from: fromId, to: toId, - type: "CALLS", + type: r.kind, confidence: SCIP_CONFIDENCE, reason, }); diff --git a/packages/mcp/README.md b/packages/mcp/README.md new file mode 100644 index 00000000..a0a82847 --- /dev/null +++ b/packages/mcp/README.md @@ -0,0 +1,62 @@ +# @opencodehub/mcp + +Model Context Protocol server for OpenCodeHub. Wraps the analysis + +storage layer and exposes it to coding agents over stdio. + +## Surface + +```bash +codehub mcp # spawn the stdio server +``` + +- Transport is stdio only — no HTTP, no SSE, no daemon + (`packages/cli/src/commands/mcp.ts`). +- `list_repos` is the discovery entry point. Per-repo tools accept an + optional `repo` (registry name) or `repo_uri` alias (Sourcegraph-style + URI like `github.com/org/repo`, `local:` for unpublished repos); + with one repo registered both are optional. +- When ≥ 2 repos are registered and neither is supplied, the tool + returns an `AMBIGUOUS_REPO` error envelope with `choices[]` (capped at + 10) so the caller can retry deterministically (see root `CLAUDE.md`). +- Every response carries a `next_steps` array and a + `_meta.codehub/staleness` entry when the index may be behind HEAD + (`packages/mcp/src/staleness.ts`). + +## Tools + +29 tools registered in `packages/mcp/src/server.ts:151-179`. Implementation +files live under `packages/mcp/src/tools/.ts`. + +| Group | Tools | +| ----------- | ---------------------------------------------------------------------------------------------------------- | +| Discovery | `list_repos`, `query`, `context`, `route_map`, `tool_map` | +| Impact | `impact`, `api_impact`, `detect_changes`, `shape_check`, `rename` | +| Snapshot | `pack_codebase`, `project_profile`, `dependencies`, `owners`, `risk_trends` | +| Findings | `scan`, `verdict`, `list_findings`, `list_findings_delta`, `license_audit` | +| Dead code | `list_dead_code`, `remove_dead_code` | +| Group | `group_list`, `group_query`, `group_status`, `group_contracts`, `group_cross_repo_links`, `group_sync` | +| Raw query | `sql` | + +## Design + +- **Single source of truth** — registration order in `server.ts` IS the + surface. `tool_map` introspects the live server so agents can list + tools without out-of-band documentation + (`packages/mcp/src/tools/tool-map.ts`). +- **Structured errors over prose** — every error returns + `structuredContent.error = { error_code, jsonrpc_code, ... }` so a + caller can branch on `error_code` instead of regex-matching + (`packages/mcp/src/error-envelope.ts`). +- **Repo resolution is centralised** — `repoResolver` and the + AMBIGUOUS_REPO envelope are wired through every per-repo tool so + ambiguity is reported once, consistently + (`packages/mcp/src/repo-resolver.ts`). +- **Connection pooling** — the graph store is held in a per-process + pool to amortise DuckDB cold starts across many tool calls + (`packages/mcp/src/connection-pool.ts`). +- **Lazy analysis** — heavy work (scan, code-pack, verdict) shells out + via `analysis-bridge` rather than running in the MCP process so a + hung scanner cannot stall the server (`packages/mcp/src/analysis-bridge.ts`). + +See ADR 0012 for the `repo_uri`-as-typed-attribute rationale and the +root `CLAUDE.md` for the AMBIGUOUS_REPO retry contract. diff --git a/packages/mcp/src/error-envelope.ts b/packages/mcp/src/error-envelope.ts index f506f001..a5fc2a4c 100644 --- a/packages/mcp/src/error-envelope.ts +++ b/packages/mcp/src/error-envelope.ts @@ -24,7 +24,8 @@ export type ErrorCode = | "RATE_LIMITED" | "INTERNAL" | "NO_INDEX" - | "AMBIGUOUS_REPO"; + | "AMBIGUOUS_REPO" + | "EMBEDDER_MISMATCH"; /** Structured shape carried under `structuredContent.error`. */ export interface ErrorDetail { diff --git a/packages/mcp/src/tools/query.ts b/packages/mcp/src/tools/query.ts index 56b74cd1..a894acb4 100644 --- a/packages/mcp/src/tools/query.ts +++ b/packages/mcp/src/tools/query.ts @@ -36,7 +36,11 @@ import { isAbsolute, resolve as resolvePath } from "node:path"; import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { createNodeFs, type FsAbstraction } from "@opencodehub/analysis"; import type { GraphNode } from "@opencodehub/core-types"; -import type { Embedder } from "@opencodehub/embedder"; +import { + assertEmbedderCompatible, + type Embedder, + openDefaultEmbedder, +} from "@opencodehub/embedder"; import type { FusedHit, SymbolHit } from "@opencodehub/search"; import { bm25Search, @@ -46,7 +50,7 @@ import { } from "@opencodehub/search"; import type { IGraphStore, ITemporalStore, SymbolSummaryRow } from "@opencodehub/storage"; import { z } from "zod"; -import { toolErrorFromUnknown } from "../error-envelope.js"; +import { toolError, toolErrorFromUnknown } from "../error-envelope.js"; import { withNextSteps } from "../next-step-hints.js"; import { stalenessFromMeta } from "../staleness.js"; import { @@ -131,6 +135,12 @@ const QueryInput = { .max(50) .optional() .describe("How many files to shortlist at the coarse step when `mode=zoom`. Default 10."), + force_backend_mismatch: z + .boolean() + .optional() + .describe( + "Bypass the embedder fingerprint check. Lets the query proceed against an `embeddings` table populated by a different embedder than the one currently active. Vectors may be stale; results may misrank. Default false.", + ), }; /** Row shape returned to the MCP client. Stable across BM25-only + hybrid. */ @@ -430,31 +440,6 @@ function fusedAsRanked( return fused.map((f) => ({ nodeId: f.nodeId, score: f.score, sources: f.sources })); } -/** - * Default production factory — lazy-imports `@opencodehub/embedder` so the - * ONNX runtime native binding only loads when the tool actually needs it. - * Tests replace this via `ctx.openEmbedder` so they don't have to stage - * gte-modernbert-base weight files on disk. - * - * Priority: - * 1. If `CODEHUB_EMBEDDING_URL` + `CODEHUB_EMBEDDING_MODEL` are set, route - * through the HTTP embedder. The query tool runs inside the MCP stdio - * server which never runs in offline mode, so the HTTP path is - * available whenever the env is configured. - * 2. Otherwise, open the local ONNX embedder. The existing graceful - * fallback (`EMBEDDER_NOT_SETUP` → BM25-only) continues to apply. - * - * Any dim mismatch between the remote model and the stored vectors surfaces - * as an error on the first `embed()` call, which `tryOpenEmbedder` catches - * and degrades to BM25-only with a clear warning. - */ -async function defaultOpenEmbedder(): Promise { - const mod = await import("@opencodehub/embedder"); - const httpEmbedder = mod.tryOpenHttpEmbedder(); - if (httpEmbedder !== null) return httpEmbedder; - return mod.openOnnxEmbedder(); -} - /** * Walk PROCESS_STEP edges backwards from each top-K hit to find containing * Process nodes, then walk PROCESS_STEP edges forward from each matched @@ -621,13 +606,22 @@ interface QueryArgs { readonly mode?: "flat" | "zoom"; /** Coarse file-tier fanout when mode=zoom. */ readonly zoom_fanout?: number; + /** + * Bypass the embedder fingerprint refusal. When `true`, the query + * proceeds against an `embeddings` table populated by a different + * embedder than the one currently active. Vectors may be stale; + * results may misrank. Default `false`. + */ + readonly force_backend_mismatch?: boolean; } export async function runQuery(ctx: ToolContext, args: QueryArgs): Promise { const limit = args.limit ?? 10; const maxSymbols = args.max_symbols ?? DEFAULT_MAX_SYMBOLS; const includeContent = args.include_content === true; - const openEmbedder = ctx.openEmbedder ?? defaultOpenEmbedder; + // Shared HTTP-priority + ONNX-fallback factory. ONNX binding only loads + // on the fallback branch, so plain (non-dynamic) import is fine here. + const openEmbedder = ctx.openEmbedder ?? (() => openDefaultEmbedder()); const fsFactory = ctx.fsFactory ?? createNodeFs; // `searchText` is what goes to BM25 + the embedder. When `task_context` // or `goal` are present, they get prefixed so the ranker sees the broader @@ -654,6 +648,24 @@ export async function runQuery(ctx: ToolContext, args: QueryArgs): Promise(openEmbedder, "[mcp:query]"); if (embedder) { try { + // Refuse when the persisted embedder modelId differs from + // the current one. Same-dim vectors from different embedders + // silently corrupt ranking. `force_backend_mismatch` lets + // the caller override. + const meta = await graph.getMeta(); + const compat = assertEmbedderCompatible( + meta?.embedderModelId, + embedder.modelId, + args.force_backend_mismatch === true, + ); + if (!compat.ok) { + return toolError( + "EMBEDDER_MISMATCH", + `Embedder mismatch: store was indexed with '${compat.persistedModelId}', ` + + `current embedder is '${compat.currentModelId}'.`, + compat.hint, + ); + } const fused = await hybridSearch( graph, { @@ -840,6 +852,9 @@ export function registerQueryTool(server: McpServer, ctx: ToolContext): void { ...(args.granularity !== undefined ? { granularity: args.granularity } : {}), ...(args.mode !== undefined ? { mode: args.mode } : {}), ...(args.zoom_fanout !== undefined ? { zoom_fanout: args.zoom_fanout } : {}), + ...(args.force_backend_mismatch !== undefined + ? { force_backend_mismatch: args.force_backend_mismatch } + : {}), }; return fromToolResult(await runQuery(ctx, typed)); }, diff --git a/packages/scanners/README.md b/packages/scanners/README.md new file mode 100644 index 00000000..5dd5f789 --- /dev/null +++ b/packages/scanners/README.md @@ -0,0 +1,80 @@ +# @opencodehub/scanners + +Subprocess wrappers for the open-source scanners that back +`codehub scan`. Every scanner runs as an external process — nothing is +linked or vendored — and returns SARIF for ingestion into the graph. + +## Surface + +```ts +import { ALL_SPECS, P1_SPECS, P2_SPECS, filterSpecsByProfile } from "@opencodehub/scanners"; + +const profile = { languages: ["python"], iacTypes: ["docker"], apiContracts: [] }; +const enabled = filterSpecsByProfile(P1_SPECS, profile); +``` + +- Catalog lookup: `findSpec(id)` returns the `ScannerSpec` for an id + across P1 + P2 (`packages/scanners/src/catalog.ts:336-338`). +- Profile gating: `filterSpecsByProfile` enforces the per-priority + rules below (`packages/scanners/src/catalog.ts:396-417`). +- Missing-binary policy: license-incompatible scanners (hadolint, + tflint) emit empty SARIF and a warning rather than crashing + (`packages/scanners/src/catalog.ts:155-194`). + +## Scanners + +20 scanners total — 12 Priority-1 (default) + 8 Priority-2 (profile-gated). +Source of truth: `packages/scanners/src/catalog.ts:12-302`. P1 ordering is +fixed in `P1_SPECS` (lines 305-318); P2 ordering in `P2_SPECS` (lines 321-330). + +### Priority-1 (default set) + +| Id | Languages / scope | SARIF native | License | +| ------------------------ | ------------------------------- | ------------ | ----------------- | +| `semgrep` | all | yes | LGPL-2.1 (binary) | +| `betterleaks` | all (secrets) | yes | MIT | +| `osv-scanner` | all (deps) | yes | Apache-2.0 | +| `bandit` | python | yes | Apache-2.0 | +| `detect-secrets` | all (Yelp keyword + basic-auth) | no | Apache-2.0 | +| `biome` | typescript / javascript / tsx | yes | MIT | +| `pip-audit` | python | no | Apache-2.0 | +| `npm-audit` | typescript / javascript | no | Artistic-2.0 bin | +| `ruff` | python | yes | MIT | +| `grype` | all (image / SBOM) | yes | Apache-2.0 | +| `checkov-docker-compose` | docker-compose | yes | Apache-2.0 | +| `vulture` | python (dead code) | no | MIT | + +### Priority-2 (profile-gated) + +| Id | Gate | License | +| ---------- | ----------------------------------------------------------- | ---------------------------- | +| `trivy` | iac contains docker / terraform / cfn / k8s / docker-compose | Apache-2.0 | +| `checkov` | iac contains terraform / cfn / k8s / docker | Apache-2.0 | +| `hadolint` | iac contains docker | GPL-3.0 — external bin only | +| `tflint` | iac contains terraform | MPL-2.0 + BUSL — external bin | +| `spectral` | apiContracts contains openapi | Apache-2.0 | +| `radon` | languages contains python | MIT | +| `ty` | languages contains python (beta) | MIT | +| `clamav` | opt-in only | GPL-2.0 — external bin only | + +## Design + +- **External processes only** — every wrapper spawns the OS binary; no + scanner code is linked or vendored. This keeps copyleft (`GPL-3.0` in + hadolint, `MPL-2.0 + BUSL-1.1` in tflint) at arm's length + (`packages/scanners/src/catalog.ts:1-8`). +- **Profile-driven gating** — `filterSpecsByProfile` reads + `ProjectProfile.{languages, iacTypes, apiContracts}` and prunes the + catalog before launch, so scans don't waste time on irrelevant tools. +- **SHA256-pinned versions** — every spec carries a `version` and an + `installCmd`; CI installs the exact version listed. +- **`detect-secrets` is the 20th scanner** — added to catch keyword and + basic-auth secret shapes that betterleaks structurally cannot see + (`packages/scanners/src/catalog.ts:64-82`). +- **`optIn` and `beta` flags** — `clamav` is opt-in (off by profile); + `ty` is marked beta. Both are excluded from the default + `filterSpecsByProfile` output unless asked for explicitly. + +See `packages/sarif/README.md` for the SARIF normaliser the wrappers +feed into, and the root README's "Supply-chain posture" section for the +license-tier rationale. diff --git a/packages/scip-ingest/src/derive.ts b/packages/scip-ingest/src/derive.ts index 494423be..980ea368 100644 --- a/packages/scip-ingest/src/derive.ts +++ b/packages/scip-ingest/src/derive.ts @@ -9,7 +9,7 @@ */ import type { ScipDocument, ScipIndex, ScipOccurrence, ScipRange } from "./parse.js"; -import { SCIP_ROLE_DEFINITION } from "./parse.js"; +import { SCIP_ROLE_DEFINITION, SCIP_ROLE_IMPORT } from "./parse.js"; export interface DerivedEdge { readonly caller: string; @@ -127,24 +127,40 @@ export function deriveEdges(doc: ScipDocument): DerivedEdge[] { const edges: DerivedEdge[] = []; for (const occ of doc.occurrences) { if (occ.symbolRoles & SCIP_ROLE_DEFINITION) continue; + // Pure import-bridge occurrences (named import line, re-export bind) + // never carry call/reference semantics — drop before classification. + if (occ.symbolRoles & SCIP_ROLE_IMPORT) continue; if (!isReferenceable(occ.symbol)) continue; - // The call graph is function-to-function. REFERENCES across types - // (e.g. `builtins/float#`) are handled by the downstream structural - // tier and would otherwise distort blast-radius rankings with noise - // from stdlib types. See POC `scip_graph_poc/ingest.py` for the - // same contract. - if (!isFunctionLike(occ.symbol)) continue; const caller = innermostEnclosing(defs, occ.range, (s) => s.startsWith("local ")); if (!caller || caller === occ.symbol) continue; if (!isFunctionLike(caller)) continue; - edges.push({ - caller, - callee: occ.symbol, - document: doc.relativePath, - callLine: occ.range.startLine, - callChar: occ.range.startChar, - kind: "CALLS", - }); + // Function-like occurrences become CALLS edges (the historical scip + // POC contract). All other referenceable occurrences with a + // function-like enclosing scope become REFERENCES edges — type + // mentions, identifier reads, decorator targets — so blast-radius + // walks see the full read-side fanout from each defining function. + // The downstream emit pass drops REFERENCES whose callee has no + // DEFINITION anywhere in the index (stdlib / vendored / typings), + // matching the existing CALLS contract. + if (isFunctionLike(occ.symbol)) { + edges.push({ + caller, + callee: occ.symbol, + document: doc.relativePath, + callLine: occ.range.startLine, + callChar: occ.range.startChar, + kind: "CALLS", + }); + } else { + edges.push({ + caller, + callee: occ.symbol, + document: doc.relativePath, + callLine: occ.range.startLine, + callChar: occ.range.startChar, + kind: "REFERENCES", + }); + } } return edges; } diff --git a/packages/storage/src/column-encode.test.ts b/packages/storage/src/column-encode.test.ts index 0cc55d9e..46a8c231 100644 --- a/packages/storage/src/column-encode.test.ts +++ b/packages/storage/src/column-encode.test.ts @@ -85,17 +85,24 @@ test("booleanOrNull: booleans pass through; everything else → null", () => { // stringArrayOrNull // --------------------------------------------------------------------------- -test("stringArrayOrNull: arrays of strings pass through (Track C-2: empty → null)", () => { +test("stringArrayOrNull: preserves [] vs absent for round-trip symmetry", () => { assert.deepEqual(stringArrayOrNull(["a", "b"]), ["a", "b"]); - // Track C-2 caveat — empty array collapses to null. - assert.equal(stringArrayOrNull([]), null); + // Explicit empty array survives the writer side as a typed 0-length + // array (NOT null) so the native TEXT[] / STRING[] column can + // distinguish `keywords: []` from absent. The symmetric reader is in + // duckdb-adapter.ts:setStringArrayField + analyze.ts:stringArrayField. + assert.deepEqual(stringArrayOrNull([]), []); assert.equal(stringArrayOrNull("a"), null); assert.equal(stringArrayOrNull(null), null); assert.equal(stringArrayOrNull(undefined), null); // Non-string elements are filtered silently; mixed arrays keep the strings. assert.deepEqual(stringArrayOrNull(["a", 1, null, "b"]), ["a", "b"]); - // Filtering out everything yields null. - assert.equal(stringArrayOrNull([1, null, undefined]), null); + // Filtering out every element yields an empty array — NOT null. The + // input was an array (= author intent: collection), just one whose + // elements all violated the contract. The reader will rebuild this as + // `[]` rather than dropping the field entirely; that's intentional — + // it preserves the array-vs-absent signal even after element coercion. + assert.deepEqual(stringArrayOrNull([1, null, undefined]), []); }); // --------------------------------------------------------------------------- diff --git a/packages/storage/src/column-encode.ts b/packages/storage/src/column-encode.ts index 21004234..fd0c4f9c 100644 --- a/packages/storage/src/column-encode.ts +++ b/packages/storage/src/column-encode.ts @@ -31,8 +31,15 @@ * (the write side is exported here; the read side stays in each adapter * because it's symmetric with the per-adapter row decoder). * - * **`stringArrayOrNull` round-trip note** — the current `[] → null` behavior - * is preserved by this commit. Track C-2 fixes the asymmetry separately. + * **`stringArrayOrNull` round-trip note** — `[]` and + * `undefined` are now distinct on the wire. Empty arrays pass through to + * the native TEXT[] / STRING[] binder as 0-length array literals (NOT + * NULL); only non-array inputs collapse to NULL. Symmetric readers + * (`setStringArrayField` in duckdb-adapter.ts, `setStringArrayFieldGd` in + * graphdb-adapter.ts, `stringArrayField` in analyze.ts) re-attach the + * empty array as the field value instead of dropping the key. Net effect: + * `{keywords: []}` round-trips byte-identically to itself instead of + * collapsing to `{}` (canonical-JSON / graphHash distinction preserved). * * **`frameworks_json` unification note (AC-A-2)** — before the hoist, the * DuckDB adapter wrote the v2.0 polymorphic shape via `frameworksJsonOrNull` @@ -300,13 +307,26 @@ export function booleanOrNull(v: unknown): boolean | null { } /** - * Coerce to a `readonly string[]` or `null`. Non-arrays and arrays that - * yield zero strings both collapse to `null`. Non-string elements are - * filtered silently. + * Coerce to a `readonly string[]` or `null`. * - * **Round-trip caveat (Track C-2):** an explicitly-empty `string[]` input - * returns `null`, which loses the "explicit empty" signal. This commit - * preserves the legacy behavior; Track C-2 fixes the asymmetry separately. + * - Non-array inputs (`undefined`, `null`, wrong type) → `null` (= column + * stays SQL NULL, reader drops the field, canonical-JSON omits the key). + * - Array inputs round-trip as a typed array — including `[]` (0-length). + * Non-string elements are filtered silently. + * + * **Preserve `[]` distinct from absent.** Both + * backends store this column as a native array type (DuckDB `TEXT[]`, + * graph-db `STRING[]`); both binders distinguish a 0-length array literal + * from SQL NULL natively. Returning `[]` on an empty-array input lets the + * "explicit empty" signal survive through the binder, the on-disk row, + * and the read-back step. The symmetric reader change in + * `duckdb-adapter.ts:setStringArrayField`, + * `graphdb-adapter.ts:setStringArrayFieldGd`, and + * `analyze.ts:stringArrayField` re-attaches `[]` instead of dropping the + * field when the read-back array has length zero. Combined, this preserves + * the canonical-JSON shape difference between `{keywords: []}` and `{}` + * (graphHash content-shape change — see graph-hash-parity test fixture + * `medium-with-empty-keywords`). */ export function stringArrayOrNull(v: unknown): readonly string[] | null { if (!Array.isArray(v)) return null; @@ -314,7 +334,7 @@ export function stringArrayOrNull(v: unknown): readonly string[] | null { for (const item of v) { if (typeof item === "string") out.push(item); } - return out.length > 0 ? out : null; + return out; } /** diff --git a/packages/storage/src/duckdb-adapter.ts b/packages/storage/src/duckdb-adapter.ts index 431200cf..4c78e0bf 100644 --- a/packages/storage/src/duckdb-adapter.ts +++ b/packages/storage/src/duckdb-adapter.ts @@ -37,7 +37,9 @@ import { DuckDBInstance, type DuckDBPreparedStatement, FLOAT, + LIST, listValue, + VARCHAR, } from "@duckdb/node-api"; import { type CodeRelation, @@ -127,6 +129,7 @@ const ALL_RELATION_TYPES: readonly string[] = [ "FOUND_IN", "DEPENDS_ON", "OWNED_BY", + "TYPE_OF", ]; const DEFAULT_COCHANGE_LOOKUP_LIMIT = 10; @@ -1912,7 +1915,8 @@ export class DuckDbStore implements IGraphStore, ITemporalStore { const c = this.requireConn(); const reader = await c.runAndReadAll( `SELECT schema_version, last_commit, indexed_at, node_count, edge_count, - stats_json, cache_hit_ratio, cache_size_bytes, last_compaction + stats_json, cache_hit_ratio, cache_size_bytes, last_compaction, + embedder_model_id FROM store_meta WHERE id = 1`, ); const rows = reader.getRowObjects(); @@ -1926,6 +1930,7 @@ export class DuckDbStore implements IGraphStore, ITemporalStore { const cacheHitRatio = row["cache_hit_ratio"]; const cacheSizeBytes = row["cache_size_bytes"]; const lastCompaction = row["last_compaction"]; + const embedderModelId = row["embedder_model_id"]; return { schemaVersion: String(row["schema_version"]), ...(lastCommit !== null && lastCommit !== undefined @@ -1944,6 +1949,9 @@ export class DuckDbStore implements IGraphStore, ITemporalStore { ...(lastCompaction !== null && lastCompaction !== undefined ? { lastCompaction: String(lastCompaction) } : {}), + ...(embedderModelId !== null && embedderModelId !== undefined + ? { embedderModelId: String(embedderModelId) } + : {}), }; } @@ -1956,8 +1964,9 @@ export class DuckDbStore implements IGraphStore, ITemporalStore { const stmt = await c.prepare( `INSERT INTO store_meta ( id, schema_version, last_commit, indexed_at, node_count, edge_count, - stats_json, cache_hit_ratio, cache_size_bytes, last_compaction - ) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + stats_json, cache_hit_ratio, cache_size_bytes, last_compaction, + embedder_model_id + ) VALUES (1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ); try { bindParam(stmt, 1, meta.schemaVersion); @@ -1969,6 +1978,7 @@ export class DuckDbStore implements IGraphStore, ITemporalStore { bindParam(stmt, 7, meta.cacheHitRatio ?? null); bindParam(stmt, 8, meta.cacheSizeBytes ?? null); bindParam(stmt, 9, meta.lastCompaction ?? null); + bindParam(stmt, 10, meta.embedderModelId ?? null); await stmt.run(); } finally { stmt.destroySync(); @@ -2069,7 +2079,13 @@ function bindParam( if (Array.isArray(value)) { // DuckDB TEXT[] → bind as a list of varchar values. Use bindList (VARIABLE // length), not bindArray (FIXED length) — `TEXT[]` in the DDL is a LIST. - stmt.bindList(index, listValue([...(value as readonly string[])])); + // + // Pass the explicit `LIST(VARCHAR)` type so an empty array (`[]`, + // written intentionally to preserve the `keywords: []` vs absent + // distinction) binds as `LIST` rather than `LIST`. + // Without the type hint DuckDB rejects empty lists with + // "Cannot create lists with item type of ANY". + stmt.bindList(index, listValue([...(value as readonly string[])]), LIST(VARCHAR)); return; } switch (typeof value) { @@ -2295,12 +2311,17 @@ function setBooleanField(out: Record, key: string, v: unknown): } function setStringArrayField(out: Record, key: string, v: unknown): void { + // Preserve `[]` distinct from absent. The DuckDB TEXT[] binder returns + // a 0-length JS array for an empty SQL array literal and `null` for SQL + // NULL. Re-attach the array verbatim so a node written as + // `{keywords: []}` round-trips with `keywords: []` (not coalesced away) + // — required for canonical-JSON / graphHash byte-identity. if (!Array.isArray(v)) return; const arr: string[] = []; for (const item of v) { if (typeof item === "string") arr.push(item); } - if (arr.length > 0) out[key] = arr; + out[key] = arr; } function setJsonArrayField(out: Record, key: string, v: unknown): void { diff --git a/packages/storage/src/graph-hash-parity.test.ts b/packages/storage/src/graph-hash-parity.test.ts index 4bf27426..b06a73b4 100644 --- a/packages/storage/src/graph-hash-parity.test.ts +++ b/packages/storage/src/graph-hash-parity.test.ts @@ -44,6 +44,7 @@ import { tmpdir } from "node:os"; import { join } from "node:path"; import { test } from "node:test"; import { + graphHash, KnowledgeGraph, makeNodeId, type NodeId, @@ -306,6 +307,102 @@ function buildLargeFixture(): KnowledgeGraph { return g; } +/** + * Empty-collection fixture: medium graph plus a Community node carrying + * an explicitly-empty `keywords: []` and a Route node carrying an + * explicitly-empty `responseKeys: []`. Asserts: + * + * 1. (parity) The DuckDb and GraphDb hashes match each other and + * the original fixture hash — i.e. `[]` round-trips + * byte-identically across both backends through the + * native TEXT[] / STRING[] columns. + * 2. (difference) The hash of this fixture differs from the hash of + * the equivalent fixture without the `keywords` / + * `responseKeys` keys — i.e. `[]` is not silently + * equivalent to absent. That assertion runs in the + * accompanying "absent-keys" test below. + * + * This is the graphHash content-shape change tripwire: writer + reader + * on both adapters must preserve the `[]` vs `undefined` distinction + * or one of the two assertions will fail. + */ +function buildMediumWithEmptyKeywordsFixture(): KnowledgeGraph { + const g = new KnowledgeGraph(); + const file = makeNodeId("File", "src/api.ts", "api.ts"); + g.addNode({ id: file, kind: "File", name: "api.ts", filePath: "src/api.ts" }); + + // Community node with explicit empty `keywords`. The two ownership-drift + // / truck-factor fields are intentionally absent so canonical-JSON only + // has to carry `keywords: []` as the load-bearing distinguisher. + const communityId = makeNodeId("Community", "", "auth-community"); + g.addNode({ + id: communityId, + kind: "Community", + name: "auth-community", + filePath: "", + inferredLabel: "auth", + symbolCount: 0, + cohesion: 1.0, + keywords: [], + }); + + // Route node with explicit empty `responseKeys`. + const routeId = makeNodeId("Route", "src/api.ts", "GET /health"); + g.addNode({ + id: routeId, + kind: "Route", + name: "GET /health", + filePath: "src/api.ts", + url: "/health", + method: "GET", + responseKeys: [], + }); + + g.addEdge({ from: file, to: routeId, type: "DEFINES", confidence: 1.0 }); + return g; +} + +/** + * Companion fixture for the empty-collection difference assertion. + * Identical to {@link buildMediumWithEmptyKeywordsFixture} except both + * `keywords` and `responseKeys` are absent (not `[]`). The accompanying + * test below asserts the resulting `graphHash` differs from the + * empty-array variant — proving the writer + readers preserve the + * `[]`-vs-absent distinction end-to-end (rather than silently coalescing + * both to absent). + */ +function buildMediumWithoutKeywordsFixture(): KnowledgeGraph { + const g = new KnowledgeGraph(); + const file = makeNodeId("File", "src/api.ts", "api.ts"); + g.addNode({ id: file, kind: "File", name: "api.ts", filePath: "src/api.ts" }); + + const communityId = makeNodeId("Community", "", "auth-community"); + g.addNode({ + id: communityId, + kind: "Community", + name: "auth-community", + filePath: "", + inferredLabel: "auth", + symbolCount: 0, + cohesion: 1.0, + // keywords intentionally absent. + }); + + const routeId = makeNodeId("Route", "src/api.ts", "GET /health"); + g.addNode({ + id: routeId, + kind: "Route", + name: "GET /health", + filePath: "src/api.ts", + url: "/health", + method: "GET", + // responseKeys intentionally absent. + }); + + g.addEdge({ from: file, to: routeId, type: "DEFINES", confidence: 1.0 }); + return g; +} + /** * AC-M6-1 fixture: a RepoNode exercising every field — populated + * explicit-null variants of `originUrl` / `defaultBranch` / `group`, and @@ -416,7 +513,7 @@ test("graphHash parity: medium fixture (mixed node kinds + OWNED_BY edges)", asy await runParity({ name: "medium", fixture: buildMediumFixture() }); }); -test("graphHash parity: large fixture (≥500 nodes, 24-edge-kind sweep)", async () => { +test("graphHash parity: large fixture (≥500 nodes, 25-edge-kind sweep)", async () => { await runParity({ name: "large", fixture: buildLargeFixture() }); }); @@ -427,3 +524,27 @@ test("graphHash parity: repo fixture (RepoNode with all attributes populated)", test("graphHash parity: repo fixture with explicit-null origin / branch / group", async () => { await runParity({ name: "repo-null", fixture: buildRepoNullFixture() }); }); + +test("graphHash parity: medium-with-empty-keywords ([] vs absent)", async () => { + await runParity({ + name: "medium-with-empty-keywords", + fixture: buildMediumWithEmptyKeywordsFixture(), + }); +}); + +test("graphHash({keywords: []}) differs from graphHash({} — keywords absent)", async () => { + // Difference assertion — proves the writer + readers actually preserve + // the `[]`-vs-absent distinction. If a future regression silently + // coalesces `[]` back to absent, this test fires before the + // medium-with-empty-keywords parity test would (parity could mask the + // bug if BOTH adapters dropped `[]` symmetrically). + const withEmpty = graphHash(buildMediumWithEmptyKeywordsFixture()); + const without = graphHash(buildMediumWithoutKeywordsFixture()); + if (withEmpty === without) { + throw new Error( + "Regression: graphHash treats `keywords: []` and absent `keywords` as equivalent. " + + "Check `stringArrayOrNull` in column-encode.ts and the symmetric readers in " + + "duckdb-adapter.ts / graphdb-adapter.ts / analyze.ts.", + ); + } +}); diff --git a/packages/storage/src/graphdb-adapter.ts b/packages/storage/src/graphdb-adapter.ts index 2fe22b75..50d7537e 100644 --- a/packages/storage/src/graphdb-adapter.ts +++ b/packages/storage/src/graphdb-adapter.ts @@ -1330,7 +1330,8 @@ export class GraphDbStore implements IGraphStore { `m.last_commit AS last_commit, m.indexed_at AS indexed_at, ` + `m.node_count AS node_count, m.edge_count AS edge_count, ` + `m.stats_json AS stats_json, m.cache_hit_ratio AS cache_hit_ratio, ` + - `m.cache_size_bytes AS cache_size_bytes, m.last_compaction AS last_compaction ` + + `m.cache_size_bytes AS cache_size_bytes, m.last_compaction AS last_compaction, ` + + `m.embedder_model_id AS embedder_model_id ` + `LIMIT 1`, ); const first = rows[0]; @@ -1345,6 +1346,7 @@ export class GraphDbStore implements IGraphStore { const cacheHitRatio = row["cache_hit_ratio"]; const cacheSizeBytes = row["cache_size_bytes"]; const lastCompaction = row["last_compaction"]; + const embedderModelId = row["embedder_model_id"]; return { schemaVersion: String(row["schema_version"]), ...(lastCommit !== null && lastCommit !== undefined @@ -1363,6 +1365,9 @@ export class GraphDbStore implements IGraphStore { ...(lastCompaction !== null && lastCompaction !== undefined ? { lastCompaction: String(lastCompaction) } : {}), + ...(embedderModelId !== null && embedderModelId !== undefined + ? { embedderModelId: String(embedderModelId) } + : {}), }; } @@ -1375,7 +1380,8 @@ export class GraphDbStore implements IGraphStore { `MERGE (m:StoreMeta {id: 1}) ` + `SET m.schema_version = $p1, m.last_commit = $p2, m.indexed_at = $p3, ` + `m.node_count = $p4, m.edge_count = $p5, m.stats_json = $p6, ` + - `m.cache_hit_ratio = $p7, m.cache_size_bytes = $p8, m.last_compaction = $p9`, + `m.cache_hit_ratio = $p7, m.cache_size_bytes = $p8, m.last_compaction = $p9, ` + + `m.embedder_model_id = $p10`, [ meta.schemaVersion, meta.lastCommit ?? null, @@ -1386,6 +1392,7 @@ export class GraphDbStore implements IGraphStore { meta.cacheHitRatio ?? null, meta.cacheSizeBytes ?? null, meta.lastCompaction ?? null, + meta.embedderModelId ?? null, ], ); } @@ -1749,10 +1756,15 @@ function setBooleanFieldGd(out: Record, key: string, v: unknown } function setStringArrayFieldGd(out: Record, key: string, v: unknown): void { + // Preserve `[]` distinct from absent. The graph-db STRING[] binder + // returns a 0-length JS array for an empty array literal and `null` + // for an absent column — matching DuckDB's TEXT[] semantics. Re-attach + // the array verbatim so canonical-JSON / graphHash parity holds across + // backends for `{keywords: []}` round-trips. if (!Array.isArray(v)) return; const arr: string[] = []; for (const item of v) if (typeof item === "string") arr.push(item); - if (arr.length > 0) out[key] = arr; + out[key] = arr; } function setJsonArrayFieldGd(out: Record, key: string, v: unknown): void { diff --git a/packages/storage/src/graphdb-schema.test.ts b/packages/storage/src/graphdb-schema.test.ts index c604be28..3455b9bd 100644 --- a/packages/storage/src/graphdb-schema.test.ts +++ b/packages/storage/src/graphdb-schema.test.ts @@ -7,7 +7,7 @@ import { generateSchemaDdl, getAllRelationTypes } from "./graphdb-schema.js"; // code over the spec text — the DDL must cover every kind the v1.1 DuckDB // schema knows. If a kind is added to `ALL_RELATION_TYPES` upstream, bump // this constant alongside the new entry in `graphdb-schema.ts`. -const EXPECTED_RELATION_COUNT = 24; +const EXPECTED_RELATION_COUNT = 25; // Banned-literal probes are built at runtime so this test file does not // itself trip `scripts/check-banned-strings.sh`. Each entry is a list of @@ -111,9 +111,9 @@ test("generateSchemaDdl rejects invalid embedding dimensions", () => { test("getAllRelationTypes returns every OCH edge kind in canonical order", () => { const kinds = getAllRelationTypes(); assert.equal(kinds.length, EXPECTED_RELATION_COUNT); - // Spot-check ordering invariants: first kind is CONTAINS, last is OWNED_BY. + // Spot-check ordering invariants: first kind is CONTAINS, last is TYPE_OF. assert.equal(kinds[0], "CONTAINS"); - assert.equal(kinds[kinds.length - 1], "OWNED_BY"); + assert.equal(kinds[kinds.length - 1], "TYPE_OF"); }); test("statements are semicolon-terminated", () => { diff --git a/packages/storage/src/graphdb-schema.ts b/packages/storage/src/graphdb-schema.ts index 02921e53..b07d87d8 100644 --- a/packages/storage/src/graphdb-schema.ts +++ b/packages/storage/src/graphdb-schema.ts @@ -67,6 +67,7 @@ const RELATION_KINDS: readonly string[] = [ "FOUND_IN", "DEPENDS_ON", "OWNED_BY", + "TYPE_OF", ]; /** @@ -198,6 +199,7 @@ export function generateSchemaDdl(opts: GraphDbSchemaOptions = {}): string { cache_hit_ratio DOUBLE, cache_size_bytes INT64, last_compaction STRING, + embedder_model_id STRING, PRIMARY KEY (id) )`); diff --git a/packages/storage/src/interface.ts b/packages/storage/src/interface.ts index 2c5dd2df..abe4b624 100644 --- a/packages/storage/src/interface.ts +++ b/packages/storage/src/interface.ts @@ -921,4 +921,15 @@ export interface StoreMeta { readonly cacheSizeBytes?: number; /** ISO-8601 timestamp of the last parse-cache compaction pass. */ readonly lastCompaction?: string; + /** + * Embedder model identifier used to populate the `embeddings` table + * during the most recent index run. Populated from + * {@link Embedder.modelId}. The query path compares this to the + * currently-active embedder's modelId; a mismatch returns exit 2 with + * a remediation hint unless `--force-backend-mismatch` is set. + * Optional so legacy stores keep round-tripping; the open-time + * backfill attributes pre-existing NULL rows to the currently-active + * embedder with a one-shot stderr warning. + */ + readonly embedderModelId?: string; } diff --git a/packages/storage/src/schema-ddl.ts b/packages/storage/src/schema-ddl.ts index 9bd1958d..b7212cff 100644 --- a/packages/storage/src/schema-ddl.ts +++ b/packages/storage/src/schema-ddl.ts @@ -170,17 +170,22 @@ export function generateSchemaDDL(opts: SchemaOptions): readonly string[] { `CREATE INDEX IF NOT EXISTS idx_embeddings_granularity ON embeddings (granularity)`, `CREATE TABLE IF NOT EXISTS store_meta ( - id INTEGER PRIMARY KEY, - schema_version TEXT NOT NULL, - last_commit TEXT, - indexed_at TEXT NOT NULL, - node_count INTEGER NOT NULL, - edge_count INTEGER NOT NULL, - stats_json TEXT, - cache_hit_ratio DOUBLE, - cache_size_bytes BIGINT, - last_compaction TEXT + id INTEGER PRIMARY KEY, + schema_version TEXT NOT NULL, + last_commit TEXT, + indexed_at TEXT NOT NULL, + node_count INTEGER NOT NULL, + edge_count INTEGER NOT NULL, + stats_json TEXT, + cache_hit_ratio DOUBLE, + cache_size_bytes BIGINT, + last_compaction TEXT, + embedder_model_id TEXT )`, + // Older stores without the embedder fingerprint column get it + // here; pre-existing rows stay NULL so the open-time backfill can + // attribute them to the currently-active embedder with a one-shot warning. + `ALTER TABLE store_meta ADD COLUMN IF NOT EXISTS embedder_model_id TEXT`, // File-level co-change table. Separate from `relations` because the signal // is statistical (not deterministic), file-granular, and rewrites on every