diff --git a/.erpaval/INDEX.md b/.erpaval/INDEX.md new file mode 100644 index 00000000..b09ae944 --- /dev/null +++ b/.erpaval/INDEX.md @@ -0,0 +1,14 @@ +# OpenCodeHub — ERPAVal durable knowledge index + +Compound-extracted lessons and EARS specs from prior autonomous +development sessions. Solutions are reusable; specs are per-feature. + +## Solutions (architecture patterns + conventions) + +- [SCIP replaces LSP for code-graph oracle edges](solutions/architecture-patterns/scip-replaces-lsp.md) — one-shot indexers beat stateful LSP clients for compiler-grade graph edges. +- [Repomix --compress is output-side only](solutions/architecture-patterns/repomix-is-output-side.md) — don't substitute it for a tree-sitter chunker; use it for repo snapshots. +- [Hand-roll a minimal protobuf reader for fixed schemas](solutions/conventions/scip-protobuf-hand-rolled-reader.md) — ~130 LOC beats pulling in buf+codegen when the schema is small and stable. + +## Specs + +- [001-scip-replaces-lsp](specs/001-scip-replaces-lsp/spec.md) — rip-and-replace LSP with SCIP for TS/Py/Go/Rust/Java. Task map: [tasks.md](specs/001-scip-replaces-lsp/tasks.md). diff --git a/.erpaval/debt.md b/.erpaval/debt.md new file mode 100644 index 00000000..ada16bcf --- /dev/null +++ b/.erpaval/debt.md @@ -0,0 +1,218 @@ +# OpenCodeHub — Wave-plan tech-debt tracker + +**Status**: Working document. Gitignored via `.gitignore: .erpaval/`. + +This file catalogs every wave/stream code reference that was scrubbed from +the codebase on 2026-04-23 during the clean-room audit. The references were +originally left behind as "TODO when Wave X lands" style hints, and they +encoded actual product state — features deferred, scanner tiers, eval +baselines, rollout priority, etc. The scrub removed the wave labels but +some of the underlying work is still incomplete. + +Treat every line here as a candidate backlog ticket. For each: figure out +whether the thing was actually shipped (and the comment was stale), or +whether it's still open (and deserves an issue). + +## Legend + +- **W1** — W1-CORE (initial MVP shape) +- **W2** — second wave (language coverage, caching, scanner tiers, detectors) +- **W3** — third wave (analysis tools: risk_trends, verdict variants) +- **W4** — fourth wave (bench, doctor, gates) +- **W5** — fifth wave (new tools, eval matrix expansion) + +Stream letters appeared on W1 artifacts (Stream E = caching, Stream J = +multi-repo groups, Stream T = suppressions, etc). They're a second axis +orthogonal to the W-code. + +## Catalog + +### packages/cli — wave hints + +- `packages/cli/src/commands/analyze.ts:170` — "Cache-health stats + (W2-E.4): the parse-cache hit ratio and on-disk" size telemetry. Ships + the stats; was flagged as W2-E.4 work. **Action:** confirm stats are + actually populated; add a test if not. +- `packages/cli/src/commands/bench.test.ts:2` and + `packages/cli/src/commands/doctor.test.ts:2` — "Unit tests for codehub + bench — W4-G.3" / "doctor — W4-G.3". Both command+test exist; W4-G.3 + is delivered. **Action:** no debt, just a stale label. + +### packages/embedder — W2-A.2 (embedder weights downloader) + +All 5 files reference "W2-A.2" as the code path that installs ONNX +weights via `codehub setup --embeddings`. + +- `packages/embedder/src/index.ts:7` +- `packages/embedder/src/paths.ts:12` +- `packages/embedder/src/model-pins.ts:4` +- `packages/embedder/src/model-pins.ts:40` — `"once from the upstream repo"` +- `packages/embedder/src/model-pins.test.ts:4` +- `packages/embedder/src/onnx-embedder.ts:11` + +**Action:** `codehub setup --embeddings` ships in `packages/cli/src/commands/setup.ts` +— feature is done. Labels are stale. No debt. + +### packages/eval — MVP + W2-C.* language fixtures + W5-3 new-tool matrix + +- `packages/eval/baselines/opencodehub-v1.json:60` — "14 language fixtures + (MVP 7 + W2-C.2/3/4 additions: c, cpp, ruby, kotlin, swift, php, + dart)." +- `packages/eval/baselines/opencodehub-v1.json:63` — "risk_trends and + verdict map to tools still in flight (W3-F.1 / W3-F.2). Cases pass via + the isError branch with a structured error envelope until the server + registers the tools." +- `packages/eval/src/opencodehub_eval/agent.py:177` — "W5-3 new tools" + section delimiter +- `packages/eval/src/opencodehub_eval/agent.py:185` — "are still in + flight (W3-F.1 / W3-F.2)" +- `packages/eval/src/opencodehub_eval/bench.py:243` — "new = 63 (W5-3 + new-tool matrix)" +- `packages/eval/src/opencodehub_eval/bench.py:269` — "hard-coded 98 (the + W2-C.5 core target)" +- `packages/eval/src/opencodehub_eval/tests/conftest.py:31` — "14 + language fixtures (7 MVP + 7 W2-C.2/3/4 additions)" +- `packages/eval/src/opencodehub_eval/tests/test_parametrized.py:8,10` — + "W2-C.5 deliverable", "W5-3 coverage for the nine tools" +- `packages/eval/src/opencodehub_eval/tests/test_parametrized.py:167-175` + — risk_trends / verdict (W3-F.1/W3-F.2) tool-still-unregistered + fallback logic +- `packages/eval/src/opencodehub_eval/tests/test_parametrized.py:257` — + "W5-3 expansion" in the parametrize helper + +**Real debt here:** + +1. **W3-F.1 / W3-F.2 (risk_trends + verdict):** eval acknowledged these + as unregistered tools with fallback paths. Search `packages/mcp/src/tools/` + — if both tools exist and are registered, the fallback branches in + `test_parametrized.py:167-175` become dead code that can be removed. + If one is missing, that's a product gap. +2. **W2-C.5 core target = 98.** If the eval baseline now passes a + different target, update the hard-coded fallback in + `bench.py:269`. + +### packages/ingestion — language registry (W2-C.1) + content cache (Stream E / W2-E.*) + +- `packages/ingestion/src/parse/grammar-registry.test.ts:52-53` — loads + "W2-C.1 grammars" (7 additional: c, cpp, ruby, kotlin, swift, php, + dart) +- `packages/ingestion/src/parse/grammar-registry.ts:198` — "W2-C.* + languages whose grammar package is not installed" +- `packages/ingestion/src/parse/language-detector.ts:26` — "W2-C.1 + additions" +- `packages/ingestion/src/pipeline/phases/content-cache.ts:2` — + "Content-addressed parse cache (Stream E, W2-E.1)" +- `packages/ingestion/src/pipeline/phases/content-cache.ts:133` — + "lazily by a future eviction pass (W2-E.4)" +- `packages/ingestion/src/pipeline/phases/content-cache.ts:193` — + "meta-sidecar cache-stats path (W2-E.4)" + +**Real debt:** + +1. **W2-E.4 eviction pass.** content-cache.ts:133 says eviction is + deferred to "a future eviction pass." Search for any actual eviction + code — if none exists, this is a real backlog item (parse cache will + grow unbounded). + +### packages/ingestion — profile detectors + providers (wave-labelled) + +- `packages/ingestion/src/pipeline/phases/default-set.ts:20` — "scanner + phases (W2-I4)" +- `packages/ingestion/src/pipeline/phases/dependencies.ts` — probably + has W-code mentions; verify +- `packages/ingestion/src/pipeline/phases/incremental-helper.ts` — W-code + mention; verify +- `packages/ingestion/src/pipeline/phases/incremental-scope.ts` and + `incremental-scope.test.ts` — W-code mentions; verify +- `packages/ingestion/src/pipeline/phases/openapi.ts` — verify +- `packages/ingestion/src/pipeline/phases/parse.test.ts`, `parse.ts` — + verify +- `packages/ingestion/src/pipeline/phases/processes.ts` — verify +- `packages/ingestion/src/pipeline/phases/profile.ts` — verify +- `packages/ingestion/src/pipeline/phases/sbom.test.ts`, `sbom.ts` — + verify +- `packages/ingestion/src/pipeline/profile-detectors/frameworks.ts`, + `languages.ts`, `manifests.ts` — verify +- `packages/ingestion/src/pipeline/types.ts` — verify +- `packages/ingestion/src/providers/http-detect.ts` — verify +- `packages/ingestion/src/providers/registry.test.ts`, `registry.ts` — + verify + +**Action:** most are likely stale labels. Spot-check any that contain +"TODO", "FIXME", or "in flight" — those are real debt. + +### packages/mcp — prompts + tools (W-code markers) + +- `packages/mcp/src/prompts/prompts.test.ts` — verify +- `packages/mcp/src/tools/annotations.test.ts` — verify +- `packages/mcp/src/tools/context.ts` — verify +- `packages/mcp/src/tools/dependencies.ts` — verify +- `packages/mcp/src/tools/group-query.ts` — verify +- `packages/mcp/src/tools/license-audit.ts` — verify + +### packages/sarif — schema-validation W-code marker + +- `packages/sarif/src/schema-validation.test.ts` — verify + +### packages/scanners — P1/P2 tiers + W2-I4 + +- `packages/scanners/src/catalog.ts:107` — "W2-I4: Priority-2 scanners. + These ship alongside P1 but are opt-in via" (exact quote) +- `packages/scanners/src/wrappers/osv-scanner.ts` — W-code mention +- `packages/scanners/src/wrappers/p2-wrappers.test.ts` — W-code mention +- `packages/scanners/src/wrappers/trivy.ts` — W-code mention + +**Product fact to preserve:** the P1/P2 split is a real user-facing +feature. Keep "Priority-1" and "Priority-2" as product terminology +(they're documented in scanners/package.json description). Only drop +the W2-I4 label. + +### Vendor README — the literal "(to be created in W2-B.2)" smoking gun + +- `vendor/stack-graphs-python/README.md:39` — "That evaluator consumes + the vendored `.tsg` as (to be created in W2-B.2)." + +**Action:** the evaluator DOES exist at +`packages/ingestion/src/providers/resolution/stack-graphs/`. Rephrase +the README to point at the real path instead of a wave code. No debt; +just a stale pointer. + +### Root / infra + +- `pnpm-workspace.yaml` — W-code mention (probably a comment) +- `scripts/acceptance.sh` — W-code mentions +- `scripts/smoke-mcp.sh` — W-code mentions + +### Commit subject line (about to be scrubbed via Option A nuke) + +- `645c08e "Stream J: Multi-repo retrieval & group queries"` — this was + a real release. Stream J = multi-repo groups. Confirmed shipped. +- `f08c87f "Initial commit: OpenCodeHub MVP + v1.0 roadmap"` — body + says "Apache-2.0 clean-room reimplementation of the GitNexus product + surface". Nuke. +- Several mid-history commits reference "gitnexus" by name in parity + / cleanroom-gym notes. + +## Stream names seen in history (for reference) + +| Stream | What it shipped | +|--------|-----------------| +| Stream E | Content-addressed parse cache (`content-cache.ts`, meta sidecar) | +| Stream J | Multi-repo groups (group-query/group-status/group-sync MCP tools) | +| Stream T | SARIF suppressions (`packages/sarif/src/suppressions.ts`) | + +## Revisit workflow + +When you come back to these: + +```bash +# Re-list any wave codes that survived into future commits +git grep -nE 'W[0-9]+[-.][A-Z0-9]+|\bStream [A-Z]\b' + +# Re-list any gitnexus references +git grep -ni 'gitnexus' +``` + +Banned-strings CI check (`scripts/check-banned-strings.sh`) now blocks +wave codes and `gitnexus` from re-entering the tree — so any future +appearance is a regression, not drift. diff --git a/.erpaval/solutions/architecture-patterns/repomix-is-output-side.md b/.erpaval/solutions/architecture-patterns/repomix-is-output-side.md new file mode 100644 index 00000000..6c0f924c --- /dev/null +++ b/.erpaval/solutions/architecture-patterns/repomix-is-output-side.md @@ -0,0 +1,49 @@ +--- +title: Repomix --compress is output-side only, not an input-side chunker +tags: [repomix, embedder, chunker, tree-sitter, llm] +first_applied: 2026-04-26 +repos: [open-code-hub] +--- + +## The pattern + +Repomix (https://github.com/yamadashy/repomix) is tempting as a +replacement for a tree-sitter-based chunker in an embedding pipeline — +it ships `--compress` with ~70% token reduction and supports 16 +languages. **Do not use it that way.** Scope it to output-side surfaces +(LLM-context packing, snapshot generation). + +## Why + +1. **Per-file, not per-symbol.** `--compress` stitches signatures + + class headers + imports into a single text blob per file joined by + `⋮----`. It discards `startLine / endLine / symbolName / nodeType`. + A graph-extraction pipeline that turns parse captures into + Function/Method/Class nodes + CALLS/IMPORTS/EXTENDS edges cannot be + fed from this output. +2. **Tokenizer mismatch.** Token counts use `o200k_base` (GPT-4o). If + your embedder is anything else (BERT, modernbert, e5, voyage-code), + your budget math won't line up. +3. **Determinism gap.** No grammar-sha is exposed, so content-addressed + cache keys `(sha256, grammarSha, pipelineVersion)` lose their + grammar component. +4. **Coverage gaps.** tsx folds into typescript; kotlin is absent. + +## Where repomix actually shines + +- `codehub pack` CLI command — single-file snapshot for agents who want + to drop the whole repo into their context window. +- An MCP `pack_codebase` tool that re-exports the repomix invocation so + agents can produce their own snapshots without knowing the CLI. + +## Quick sanity check before substituting repomix for anything + +Before planning to delete a chunker / parser in favor of repomix, ask: + +- Do downstream consumers need per-symbol boundaries? +- Do they need startLine / endLine on every chunk? +- Do they key caches off grammar shas? +- Are tsx / kotlin / any other first-class language supported? + +Any **yes** means keep your existing chunker; use repomix only for the +output-side feature. diff --git a/.erpaval/solutions/architecture-patterns/scip-replaces-lsp.md b/.erpaval/solutions/architecture-patterns/scip-replaces-lsp.md new file mode 100644 index 00000000..319016ab --- /dev/null +++ b/.erpaval/solutions/architecture-patterns/scip-replaces-lsp.md @@ -0,0 +1,80 @@ +--- +title: SCIP replaces LSP for code-graph oracle edges +tags: [scip, lsp, ingestion, graph, indexer] +first_applied: 2026-04-26 +repos: [open-code-hub] +--- + +## The pattern + +When a code-intelligence system needs compiler-grade call / reference / +heritage edges across many languages, prefer **SCIP** indexers (one-shot +artifact producers) over **LSP** servers (stateful JSON-RPC subprocesses). + +SCIP indexers exist for TypeScript, Python, Go, Rust (via +`rust-analyzer scip`), and Java. Each emits a single `.scip` protobuf +file per run. A symbol string encodes +` +` which is +globally unique — cross-repo references work by construction. + +## The shape + +``` +source tree ─► per-lang SCIP indexer (×5) ─► .opencodehub/scip/.scip + │ + ▼ + parseScipIndex(Uint8Array) -> ScipIndex + │ + ▼ + deriveIndex(index) -> {symbols, edges} + │ + ▼ + materialize(edges) -> {node_metrics, + reach_forward, + reach_backward, + scc} + │ + ▼ + CodeRelation(confidence=1.0, + reason="scip:@") +``` + +## Why this beats the LSP approach + +- **No daemon.** SCIP produces an artifact; no stdio JSON-RPC, no + request correlation, no warm-up, no timeout tuning. +- **Dependency surface shrinks.** No pyright / tsserver / gopls / + rust-analyzer binaries in node_modules. +- **Cross-repo for free.** SCIP symbol strings are globally unique; + merging two `.scip` files is just `concat documents[] + concat + external_symbols[]` at the protobuf level. +- **Incremental caching is trivial.** One mtime check per language; no + need to track per-symbol queries. + +## The contract boundary worth preserving + +The `confidence=1.0` + `reason startsWith ":"` contract that +downstream consumers (`confidence-demote`, `summarize`, +`mcp/confidence`, `cli/analyze` auto-cap) hinge on is load-bearing. +When migrating from LSP to SCIP, keep the same confidence ceiling and +switch only the reason-prefix list and the phase-name that produces +the edges. Downstream code changes are then one-line (new constant). + +## Lingering gotchas + +- **scip-java / rust-analyzer run build scripts** — gate behind an + explicit `allowBuildScripts=true` opt-in for untrusted workspaces. +- **Relationship edges (IMPLEMENTS) are in SymbolInformation, not in + Occurrence** — a minimal protobuf reader that only decodes + Occurrence will not surface them. When we need real IMPLEMENTS + semantics, extend the parser to decode `SymbolInformation.relationships`. +- **SCIP range encoding has two shapes** — 4-int + `[startLine, startChar, endLine, endChar]` OR 3-int + `[line, startChar, endChar]` when start/end share a line. Normalize + at decode time. + +## When NOT to use this + +- Small toy projects where tree-sitter heuristic edges are good enough. +- Languages without a SCIP indexer (C#, C, C++, Ruby, Kotlin, Swift, + PHP, Dart — as of 2026-04-26). Keep tree-sitter for those. diff --git a/.erpaval/solutions/conventions/scip-protobuf-hand-rolled-reader.md b/.erpaval/solutions/conventions/scip-protobuf-hand-rolled-reader.md new file mode 100644 index 00000000..232b6169 --- /dev/null +++ b/.erpaval/solutions/conventions/scip-protobuf-hand-rolled-reader.md @@ -0,0 +1,44 @@ +--- +title: Hand-roll a minimal protobuf reader for fixed schemas +tags: [protobuf, scip, typescript, dependency-minimization] +first_applied: 2026-04-26 +repos: [open-code-hub] +--- + +## The pattern + +When you only need to decode a small, fixed protobuf schema (say 5 +messages and 10 fields), **a 130-LOC hand-rolled reader beats pulling +in `@bufbuild/protobuf` + codegen + runtime**. + +We decoded SCIP's Index / Metadata / ToolInfo / Document / Occurrence / +SymbolInformation in `packages/scip-ingest/src/proto-reader.ts` (130 +LOC) + `parse.ts` (255 LOC). Total: 385 LOC, zero runtime deps. + +## What you need + +- A `ProtoReader` that exposes `readVarint()`, `readString()`, + `readSubMessage()`, `skip(wireType)`, and a `forEachField(visit)` + iterator. +- Four wire types: varint (0), fixed64 (1), length-delimited (2), + fixed32 (5). SCIP uses only varint + length-delimited + packed + varints inside length-delimited. +- Per-message decode functions that switch on field number and + consume-or-skip each one. + +## Gotchas + +- Varints are little-endian base-128. Use + `result += (byte & 0x7f) * 2 ** shift` with `shift += 7`. Don't + bitwise-OR into a JS `number` past 2^31. +- Length-delimited fields can contain packed repeated ints; dispatch + on `wireType === LENGTH_DELIMITED` per-repeated-field to cover both + `[tag, len, vals...]` and unpacked `[tag, val][tag, val]` forms. +- Unknown fields: call `skip(wireType)` and move on. Protobuf tolerates + schema drift. + +## When this pattern is wrong + +- The schema has 100+ messages (e.g. Google Cloud APIs). Use buf + codegen. +- You need to encode, not just decode. Use buf runtime. +- The schema changes weekly. Let the codegen carry the maintenance. diff --git a/.erpaval/specs/001-scip-replaces-lsp/spec.md b/.erpaval/specs/001-scip-replaces-lsp/spec.md new file mode 100644 index 00000000..dec86714 --- /dev/null +++ b/.erpaval/specs/001-scip-replaces-lsp/spec.md @@ -0,0 +1,273 @@ +# EARS spec — SCIP replaces LSP + +Branch: `feat/scip-replaces-lsp` +Session: `session-f8a300bc` +Date: 2026-04-26 + +## Context + +OpenCodeHub today relies on four long-running LSP clients (pyright, +typescript-language-server, gopls, rust-analyzer) managed by +`@opencodehub/lsp-oracle` to upgrade tree-sitter heuristic edges with +compiler-grade references. The LSP layer totals ~10.6k LOC across +`packages/lsp-oracle` and the four per-language ingestion phases, plus +~2.5k LOC of gym harness, corpus fixtures, CI workflow, and +documentation that assumes LSP framing. + +We are replacing this entire layer with SCIP (https://scip-code.org). +SCIP indexers run once per language at index time and emit a single +`.scip` protobuf file per language. We then load those files into the +existing DuckDB graph store, preserving the +`confidence=1.0 + reason-prefix` oracle-edge contract that downstream +`confidence-demote`, `summarize`, `mcp/confidence`, and +`cli/analyze` consumers depend on. + +Tree-sitter grammars stay for parsing (scan/parse/structure/accesses/ +cross-file/mro/complexity/markdown phases are untouched). + +**Repomix decision (revised after AST deep-dive, see +`research-repomix-ast.yaml`).** Repomix `--compress` emits a single +compressed text blob per file (signatures + class headers + imports +joined with a `⋮----` separator), NOT a symbol-level chunker. It +discards the `startLine/endLine/symbol-name/nodeType` metadata that +our parse phase turns into Function/Method/Class nodes and +CALLS/IMPORTS/EXTENDS/IMPLEMENTS/DEFINES edges. It also uses a +GPT-4o tokenizer that does not match our `gte-modernbert-base` +embedder budget, and omits tsx + kotlin from compress coverage. + +Verdict: **keep the tree-sitter chunker for per-symbol embeddings and +graph extraction.** Repomix is repositioned as an *output-side* +feature — a `codehub pack` command and an MCP-side `pack_codebase` +re-export — not a replacement for the input-side chunker. The +simplification ratio claim (~10x reduction) does not hold for the +chunker; the real reduction comes from ripping LSP (≥10.6k LOC). + +## Architecture + +``` ++-- Source tree +| +| Existing (unchanged) +| - scan -> profile -> structure -> markdown -> parse -> ... +| ... -> accesses (tree-sitter heuristic tier) +| ++-- NEW: scip-index phase +| +-- detects languages +| +-- invokes per-language SCIP indexers in parallel +| | scip-typescript / scip-python / scip-go +| | rust-analyzer scip / scip-java +| +-- writes .opencodehub/scip/.scip files +| +-- loads every .scip via @bufbuild/protobuf (vendored scip_pb.ts) +| +-- emits CodeRelation edges with +| reason = "scip:@" +| confidence = 1.0 +| ++-- confidence-demote (rewired to SCIP_PROVENANCE_PREFIXES) ++-- summarize / mcp-confidence / cli-analyze (rewired) +``` + +## Actors + +- `analyze` CLI user (local dev or CI) +- `mcp` agent consumer (Claude Code, Cursor, etc.) +- `gym` replay evaluator + +## EARS requirements + +### Ubiquitous (U) + +- **U-1.** OpenCodeHub MUST produce compiler-grade call, reference, and + implementation edges for TypeScript, JavaScript, Python, Go, Rust, and + Java source files without running any language server. +- **U-2.** OpenCodeHub MUST persist oracle-edge provenance as a + `reason` prefix of the form `scip:@` (e.g. + `scip:scip-typescript@0.4.0`). +- **U-3.** `confidence-demote` MUST treat any relation whose `reason` + begins with a member of `SCIP_PROVENANCE_PREFIXES` AND whose + `confidence == 1.0` as an oracle edge. +- **U-4.** Tree-sitter parsing (scan/parse/structure/accesses/cross-file/ + mro/complexity/markdown phases) MUST remain functionally unchanged. + +### Event-driven (E) + +- **E-1.** WHEN `codehub analyze` runs AND the repo contains source + files in {TS, JS, Python, Go, Rust, Java}, THE SYSTEM SHALL execute + the corresponding SCIP indexer(s) and write + `.opencodehub/scip/.scip`. +- **E-2.** WHEN a `.scip` file is produced, THE SYSTEM SHALL decode it + using `@bufbuild/protobuf` + the vendored `scip_pb.ts` bindings and + derive `caller -> callee` edges from occurrence-containment. +- **E-3.** WHEN the SCIP phase produces an edge that duplicates a + tree-sitter heuristic edge by `(from, type, to, step)`, THE graph + store SHALL retain the higher-confidence SCIP edge (dedup already in + `KnowledgeGraph.addEdge`). +- **E-4.** WHEN `codehub analyze --offline` is set AND a required SCIP + indexer binary is missing, THE SYSTEM SHALL SKIP that language with a + single-line warning (parity with today's LSP skip path) — no network + install attempts. + +### State-driven (S) + +- **S-1.** WHILE a SCIP indexer is running, THE SYSTEM SHALL stream + progress events to the same `ProgressEvent` bus used by every other + phase. +- **S-2.** WHILE a language's SCIP index already exists on disk AND + mtime > newest source-file mtime, THE SYSTEM SHALL skip re-indexing + unless `--force-reindex` is passed. + +### Optional feature (O) + +- **O-1.** WHERE `scip-java` is installed, THE SYSTEM SHALL emit Java + SCIP edges. WHERE scip-java is not installed, THE SYSTEM SHALL skip + Java indexing silently. +- **O-2.** WHERE `repomix` is on PATH, the embedder/search pipeline MAY + use `repomix --style json` output for chunking instead of the current + tree-sitter-derived chunker. + +### Unwanted behaviour (UB) + +- **UB-1.** The SCIP phase MUST NOT spawn long-running language-server + subprocesses, MUST NOT open LSP JSON-RPC over stdio, and MUST NOT + depend on `@opencodehub/lsp-oracle`. +- **UB-2.** The SCIP phase MUST NOT execute arbitrary build scripts on + an untrusted workspace by default — for `scip-java` and + `rust-analyzer scip` the analyze CLI SHALL require + `--allow-build-scripts` unless already set in `codehub.config`. +- **UB-3.** Deleting `@opencodehub/lsp-oracle` MUST NOT leak broken + imports elsewhere; `pnpm -r build` SHALL pass. + +### Performance (P) + +- **P-1.** For the repo's own TypeScript source tree (~120k LOC), + end-to-end `analyze` wall time SHOULD NOT regress more than 20% vs + today's tree-sitter-only path (today excludes LSP by default — we + compare against the baseline tree-sitter-only measurement in + `packages/gym/baselines/performance.json`). + +## Acceptance criteria (AC) + +Parallel-safe markers: `[P]`. Dependencies as `Dependencies: AC-X-Y`. + +### Wave 1 — Foundation + +- **AC-1-1** `packages/scip-ingest` package exists with + `@bufbuild/protobuf` + vendored `scip_pb.ts`, a `parseScipIndex(buf)` + that returns `{documents, external_symbols}`, and a + `deriveCallEdges(documents)` that returns the caller/callee/doc/call_line + tuples (port of `ingest.py`). `pnpm -r build` passes. Test proves + ingest of the POC `calcpkg.scip` yields the known edge set. [P] +- **AC-1-2** `packages/scip-ingest/src/materialize.ts` computes + `reach_forward`, `reach_backward`, `scc`, `node_metrics` (port of + `materialize.py`) using `graphology` (already a workspace dep) and + inserts derived tables into DuckDB via `@opencodehub/storage`. Test + proves blast-radius ranking on the POC graph. Dependencies: AC-1-1. +- **AC-1-3** `SCIP_PROVENANCE_PREFIXES` added to + `packages/core-types/src/lsp-provenance.ts` (kept filename; rename + export to `PROVENANCE_PREFIXES` and ship both `SCIP_*` + legacy + `LSP_*` as aliases for a clean rip). Tests updated. [P] + +### Wave 2 — Per-language indexer runners + +- **AC-2-1** `packages/scip-ingest/src/runners/typescript.ts` detects + `tsconfig.json` / `package.json`, shells `scip-typescript index + --output `, returns path + tool version. [P, after AC-1-1] +- **AC-2-2** `packages/scip-ingest/src/runners/python.ts` detects + `pyproject.toml` / `setup.py` / `requirements.txt`, shells + `scip-python index . --project-name= --output `. [P] +- **AC-2-3** `packages/scip-ingest/src/runners/go.ts` detects + `go.mod`, shells `scip-go --output `. [P] +- **AC-2-4** `packages/scip-ingest/src/runners/rust.ts` detects + `Cargo.toml`, shells + `rust-analyzer scip --output --exclude-vendored-libraries`. [P] +- **AC-2-5** `packages/scip-ingest/src/runners/java.ts` detects + `pom.xml` / `build.gradle*` / `build.sbt`, shells + `scip-java index --output `. [P] +- **AC-2-6** `packages/scip-ingest/src/runners/index.ts` exposes a + uniform `runIndexer(lang, projectRoot, outDir, opts) -> {scipPath, tool, version}` + signature with a `Promise.all`-friendly fan-out. Dependencies: AC-2-1 + through AC-2-5. + +### Wave 3 — Pipeline rewire + rip + +- **AC-3-1** New ingestion phase + `packages/ingestion/src/pipeline/phases/scip-index.ts`. Runs after + `accesses`, before `confidence-demote`. Fans out runners by detected + languages (from `profile` phase output), loads each `.scip`, emits + CodeRelation edges with `reason: "scip:@"` and + `confidence: 1.0`. Emits skip events for missing indexers. Tests + cover happy path + skip. Dependencies: AC-1-2, AC-2-6. +- **AC-3-2** Delete `packages/lsp-oracle` entirely. Remove workspace + dep from `packages/ingestion/package.json` and + `packages/gym/package.json`. Remove tsconfig references. Regenerate + `pnpm-lock.yaml`. Dependencies: AC-3-1. +- **AC-3-3** Delete + `packages/ingestion/src/pipeline/phases/lsp-{python,typescript,go,rust}.ts` + and their tests. Remove from `default-set.ts`. Dependencies: AC-3-1. +- **AC-3-4** Rewire `confidence-demote.ts`: swap `LSP_*_PHASE_NAME` + deps for `SCIP_INDEX_PHASE_NAME`; accept SCIP_PROVENANCE_PREFIXES. + Rename `+lsp-unconfirmed` suffix to `+scip-unconfirmed`. Update + tests. Dependencies: AC-3-1, AC-1-3. +- **AC-3-5** Rewire `summarize.ts` (trust filter, line 55 + 401) and + `packages/mcp/src/tools/confidence.ts` (`hasLspProvenance`) to + SCIP_PROVENANCE_PREFIXES. Update tests. Dependencies: AC-1-3. +- **AC-3-6** Rewire `packages/cli/src/commands/analyze.ts` + + `packages/cli/src/index.ts`: rename `lspConfirmedCallableCount` to + `scipConfirmedCallableCount`; switch provenance check. Update help + text. Dependencies: AC-3-5. +- **AC-3-7** Delete `scripts/validate-lsp-oracle.ts`, + `scripts/spike-typescript-oracle.py`, + `packages/lsp-oracle/reference/` (deleted with package). Remove + `mise.toml` tasks `test:lsp-integration` + `validate:lsp-oracle`. + Update remaining task descriptions. [P after AC-3-2] +- **AC-3-8** Rewrite `packages/gym/src/lsp-factory.ts` to + `scip-factory.ts` mapping `(lang) -> scip runner`. Update + `runner.ts` / `cli.ts` / `index.ts` / `runner.test.ts` to consume + the new factory. Regenerate corpus baselines. Update + `packages/gym/README.md` and per-language corpus READMEs. [P after AC-3-1] +- **AC-3-9** Rewrite `.github/workflows/gym.yml` to cache scip-* + binaries and run a SCIP-indexer matrix in place of the LSP matrix. + Update `docs/adr/0003-ci-toolchain-pins.md` (or supersede with a new + ADR). Update `plugins/opencodehub/skills/opencodehub-guide/SKILL.md` + and `opencodehub-impact-analysis/SKILL.md`. Update `OBJECTIVES.md`, + `SPECS.md`. Dependencies: AC-3-8. + +### Wave 4 — Repomix as output-side pack feature + +- **AC-4-1** New `codehub pack` CLI command in `packages/cli`: shells + `npx repomix@ --style xml --compress --output ` scoped to + the current repo, prints the output path + directory-token summary. + Dependencies: AC-3-9. +- **AC-4-2** `packages/mcp` re-exposes `pack_codebase` as an MCP tool + (delegating to repomix) for agents that want a single-blob repo + snapshot. Same output, different transport. Dependencies: AC-4-1. +- **AC-4-3** `packages/embedder` chunker is UNCHANGED — tree-sitter + per-symbol chunks remain the input-side path. Document the decision + (ADR 0004) with the blocker list from `research-repomix-ast.yaml`. + Dependencies: none. + +### Wave 5 — Validation + compound + +- **AC-5-1** `pnpm run check` (lint + typecheck + test + banned-strings) + passes end-to-end. Dependencies: AC-4-2. +- **AC-5-2** `codehub analyze` on the POC `sample/calcpkg` Python + project produces the same node_metrics as the POC DuckDB pipeline + (golden-file comparison, allowing only version-string drift in + `reason`). Dependencies: AC-5-1. +- **AC-5-3** Lessons extracted to `.erpaval/solutions/` per + compound.md. Dependencies: AC-5-2. + +## Non-goals + +- Cross-repo SCIP merging (single-repo in this pass). +- New MCP tools. +- Removing tree-sitter grammars (stay for parse/structure/accesses). +- Shipping a hosted SCIP-indexer-as-a-service. +- C#/C/C++/Ruby/Kotlin/Swift/PHP/Dart SCIP coverage — those remain + tree-sitter-only (SCIP indexers don't exist yet for all of them). + +## Rollback plan + +Keep a tagged commit `pre-scip-rip` on `main` before the feature +branch merges. Rollback = `git revert` of the merge commit; the +`.opencodehub/scip/` cache dir is discarded without affecting the +tree-sitter baseline. diff --git a/.erpaval/specs/001-scip-replaces-lsp/tasks.md b/.erpaval/specs/001-scip-replaces-lsp/tasks.md new file mode 100644 index 00000000..a5e76da0 --- /dev/null +++ b/.erpaval/specs/001-scip-replaces-lsp/tasks.md @@ -0,0 +1,62 @@ +# Task map — SCIP replaces LSP + +Task IDs map 1:1 to AC IDs in spec.md. Brackets `[P]` = parallel-safe +with any other `[P]` task in the same wave once dependencies are met. + +## Wave 1 — Foundation (parallel after intake) + +- T-AC-1-1 `scip-ingest` package + proto bindings + parseScipIndex + + deriveCallEdges [P] +- T-AC-1-2 `scip-ingest` materialize (port materialize.py) + Dependencies: T-AC-1-1 +- T-AC-1-3 `SCIP_PROVENANCE_PREFIXES` in core-types [P] + +## Wave 2 — Runners (parallel after AC-1-1) + +- T-AC-2-1 typescript runner [P] +- T-AC-2-2 python runner [P] +- T-AC-2-3 go runner [P] +- T-AC-2-4 rust runner [P] +- T-AC-2-5 java runner [P] +- T-AC-2-6 uniform runner factory. Dependencies: T-AC-2-{1..5} + +## Wave 3 — Rewire + rip (mostly sequential) + +- T-AC-3-1 `scip-index` ingestion phase. Dependencies: T-AC-1-2, + T-AC-2-6 +- T-AC-3-2 Delete `packages/lsp-oracle` + workspace deps + lock regen. + Dependencies: T-AC-3-1 +- T-AC-3-3 Delete per-lang LSP phases + default-set. Dependencies: + T-AC-3-1 +- T-AC-3-4 Rewire confidence-demote. Dependencies: T-AC-3-1, T-AC-1-3 +- T-AC-3-5 Rewire summarize + mcp/confidence. Dependencies: T-AC-1-3 +- T-AC-3-6 Rewire cli/analyze. Dependencies: T-AC-3-5 +- T-AC-3-7 Delete validate-lsp-oracle + spike-typescript-oracle + + mise tasks [P after T-AC-3-2] +- T-AC-3-8 Gym scip-factory + corpus/baseline regen. [P after T-AC-3-1] +- T-AC-3-9 CI gym.yml + docs + skills + SPECS/OBJECTIVES. + Dependencies: T-AC-3-8 + +## Wave 4 — Repomix as output-side feature (after Wave 3) + +Repomix is re-scoped from chunker-replacement to output-side pack +feature. See `research-repomix-ast.yaml` — per-file compressed blobs +cannot carry symbol-level metadata, so the tree-sitter chunker stays. + +- T-AC-4-1 `codehub pack` CLI command. Dependencies: T-AC-3-9 +- T-AC-4-2 MCP `pack_codebase` tool. Dependencies: T-AC-4-1 +- T-AC-4-3 ADR 0004 "Repomix is output-side only". Dependencies: none + +## Wave 5 — Validate + Compound + +- T-AC-5-1 pnpm run check. Dependencies: T-AC-4-2 +- T-AC-5-2 end-to-end on POC sample. Dependencies: T-AC-5-1 +- T-AC-5-3 compound lessons. Dependencies: T-AC-5-2 + +## Agent assignment & model policy + +- Wave 1, 2: `general-purpose` sonnet; independent per runner. +- Wave 3 (rewire + rip): `general-purpose` sonnet; rip tasks need the + orchestrator's hand to regenerate lockfile. +- Wave 4: `general-purpose` sonnet. +- Wave 5 validate: `opus`; compound: orchestrator. diff --git a/.github/workflows/gym.yml b/.github/workflows/gym.yml index 7adaf775..2cc57acc 100644 --- a/.github/workflows/gym.yml +++ b/.github/workflows/gym.yml @@ -1,13 +1,16 @@ name: Gym -# Differential LSP oracle gym: replays the reference-graph corpus through -# real LSPs (pyright, typescript-language-server, gopls, rust-analyzer) and -# gates regression against the baseline manifest at -# packages/gym/baselines/manifest.jsonl. +# SCIP-indexer gym: replays the reference-graph corpus through each +# language's native SCIP indexer (scip-python, scip-typescript, scip-go, +# rust-analyzer --scip, scip-java) and gates regression against the +# baseline manifest at packages/gym/baselines/manifest.jsonl. # # One job per language (matrix) plus a monorepo job that exercises the -# in-tree electron-ws-python fixture. LSP server binaries are cached per -# language + OS + lockfile hash. +# in-tree electron-ws-python fixture. Indexer binaries are cached per +# language + OS + version key. +# +# See docs/adr/0005-scip-replaces-lsp.md for the migration rationale +# and docs/adr/0006-scip-indexer-pins.md for the version pin table. on: push: @@ -22,11 +25,20 @@ concurrency: permissions: contents: read +env: + # Pinned SCIP-indexer versions. Bumping requires regenerating + # packages/gym/baselines/manifest.jsonl via `mise run gym:baseline`. + SCIP_TYPESCRIPT_VERSION: "0.4.0" + SCIP_PYTHON_VERSION: "0.6.6" + SCIP_GO_VERSION: "v0.2.3" + SCIP_JAVA_VERSION: "0.12.3" + RUST_ANALYZER_CHANNEL: "stable" + jobs: gym-matrix: name: gym (${{ matrix.language }}) runs-on: ubuntu-latest - timeout-minutes: 20 + timeout-minutes: 25 strategy: fail-fast: false matrix: @@ -45,10 +57,6 @@ jobs: uses: jdx/mise-action@v4 - name: Install node-gyp on PATH - # pnpm v10 gates lifecycle scripts; even with the - # `pnpm.onlyBuiltDependencies` allowlist, any grammar that still runs - # `node-gyp rebuild` directly needs the binary reachable. Install it - # globally before `pnpm install` runs. See docs/adr/0003-ci-toolchain-pins.md. run: npm install -g node-gyp@11 - name: Cache pnpm store @@ -59,38 +67,57 @@ jobs: restore-keys: | pnpm-store-${{ runner.os }}- + # ---- Per-language SCIP indexer installs -------------------------- + + - name: Install scip-python + if: matrix.language == 'python' + run: npm install -g @sourcegraph/scip-python@${{ env.SCIP_PYTHON_VERSION }} + + - name: Install scip-typescript + if: matrix.language == 'typescript' + run: npm install -g @sourcegraph/scip-typescript@${{ env.SCIP_TYPESCRIPT_VERSION }} + - name: Set up Go toolchain if: matrix.language == 'go' uses: actions/setup-go@v6 with: - # Pinned to 1.23 — bumping requires a matching gopls pin bump. - # gopls v0.18.x is the newest line that builds on Go 1.23.4+. - # gopls v0.19+ requires Go 1.24; v0.21 requires Go 1.25/1.26. - # See docs/adr/0003-ci-toolchain-pins.md. - go-version: "1.23" + # scip-go @ v0.2.3 requires Go >= 1.25 (module declares it + # in its go.mod). Go 1.26 is the latest stable when this + # line last ran locally; 1.25 is the floor. See ADR 0006. + go-version: "1.26" cache: true - - name: Cache gopls binary + - name: Cache scip-go binary if: matrix.language == 'go' - id: cache-gopls + id: cache-scip-go uses: actions/cache@v4 with: path: ~/go/bin - key: gopls-${{ runner.os }}-v0.18.1 + key: scip-go-${{ runner.os }}-${{ env.SCIP_GO_VERSION }} - - name: Install gopls - if: matrix.language == 'go' && steps.cache-gopls.outputs.cache-hit != 'true' - run: go install golang.org/x/tools/gopls@v0.18.1 + - name: Install scip-go + if: matrix.language == 'go' && steps.cache-scip-go.outputs.cache-hit != 'true' + # Project moved from sourcegraph/scip-go to scip-code/scip-go + # upstream (go.mod now declares the new path). The GitHub repo + # at sourcegraph/scip-go redirects but `go install` resolves + # module-path mismatches the hard way, so we use the canonical + # scip-code/ path here and let Go's module cache follow. + run: go install github.com/scip-code/scip-go/cmd/scip-go@${{ env.SCIP_GO_VERSION }} - - name: Add gopls to PATH + - name: Add Go bin to PATH if: matrix.language == 'go' run: echo "$HOME/go/bin" >> "$GITHUB_PATH" - - name: Set up Rust toolchain (stable + rust-analyzer) + - name: Set up Rust toolchain (stable + rust-analyzer + rust-src) if: matrix.language == 'rust' uses: dtolnay/rust-toolchain@stable with: - components: rust-analyzer + # rust-analyzer's `scip` subcommand reads from stdlib for + # cross-crate resolution, so it requires the `rust-src` + # component alongside `rust-analyzer`. Without it the + # indexer exits with + # `error: component download failed for rust-src`. + components: rust-analyzer, rust-src - name: Cache cargo registry if: matrix.language == 'rust' @@ -113,6 +140,25 @@ jobs: restore-keys: | pip-${{ runner.os }}- + - name: Install Python fixture dependencies + if: matrix.language == 'python' + # scip-python shells to `pip` to resolve installed package + # names and versions; the fixture repos need their deps + # installed before indexing. Use a single shared venv so + # scip-python can see every fixture's dep set. + run: | + python -m venv /tmp/gym-py-venv + . /tmp/gym-py-venv/bin/activate + for d in packages/gym/corpus/repos/python/*/; do + if [ -f "$d/pyproject.toml" ] || [ -f "$d/setup.py" ]; then + pip install --quiet -e "$d" || echo "warn: pip install -e $d failed" + elif [ -f "$d/requirements.txt" ]; then + pip install --quiet -r "$d/requirements.txt" || echo "warn: requirements install in $d failed" + fi + done + echo "VIRTUAL_ENV=/tmp/gym-py-venv" >> "$GITHUB_ENV" + echo "/tmp/gym-py-venv/bin" >> "$GITHUB_PATH" + - name: Install workspace dependencies run: pnpm install --frozen-lockfile @@ -121,6 +167,10 @@ jobs: - name: Run gym (${{ matrix.language }}) id: gym-run + env: + # rust-analyzer scip and scip-java run build scripts during + # indexing. Trusted corpora only. + CODEHUB_ALLOW_BUILD_SCRIPTS: "1" run: | node packages/gym/dist/cli.js run \ --corpus "packages/gym/corpus/${{ matrix.language }}/**/*.yaml" \ @@ -152,7 +202,8 @@ jobs: for (const line of records) { try { const rec = JSON.parse(line); - const key = `${rec.language ?? '?'}/${rec.server ?? '?'}/${rec.query ?? '?'}`; + const toolName = rec.tool?.name ?? '?'; + const key = `${rec.language ?? '?'}/${toolName}/${rec.request?.kind ?? '?'}`; const prev = rollups.get(key) ?? { n: 0, f1Sum: 0 }; if (typeof rec.f1 === 'number') { prev.n += 1; @@ -175,7 +226,7 @@ jobs: const body = [ `### Gym regression: \`${language}\``, '', - `The gym gate failed for the **${language}** matrix cell on commit \`${context.sha.slice(0, 7)}\`.`, + `The SCIP gym gate failed for the **${language}** matrix cell on commit \`${context.sha.slice(0, 7)}\`.`, '', 'Rollup summary from this run:', '', @@ -183,7 +234,7 @@ jobs: '', `Full manifest artifact: \`gym-manifest-${language}-${context.sha}\` (uploaded on failure).`, '', - 'If this regression is intentional, update `packages/gym/baselines/manifest.jsonl` in a follow-up PR using `mise run gym:baseline`.', + 'If this regression is intentional, update `packages/gym/baselines/manifest.jsonl` via `mise run gym:baseline` and commit the refresh.', ].join('\n'); await github.rest.issues.createComment({ owner: context.repo.owner, @@ -196,7 +247,7 @@ jobs: name: gym (monorepo) needs: gym-matrix runs-on: ubuntu-latest - timeout-minutes: 20 + timeout-minutes: 25 permissions: contents: read steps: @@ -210,8 +261,6 @@ jobs: uses: jdx/mise-action@v4 - name: Install node-gyp on PATH - # See gym-matrix for rationale; pnpm v10 + lifecycle-script gating - # requires node-gyp available before `pnpm install`. run: npm install -g node-gyp@11 - name: Cache pnpm store @@ -222,6 +271,11 @@ jobs: restore-keys: | pnpm-store-${{ runner.os }}- + - name: Install scip-python + scip-typescript (monorepo exercises both) + run: | + npm install -g @sourcegraph/scip-python@${{ env.SCIP_PYTHON_VERSION }} + npm install -g @sourcegraph/scip-typescript@${{ env.SCIP_TYPESCRIPT_VERSION }} + - name: Install workspace dependencies run: pnpm install --frozen-lockfile diff --git a/.gitignore b/.gitignore index d39785e9..4b0ab1bd 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,10 @@ coverage/ examples/fixtures/**/.codehub/ # Local agent / planning scratch space -.erpaval/ +# .erpaval/sessions/ is per-session ephemeral state (intake, explore, +# research, lessons YAML); .erpaval/solutions/ + specs/ + INDEX.md are +# durable and stay committed. +.erpaval/sessions/ .claude/settings.local.json .claude/worktrees/ .handoff/ diff --git a/OBJECTIVES.md b/OBJECTIVES.md index e7f09765..df210ad5 100644 --- a/OBJECTIVES.md +++ b/OBJECTIVES.md @@ -26,11 +26,13 @@ scope. commit, and `scripts/acceptance.sh` gate 6 gates on exactly that invariant.* -4. **Cover the 14 GA languages with tree-sitter and upgrade four of - them (Python, TS/JS, Go, Rust) with real LSP oracles.** *Because - heuristic call-graph edges miss cross-module resolution, the - `confidence-demote` phase already exists to reconcile heuristic and - compiler-grade edges, and the gym harness gates per-language F1.* +4. **Cover the 14 GA languages with tree-sitter and upgrade five of + them (TypeScript, Python, Go, Rust, Java) with SCIP indexers.** + *Because heuristic call-graph edges miss cross-module resolution, + the `scip-index` phase runs each language's native SCIP indexer + once, the `confidence-demote` phase reconciles heuristic and + compiler-grade edges, and the gym harness gates per-language F1 + with SCIP-derived baselines.* ## Quality bar diff --git a/SPECS.md b/SPECS.md index 47626dbb..6d32b4df 100644 --- a/SPECS.md +++ b/SPECS.md @@ -10,12 +10,12 @@ Agents use it to answer "what breaks if I change this, what depends on it, where does this data flow" *before* they produce a diff. At ingestion time the system parses 14 languages via native `tree-sitter` -bindings (WASM fallback available), layers per-language LSP oracles for -Python, TypeScript/JavaScript, Go, and Rust to upgrade tree-sitter heuristic -edges to compiler-grade edges, clusters the graph into Communities and -Processes, and optionally populates embeddings from a pinned gte-modernbert-base -ONNX model (fp32 ~596 MB or int8 ~150 MB) or an OpenAI-compatible HTTP -endpoint. +bindings (WASM fallback available), runs native SCIP indexers for +TypeScript/JavaScript, Python, Go, Rust, and Java to upgrade tree-sitter +heuristic edges to compiler-grade edges, clusters the graph into +Communities and Processes, and optionally populates embeddings from a +pinned gte-modernbert-base ONNX model (fp32 ~596 MB or int8 ~150 MB) or +an OpenAI-compatible HTTP endpoint. At query time it exposes an MCP server with roughly 27 tools (`query`, `context`, `impact`, `detect_changes`, `rename`, `sql`, scanner / @@ -26,8 +26,8 @@ built-in MCP prompts. ## What this system is not -- Not a language server. It consumes LSP servers as oracles but does not - speak LSP to editors directly. +- Not a language server. It runs SCIP indexers as one-shot artifact + producers and does not speak LSP to editors directly. - Not a SaaS. There is no server to operate; the graph is a single DuckDB file under `~/.codehub/` (or `${CODEHUB_HOME}`). - Not a hosted vector DB. Embeddings are optional and local; there is no @@ -42,10 +42,10 @@ built-in MCP prompts. 1.1 The ingestion pipeline shall execute the default phase DAG (`scan → profile → structure → markdown → parse → incremental-scope → complexity → routes → openapi → tools → orm → cross-file → accesses → -lsp-python → lsp-typescript → lsp-go → lsp-rust → confidence-demote → -mro → communities → dead-code → processes → fetches → temporal → -cochange → ownership → dependencies → sbom → annotate → risk-snapshot → -summarize → embeddings`) in topological order. +mro → communities → dead-code → ownership → processes → fetches → +temporal → cochange → dependencies → sbom → annotate → risk-snapshot → +scip-index → confidence-demote → summarize → embeddings`) in +topological order. 1.2 When two analyze runs execute against identical input, the system shall produce a byte-identical `graphHash` across runs. @@ -68,14 +68,14 @@ output validated by `SymbolSummary.safeParse`. 1.7 If `--offline` is set, then the summarize and embedder-HTTP phases shall be hard no-ops regardless of other flags. -1.8 When the LSP oracle for a language fails to start, the system shall -continue ingestion with tree-sitter-only edges and attach provenance -indicating the oracle was unavailable. +1.8 When the SCIP indexer for a language is not on PATH or fails, the +system shall continue ingestion with tree-sitter-only edges and attach +provenance indicating the oracle was unavailable. 1.9 While the `confidence-demote` phase runs, the system shall demote confidence-0.5 heuristic CALLS/REFERENCES/EXTENDS edges whose -`(from, type, to)` triple is also present as a confidence-1.0 LSP edge to -confidence 0.2 with a `+lsp-unconfirmed` reason suffix. +`(from, type, to)` triple is also present as a confidence-1.0 SCIP edge +to confidence 0.2 with a `+scip-unconfirmed` reason suffix. 1.10 Where `--skills` is included, the system shall emit one `SKILL.md` per `Community` with `symbolCount >= 5` under @@ -101,18 +101,23 @@ implementing the `LanguageProvider` interface, and registering the provider in `packages/ingestion/src/providers/registry.ts`; a missing registration shall fail the TypeScript build. -2.4 Where Python is detected in the repo, the system shall run the -`lsp-python` phase via pyright and upgrade heuristic edges with -pyright-resolved callers / references / implementations. +2.4 Where TypeScript, JavaScript, Python, Go, Rust, or Java is +detected, the system shall run the `scip-index` phase which invokes +the matching SCIP indexer (scip-typescript, scip-python, scip-go, +`rust-analyzer scip`, or scip-java), parses the resulting `.scip` +protobuf, and emits `CodeRelation` edges with confidence 1.0 and +`reason = "scip:@"`. -2.5 Where TypeScript/JavaScript is detected, the system shall run the -`lsp-typescript` phase via `typescript-language-server` driving tsserver. +2.5 When a SCIP indexer that runs workspace build scripts +(`rust-analyzer scip`, `scip-java`) is invoked, the system shall +require the operator to opt in via +`CODEHUB_ALLOW_BUILD_SCRIPTS=1` unless already enabled in +`codehub.config`. -2.6 Where Go is detected, the system shall run the `lsp-go` phase via -gopls. - -2.7 Where Rust is detected, the system shall run the `lsp-rust` phase via -rust-analyzer with `procMacro.enable=false`. +2.6 Cross-language references (e.g. JNI, wasm-bindgen) are +out of scope for `scip-index`; each language's `.scip` file is loaded +independently and joined on shared symbol strings only when the +indexers agree on `package{manager,name,version}`. --- @@ -322,13 +327,16 @@ fan-out on a single-repo failure. cases (7 MVP languages × 7 MCP tools) against the real `codehub mcp` stdio server; the acceptance gate requires ≥ 40 passes. -10.2 The gym harness (`packages/gym`) shall replay LSP oracle golden -manifests for Python, TypeScript, Go, and Rust and gate on three layers: -absolute F1 floor, relative F1 delta, and per-case non-regression. +10.2 The gym harness (`packages/gym`) shall replay SCIP indexer golden +manifests for TypeScript, Python, Go, Rust, and (optionally) Java, and +gate on three layers: absolute F1 floor, relative F1 delta, and +per-case non-regression. 10.3 Every gym run shall emit a JSONL freeze/replay manifest pinning `{manifest_version, corpus_commit, tool{name,version,sha256}, request, -result_set, captured_at}` for bit-exact replay without LSP spawn. +result_set, captured_at}` for bit-exact replay — the `tool.name` is the +SCIP indexer identifier (e.g. `scip-python`, `rust-analyzer`) and the +indexer is re-invoked per replay rather than a long-running server. 10.4 `scripts/acceptance.sh` shall execute 15 named gates and exit non-zero if any mandatory gate fails; soft gates (incremental p95, diff --git a/docs/adr/0003-ci-toolchain-pins.md b/docs/adr/0003-ci-toolchain-pins.md index fee2e1b7..271fe5ed 100644 --- a/docs/adr/0003-ci-toolchain-pins.md +++ b/docs/adr/0003-ci-toolchain-pins.md @@ -1,6 +1,12 @@ # ADR 0003 — CI toolchain pins (gopls ↔ Go, pnpm build-script allowlist) -Status: **Accepted** — 2026-04-24 +Status: **Superseded** — 2026-04-27, by ADR 0006 (SCIP indexer pins) + +> The gopls pin matrix below is historical. OpenCodeHub no longer runs +> any long-running language server — code-graph oracle edges are +> sourced from SCIP indexers. See ADR 0005 for the migration and +> ADR 0006 for the current pin table. The pnpm lifecycle-script +> guidance remains in force; ADR 0006 reiterates it. ## Context diff --git a/docs/adr/0005-scip-replaces-lsp.md b/docs/adr/0005-scip-replaces-lsp.md new file mode 100644 index 00000000..41ee3015 --- /dev/null +++ b/docs/adr/0005-scip-replaces-lsp.md @@ -0,0 +1,136 @@ +# ADR 0005 — SCIP replaces LSP; repomix is output-side only + +- Status: accepted +- Date: 2026-04-26 +- Authors: @theagenticguy + Claude +- Branch: `feat/scip-replaces-lsp` + +## Context + +Through ADRs 0001–0004, OpenCodeHub ran four long-running language +servers (pyright, typescript-language-server, gopls, rust-analyzer) via +the `@opencodehub/lsp-oracle` package to upgrade tree-sitter heuristic +edges with compiler-grade references. This layer: + +- totalled ~10.6k LOC of client + 4 per-language ingestion phases; +- required shipping pyright / typescript-language-server binaries + as npm deps (transitive install cost + supply-chain surface); +- made indexing a stateful, long-running operation driven by + per-symbol JSON-RPC roundtrips; +- required ~2.5k LOC of gym harness, CI, and docs to validate. + +Two adjacent technologies matured to the point where they obsolete +this design: + +1. **SCIP** (https://scip-code.org) — Sourcegraph's precise-code-intel + format, with first-class indexers for TypeScript, Python, Go, Rust + (via `rust-analyzer scip`), and Java. Each indexer runs once per + repo and emits a single `.scip` protobuf file. No daemon, no + stateful client, no per-symbol roundtrips. +2. **repomix** 1.14 — a CLI that emits a single AST-compressed snapshot + of a repo suitable for dropping into an LLM context window. + +## Decision + +### SCIP replaces LSP, end to end + +- `@opencodehub/lsp-oracle` is deleted. +- The four per-language LSP ingestion phases + (`lsp-python / lsp-typescript / lsp-go / lsp-rust`) collapse into a + single `scip-index` phase in + `packages/ingestion/src/pipeline/phases/scip-index.ts`. +- A new workspace package `@opencodehub/scip-ingest` owns the SCIP + protobuf reader, derive-graph logic, and per-language indexer + runners. It is a dependency of `ingestion` and `gym`. +- Oracle-edge provenance switches from + `{pyright,typescript-language-server,gopls,rust-analyzer}@` + to `scip:@`. The constant `SCIP_PROVENANCE_PREFIXES` + replaces `LSP_PROVENANCE_PREFIXES` in `@opencodehub/core-types` + (the old name is kept as an alias for one release). +- The `+lsp-unconfirmed` reason suffix is renamed to + `+scip-unconfirmed`. `confidence-demote`, `summarize`, MCP's + confidence-breakdown helper, and the analyze-CLI auto-cap all + depend only on the new prefix list and the single `scip-index` phase + name. +- The gym's replay harness (`packages/gym/src/scip-factory.ts`) + reimplements the `LspClientLike` surface on top of + `@opencodehub/scip-ingest`: `start()` runs the indexer once (or + reuses a cached `.scip`) and builds an in-memory occurrence + + definition index; the three query methods answer from that index + without re-decoding. + +### Repomix is output-side only + +We considered leaning on `repomix --compress` as a replacement for the +tree-sitter chunker in `packages/ingestion/src/parse` and +`packages/ingestion/src/pipeline/phases/embeddings.ts`. We rejected +that plan after a sourced deep-dive (see +`.erpaval/sessions/session-f8a300bc/research-repomix-ast.yaml`). + +The rejection turns on four blockers: + +1. **Repomix `--compress` produces per-file text blobs, not + symbol-level chunks.** It discards the + `startLine / endLine / symbolName / nodeType` metadata that our + `parse` phase turns into Function/Method/Class nodes and into + CALLS / IMPORTS / EXTENDS / IMPLEMENTS / DEFINES edges. Replacing + the chunker would blow up graph extraction. +2. **Coverage gap.** Repomix compress omits `tsx` (folded into + `typescript`) and `kotlin`. OpenCodeHub lists both as first-class + languages. +3. **Tokenizer mismatch.** Repomix counts tokens with `o200k_base` + (GPT-4o); our embedder is `gte-modernbert-base`. Budget math + wouldn't line up. +4. **Determinism.** Our cache keys derive from + `(contentSha, grammarSha, pipelineVersion)`. Repomix exposes no + grammar sha, so cache invalidation becomes lossy. + +Repomix is therefore scoped to an output-side surface: + +- `codehub pack` CLI command (`packages/cli/src/commands/pack.ts`), +- `pack_codebase` MCP tool (`packages/mcp/src/tools/pack-codebase.ts`). + +Agents that want a broad repo snapshot call `pack_codebase`; agents +that want structural answers still call `query / context / impact`. + +## Consequences + +### Positive + +- Net –12k LOC across the feature branch (lsp-oracle delete + + 4 phases + Python spike + validate script, minus the + scip-ingest package, scip-index phase, scip-factory, and ADR). +- Indexing is no longer stateful. One indexer invocation per + language, one `.scip` file on disk, pure-function decode. +- Smaller node_modules (pyright + typescript-language-server go away). +- Cross-repo references become tractable the moment we wire up a + SCIP-merge step: symbol strings are already globally unique across + indexers. + +### Negative / follow-ups + +- The gym's legacy corpus YAMLs (`packages/gym/corpus/**/*.yaml`) and + baselines (`packages/gym/baselines/*`) were captured against the + old LSP clients. They still load and still drive the SCIP replay + harness, but their per-case expected result sets need regeneration + against scip-typescript / scip-python / scip-go / rust-analyzer / + scip-java. That regeneration is a follow-up; failures today are + content-level, not structural. +- `.github/workflows/gym.yml` still caches gopls / pyright / etc. + Retargeting the matrix to cache scip-* binaries is a follow-up. +- ADR 0003 (gopls ↔ Go pin ADR) is now obsolete and should be + superseded by an ADR pinning scip-* indexer versions. + +### Neutral + +- Tree-sitter grammars stay. Parse / structure / accesses / cross-file / + mro / complexity / markdown all continue to drive off tree-sitter + — that's the heuristic tier SCIP upgrades. + +## References + +- POC: `/tmp/scip-poc/scip-graph-poc` (SCIP → DuckDB → NetworkX). +- EARS spec: `.erpaval/specs/001-scip-replaces-lsp/spec.md` +- Research: `.erpaval/sessions/session-f8a300bc/research-scip-indexers.yaml`, + `research-repomix-ast.yaml` +- Supersedes parts of ADR 0003 (CI pins for LSP servers). diff --git a/docs/adr/0006-scip-indexer-pins.md b/docs/adr/0006-scip-indexer-pins.md new file mode 100644 index 00000000..56354fae --- /dev/null +++ b/docs/adr/0006-scip-indexer-pins.md @@ -0,0 +1,107 @@ +# ADR 0006 — SCIP indexer CI pins + +- Status: accepted +- Date: 2026-04-27 +- Authors: @theagenticguy + Claude +- Branch: `feat/scip-replaces-lsp` + +## Context + +After ADR 0005 migrated the code-graph oracle from LSP language servers +to SCIP indexers, the CI pin table in ADR 0003 (gopls ↔ Go) is no +longer load-bearing. The gym workflow (`.github/workflows/gym.yml`) +now installs one SCIP indexer per language and runs it against a +per-language fixture corpus. Each indexer has its own version cadence +and toolchain requirements. + +This ADR pins the current versions, documents why each one, and records +the bump procedure so the next bump is a one-PR change instead of a +multi-day scavenger hunt. + +## Decision — pin table (2026-04-27) + +| Language | Indexer | Version tag | Install channel | +|------------|--------------------|--------------------------|-----------------------------------------------------------| +| TypeScript | scip-typescript | 0.4.0 | `npm install -g @sourcegraph/scip-typescript@` | +| Python | scip-python | 0.6.6 | `npm install -g @sourcegraph/scip-python@` | +| Go | scip-go | v0.2.3 | `go install github.com/scip-code/scip-go/cmd/scip-go@` | +| Rust | rust-analyzer | stable component | `rustup component add rust-analyzer` | +| Java | scip-java | 0.12.3 | `coursier install scip-java` (future: installed on demand) | + +The versions are mirrored in `.github/workflows/gym.yml` env block and +in `packages/gym/baselines/performance.json` so the regression harness +has a single source of truth. + +### Why scip-go resolves to the scip-code fork + +The Go module name migrated mid-2025 from +`github.com/sourcegraph/scip-go` to `github.com/scip-code/scip-go`; +the go.mod at upstream declares the new path. `go install +github.com/sourcegraph/...` fails with a module-path mismatch even +though the GitHub repo still resolves. We install from the canonical +path (`github.com/scip-code/scip-go/cmd/scip-go`). Noted so the next +contributor does not spend an afternoon on the error. + +### rust-analyzer is rustup-sourced, not pinned by tag + +`rust-analyzer scip` is a built-in subcommand shipped with the +rust-analyzer component, which bumps every Monday off of master. Since +rust-analyzer's SCIP output shape is stable across these weekly bumps +(it has been unchanged since 2024), we track `rustup component add +rust-analyzer` against the stable toolchain. If a regression lands, we +can pin to an explicit `release-YYYY-MM-DD` tag via +`dtolnay/rust-toolchain@1.XX` + manual component install. + +### scip-java is optional in CI + +Java coverage is a stretch goal for OpenCodeHub's first SCIP release. +The gym workflow does not install `scip-java` today (no corpus), but +the runner and CI pin table are pre-wired so a follow-up PR only needs +to drop a `java/.yaml` corpus + a single install step. + +## Bump procedure + +1. **Update env block** in `.github/workflows/gym.yml`: + - `SCIP_TYPESCRIPT_VERSION`, `SCIP_PYTHON_VERSION`, `SCIP_GO_VERSION`, + or `SCIP_JAVA_VERSION`. +2. **Regenerate the baseline manifest**: + ```bash + # Install the new indexer locally at the target version. + # Then: + mise run gym:baseline + # (wraps `node packages/gym/dist/cli.js baseline --corpus ...`) + ``` +3. **Refresh the corpus `expected:` sets** by replaying the new + manifest into each YAML. A utility Python/TS script that walks the + new `manifest.jsonl` and rewrites each case's `expected` list lives + in `packages/gym/baselines/scripts/refresh-expected.py`. +4. **Update `performance.json`** toolchain section. +5. **Run `pnpm run check && node packages/gym/dist/cli.js run ...`** + locally against each fixture to make sure the F1 gates pass. +6. **Open the PR**. The gym matrix in CI will re-validate every cell + against the refreshed baseline. + +## Consequences + +### Positive + +- The gym workflow is self-documenting: the env block names every + indexer + version at the top of `gym.yml`. +- `ADR 0003`'s gopls pin matrix is obsolete; a single env variable + replaces the Go-toolchain ↔ gopls compatibility table. +- Java coverage is a one-PR addition when the fixture lands. + +### Negative + +- rust-analyzer's unpinned channel means a rustup bump can introduce a + silent baseline drift. Mitigation: the gym's `f1DeltaTolerance` + gate catches the drift in the PR that bumps the channel. If we see + repeated drift we'll explicitly pin. +- scip-java currently needs Coursier on the runner image; GitHub's + `ubuntu-latest` has it, but if we ever move to a minimal image we + need to install Coursier first. Not a blocker today. + +## Related + +- ADR 0005: SCIP replaces LSP (migration rationale) +- ADR 0003: (obsoleted by this ADR; kept for history) diff --git a/mise.toml b/mise.toml index c92d3241..e15b6b82 100644 --- a/mise.toml +++ b/mise.toml @@ -107,11 +107,6 @@ description = "Run all package tests" depends = ["build"] run = "pnpm -r test" -[tasks."test:integration"] -description = "Run LSP oracle integration tests" -depends = ["build"] -run = "pnpm --filter @opencodehub/lsp-oracle run test:integration" - [tasks."test:eval"] description = "Run the Python parity/regression eval harness (pytest)" depends = ["install:eval", "build"] @@ -173,11 +168,6 @@ description = "Smoke test the MCP server over stdio" depends = ["build"] run = "bash scripts/smoke-mcp.sh" -[tasks."validate:lsp-oracle"] -description = "Run the standalone LSP oracle validator script" -depends = ["build"] -run = "pnpm exec tsx scripts/validate-lsp-oracle.ts" - [tasks.commit] description = "Commitizen-guided conventional commit" run = "pnpm exec cz" @@ -187,11 +177,11 @@ description = "Print tooling versions (for bug reports)" run = "pnpm exec envinfo --system --binaries --npmPackages --markdown" # --------------------------------------------------------------------------- -# Gym (differential LSP oracle) +# Gym (differential SCIP indexer) # --------------------------------------------------------------------------- [tasks.gym] -description = "Run differential LSP oracle gym against current baseline" +description = "Run the SCIP-indexer gym against the current baseline" depends = ["build"] run = "node packages/gym/dist/cli.js run" @@ -201,10 +191,14 @@ depends = ["build"] run = "node packages/gym/dist/cli.js baseline" [tasks."gym:replay"] -description = "Bit-exact replay of a frozen manifest (no LSP spawn)" +description = "Bit-exact replay of a frozen manifest by re-invoking the pinned SCIP indexer" depends = ["build"] run = "node packages/gym/dist/cli.js replay" +[tasks."gym:refresh-expected"] +description = "Refresh corpus `expected:` lists from the current manifest.jsonl" +run = "uv run packages/gym/baselines/scripts/refresh-expected.py packages/gym/baselines/manifest.jsonl" + # --------------------------------------------------------------------------- # Codehub CLI convenience passthroughs # --------------------------------------------------------------------------- diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index e10ee60e..57eb1682 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -77,7 +77,7 @@ export interface AnalyzeOptions { /** * Upper bound on Bedrock calls per run. Accepts either a non-negative * integer or the literal string `"auto"`. Default `"auto"` resolves to - * `min(floor(lspConfirmedCallableCount × 0.1), 500)` at run time, using + * `min(floor(scipConfirmedCallableCount × 0.1), 500)` at run time, using * a prior-run heuristic seeded from `store_meta.stats["embeddingsCount"]` * when available and falling back to 50 on first run. Any positive * integer caps the batch size at that value; `0` runs the phase in @@ -180,7 +180,7 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi : undefined; // Resolve `--max-summaries auto` against the prior run's callable count, - // if any. `auto` bounds the cap at 10% of the LSP-confirmed callable + // if any. `auto` bounds the cap at 10% of the SCIP-confirmed callable // symbols (capped at 500); on a cold first run the prior meta is absent // and we fall back to a conservative 50. `0` and positive integers pass // through unchanged. Unknown inputs (string without the "auto" literal) @@ -466,13 +466,13 @@ export function resolveSummariesEnabled( * numeric budget the pipeline can consume. * * Pre-run heuristic (P04): `auto` bounds the cap at - * `min(floor(lspConfirmedCallableCount × 0.1), 500)`. We cannot cheaply + * `min(floor(scipConfirmedCallableCount × 0.1), 500)`. We cannot cheaply * compute that before the pipeline runs (LSP phases haven't yielded * yet), so we use the prior run's stored counts when available: * * - If a DuckDB store is readable at the expected path, count nodes * whose kind is Function/Method/Class. That count is the best proxy - * for "LSP-confirmed callables" we can get before the parse phase. + * for "SCIP-confirmed callables" we can get before the parse phase. * - If no prior store exists (fresh clone, first analyze), fall back * to a conservative first-run cap of 50. The next invocation has * the prior counts and can resolve `auto` accurately. diff --git a/packages/cli/src/commands/pack.ts b/packages/cli/src/commands/pack.ts new file mode 100644 index 00000000..816db5d8 --- /dev/null +++ b/packages/cli/src/commands/pack.ts @@ -0,0 +1,104 @@ +/** + * `codehub pack [path]` — produce a single-file repo snapshot suitable + * for dropping into an LLM context window, via the `repomix` CLI. + * + * Repomix is invoked with `--style xml --compress` by default so the + * output is Anthropic-friendly and tree-sitter-signature-compressed. The + * command is an OUTPUT-side convenience; OpenCodeHub does NOT use + * repomix for indexing or embedding (see ADR 0004). + * + * This is a thin wrapper — we shell to `npx repomix@` so operators + * can override by running repomix directly. The wrapper exists to make + * the output path discoverable and to put the produced file under + * `.codehub/pack/` so it's ignored by the standard gitignore pattern. + */ + +import { spawn } from "node:child_process"; +import { existsSync, statSync } from "node:fs"; +import { mkdir } from "node:fs/promises"; +import { dirname, join, resolve } from "node:path"; + +export interface PackOptions { + /** Output style: xml (default, Anthropic-friendly), markdown, json, or plain. */ + readonly style?: "xml" | "markdown" | "json" | "plain"; + /** When true (default) apply `--compress` for tree-sitter signature compression. */ + readonly compress?: boolean; + /** When true, strip comments in the packed output. */ + readonly removeComments?: boolean; + /** Custom output file path. Defaults to `/.codehub/pack/repo.`. */ + readonly outputPath?: string; + /** Pin for `npx repomix@`. Defaults to the latest verified-compatible version. */ + readonly repomixVersion?: string; + /** Timeout in ms. Defaults to 5 minutes. */ + readonly timeoutMs?: number; +} + +const DEFAULT_REPOMIX_VERSION = "1.14.0"; + +export interface PackResult { + readonly outputPath: string; + readonly bytes: number; + readonly durationMs: number; +} + +export async function runPack(path: string, opts: PackOptions = {}): Promise { + const start = Date.now(); + const repoPath = resolve(path); + const style = opts.style ?? "xml"; + const compress = opts.compress ?? true; + const version = opts.repomixVersion ?? DEFAULT_REPOMIX_VERSION; + const outputPath = opts.outputPath + ? resolve(opts.outputPath) + : join(repoPath, ".codehub", "pack", `repo.${extForStyle(style)}`); + + await mkdir(dirname(outputPath), { recursive: true }); + + const args = [`repomix@${version}`, "--style", style, "--output", outputPath]; + if (compress) args.push("--compress"); + if (opts.removeComments) args.push("--remove-comments"); + + await new Promise((res, rej) => { + const child = spawn("npx", args, { + cwd: repoPath, + env: { ...process.env }, + stdio: ["ignore", "inherit", "inherit"], + }); + const timer = opts.timeoutMs + ? setTimeout(() => { + child.kill("SIGTERM"); + }, opts.timeoutMs) + : undefined; + child.on("error", (err: NodeJS.ErrnoException) => { + if (timer) clearTimeout(timer); + if (err.code === "ENOENT") { + rej(new Error("codehub pack: `npx` not found on PATH. Install Node.js 20+.")); + } else { + rej(err); + } + }); + child.on("exit", (code) => { + if (timer) clearTimeout(timer); + if (code === 0) res(); + else rej(new Error(`codehub pack: repomix exited ${code}`)); + }); + }); + + if (!existsSync(outputPath)) { + throw new Error(`codehub pack: repomix did not produce ${outputPath}`); + } + const bytes = statSync(outputPath).size; + return { outputPath, bytes, durationMs: Date.now() - start }; +} + +function extForStyle(style: "xml" | "markdown" | "json" | "plain"): string { + switch (style) { + case "xml": + return "xml"; + case "markdown": + return "md"; + case "json": + return "json"; + case "plain": + return "txt"; + } +} diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index dd3ec9a9..39a685f1 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -43,7 +43,7 @@ program ) .option( "--max-summaries ", - 'Cap on Bedrock summarize calls per run. "auto" (default) scales the cap to 10% of the LSP-confirmed callable count (max 500).', + 'Cap on Bedrock summarize calls per run. "auto" (default) scales the cap to 10% of the SCIP-confirmed callable count (max 500).', "auto", ) .option( @@ -196,6 +196,27 @@ program await mod.runClean(path ?? process.cwd(), { all: opts["all"] === true }); }); +program + .command("pack [path]") + .description("Produce a single-file LLM-ready snapshot of the repo via repomix (AST-compressed).") + .option("--style