diff --git a/docs/mcp-tools.md b/docs/mcp-tools.md index 239bbdf9..696acf6d 100644 --- a/docs/mcp-tools.md +++ b/docs/mcp-tools.md @@ -49,13 +49,27 @@ Add `--watch-auto` to your MCP config args: } ``` -The watcher starts automatically on the first tool call — no hardcoded path needed. It re-extracts signatures for any changed source file and patches `llm-context.json` within ~500 ms of a save. If an embedding server is reachable, it also re-embeds changed functions into the vector index automatically. The call graph is not rebuilt on every change; it stays current via the [post-commit hook](#cicd-integration) (`openlore analyze --force`). +The watcher is **on by default** — it starts automatically on the first tool call +(no hardcoded path needed) and keeps the analysis fresh as you edit. To disable it, +start the server with `openlore mcp --no-watch-auto`. + +Freshness is **O(change), not O(repo)** (Spec 13.1): per-file save events are coalesced +into a single batched flush, the patched signatures are handed directly to the MCP read +cache (so the next tool call is a cache hit, not a cold re-parse of `llm-context.json`), +and the vector index is updated with row-level ops rather than a full-table rewrite. +A bulk event (branch switch / rebase / formatter) collapses to a single refresh. On large +repos (> 5000 source files) live embedding auto-degrades to signatures-only (logged once); +embeddings then refresh at commit. Set `OPENLORE_WATCH_DEBUG=1` for per-file stderr detail +(default is one summary line per batch). The call graph is not rebuilt on every change; it +stays current via the [post-commit hook](#cicd-integration) (`openlore analyze --force`). | Option | Default | Description | |---|---|---| -| `--watch-auto` | off | Auto-detect project root from first tool call | +| `--watch-auto` | **on** | Auto-detect project root from first tool call | +| `--no-watch-auto` | — | Disable the auto-watcher (one-shot tool calls) | | `--watch ` | — | Watch a fixed directory (alternative to `--watch-auto`) | -| `--watch-debounce ` | 400 | Delay before re-indexing after a file change | +| `--watch-debounce ` | 400 | Idle delay before a coalesced flush after a change | +| `--watch-no-embed` | off | Signatures-only: skip live re-embedding (refresh at commit) | ### Cline / Roo Code / Kilocode diff --git a/docs/specs/openlore-spec-13-context-substrate.md b/docs/specs/openlore-spec-13-context-substrate.md index 4bafde14..780d91f0 100644 --- a/docs/specs/openlore-spec-13-context-substrate.md +++ b/docs/specs/openlore-spec-13-context-substrate.md @@ -26,7 +26,8 @@ Branch: `openlore-spec-13-context-substrate`. Direction locked; claims verified; - [x] Competitive + market reality verified against primary sources (2026-05-30) - [x] Repo ground-truth established (what actually ships vs. what was claimed) - [x] Theses adversarially stress-tested; positioning corrected to survive the strongest attack -- [ ] **Spec 14** — Agent Token-Efficiency Benchmark Harness (WITH vs WITHOUT). *Do this first.* → [openlore-spec-14-agent-benchmark-harness.md](openlore-spec-14-agent-benchmark-harness.md) +- [ ] **Spec 13.1** — Make Incremental Freshness Cheap (Watch-Mode Performance). *Urgent regression fix — do this before 14; the watcher that backs this spec's "always-fresh" promise currently taxes every dogfooding session.* → [openlore-spec-13.1-watch-mode-performance.md](openlore-spec-13.1-watch-mode-performance.md) +- [ ] **Spec 14** — Agent Token-Efficiency Benchmark Harness (WITH vs WITHOUT). *Do this first (after 13.1).* → [openlore-spec-14-agent-benchmark-harness.md](openlore-spec-14-agent-benchmark-harness.md) - [ ] **Spec 15** — Decision & Drift Governance Dogfooding (turn the gate on in our own repo). → [openlore-spec-15-governance-dogfooding.md](openlore-spec-15-governance-dogfooding.md) - [ ] **Spec 16** — Architectural Decisions as First-Class Graph Nodes (`affects` edges). → [openlore-spec-16-decisions-as-graph-nodes.md](openlore-spec-16-decisions-as-graph-nodes.md) - [ ] **Spec 17** — Cross-Domain Impact Analysis (Code ↔ Infrastructure). → [openlore-spec-17-cross-domain-impact.md](openlore-spec-17-cross-domain-impact.md) diff --git a/docs/specs/openlore-spec-13.1-watch-mode-performance.md b/docs/specs/openlore-spec-13.1-watch-mode-performance.md new file mode 100644 index 00000000..68d8feac --- /dev/null +++ b/docs/specs/openlore-spec-13.1-watch-mode-performance.md @@ -0,0 +1,336 @@ +# OpenLore Spec 13.1 — Make Incremental Freshness Cheap (Watch-Mode Performance) + +> **Type:** urgent regression fix, not a feature. This is a child of [Spec 13](openlore-spec-13-context-substrate.md): +> Spec 13 promises OpenLore "stays fresh **incrementally** so it never carries the staleness tax +> Cherny rejected." The watcher is that mechanism — and today it imposes *its own* tax. This spec +> makes incremental freshness actually O(change), so `--watch-auto` can stay **on by default** +> (owner's decision, 2026-05-31) without degrading the agent session that depends on it. +> +> **Do this before Spec 14.** It is a live dogfooding-blocker (see "Why this jumps the queue"). + +--- + +## Progress + +Branch: `openlore-spec-13.1-watch-mode-performance` (proposed). Root cause confirmed against the code. + +- [x] Symptom reproduced from the field: multiple Claude Code sessions across multiple dogfooded + repos report *"severe, batched result-delivery latency — commands ran correctly on disk and + on the remote, but their output came back in large delayed drains."* Only began once the + `openlore` MCP server was registered in those repos. +- [x] Root cause traced to the watch-mode re-index pipeline (this document, "Root cause"). +- [x] **Step 1** — Coalesce per-file events into a single batched flush (one write per burst). + `McpWatcher` now uses one `pending` Set + a single debounce timer + a `maxBatchMs` + ceiling; `handleChange(path)` delegates to `handleBatch([path])`. +- [x] **Step 2** — Make the `llm-context.json` update cheap and stop busting the read cache. + Implemented 2a: `primeContextCache` (new export in `mcp-handlers/utils.ts`) hands the + patched context to the read cache so the next tool call is a HIT (0 ms vs ~19 ms cold), + and the ~2.7 MB disk write is write-behind (deferred + coalesced via `maxBatchMs`). +- [x] **Step 3** — Make the vector-index update a real incremental row op (no full-table rewrite). + New `VectorIndex.updateFiles()` does `delete("filePath" IN …) + add(rows)` for the changed + files only and patches the BM25 corpus cache in place; the cold `build()` path is untouched. +- [x] **Step 4** — Decouple embedding freshness from signature freshness (signatures land instantly). + Signatures persist synchronously first; the vector update runs on a separate lower-priority + embed lane. Added `--watch-no-embed` + auto-degrade above `WATCH_EMBED_FILE_CEILING` (5000). +- [x] **Step 5** — Backpressure + VCS-flood detection (branch switch ⟹ one refresh, not N). + A `.git` ref watcher (HEAD/index/MERGE_HEAD/ORIG_HEAD) + a `WATCH_BULK_THRESHOLD` (25) + batch-size trip collapse a bulk event into one settled refresh; single-flight never interleaves. +- [x] **Step 6** — stderr discipline (one summary line per batch; verbose behind a debug flag). + Default is one `[mcp-watcher] updated/coalesced N … (Mms)` line per batch; per-file/per-embed + detail is behind `OPENLORE_WATCH_DEBUG`. +- [x] **Step 7** — Reconcile docs/install text with the on-by-default reality. + `docs/mcp-tools.md`, `README.md`, `src/cli/install/index.ts`, and the orient skill wrapper + now state watch is on by default, cheap/batched, and how to disable / run signatures-only. +- [x] **Step 8** — Watch-mode microbenchmark + regression tests. + `scripts/bench-watch.ts` (+ `npm run bench:watch`, recorded in `scripts/BENCHMARKS.md`) plus + `mcp-watcher-incremental.test.ts` and `vector-index-updatefiles.test.ts`. + +**Measured (`npm run bench:watch`, synthetic 4.03 MB context, signatures-only):** single-save +flush **4.5 ms**; next-call read after save **0.02 ms** (in-memory cache HIT) vs **4.4 ms** cold +parse (≈256× on this fixture, widening with context size); 50-file burst → **1** flush (was 50 +full pipelines), coalesced flush **8 ms**. The decisive wins are the eliminated forced re-parse +(G1) and the single-flush coalescing (G2). Satisfies G1, G2, G3, G4, G5, G6; G7 protected (cold +`build()`/`analyze` paths untouched; full unit + relevant integration suites green). + +> **Not** addressed by PR #83 (Panic Response Layer). PR #83 touches `mcp.ts` and `vector-index.ts` +> for panic/gryph concerns only; it does not change the `mcp-watcher` re-index pipeline, +> `handleChange`, or `--watch-auto`. The two are independent — both edit `mcp.ts` but in different +> regions, so expect a small merge but no logical conflict. **PR #83 does not fix this.** + +--- + +## Symptom (from the field) + +Reported verbatim by multiple agent sessions, across multiple repos, only after dogfooding began: + +> "The tool-execution environment in this session had severe, batched result-delivery latency — +> commands ran correctly on disk and on the remote, but their output came back to me in large +> delayed drains, which is why this took many round-trips. Everything is confirmed landed." + +The tells: tool *execution* is fine (writes land), but tool *result delivery* arrives batched and +late. That is the signature of (a) a background process contending for CPU/IO with the agent's +session, and/or (b) a flood of child-process stderr the client must drain — not of failing commands. + +## What is shared across every affected repo + +Exactly one thing changed when dogfooding started: each repo's `.claude/settings.json` now registers + +```json +{ "mcpServers": { "openlore": { "command": "npx", "args": ["--yes", "openlore", "mcp"] } } } +``` + +A long-running `openlore mcp` stdio server, started with **no flags**. No git hooks are installed in +the affected repos (verified in `enklayve/.git/hooks` — empty), so the decisions/commit gate is **not** +the cause. The cause is in the MCP server's default behavior. + +## Root cause (grounded against the code) + +**`--watch-auto` defaults to `true`** — [src/cli/commands/mcp.ts:1610](../../src/cli/commands/mcp.ts#L1610): + +```ts +.option('--watch-auto', 'Auto-detect the project directory from the first tool call and start watching', true) +``` + +So plain `openlore mcp` silently arms a recursive `chokidar` watcher on the **first** tool call that +carries a `directory` ([mcp.ts:1347-1362](../../src/cli/commands/mcp.ts#L1347-L1362)). From then on, +**every file the agent edits** fires `McpWatcher.handleChange` +([mcp-watcher.ts:191-273](../../src/core/services/mcp-watcher.ts#L191-L273)). The directory pruning +itself is well guarded (node_modules/dist/target/etc. are excluded — the EMFILE fix at +[mcp-watcher.ts:60-100](../../src/core/services/mcp-watcher.ts#L60-L100)); the defect is the **per-save +cost**, which is O(repo), not O(change). On a real dogfood target (`enklayve`: 2.1 MB `call-graph.db`, +**2.1 MB `llm-context.json`**, a LanceDB `vector-index/`), a single save does all of: + +1. **Full `llm-context.json` rewrite.** `handleChange` reads → `JSON.parse` → patches one signature + entry → writes the **entire** file back ([mcp-watcher.ts:247-267](../../src/core/services/mcp-watcher.ts#L247-L267)). + That is a 2.1 MB parse + 2.1 MB write **per save**, regardless of edit size. + +2. **A forced 2.1 MB re-parse on the next tool call.** `readCachedContext` caches the parsed context + keyed on file **mtime** ([utils.ts:124-146](../../src/core/services/mcp-handlers/utils.ts#L124-L146)). + The rewrite in (1) bumps mtime, so the next MCP query (which `orient`, `analyze_impact`, + `get_subgraph`, `search_code`, etc. all depend on) must re-read and re-parse the whole 2.1 MB file + cold. The watcher's write therefore taxes the read path too. + +3. **A full vector-index read + overwrite.** `reEmbed` ([mcp-watcher.ts:269-319](../../src/core/services/mcp-watcher.ts#L269-L319)) + calls `VectorIndex.build(..., incremental=true)`. But the "incremental" path still + `openTable()` → `table.query().toArray()` — reads the **entire** corpus into memory + ([vector-index.ts:413-415](../../src/core/analyzer/vector-index.ts#L413-L415)) — then + `createTable(TABLE_NAME, ..., { mode: 'overwrite' })` — rewrites the **whole** table + ([vector-index.ts:472](../../src/core/analyzer/vector-index.ts#L472)). `incremental` only avoids + *re-embedding* unchanged functions; the storage read and rewrite are full-corpus every time. The + BM25-only path is the same shape (overwrite + corpus-cache bust). + +4. **A stderr line per change** (and another per embed) — [mcp-watcher.ts:238-239, 267, 311-315](../../src/core/services/mcp-watcher.ts#L238-L239). + +5. **No coalescing across files.** The debounce is **per-file** (a `setTimeout` per path, + [mcp-watcher.ts:165-183](../../src/core/services/mcp-watcher.ts#L165-L183)); the `running` flag + serializes but *reschedules* superseded work rather than dropping it. A bulk file event — `git + checkout`/`rebase`/`pull`, a formatter, a project-wide find-replace — touching N source files + therefore runs the full O(repo) pipeline **N times back-to-back**. A 50-file branch switch = + 50 full `llm-context.json` rewrites + 50 full vector-index overwrites, serialized. + +**Net:** the freshness mechanism that Spec 13 sells as cheap is, in the field, an O(repo) re-index + +re-embed pipeline that fires on every keystroke-save and storms on every VCS operation — saturating +CPU/IO and flooding stderr in the MCP child process while the agent is trying to work. That is the +"batched result-delivery latency." The call-graph subset rebuild +([mcp-watcher.ts:206-244](../../src/core/services/mcp-watcher.ts#L206-L244)) is correctly bounded +(changed file + ≤10 callers) and is **not** the problem — items 1–5 are. + +### Why this jumps the queue (ahead of Spec 14) + +Spec 13 says "run the benchmark before writing another line of *feature* code." This is not feature +code — it is a regression that degrades every dogfooding session, and dogfooding is how 14–23 get +validated. A Spec 14 token/latency benchmark run *through* the MCP server while this is live would +also be polluted by watcher contention. Fix the substrate's freshness tax first; then benchmark. + +--- + +## Goal & success criteria + +**Goal:** incremental freshness is O(change), not O(repo), and never storms — so `--watch-auto` +stays on by default and a watching session is indistinguishable from a non-watching one in latency. + +Verifiable criteria (see Step 8 for the harness): + +- **G1** — A single source-file save triggers **≤ 1** `llm-context` persistence and **≤ 1** vector + update, and does **not** force a full-file re-parse on the next tool call. +- **G2** — A burst of N saves within the debounce window coalesces to **1** flush, not N. +- **G3** — A VCS bulk event (≥ `BULK_THRESHOLD` files, or `.git/HEAD`/`.git/index` churn) produces + **at most one** deferred refresh, not one pipeline per file. +- **G4** — Per-save wall-clock and CPU on a 2 MB-context repo drop by **≥ 10×** vs. today + (measured; the benchmark sets the real number). +- **G5** — Watcher stderr emits **≤ 1** line per batch by default; per-file detail only with a debug flag. +- **G6** — `orient`/`search_code` still reflect a just-saved edit within the debounce window + (freshness preserved — this is the whole point of keeping watch on). +- **G7** — No regression in the cold `analyze`/`--watch` path or in MCP read latency + (`scripts/bench-mcp.ts`). + +--- + +## The fix — detailed steps + +> Design principle: **separate the two freshnesses.** *Signature/structure* freshness (what +> `orient`/`search_code` return as text) must land immediately and cheaply. *Embedding* freshness +> (semantic re-rank quality) may lag a few seconds and batch. Spec 13's thesis is the structural map; +> the vector layer is the optional semantic assist (Spec 06), so it can trail. + +### Step 1 — Coalesce per-file events into a single batched flush + +Replace the per-file timer map + reschedule loop ([mcp-watcher.ts:111, 165-183](../../src/core/services/mcp-watcher.ts#L165-L183)) +with **one** coalescing queue: + +- Maintain a `Set` of pending changed paths plus a single debounce timer. +- On each `change`, add the path and (re)arm one timer (`debounceMs`, default 400). Add a hard + **max-batch ceiling** so a continuous stream still flushes periodically + (`maxBatchMs`, e.g. 2000) — never starve. +- On flush, drain the whole Set and process it as **one batch**: one call-graph subset build over + all changed files, **one** `llm-context` persistence (Step 2), **one** vector update (Step 3). +- Keep single-flight: if a flush is running, accumulate into the next Set; do not interleave. + +`handleChange(path)` stays exported for unit tests but becomes `handleBatch(paths)` internally; the +single-file form delegates to a batch of one. + +### Step 2 — Make the `llm-context.json` update cheap and stop busting the read cache + +The 2.1 MB rewrite-per-save and the mtime-driven re-parse are the two biggest single-save costs. +Pick **2a** (smallest change, recommended) and add **2c**; consider **2b** as the durable form. + +- **2a — Write-behind + in-memory cache handoff (recommended first move).** Keep the patched context + in memory; flush to `llm-context.json` at most once per `flushIntervalMs` (e.g. 2000) or on idle, + not per save. Crucially, **update the read-path cache in place** so freshness does not require a + disk round-trip: expose a setter on `readCachedContext`'s `_contextCache` + ([utils.ts:115-146](../../src/core/services/mcp-handlers/utils.ts#L115-L146)) that the watcher calls + with the new in-memory context, so the next tool call is a cache **hit** (no 2.1 MB re-parse) even + before the disk flush. This satisfies G1, G2, G6 directly. +- **2b — Stop storing signatures in the monolith (durable form).** Signatures are the only thing the + watcher patches into `llm-context.json`. Move per-file signatures to an incrementally updatable + store — the `EdgeStore` SQLite already updated incrementally here is the natural home (one-row + upsert per file), or a per-file sidecar. Then a single-file change is an O(1) row write, and + `llm-context.json` is rebuilt only by `analyze`. Larger blast radius (read paths that consume + `context.signatures` must read the new store); schedule after 2a proves the model. +- **2c — Cache invalidation that survives partial writes.** If any path keeps rewriting + `llm-context.json`, make `readCachedContext` invalidation tolerate it: invalidate per-file rather + than busting the whole parsed object, or have the watcher push the updated object into the cache + (as in 2a) so an mtime bump never forces a cold full re-parse. + +### Step 3 — Make the vector-index update a real incremental row op + +Stop the full-table read+overwrite on the watch path. + +- Replace `query().toArray()` + `createTable(overwrite)` ([vector-index.ts:404-472](../../src/core/analyzer/vector-index.ts#L404-L472)) + with **row-level** ops for the changed functions only: LanceDB `delete(predicate)` for the changed + file's existing rows + `add(newRows)`, or `mergeInsert` keyed on function `id`. Add a dedicated + `VectorIndex.updateFiles(outputDir, changedNodes, …)` entry point for the watcher so the cold + `build()` path is untouched (protects G7 and the `analyze --embed` contract). +- For the **BM25-only** path (no embedder): update only the affected documents in the corpus and + surgically invalidate just those entries in `_bm25Cache` + ([vector-index.ts:191-202](../../src/core/analyzer/vector-index.ts#L191-L202)) instead of rebuilding + and dropping the whole corpus cache. +- Likewise invalidate only the changed rows in `_tableCache`, not the whole table handle. + +### Step 4 — Decouple embedding freshness from signature freshness + +- On flush, run Step 2 (signatures) **synchronously and first** so `orient`/`search_code` reflect the + edit immediately (G6). Schedule Step 3 (embedding/vector) as a **separate, lower-priority** task + that may batch across multiple flushes and run on idle. Never block a signature update on an embed. +- Add a `watchEmbed` switch (config + `--watch-no-embed`) so large repos can run **signatures-only** + live freshness and let embeddings refresh at commit (the post-commit `analyze --embed` the + watcher header already references, [mcp-watcher.ts:10-11](../../src/core/services/mcp-watcher.ts#L10-L11)). +- **Auto-degrade on big repos:** if the watched tree exceeds `WATCH_EMBED_FILE_CEILING` source files, + default to signatures-only live and log the decision once (no silent cap — state it, per Spec 13's + "no claim outruns the code" discipline). + +### Step 5 — Backpressure + VCS-flood detection + +- **VCS detection:** watch for `.git/HEAD`, `.git/index`, `.git/MERGE_HEAD`, `ORIG_HEAD` churn, or a + flush batch ≥ `BULK_THRESHOLD` files. On detection, **cancel** queued per-file work and schedule a + **single** coalesced refresh after the operation settles (a quiet period), rather than N pipelines. + A branch switch becomes one refresh (G3). +- **Backpressure:** if flush batches keep arriving faster than they drain (queue depth grows past a + bound), degrade to "mark stale + one batched refresh on the next idle window" and emit a single + `[mcp-watcher] coalesced N changes` line. Never let the queue grow unbounded. + +### Step 6 — stderr discipline + +- Default to **one summary line per batch** (`[mcp-watcher] updated N files (Mms)`); move the + per-file/per-embed lines ([mcp-watcher.ts:238-239, 267, 311-315](../../src/core/services/mcp-watcher.ts#L238-L239)) + behind `OPENLORE_WATCH_DEBUG`. This removes the stderr-flood contribution to the client's batched + result drain, independent of the CPU/IO win. + +### Step 7 — Reconcile the docs/install text with reality + +The current behavior contradicts the docs, which compounds the confusion: + +- [docs/mcp-tools.md:56](../../docs/mcp-tools.md#L56) lists `--watch-auto` default as **`off`** — it is + `true`. Fix the table to "on by default" and describe the new cheap batched behavior + the + `--watch-no-embed` / signatures-only auto-degrade. +- [src/cli/install/index.ts:235](../../src/cli/install/index.ts#L235) and [README.md:183](../../README.md#L183) + should state watch is on by default, why (live freshness), and how to disable + (`openlore mcp --no-watch-auto`) or run signatures-only. +- The orient skill's stdio fallback spawns `npx --yes openlore mcp` with no flags + ([skills/openlore-orient/scripts/orient-via-mcp.mjs:30](../../skills/openlore-orient/scripts/orient-via-mcp.mjs#L30)), + so it too arms the watcher; for one-shot orient it should pass `--no-watch-auto` (the option's own + help already claims the orient wrapper does this — make it true). + +### Step 8 — Watch-mode microbenchmark + regression tests + +- **Benchmark** (`scripts/bench-watch.ts`, sibling to `bench.ts`/`bench-mcp.ts`): on a fixture with a + ~2 MB context + populated vector index, measure (a) single-save flush latency + CPU, (b) a 50-file + bulk-change burst, asserting G1–G4. Record before/after in `BENCHMARKS.md`. +- **Tests** (extend `mcp-watcher.test.ts` / `.integration.test.ts`): + - N change events in one window ⟹ exactly 1 persistence + 1 vector update (G2). + - VCS-flood / ≥ BULK_THRESHOLD batch ⟹ exactly 1 deferred refresh (G3). + - A save updates the in-memory read cache: the next `readCachedContext` is a **hit**, no full + re-parse (G1). + - `VectorIndex.updateFiles` changes only the target file's rows; corpus rows for other files are + byte-identical (Step 3). + - Signatures reflect a just-saved symbol within the debounce window even when the embedder is + absent/slow (G4/G6, signatures-only path). + - stderr emits ≤ 1 line per batch unless `OPENLORE_WATCH_DEBUG` (G5). + +--- + +## Tunables (new) — single source of truth + +Add to constants and surface in `.openlore/config.json` (and `--watch-*` flags). Defaults chosen to +keep watch **on** and cheap: + +| Knob | Default | Purpose | +|---|---|---| +| `watchDebounceMs` | 400 | idle quiet period before a flush (existing) | +| `watchMaxBatchMs` | 2000 | hard flush ceiling under a continuous stream | +| `watchBulkThreshold` | 25 | batch size that trips VCS-flood handling | +| `watchEmbed` | `true` | run vector update live; `false` = signatures-only | +| `watchEmbedFileCeiling` | e.g. 5000 | above this, auto-degrade to signatures-only | +| `OPENLORE_WATCH_DEBUG` | unset | enable per-file/per-embed stderr lines | + +--- + +## Compatibility & scope guarantee (per Spec 13's prime constraint) + +This is **additive and behavior-preserving** for the frozen contract: + +- **`mcp` CLI surface preserved.** `--watch`/`--watch-auto`/`--watch-debounce` keep their meaning; + new flags (`--watch-no-embed`) are additive. Default stays on (owner's decision) — but now cheap. +- **Cold paths untouched.** `analyze` and `analyze --embed` build full `llm-context.json` and the full + vector index exactly as today; Step 3 adds a *new* `VectorIndex.updateFiles` reader beside the + existing `build()` rather than changing it (protects G7). +- **`orient()` response shape unchanged.** This is a latency/IO fix; no field is added, removed, or + retyped. +- **`llm-context.json` format unchanged** under 2a/2c. If 2b lands later, signatures move to a store + but the artifact stays valid (consumers migrate behind the existing readers); a `SCHEMA_VERSION` + bump rebuilds from source — one re-analyze, no migration (the Spec 13 safety property). +- **Freshness guarantee strengthened, not weakened.** The point of keeping watch on is preserved + (G6); we remove only the cost, not the freshness. + +--- + +## Relationship to existing specs + +- **Spec 13 (context substrate)** — direct parent. This is the "kept fresh incrementally so it never + carries the staleness tax" claim, made true in the field. Add a 13.1 line to Spec 13's Progress + list ahead of Spec 14. +- **Spec 06 (BM25 without embeddings)** — Step 4's signatures-only / `--watch-no-embed` mode is the + watch-time expression of the same "deterministic retrieval, network/embeddings optional" floor. +- **Spec 14 (benchmark harness)** — runs *after* this; a token/latency benchmark through the MCP + server is only trustworthy once the watcher no longer contends with the measured session. +- **PR #83 (Panic Response Layer)** — orthogonal; does not touch this pipeline (see Progress note). diff --git a/package.json b/package.json index 88da19eb..0ef8ae26 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "view": "tsx src/cli/index.ts view", "bench": "tsx scripts/bench.ts", "bench:mcp": "tsx scripts/bench-mcp.ts", + "bench:watch": "tsx scripts/bench-watch.ts", "test": "vitest", "test:run": "vitest run", "test:coverage": "vitest run --coverage", diff --git a/scripts/bench-watch.ts b/scripts/bench-watch.ts new file mode 100644 index 00000000..f5c2604e --- /dev/null +++ b/scripts/bench-watch.ts @@ -0,0 +1,155 @@ +/** + * bench-watch.ts — watch-mode (MCP incremental re-index) microbenchmark. + * + * Spec 13.1: freshness must be O(change), not O(repo). This measures the + * per-save and bulk-burst cost of the watcher pipeline on a fixture with a + * ~2 MB llm-context.json, and asserts the coalescing/cache guarantees: + * + * G1 — a single save triggers ≤ 1 llm-context persistence and the next read + * is a cache HIT (no cold full-file re-parse). + * G2 — a burst of N saves coalesces to ONE flush. + * G4 — per-save wall-clock stays small relative to the context size. + * + * Run: npm run bench:watch + * + * This is a manual benchmark (not part of CI). It builds its own throwaway + * fixture under the OS temp dir and cleans up afterwards. + */ +import { mkdtemp, mkdir, writeFile, readFile, rm, stat } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { McpWatcher } from '../src/core/services/mcp-watcher.js'; +import { readCachedContext, _resetContextCacheForTesting } from '../src/core/services/mcp-handlers/utils.js'; + +const FILE_COUNT = 600; // synthetic source files in the context +const ENTRIES_PER_FILE = 20; // signatures per file +const SINGLE_SAVE_RUNS = 20; +const BURST_SIZE = 50; + +function median(xs: number[]): number { + const s = [...xs].sort((a, b) => a - b); + const m = Math.floor(s.length / 2); + return s.length % 2 ? s[m] : (s[m - 1] + s[m]) / 2; +} + +function synthSignatures(): Array<{ path: string; language: string; entries: Array<{ name: string; signature: string; docstring: string; line: number; kind: string }> }> { + const sigs = []; + for (let i = 0; i < FILE_COUNT; i++) { + const path = `src/module_${i}/file_${i}.ts`; + const entries = []; + for (let j = 0; j < ENTRIES_PER_FILE; j++) { + entries.push({ + name: `fn_${i}_${j}`, + signature: `export function fn_${i}_${j}(arg0: string, arg1: number, opts?: Record): Promise`, + docstring: `Function ${j} in module ${i}. Handles a representative unit of work for the benchmark fixture.`, + line: j * 7 + 1, + kind: 'function', + }); + } + sigs.push({ path, language: 'TypeScript', entries }); + } + return sigs; +} + +async function main(): Promise { + const root = await mkdtemp(join(tmpdir(), 'ol-benchwatch-')); + const analysisDir = join(root, '.openlore', 'analysis'); + await mkdir(analysisDir, { recursive: true }); + const contextPath = join(analysisDir, 'llm-context.json'); + + // Build a ~2 MB context. + const signatures = synthSignatures(); + await writeFile(contextPath, JSON.stringify({ signatures, callGraph: null }, null, 2), 'utf-8'); + const ctxBytes = (await stat(contextPath)).size; + + // Write the real source files so the watcher can read them on change. + for (let i = 0; i < FILE_COUNT; i++) { + const dir = join(root, 'src', `module_${i}`); + await mkdir(dir, { recursive: true }); + await writeFile(join(dir, `file_${i}.ts`), `export function fn_${i}_0() { return ${i}; }\n`, 'utf-8'); + } + + // embed:false → measure the signature/freshness pipeline (the per-save hot + // path the spec flagged: the 2 MB rewrite + the re-parse it used to force). + const watcher = new McpWatcher({ rootPath: root, embed: false }); + + // ── Single-save latency, including the simulated "next tool call" read ────── + const flushTimes: number[] = []; + const readTimes: number[] = []; + for (let r = 0; r < SINGLE_SAVE_RUNS; r++) { + const i = r % FILE_COUNT; + const f = join(root, 'src', `module_${i}`, `file_${i}.ts`); + await writeFile(f, `export function fn_${i}_0() { return ${i + r * 1000}; }\n`, 'utf-8'); + + const t0 = performance.now(); + await watcher.handleChange(f); + flushTimes.push(performance.now() - t0); + + // The next "tool call" read — must be a cache HIT (no 2 MB cold re-parse). + const t1 = performance.now(); + const ctx = await readCachedContext(root); + readTimes.push(performance.now() - t1); + if (!ctx) throw new Error('readCachedContext returned null after save'); + } + + // ── Cold read baseline (cache cleared → full 2 MB parse) for contrast ─────── + _resetContextCacheForTesting(); + const coldT0 = performance.now(); + await readCachedContext(root); + const coldRead = performance.now() - coldT0; + + // ── Bulk burst: BURST_SIZE files in one window must coalesce to ONE flush ─── + let summaries = 0; + const origWrite = process.stderr.write.bind(process.stderr); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (process.stderr as any).write = (chunk: any, ...rest: any[]): boolean => { + if (/\[mcp-watcher\] (updated|coalesced)/.test(String(chunk))) summaries++; + return origWrite(chunk, ...rest); + }; + const burst = new McpWatcher({ rootPath: root, embed: false, debounceMs: 50, maxBatchMs: 2000 }); + const burstFiles: string[] = []; + for (let i = 0; i < BURST_SIZE; i++) { + const f = join(root, 'src', `module_${i}`, `file_${i}.ts`); + await writeFile(f, `export function fn_${i}_0() { return ${i}*2; }\n`, 'utf-8'); + burstFiles.push(f); + } + const burstT0 = performance.now(); + for (const f of burstFiles) (burst as unknown as { enqueue(p: string): void }).enqueue(f); + // Wait for the single coalesced flush to complete. + await new Promise((res) => setTimeout(res, 400)); + const burstTime = performance.now() - burstT0; + (process.stderr as any).write = origWrite; + + const report = +`## Watch-mode benchmark (Spec 13.1) + +Fixture: ${FILE_COUNT} files × ${ENTRIES_PER_FILE} signatures, llm-context.json = ${(ctxBytes / 1_048_576).toFixed(2)} MB. + +| Metric | Result | +|--------|--------| +| Single-save flush (median of ${SINGLE_SAVE_RUNS}) | ${median(flushTimes).toFixed(1)} ms | +| Next-call read after save (median, cache HIT) | ${median(readTimes).toFixed(2)} ms | +| Cold read (cache cleared, full parse) | ${coldRead.toFixed(1)} ms | +| ${BURST_SIZE}-file burst → flushes | ${summaries} (expected 1) | +| ${BURST_SIZE}-file burst wall-clock | ${burstTime.toFixed(1)} ms | + +G1: next-call read is a cache hit — ${median(readTimes).toFixed(2)} ms vs ${coldRead.toFixed(1)} ms cold (${(coldRead / Math.max(median(readTimes), 0.001)).toFixed(0)}× faster). +G2: ${BURST_SIZE} saves coalesced to ${summaries} flush${summaries === 1 ? '' : 'es'}. +`; + + // eslint-disable-next-line no-console + console.log(report); + + // Assertions (fail loudly in CI-less manual runs). + if (summaries !== 1) throw new Error(`G2 violated: expected 1 coalesced flush, got ${summaries}`); + if (median(readTimes) >= coldRead) throw new Error('G1 violated: post-save read is not faster than a cold parse'); + + await rm(root, { recursive: true, force: true }); +} + +main().catch((err) => { + // eslint-disable-next-line no-console + console.error(err); + process.exit(1); +}); diff --git a/src/cli/commands/mcp.ts b/src/cli/commands/mcp.ts index 0840fab2..6e75ed45 100644 --- a/src/cli/commands/mcp.ts +++ b/src/cli/commands/mcp.ts @@ -1286,6 +1286,7 @@ interface McpServerOptions { watch?: string; watchAuto?: boolean; watchDebounce?: string; + watchNoEmbed?: boolean; minimal?: boolean; } @@ -1353,6 +1354,7 @@ async function startMcpServer(options: McpServerOptions = {}): Promise { autoWatcher = new McpWatcher({ rootPath: resolve(dir), debounceMs: isNaN(debounceMs) ? 400 : debounceMs, + embed: !options.watchNoEmbed, }); await autoWatcher.start(); const cleanup = () => autoWatcher!.stop().then(() => process.exit(0)); @@ -1592,6 +1594,7 @@ async function startMcpServer(options: McpServerOptions = {}): Promise { const watcher = new McpWatcher({ rootPath: resolve(options.watch), debounceMs: isNaN(debounceMs) ? 400 : debounceMs, + embed: !options.watchNoEmbed, }); await watcher.start(); const cleanup = () => watcher.stop().then(() => process.exit(0)); @@ -1610,5 +1613,6 @@ export const mcpCommand = new Command('mcp') .option('--watch-auto', 'Auto-detect the project directory from the first tool call and start watching', true) .option('--no-watch-auto', 'Disable auto-watch (use for one-shot tool calls, e.g. the orient skill wrapper)') .option('--watch-debounce ', 'Debounce delay in ms before re-indexing after a file change (default: 400)', '400') + .option('--watch-no-embed', 'Watch signatures only — skip live vector re-embedding (embeddings refresh at commit). Large repos auto-degrade to this.') .option('--minimal', 'Expose only core 5 tools (orient, search_code, record_decision, detect_changes, check_spec_drift). Pair with alwaysLoad: true in Claude Code for always-visible core tools.') .action((options: McpServerOptions) => startMcpServer(options)); diff --git a/src/constants.ts b/src/constants.ts index 7946bf76..c9355850 100644 --- a/src/constants.ts +++ b/src/constants.ts @@ -549,3 +549,36 @@ export const DECISIONS_CONSOLIDATION_MAX_TOKENS = 2_000; /** Max output tokens for verification LLM call */ export const DECISIONS_VERIFICATION_MAX_TOKENS = 1_500; + +// ============================================================================ +// WATCH MODE (MCP incremental re-index) — Spec 13.1 +// ============================================================================ +// Defaults chosen to keep --watch-auto on by default while making incremental +// freshness O(change), not O(repo). See docs/specs/openlore-spec-13.1-*. + +/** Idle quiet period (ms) before a coalesced flush after the last file change. */ +export const WATCH_DEBOUNCE_MS = 400; + +/** + * Hard ceiling (ms) that forces a flush even under a continuous change stream, + * so a steady drip of edits never starves the queue indefinitely. + */ +export const WATCH_MAX_BATCH_MS = 2000; + +/** + * Number of files in a single coalesced flush that trips VCS-flood handling + * (a branch switch / rebase / formatter touching many files at once). + */ +export const WATCH_BULK_THRESHOLD = 25; + +/** + * Above this many watched source files, live embedding auto-degrades to + * signatures-only; embeddings refresh at commit (post-commit analyze --embed). + */ +export const WATCH_EMBED_FILE_CEILING = 5000; + +/** + * Quiet period (ms) after a detected VCS bulk operation (.git/HEAD or index + * churn) before a single coalesced refresh runs, so the whole op settles first. + */ +export const WATCH_VCS_SETTLE_MS = 750; diff --git a/src/core/analyzer/vector-index-updatefiles.test.ts b/src/core/analyzer/vector-index-updatefiles.test.ts new file mode 100644 index 00000000..96f22358 --- /dev/null +++ b/src/core/analyzer/vector-index-updatefiles.test.ts @@ -0,0 +1,91 @@ +/** + * Spec 13.1 — VectorIndex.updateFiles row-level incremental update. + * + * Proves the watch path replaces ONLY the changed file's rows (delete + add) + * instead of the full-table read+overwrite build() performs: a sibling file's + * rows survive an update untouched, and the changed file's rows are replaced. + * Runs BM25-only (embedSvc = null) so it needs no embedding service. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { + VectorIndex, + _resetVectorIndexCachesForTesting, +} from './vector-index.js'; +import type { FunctionNode } from './call-graph.js'; + +function node(filePath: string, name: string): FunctionNode { + return { + id: `${filePath}::${name}`, + name, + filePath, + className: '', + language: 'TypeScript', + signature: `function ${name}()`, + docstring: '', + fanIn: 0, + fanOut: 0, + startIndex: 0, + endIndex: 0, + } as unknown as FunctionNode; +} + +let outputDir: string; + +beforeEach(async () => { + outputDir = await mkdtemp(join(tmpdir(), 'ol-vi-update-')); + _resetVectorIndexCachesForTesting(); +}); + +afterEach(async () => { + _resetVectorIndexCachesForTesting(); + await rm(outputDir, { recursive: true, force: true }); +}); + +async function names(query: string): Promise> { + const results = await VectorIndex.search(outputDir, query, null, { limit: 20 }); + return new Set(results.map((r) => r.record.name)); +} + +describe('VectorIndex.updateFiles — Spec 13.1 (BM25-only)', () => { + it('replaces only the changed file rows; sibling file rows survive', async () => { + // Build a BM25-only index with two files. + const initial = [node('alpha.ts', 'alphafn'), node('beta.ts', 'betafn')]; + const built = await VectorIndex.build( + outputDir, initial, [], new Set(), new Set(), null, undefined, false, + ); + expect(built.hasEmbeddings).toBe(false); + expect(built.total).toBe(2); + _resetVectorIndexCachesForTesting(); + + // Sanity: both functions are findable. + expect(await names('alphafn')).toContain('alphafn'); + expect(await names('betafn')).toContain('betafn'); + _resetVectorIndexCachesForTesting(); + + // Rename alpha.ts's function → updateFiles should drop the old row and add new. + const result = await VectorIndex.updateFiles( + outputDir, + [node('alpha.ts', 'gammafn')], + new Set(['alpha.ts']), + [], + new Set(), + new Set(), + null, + undefined, + ); + expect(result.hasEmbeddings).toBe(false); + _resetVectorIndexCachesForTesting(); + + // beta.ts is untouched (its row survived the row-level op). + expect(await names('betafn')).toContain('betafn'); + _resetVectorIndexCachesForTesting(); + // alpha.ts now has gammafn … + expect(await names('gammafn')).toContain('gammafn'); + _resetVectorIndexCachesForTesting(); + // … and the old alphafn row is gone (the delete predicate actually matched). + expect(await names('alphafn')).not.toContain('alphafn'); + }); +}); diff --git a/src/core/analyzer/vector-index.ts b/src/core/analyzer/vector-index.ts index 4d324c25..f4802103 100644 --- a/src/core/analyzer/vector-index.ts +++ b/src/core/analyzer/vector-index.ts @@ -204,6 +204,37 @@ export function _resetVectorIndexCachesForTesting(): void { _metaCache.clear(); } +/** + * Surgically patch the cached BM25 corpus for `dbPath` (Spec 13.1): drop the + * rows belonging to `changedFilePaths` and splice in `newRows`, then rebuild the + * in-memory corpus. No disk read — if nothing is cached yet this is a no-op and + * the next search builds the corpus fresh from the table. + */ +function patchBm25Cache(dbPath: string, changedFilePaths: Set, newRows: Record[]): void { + const entry = _bm25Cache.get(dbPath); + if (!entry) return; + const kept = entry.rows.filter((r) => !changedFilePaths.has(r.filePath as string)); + for (const r of newRows) kept.push(r); + const corpus = buildBm25Corpus(kept.map((r) => ({ id: r.id as string, text: r.text as string }))); + _bm25Cache.set(dbPath, { corpus, rowCount: kept.length, rows: kept }); +} + +/** + * Build a LanceDB `` `filePath` IN (...) `` predicate, SQL-escaping each path. + * + * The column identifier MUST be **backtick**-quoted, not double-quoted: LanceDB's + * datafusion filter parser treats a double-quoted token as a *string literal* + * (so `"filePath" = 'x'` compares the constant string 'filePath' to 'x' and is + * always false — a silent no-op delete), and a *bare* `filePath` is lowercased to + * `filepath`, which errors (no such column). Backticks are the only form that + * binds to the camelCase column. Verified empirically against @lancedb/lancedb. + */ +function filePathInPredicate(paths: Set): string | null { + if (paths.size === 0) return null; + const list = Array.from(paths).map((p) => `'${p.replace(/'/g, "''")}'`).join(', '); + return `\`filePath\` IN (${list})`; +} + // ============================================================================ // HELPERS // ============================================================================ @@ -492,6 +523,157 @@ export class VectorIndex { }; } + /** + * Watch-mode incremental update (Spec 13.1). Replace only the rows for the + * changed files with freshly-built records — a row-level delete+add instead of + * the full-corpus read+overwrite that build() performs. The cold build() path + * is untouched, protecting the `analyze --embed` contract (G7). + * + * - Embedded index: reuse existing vectors for rows whose embed-text is + * unchanged (queried for the changed files only, not the whole corpus), + * embed just the new/changed texts, then delete the changed files' old rows + * and add the rebuilt ones. The LanceDB table handle in _tableCache stays + * valid across row ops, so search() does not pay a reconnect. + * - BM25-only index: delete+add the changed files' documents and patch the + * cached BM25 corpus in place rather than dropping the whole corpus cache. + */ + static async updateFiles( + outputDir: string, + nodes: FunctionNode[], + changedFilePaths: Set, + signatures: FileSignatureMap[], + hubIds: Set, + entryPointIds: Set, + embedSvc: EmbeddingService | null | undefined, + fileContents?: Map, + ): Promise<{ embedded: number; reused: number; total: number; hasEmbeddings: boolean }> { + if (!VectorIndex.exists(outputDir)) { + return { embedded: 0, reused: 0, total: 0, hasEmbeddings: false }; + } + const dbPath = join(outputDir, DB_FOLDER); + const existingMeta = readMeta(outputDir); + const indexHasEmbeddings = existingMeta === null ? true : existingMeta.hasEmbeddings; + + // ── Build candidate records for the changed files' functions ────────────── + const sigIndex = buildSignatureIndex(signatures); + const nodeIds = new Set(nodes.map((n) => n.id)); + const candidates: Omit[] = nodes.map((node) => { + const cgDoc = node.docstring ?? ''; + const cgSig = node.signature ?? ''; + const { signature: regexSig, docstring: regexDoc } = findSignatureEntry(node, sigIndex); + const signature = cgSig || regexSig; + const docstring = cgDoc || regexDoc; + return { + id: node.id, + name: node.name, + filePath: node.filePath, + className: node.className ?? '', + language: node.language, + signature, + docstring, + fanIn: node.fanIn, + fanOut: node.fanOut, + isHub: hubIds.has(node.id), + isEntryPoint: entryPointIds.has(node.id), + text: buildText(node, signature, docstring, fileContents), + }; + }); + // Synthetic entries (constants / type aliases with no call-graph node) for + // the changed files only. + for (const fsm of signatures) { + if (!changedFilePaths.has(fsm.path)) continue; + for (const entry of fsm.entries) { + const syntheticId = `${fsm.path}::${entry.name}`; + if (nodeIds.has(syntheticId)) continue; + if (nodes.some((n) => n.filePath === fsm.path && n.name === entry.name)) continue; + const sig = entry.signature ?? ''; + const doc = entry.docstring ?? ''; + candidates.push({ + id: syntheticId, + name: entry.name, + filePath: fsm.path, + className: '', + language: fsm.language, + signature: sig, + docstring: doc, + fanIn: 0, + fanOut: 0, + isHub: false, + isEntryPoint: false, + text: `[${fsm.language}] ${fsm.path} ${entry.name}\n${sig}${doc ? '\n' + doc : ''}`, + }); + } + } + + const { connect } = await import('@lancedb/lancedb'); + const db = await connect(dbPath); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const table: any = await db.openTable(TABLE_NAME); + const predicate = filePathInPredicate(changedFilePaths); + + // ── BM25-only index ─────────────────────────────────────────────────────── + if (!embedSvc || !indexHasEmbeddings) { + if (predicate) await table.delete(predicate); + if (candidates.length > 0) { + await table.add(candidates as unknown as Record[]); + } + patchBm25Cache(dbPath, changedFilePaths, candidates as unknown as Record[]); + return { embedded: 0, reused: 0, total: candidates.length, hasEmbeddings: false }; + } + + // ── Embedded index: reuse unchanged vectors for the changed files only ──── + const cachedVectors = new Map(); // "id::text" → vector + if (predicate) { + try { + const existingRows = await table.query().where(predicate).toArray() as Record[]; + for (const row of existingRows) { + const id = row.id as string; + const text = row.text as string; + cachedVectors.set(`${id}::${text}`, Array.from(row.vector as ArrayLike)); + } + } catch { + // unreadable subset — embed everything fresh + } + } + + const toEmbed: typeof candidates = []; + const toEmbedIdx: number[] = []; + const cachedIdx: number[] = []; + for (let i = 0; i < candidates.length; i++) { + const key = `${candidates[i].id}::${candidates[i].text}`; + if (cachedVectors.has(key)) cachedIdx.push(i); + else { toEmbed.push(candidates[i]); toEmbedIdx.push(i); } + } + + let newVectors: number[][] = []; + if (toEmbed.length > 0) { + newVectors = await embedSvc.embed(toEmbed.map((r) => r.text)); + if (newVectors.length !== toEmbed.length) { + throw new Error(`Embedding count mismatch: expected ${toEmbed.length}, got ${newVectors.length}`); + } + } + + const fullRecords: FunctionRecord[] = new Array(candidates.length); + for (const idx of cachedIdx) { + const r = candidates[idx]; + fullRecords[idx] = { ...r, vector: cachedVectors.get(`${r.id}::${r.text}`)! }; + } + for (let i = 0; i < toEmbedIdx.length; i++) { + fullRecords[toEmbedIdx[i]] = { ...candidates[toEmbedIdx[i]], vector: newVectors[i] }; + } + + if (predicate) await table.delete(predicate); + if (fullRecords.length > 0) { + await table.add(fullRecords as unknown as Record[]); + } + + // Keep the table handle (_tableCache) — row ops don't invalidate it. Patch + // the BM25 corpus cache in place for the changed files. + patchBm25Cache(dbPath, changedFilePaths, fullRecords as unknown as Record[]); + + return { embedded: toEmbed.length, reused: cachedIdx.length, total: fullRecords.length, hasEmbeddings: true }; + } + /** * Hybrid search over the index: dense (ANN) + sparse (BM25) merged via RRF. * diff --git a/src/core/services/mcp-handlers/utils.ts b/src/core/services/mcp-handlers/utils.ts index 9a1059ff..c10b4c98 100644 --- a/src/core/services/mcp-handlers/utils.ts +++ b/src/core/services/mcp-handlers/utils.ts @@ -121,6 +121,38 @@ export function _resetContextCacheForTesting(): void { _contextCache.clear(); } +/** + * Watch-mode handoff (Spec 13.1). Push an updated context into the in-memory + * read cache so the next tool call is a cache HIT — no 2.1 MB disk re-parse — + * even though the watcher only patched a few signatures. Keyed identically to + * {@link readCachedContext} (resolved project directory). + * + * The cached `mtime` is set to the current on-disk `llm-context.json` mtime so + * the entry stays valid until the file genuinely changes on disk again: + * • watcher patches in memory but defers the disk write → disk mtime is + * unchanged → this entry matches → hit returns the patched context; + * • watcher writes the file then primes → disk mtime is the just-written one + * → this entry matches → hit, no cold re-parse of what we just wrote; + * • some other process (e.g. `openlore analyze`) rewrites the file → its mtime + * differs from this entry → next read MISSes and re-reads disk → correct. + */ +export async function primeContextCache(directory: string, ctx: CachedContext): Promise { + const analysisDir = join(directory, OPENLORE_DIR, OPENLORE_ANALYSIS_SUBDIR); + const filePath = join(analysisDir, ARTIFACT_LLM_CONTEXT); + let mtime: number; + try { + mtime = (await stat(filePath)).mtimeMs; + } catch { + return; // no artifact on disk yet — nothing to stay fresh against + } + const existing = _contextCache.get(directory); + // Preserve an already-open EdgeStore handle if the new ctx doesn't carry one. + if (existing?.ctx.edgeStore && !ctx.edgeStore) { + ctx.edgeStore = existing.ctx.edgeStore; + } + _contextCache.set(directory, { ctx, mtime }); +} + export async function readCachedContext(directory: string, timeout?: number): Promise { const analysisDir = join(directory, OPENLORE_DIR, OPENLORE_ANALYSIS_SUBDIR); const filePath = join(analysisDir, ARTIFACT_LLM_CONTEXT); diff --git a/src/core/services/mcp-watcher-incremental.test.ts b/src/core/services/mcp-watcher-incremental.test.ts new file mode 100644 index 00000000..7b6eb971 --- /dev/null +++ b/src/core/services/mcp-watcher-incremental.test.ts @@ -0,0 +1,166 @@ +/** + * Spec 13.1 — watch-mode performance regression tests. + * + * These cover the freshness/coalescing guarantees without needing a real + * chokidar watcher, an EdgeStore (call-graph.db), or a LanceDB vector index: + * • G1 — primeContextCache makes the next read a HIT (no cold re-parse of + * llm-context.json). + * • G2 — a burst of N events coalesces to exactly ONE flush / persistence. + * • G3 — a batch ≥ BULK_THRESHOLD is reported as a single coalesced refresh. + * • G5 — the watcher emits ≤ 1 summary line per batch by default. + * • G6 — signatures reflect a just-saved symbol after the flush, on disk. + */ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { mkdtemp, mkdir, writeFile, readFile, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { McpWatcher } from './mcp-watcher.js'; +import { + readCachedContext, + primeContextCache, + _resetContextCacheForTesting, +} from './mcp-handlers/utils.js'; + +let root: string; +let analysisDir: string; +let contextPath: string; + +async function writeContext(signatures: unknown[] = []): Promise { + await writeFile(contextPath, JSON.stringify({ signatures, callGraph: null }, null, 2), 'utf-8'); +} + +beforeEach(async () => { + root = await mkdtemp(join(tmpdir(), 'ol-watch-')); + analysisDir = join(root, '.openlore', 'analysis'); + await mkdir(analysisDir, { recursive: true }); + contextPath = join(analysisDir, 'llm-context.json'); + _resetContextCacheForTesting(); +}); + +afterEach(async () => { + _resetContextCacheForTesting(); + vi.restoreAllMocks(); + await rm(root, { recursive: true, force: true }); +}); + +describe('McpWatcher — Spec 13.1 freshness', () => { + it('G6: a save patches the just-changed signature into llm-context.json on disk', async () => { + await writeContext([]); + await readCachedContext(root); // pre-warm + + const fooAbs = join(root, 'foo.ts'); + await writeFile(fooAbs, 'export function alpha() { return 1; }\n', 'utf-8'); + + const watcher = new McpWatcher({ rootPath: root, embed: false }); + await watcher.handleChange(fooAbs); + + const onDisk = JSON.parse(await readFile(contextPath, 'utf-8')) as { signatures: Array<{ path: string; entries: Array<{ name: string }> }> }; + const fooEntry = onDisk.signatures.find((s) => s.path === 'foo.ts'); + expect(fooEntry).toBeDefined(); + expect(fooEntry!.entries.some((e) => e.name === 'alpha')).toBe(true); + }); + + it('G6: a save preserves existing signatures (reads ground truth from disk, not a stale cache)', async () => { + // Seed an existing entry, then pre-poison the shared read cache with an + // EMPTY context for this directory. A writer that patched the cached object + // would drop src/existing.ts; reading disk ground truth preserves it. + await writeContext([{ path: 'src/existing.ts', entries: [{ name: 'existingFn', signature: '', docstring: '', line: 1, kind: 'function' }] }]); + await primeContextCache(root, { signatures: [] } as never); + + const fooAbs = join(root, 'src', 'newmod.ts'); + await mkdir(join(root, 'src'), { recursive: true }); + await writeFile(fooAbs, 'export function newFn() { return 42; }\n', 'utf-8'); + + const watcher = new McpWatcher({ rootPath: root, embed: false }); + await watcher.handleChange(fooAbs); + + const onDisk = JSON.parse(await readFile(contextPath, 'utf-8')) as { signatures: Array<{ path: string }> }; + const paths = onDisk.signatures.map((s) => s.path); + expect(paths).toContain('src/existing.ts'); + expect(paths).toContain('src/newmod.ts'); + }); + + it('G1: primeContextCache makes the next read a HIT — it returns the in-memory object, not what is on disk', async () => { + await writeContext([{ path: 'orig.ts', entries: [] }]); + const cold = await readCachedContext(root); + expect(cold).not.toBeNull(); + + // Prime the cache with a DIFFERENT object WITHOUT touching the file → the + // on-disk mtime is unchanged, so the entry stays valid. A subsequent read + // that hit the cache returns the primed object; a read that went to disk + // would return the original on-disk signatures instead. + await primeContextCache(root, { signatures: [{ path: 'patched.ts', entries: [{ name: 'beta', signature: '', docstring: '', line: 1, kind: 'function' }] }] } as never); + + const after = await readCachedContext(root); + const sigs = (after as { signatures: Array<{ path: string }> }).signatures; + expect(sigs.some((s) => s.path === 'patched.ts')).toBe(true); + expect(sigs.some((s) => s.path === 'orig.ts')).toBe(false); + + const onDisk = JSON.parse(await readFile(contextPath, 'utf-8')) as { signatures: Array<{ path: string }> }; + expect(onDisk.signatures.some((s) => s.path === 'orig.ts')).toBe(true); + }); + + it('G2/G5: a burst of N change events coalesces to exactly ONE flush + ONE summary line', async () => { + await writeContext([]); + const files = ['a.ts', 'b.ts', 'c.ts', 'd.ts']; + for (const f of files) { + await writeFile(join(root, f), `export function fn_${f.replace('.ts', '')}() {}\n`, 'utf-8'); + } + + const summaries: string[] = []; + vi.spyOn(process.stderr, 'write').mockImplementation((chunk: string | Uint8Array): boolean => { + const s = chunk.toString(); + if (/\[mcp-watcher\] (updated|coalesced)/.test(s)) summaries.push(s); + return true; + }); + + const watcher = new McpWatcher({ rootPath: root, embed: false, debounceMs: 30, maxBatchMs: 1000 }); + for (const f of files) (watcher as unknown as { enqueue(p: string): void }).enqueue(join(root, f)); + + await new Promise((r) => setTimeout(r, 200)); + + expect(summaries.length).toBe(1); + expect(summaries[0]).toContain('updated 4 files'); + + const ctx = await readCachedContext(root); + const paths = new Set((ctx as { signatures: Array<{ path: string }> }).signatures.map((s) => s.path)); + for (const f of files) expect(paths.has(f)).toBe(true); + }); + + it('G3: a batch ≥ BULK_THRESHOLD is reported as a single coalesced refresh', async () => { + await writeContext([]); + const files = ['x.ts', 'y.ts', 'z.ts']; + for (const f of files) await writeFile(join(root, f), `export const ${f.replace('.ts', '')} = 1;\n`, 'utf-8'); + + const summaries: string[] = []; + vi.spyOn(process.stderr, 'write').mockImplementation((chunk: string | Uint8Array): boolean => { + const s = chunk.toString(); + if (/\[mcp-watcher\] (updated|coalesced)/.test(s)) summaries.push(s); + return true; + }); + + const watcher = new McpWatcher({ rootPath: root, embed: false, debounceMs: 30, bulkThreshold: 3 }); + for (const f of files) (watcher as unknown as { enqueue(p: string): void }).enqueue(join(root, f)); + await new Promise((r) => setTimeout(r, 200)); + + expect(summaries.length).toBe(1); + expect(summaries[0]).toContain('coalesced 3 changes'); + }); + + it('the watcher-path flush persists the patched context to disk (freshness survives a process restart)', async () => { + await writeContext([]); + const fooAbs = join(root, 'foo.ts'); + await writeFile(fooAbs, 'export function delta() {}\n', 'utf-8'); + + const watcher = new McpWatcher({ rootPath: root, embed: false, debounceMs: 20, maxBatchMs: 1000 }); + (watcher as unknown as { enqueue(p: string): void }).enqueue(fooAbs); + await new Promise((r) => setTimeout(r, 150)); + + const onDisk = JSON.parse(await readFile(contextPath, 'utf-8')) as { signatures: Array<{ path: string; entries: Array<{ name: string }> }> }; + const foo = onDisk.signatures.find((s) => s.path === 'foo.ts'); + expect(foo).toBeDefined(); + expect(foo!.entries.some((e) => e.name === 'delta')).toBe(true); + + await watcher.stop(); + }); +}); diff --git a/src/core/services/mcp-watcher.test.ts b/src/core/services/mcp-watcher.test.ts index 6d905d01..42ad4d28 100644 --- a/src/core/services/mcp-watcher.test.ts +++ b/src/core/services/mcp-watcher.test.ts @@ -1,5 +1,12 @@ /** - * Tests for McpWatcher — handleChange (unit, no real FS watcher needed) + * Tests for McpWatcher — handleChange / handleBatch (unit, no real FS watcher). + * + * Spec 13.1 reshaped the watcher: a single coalescing queue (enqueue → flush → + * handleBatch) replaces the per-file timer map, and the vector update goes + * through VectorIndex.updateFiles (row-level) on a decoupled lane rather than + * reEmbed → VectorIndex.build. These tests track the new surface; the + * freshness/coalescing guarantees themselves live in + * mcp-watcher-incremental.test.ts. */ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; @@ -10,6 +17,7 @@ import type { LLMContext } from '../analyzer/artifact-generator.js'; import type { SerializedCallGraph } from '../analyzer/call-graph.js'; import { EdgeStore } from './edge-store.js'; import type { CallEdge } from '../analyzer/call-graph.js'; +import { _resetContextCacheForTesting } from './mcp-handlers/utils.js'; // ── chokidar mock (prevents real FS watcher from opening) ──────────────────── @@ -61,6 +69,10 @@ async function setupProject(ctx: LLMContext): Promise<{ rootPath: string; output return { rootPath, outputPath, contextPath }; } +beforeEach(() => { + _resetContextCacheForTesting(); +}); + // ── Tests ───────────────────────────────────────────────────────────────────── describe('McpWatcher.handleChange', () => { @@ -72,6 +84,7 @@ describe('McpWatcher.handleChange', () => { afterEach(() => { stderrSpy.mockRestore(); + _resetContextCacheForTesting(); }); it('updates signatures for a changed TypeScript file', async () => { @@ -304,15 +317,16 @@ describe('McpWatcher.handleChange', () => { const watcher = new McpWatcher({ rootPath, outputPath }); await watcher.handleChange(srcFile); - // llm-context.json must not be written (early return on hash hit) + // llm-context.json must not be written: the only changed file was a no-op + // autosave (hash hit), so the batch has nothing to persist. const after = await readFile(contextPath, 'utf-8'); expect(after).toBe(before); }); }); -// ── reEmbed paths ───────────────────────────────────────────────────────────── +// ── Vector update path (updateVectors → VectorIndex.updateFiles) ──────────────── -describe('McpWatcher.reEmbed', () => { +describe('McpWatcher vector update (Spec 13.1 — row-level updateFiles)', () => { let stderrSpy: ReturnType; beforeEach(() => { @@ -322,20 +336,21 @@ describe('McpWatcher.reEmbed', () => { afterEach(() => { stderrSpy.mockRestore(); vi.restoreAllMocks(); + vi.resetModules(); + _resetContextCacheForTesting(); }); - it('refreshes the BM25 index (embedSvc=null) when no embedding service is available', async () => { + it('calls VectorIndex.updateFiles with a null embedder when no embedding service is available (BM25 refresh)', async () => { const cg = makeCallGraph(); const ctx = makeContext({ callGraph: cg }); const { rootPath, outputPath } = await setupProject(ctx); - // Write a fake vector index marker so VectorIndex.exists returns true await mkdir(join(outputPath, 'vector-index'), { recursive: true }); await writeFile(join(outputPath, 'vector-index', '.keep'), '', 'utf-8'); - const mockBuild = vi.fn().mockResolvedValue({ embedded: 0, reused: 0, total: 1, hasEmbeddings: false }); + const mockUpdate = vi.fn().mockResolvedValue({ embedded: 0, reused: 0, total: 1, hasEmbeddings: false }); vi.doMock('../analyzer/vector-index.js', () => ({ - VectorIndex: { exists: vi.fn().mockReturnValue(true), build: mockBuild }, + VectorIndex: { exists: vi.fn().mockReturnValue(true), updateFiles: mockUpdate }, })); vi.doMock('../analyzer/embedding-service.js', () => ({ EmbeddingService: { @@ -354,32 +369,31 @@ describe('McpWatcher.reEmbed', () => { const watcher = new McpWatcher({ rootPath, outputPath }); await watcher.handleChange(srcFile); - // build is invoked with a null embedder (BM25 refresh), not skipped - expect(mockBuild).toHaveBeenCalledWith( + expect(mockUpdate).toHaveBeenCalledWith( outputPath, - cg.nodes, - expect.any(Array), - expect.any(Set), - expect.any(Set), - null, - expect.any(Map), - true, - ); - expect(stderrSpy).toHaveBeenCalledWith( - expect.stringContaining('refreshed BM25 index'), + expect.any(Array), // changed nodes (empty here — no edge store) + expect.any(Set), // changed file paths + expect.any(Array), // signatures + expect.any(Set), // hub ids + expect.any(Set), // entry ids + null, // embedder unavailable → BM25 refresh + expect.any(Map), // file contents ); }); - it('calls VectorIndex.build and logs when embedding succeeds', async () => { + it('calls VectorIndex.updateFiles with the embedder when one is available', async () => { const cg = makeCallGraph(); const ctx = makeContext({ callGraph: cg }); const { rootPath, outputPath } = await setupProject(ctx); - const mockBuild = vi.fn().mockResolvedValue({ embedded: 3, reused: 1, total: 4, hasEmbeddings: true }); + await mkdir(join(outputPath, 'vector-index'), { recursive: true }); + await writeFile(join(outputPath, 'vector-index', '.keep'), '', 'utf-8'); + + const mockUpdate = vi.fn().mockResolvedValue({ embedded: 3, reused: 1, total: 4, hasEmbeddings: true }); const mockEmbedSvc = {}; vi.doMock('../analyzer/vector-index.js', () => ({ - VectorIndex: { exists: vi.fn().mockReturnValue(true), build: mockBuild }, + VectorIndex: { exists: vi.fn().mockReturnValue(true), updateFiles: mockUpdate }, })); vi.doMock('../analyzer/embedding-service.js', () => ({ EmbeddingService: { @@ -398,30 +412,30 @@ describe('McpWatcher.reEmbed', () => { const watcher = new McpWatcher({ rootPath, outputPath }); await watcher.handleChange(srcFile); - expect(mockBuild).toHaveBeenCalledWith( + expect(mockUpdate).toHaveBeenCalledWith( outputPath, - cg.nodes, + expect.any(Array), + expect.any(Set), expect.any(Array), expect.any(Set), expect.any(Set), mockEmbedSvc, expect.any(Map), - true, - ); - expect(stderrSpy).toHaveBeenCalledWith( - expect.stringContaining('re-embedded'), ); }); - it('logs embed error and does not throw when VectorIndex.build throws', async () => { + it('logs an embed error and does not throw when VectorIndex.updateFiles throws', async () => { const cg = makeCallGraph(); const ctx = makeContext({ callGraph: cg }); const { rootPath, outputPath } = await setupProject(ctx); + await mkdir(join(outputPath, 'vector-index'), { recursive: true }); + await writeFile(join(outputPath, 'vector-index', '.keep'), '', 'utf-8'); + vi.doMock('../analyzer/vector-index.js', () => ({ VectorIndex: { exists: vi.fn().mockReturnValue(true), - build: vi.fn().mockRejectedValue(new Error('LanceDB connection failed')), + updateFiles: vi.fn().mockRejectedValue(new Error('LanceDB connection failed')), }, })); vi.doMock('../analyzer/embedding-service.js', () => ({ @@ -446,9 +460,9 @@ describe('McpWatcher.reEmbed', () => { }); }); -// ── Debounce ────────────────────────────────────────────────────────────────── +// ── Coalescing queue (Spec 13.1) ─────────────────────────────────────────────── -describe('McpWatcher debounce', () => { +describe('McpWatcher coalescing queue', () => { beforeEach(() => { vi.useFakeTimers(); }); @@ -457,79 +471,62 @@ describe('McpWatcher debounce', () => { vi.useRealTimers(); }); - it('coalesces rapid changes to the same file into one handleChange call', async () => { + it('coalesces rapid changes to the same file into a single batch flush', async () => { const { McpWatcher } = await import('./mcp-watcher.js'); - const watcher = new McpWatcher({ rootPath: '/tmp/proj', debounceMs: 200 }); - const spy = vi.spyOn(watcher, 'handleChange').mockResolvedValue(undefined); + const watcher = new McpWatcher({ rootPath: '/tmp/proj', debounceMs: 200, embed: false }); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const spy = vi.spyOn(watcher as any, 'handleBatch').mockResolvedValue(undefined); + const enqueue = (watcher as unknown as { enqueue(p: string): void }).enqueue.bind(watcher); - // Simulate 5 rapid saves - for (let i = 0; i < 5; i++) { - (watcher as unknown as { scheduleChange(p: string): void }).scheduleChange('/tmp/proj/src/foo.ts'); - } + for (let i = 0; i < 5; i++) enqueue('/tmp/proj/src/foo.ts'); await vi.runAllTimersAsync(); expect(spy).toHaveBeenCalledTimes(1); }); - it('fires separate handleChange for two different files', async () => { + it('coalesces changes across DIFFERENT files into ONE batch (G2)', async () => { const { McpWatcher } = await import('./mcp-watcher.js'); - const watcher = new McpWatcher({ rootPath: '/tmp/proj', debounceMs: 200 }); - const spy = vi.spyOn(watcher, 'handleChange').mockResolvedValue(undefined); + const watcher = new McpWatcher({ rootPath: '/tmp/proj', debounceMs: 200, embed: false }); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const spy = vi.spyOn(watcher as any, 'handleBatch').mockResolvedValue(undefined); + const enqueue = (watcher as unknown as { enqueue(p: string): void }).enqueue.bind(watcher); - (watcher as unknown as { scheduleChange(p: string): void }).scheduleChange('/tmp/proj/src/a.ts'); - (watcher as unknown as { scheduleChange(p: string): void }).scheduleChange('/tmp/proj/src/b.ts'); + enqueue('/tmp/proj/src/a.ts'); + enqueue('/tmp/proj/src/b.ts'); await vi.runAllTimersAsync(); - expect(spy).toHaveBeenCalledTimes(2); - }); -}); - -describe('McpWatcher reschedule-when-busy', () => { - let stderrSpy: ReturnType; - - beforeEach(() => { - stderrSpy = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); - vi.useFakeTimers(); - }); - - afterEach(() => { - stderrSpy.mockRestore(); - vi.useRealTimers(); + // One flush carrying both paths — not one flush per file. + expect(spy).toHaveBeenCalledTimes(1); + const batch = spy.mock.calls[0][0] as string[]; + expect(new Set(batch)).toEqual(new Set(['/tmp/proj/src/a.ts', '/tmp/proj/src/b.ts'])); }); - it('reschedules a change instead of dropping it when busy', async () => { + it('processes changes that arrive while a flush is in flight (no drop, single-flight)', async () => { const { McpWatcher } = await import('./mcp-watcher.js'); - const watcher = new McpWatcher({ rootPath: '/tmp/proj', debounceMs: 100 }); + const watcher = new McpWatcher({ rootPath: '/tmp/proj', debounceMs: 100, embed: false }); - // Make handleChange block until we resolve it let resolveFirst!: () => void; const firstCall = new Promise(r => { resolveFirst = r; }); - let callCount = 0; - vi.spyOn(watcher, 'handleChange').mockImplementation(async () => { - callCount++; - if (callCount === 1) await firstCall; + let calls = 0; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + vi.spyOn(watcher as any, 'handleBatch').mockImplementation(async () => { + calls++; + if (calls === 1) await firstCall; }); + const enqueue = (watcher as unknown as { enqueue(p: string): void }).enqueue.bind(watcher); - const schedule = (watcher as unknown as { scheduleChange(p: string): void }).scheduleChange.bind(watcher); - - // First change — will start processing after debounce - schedule('/tmp/proj/src/a.ts'); + enqueue('/tmp/proj/src/a.ts'); await vi.advanceTimersByTimeAsync(100); - // handleChange is now running (blocked on firstCall) - expect(callCount).toBe(1); + expect(calls).toBe(1); // first flush running, blocked - // Second change arrives while busy — should be rescheduled, not dropped - schedule('/tmp/proj/src/a.ts'); + // New change arrives while busy — accumulates in pending, not dropped. + enqueue('/tmp/proj/src/b.ts'); await vi.advanceTimersByTimeAsync(100); - // Still blocked — rescheduled change fires but sees busy, reschedules again - expect(callCount).toBe(1); + expect(calls).toBe(1); // still single-flight - // Unblock first handleChange resolveFirst(); await vi.advanceTimersByTimeAsync(200); - - // Rescheduled change should now have fired - expect(callCount).toBe(2); + expect(calls).toBe(2); // pending 'b.ts' flushed after the first finished }); }); @@ -614,16 +611,16 @@ describe('McpWatcher — real chokidar prunes build dirs (does not FD-storm targ // chokidar (not the module mock above) via a fresh dynamic import in an // isolated module registry. it('watches source but never opens target/ children', async () => { - const { mkdtemp, writeFile, mkdir } = await import('node:fs/promises'); - const { tmpdir } = await import('node:os'); + const { mkdtemp: mkdtempReal, writeFile: writeFileReal, mkdir: mkdirReal } = await import('node:fs/promises'); + const { tmpdir: tmpdirReal } = await import('node:os'); const { join: pjoin } = await import('node:path'); - const root = await mkdtemp(pjoin(tmpdir(), 'mcp-prune-')); - await mkdir(pjoin(root, 'src'), { recursive: true }); - await mkdir(pjoin(root, 'target', 'debug', 'deps'), { recursive: true }); - await writeFile(pjoin(root, 'src', 'main.rs'), 'fn main() {}'); + const root = await mkdtempReal(pjoin(tmpdirReal(), 'mcp-prune-')); + await mkdirReal(pjoin(root, 'src'), { recursive: true }); + await mkdirReal(pjoin(root, 'target', 'debug', 'deps'), { recursive: true }); + await writeFileReal(pjoin(root, 'src', 'main.rs'), 'fn main() {}'); for (let i = 0; i < 40; i++) { - await writeFile(pjoin(root, 'target', 'debug', 'deps', `f${i}.rs`), '// gen'); + await writeFileReal(pjoin(root, 'target', 'debug', 'deps', `f${i}.rs`), '// gen'); } // Use the real chokidar + the real ignore predicate, not the vi.mock. diff --git a/src/core/services/mcp-watcher.ts b/src/core/services/mcp-watcher.ts index 9d39a01b..0629d50f 100644 --- a/src/core/services/mcp-watcher.ts +++ b/src/core/services/mcp-watcher.ts @@ -8,19 +8,39 @@ * The call graph is deliberately excluded — rebuilding it requires full * tree-sitter analysis of all call sites and is too expensive for a watch loop. * It stays current via the post-commit hook (openlore analyze --force --embed). + * + * Spec 13.1 (watch-mode performance): freshness is O(change), not O(repo). + * • Per-file events COALESCE into one batched flush (single debounce timer + + * hard max-batch ceiling), so a burst / branch-switch runs the pipeline once, + * not once per file. + * • The patched llm-context is handed to the MCP read cache in place + * (primeContextCache), so the next tool call is a cache HIT — no 2.1 MB + * cold re-parse — even after the disk write. + * • Vector updates are row-level (VectorIndex.updateFiles), not a full-corpus + * read+overwrite, and run on a separate lower-priority lane so signature + * freshness never blocks on embedding. + * • VCS-flood / bulk batches are detected and collapsed to a single refresh. + * • stderr emits one summary line per batch by default (per-file detail behind + * OPENLORE_WATCH_DEBUG). */ -import { readFile, writeFile } from 'node:fs/promises'; +import { readFile, writeFile, readdir } from 'node:fs/promises'; import { createHash } from 'node:crypto'; import { join, relative } from 'node:path'; import chokidar, { type FSWatcher } from 'chokidar'; import { extractSignatures, detectLanguage } from '../analyzer/signature-extractor.js'; -import type { LLMContext } from '../analyzer/artifact-generator.js'; +import type { FunctionNode } from '../analyzer/call-graph.js'; import { EdgeStore } from './edge-store.js'; +import { primeContextCache, type CachedContext } from './mcp-handlers/utils.js'; import { OPENLORE_DIR, OPENLORE_ANALYSIS_SUBDIR, ARTIFACT_LLM_CONTEXT, + WATCH_DEBOUNCE_MS, + WATCH_MAX_BATCH_MS, + WATCH_BULK_THRESHOLD, + WATCH_EMBED_FILE_CEILING, + WATCH_VCS_SETTLE_MS, } from '../../constants.js'; const CALL_GRAPH_LANGS = new Set([ @@ -36,12 +56,25 @@ export interface McpWatcherOptions { rootPath: string; /** Absolute path to .openlore/analysis/ — where llm-context.json lives */ outputPath?: string; - /** Milliseconds to debounce file-change events (default: 400) */ + /** Milliseconds to debounce file-change events (default: WATCH_DEBOUNCE_MS) */ debounceMs?: number; + /** Hard flush ceiling under a continuous change stream (default: WATCH_MAX_BATCH_MS) */ + maxBatchMs?: number; + /** Batch size that trips VCS-flood handling (default: WATCH_BULK_THRESHOLD) */ + bulkThreshold?: number; + /** Run the live vector update; false = signatures-only (default: true) */ + embed?: boolean; + /** Above this many watched source files, auto-degrade to signatures-only */ + embedFileCeiling?: number; /** Extra glob patterns to ignore in addition to defaults */ ignore?: string[]; } +interface ChangedFile { + rel: string; + content: string; +} + const SOURCE_EXTENSIONS = /\.(ts|tsx|js|jsx|py|go|rs|rb|java|kt|php|cs|cpp|cc|cxx|h|hpp|c|swift)$/; // Directory NAMES that must never be watched. Build-output and dependency @@ -104,24 +137,63 @@ export function isIgnoredRelPath(relPath: string): boolean { export class McpWatcher { private readonly rootPath: string; private readonly outputPath: string; + private readonly contextPath: string; private readonly debounceMs: number; + private readonly maxBatchMs: number; + private readonly bulkThreshold: number; + private readonly embedFileCeiling: number; private readonly extraIgnore: string[]; + private readonly debug: boolean; private fsWatcher?: FSWatcher; - private timers = new Map>(); - private running = false; + private gitWatcher?: FSWatcher; + + // ── Coalescing queue (Step 1) ────────────────────────────────────────────── + private pending = new Set(); // absolute paths awaiting a flush + private debounceTimer?: ReturnType; + private maxBatchTimer?: ReturnType; + private running = false; // single-flight for the signature flush + private vcsBulkFlag = false; // set by the .git ref watcher + + // ── Embedding lane (Step 4 — decoupled, lower priority) ───────────────────── + private embed: boolean; + private embedDegraded = false; // auto-degraded on a too-large tree + private embedFiles = new Map(); // rel → content awaiting embed + private embedNodes = new Map(); // id → node awaiting embed + private embedTimer?: ReturnType; + private embedRunning = false; + private lastEmbedContext?: CachedContext; constructor(options: McpWatcherOptions) { this.rootPath = options.rootPath; this.outputPath = options.outputPath ?? join(options.rootPath, OPENLORE_DIR, OPENLORE_ANALYSIS_SUBDIR); - this.debounceMs = options.debounceMs ?? 400; + this.contextPath = join(this.outputPath, ARTIFACT_LLM_CONTEXT); + this.debounceMs = options.debounceMs ?? WATCH_DEBOUNCE_MS; + this.maxBatchMs = options.maxBatchMs ?? WATCH_MAX_BATCH_MS; + this.bulkThreshold = options.bulkThreshold ?? WATCH_BULK_THRESHOLD; + this.embedFileCeiling = options.embedFileCeiling ?? WATCH_EMBED_FILE_CEILING; + this.embed = options.embed ?? true; this.extraIgnore = options.ignore ?? []; + this.debug = !!process.env.OPENLORE_WATCH_DEBUG; } // ── Lifecycle ────────────────────────────────────────────────────────────── async start(): Promise { + // Auto-degrade live embedding on very large trees (Step 4). Counting is + // bounded — it stops as soon as the ceiling is exceeded. + if (this.embed) { + const count = await this.countSourceFiles(this.embedFileCeiling + 1); + if (count > this.embedFileCeiling) { + this.embedDegraded = true; + process.stderr.write( + `[mcp-watcher] ${count}+ source files exceed the live-embed ceiling ` + + `(${this.embedFileCeiling}); running signatures-only — embeddings refresh at commit\n` + ); + } + } + await new Promise((resolve, reject) => { const extraIgnore = this.extraIgnore; const rootPath = this.rootPath; @@ -142,7 +214,7 @@ export class McpWatcher { this.fsWatcher.on('change', (absPath: string) => { if (SOURCE_EXTENSIONS.test(absPath)) { - this.scheduleChange(absPath); + this.enqueue(absPath); } }); @@ -150,133 +222,306 @@ export class McpWatcher { this.fsWatcher.on('error', (err: unknown) => reject(err)); }); - process.stderr.write(`[mcp-watcher] watching ${this.rootPath}\n`); + // Best-effort VCS-flood detection (Step 5): a branch switch / rebase / merge + // bumps these refs. We never recurse into .git (it stays ignored above); we + // watch only these specific files, then collapse the churn into one refresh. + try { + const gitDir = join(this.rootPath, '.git'); + const refs = ['HEAD', 'index', 'MERGE_HEAD', 'ORIG_HEAD'].map((f) => join(gitDir, f)); + this.gitWatcher = chokidar.watch(refs, { + persistent: true, + ignoreInitial: true, + followSymlinks: false, + }); + this.gitWatcher.on('all', () => this.onVcsEvent()); + } catch { + // no .git, or watch failed — VCS detection falls back to the batch-size + // threshold in handleBatch, which is enough for G3. + } + + process.stderr.write( + `[mcp-watcher] watching ${this.rootPath}` + + `${this.embed && !this.embedDegraded ? '' : ' (signatures-only)'}\n` + ); } async stop(): Promise { - for (const t of this.timers.values()) clearTimeout(t); - this.timers.clear(); + if (this.debounceTimer) clearTimeout(this.debounceTimer); + if (this.maxBatchTimer) clearTimeout(this.maxBatchTimer); + if (this.embedTimer) clearTimeout(this.embedTimer); + this.debounceTimer = this.maxBatchTimer = this.embedTimer = undefined; + // Best-effort: persist anything still queued so a save right before shutdown + // is not lost. + if (this.pending.size > 0 && !this.running) { + const batch = Array.from(this.pending); + this.pending.clear(); + try { await this.handleBatch(batch, { syncFlush: true }); } catch { /* ignore */ } + } await this.fsWatcher?.close(); + await this.gitWatcher?.close(); process.stderr.write('[mcp-watcher] stopped\n'); } - // ── Debounce ─────────────────────────────────────────────────────────────── + // ── Coalescing (Step 1) ────────────────────────────────────────────────────── - private scheduleChange(absPath: string): void { - const existing = this.timers.get(absPath); - if (existing) clearTimeout(existing); + /** + * Add a changed path to the pending set and (re)arm a single debounce timer, + * plus a one-shot hard ceiling so a continuous stream still flushes. + */ + private enqueue(absPath: string): void { + this.pending.add(absPath); + if (this.debounceTimer) clearTimeout(this.debounceTimer); + this.debounceTimer = setTimeout(() => this.flush(), this.debounceMs); + if (!this.maxBatchTimer) { + this.maxBatchTimer = setTimeout(() => this.flush(), this.maxBatchMs); + } + } - const t = setTimeout(() => { - this.timers.delete(absPath); - if (this.running) { - // Re-schedule instead of dropping — ensures no changes are lost - this.scheduleChange(absPath); - return; - } - this.running = true; - this.handleChange(absPath) - .catch(err => process.stderr.write(`[mcp-watcher] error: ${(err as Error).message}\n`)) - .finally(() => { this.running = false; }); - }, this.debounceMs); + /** A .git ref changed — settle, then flush whatever changed as one bulk batch. */ + private onVcsEvent(): void { + this.vcsBulkFlag = true; + if (this.debounceTimer) clearTimeout(this.debounceTimer); + this.debounceTimer = setTimeout(() => this.flush(), WATCH_VCS_SETTLE_MS); + if (this.debug) { + process.stderr.write('[mcp-watcher] VCS operation detected — coalescing into one refresh\n'); + } + } - this.timers.set(absPath, t); + /** + * Drain the pending set into a single batch. Single-flight: if a flush is + * already running, leave the new paths in `pending` and reschedule once it + * finishes — never interleave two flushes. + */ + private flush(): void { + if (this.debounceTimer) { clearTimeout(this.debounceTimer); this.debounceTimer = undefined; } + if (this.maxBatchTimer) { clearTimeout(this.maxBatchTimer); this.maxBatchTimer = undefined; } + if (this.running) return; // a follow-up is scheduled in finally{} + if (this.pending.size === 0) return; + + const batch = Array.from(this.pending); + this.pending.clear(); + this.running = true; + this.handleBatch(batch) + .catch((err) => process.stderr.write(`[mcp-watcher] error: ${(err as Error).message}\n`)) + .finally(() => { + this.running = false; + if (this.pending.size > 0) { + this.debounceTimer = setTimeout(() => this.flush(), this.debounceMs); + } + }); } // ── Core re-index ────────────────────────────────────────────────────────── /** - * Re-index a single changed file. - * Exposed for unit testing without needing a real file watcher. + * Re-index a single changed file. Exposed for unit testing without needing a + * real file watcher; flushes synchronously so callers observe the update on + * disk immediately. Internally this is just a batch of one. */ async handleChange(absPath: string): Promise { - const rel = relative(this.rootPath, absPath); - - // Skip test files and unsupported languages - if (isTestFile(rel)) return; - if (detectLanguage(rel) === 'unknown') return; + await this.handleBatch([absPath], { syncFlush: true }); + } - // Read new file content (needed for hash check and re-parse) - let content: string; - try { - content = await readFile(absPath, 'utf-8'); - } catch { - return; // file may have been deleted between the event and now + /** + * Process a coalesced batch of changed files as ONE pipeline pass: + * • per-file incremental edge update (content-hash skip), all under one open + * EdgeStore; + * • ONE signature patch + ONE llm-context persist + ONE read-cache handoff; + * • ONE vector update (inline when syncFlush, else on the embed lane). + */ + private async handleBatch(absPaths: string[], opts: { syncFlush?: boolean } = {}): Promise { + const t0 = Date.now(); + const consumedVcsBulk = this.vcsBulkFlag; + this.vcsBulkFlag = false; + + // 1. Resolve + read candidate files (skip tests / unknown langs / deleted). + const files: Array<{ rel: string; abs: string; content: string }> = []; + for (const abs of absPaths) { + const rel = relative(this.rootPath, abs); + if (isTestFile(rel)) continue; + if (detectLanguage(rel) === 'unknown') continue; + let content: string; + try { + content = await readFile(abs, 'utf-8'); + } catch { + continue; // file may have been deleted between the event and now + } + files.push({ rel, abs, content }); } + if (files.length === 0) return; - // ── Incremental edge update (CGC _handle_modification algorithm) ────────── + // 2. Incremental edge update (CGC _handle_modification algorithm), one open + // store for the whole batch. Content-hash skip drops no-op autosaves. + const changedFiles: ChangedFile[] = []; + const changedNodes: FunctionNode[] = []; if (EdgeStore.exists(this.outputPath)) { const store = EdgeStore.open(EdgeStore.dbPath(this.outputPath)); try { - // Content hash — skip entirely on no-op IDE autosaves - const newHash = createHash('sha256').update(content).digest('hex'); - if (store.getFileHash(rel) === newHash) return; - - // Reverse lookup BEFORE delete so we know which files call into this one - // callerFiles are relative paths (DB stores relative paths) - const callerFiles = store.getCallerFiles(rel); - - // Re-parse BEFORE mutating DB — graph stays readable (old state) during parse. - // Seed resolution with all known nodes so the re-parsed caller files' - // calls into other files don't degrade to `external::` (they would - // otherwise, since the subset trie only holds the re-parsed files). - const resolutionNodes = store.getAllInternalNodes(); - const { edges: newEdges, nodes: newNodes } = await buildGraphSubset(rel, content, callerFiles, this.rootPath, resolutionNodes); - - // Atomic swap: delete stale data and insert fresh data in one transaction - // so concurrent MCP reads never see a torn graph - store.transaction(() => { - store.deleteEdgesForFile(rel); - for (const cf of callerFiles.slice(0, CALLER_REPARSE_LIMIT)) { - store.deleteOutgoingEdgesForFile(cf); + for (const f of files) { + const newHash = createHash('sha256').update(f.content).digest('hex'); + if (store.getFileHash(f.rel) === newHash) continue; // no-op autosave + + // Reverse lookup BEFORE delete so we know which files call into this one. + const callerFiles = store.getCallerFiles(f.rel); + // Re-parse BEFORE mutating DB — graph stays readable (old state) during + // parse. Seed resolution with all known nodes so re-parsed callers' + // cross-file calls don't degrade to `external::`. + const resolutionNodes = store.getAllInternalNodes(); + const { edges: newEdges, nodes: newNodes } = + await buildGraphSubset(f.rel, f.content, callerFiles, this.rootPath, resolutionNodes); + + // Atomic swap so concurrent MCP reads never see a torn graph. + store.transaction(() => { + store.deleteEdgesForFile(f.rel); + for (const cf of callerFiles.slice(0, CALLER_REPARSE_LIMIT)) { + store.deleteOutgoingEdgesForFile(cf); + } + store.deleteNodesForFile(f.rel); + store.insertNodes(newNodes); + store.insertEdges(newEdges); + store.setFileHash(f.rel, newHash); + }); + + changedFiles.push({ rel: f.rel, content: f.content }); + for (const n of newNodes) changedNodes.push(n); + if (this.debug) { + process.stderr.write( + `[mcp-watcher] graph: ${f.rel} (+${newNodes.length} nodes, +${newEdges.length} edges, ${callerFiles.length} callers)\n` + ); } - store.deleteNodesForFile(rel); - store.insertNodes(newNodes); - store.insertEdges(newEdges); - store.setFileHash(rel, newHash); - }); - - process.stderr.write( - `[mcp-watcher] updated graph: ${rel} (+${newNodes.length} nodes, +${newEdges.length} edges, ${callerFiles.length} callers re-parsed)\n` - ); + } } finally { store.close(); } + } else { + // No edge store yet — still refresh signatures for every candidate. + for (const f of files) changedFiles.push({ rel: f.rel, content: f.content }); } - // ── Signature patch ─────────────────────────────────────────────────────── - const contextPath = join(this.outputPath, ARTIFACT_LLM_CONTEXT); - let context: LLMContext; - try { - const raw = await readFile(contextPath, 'utf-8'); - context = JSON.parse(raw) as LLMContext; - } catch { - process.stderr.write(`[mcp-watcher] no context at ${contextPath} — run analyze first\n`); + if (changedFiles.length === 0) return; // every event was a no-op autosave + + // 3. Signatures: load context (shared in-memory cache), patch all changed + // files, then ONE persist + read-cache handoff (Step 2). The handoff + // means the next tool call is a cache HIT — no cold 2.1 MB re-parse. + const context = await this.loadContext(); + if (!context) { + process.stderr.write(`[mcp-watcher] no context at ${this.contextPath} — run analyze first\n`); return; } - - const newMap = extractSignatures(rel, content); if (!context.signatures) context.signatures = []; - const idx = context.signatures.findIndex(m => m.path === rel); - if (idx >= 0) { - context.signatures[idx] = newMap; - } else { - context.signatures.push(newMap); + for (const f of changedFiles) { + const newMap = extractSignatures(f.rel, f.content); + const idx = context.signatures.findIndex((m) => m.path === f.rel); + if (idx >= 0) context.signatures[idx] = newMap; + else context.signatures.push(newMap); } + await this.persistContext(context); + + // 4. Vector update — decoupled from signature freshness (Step 4). + const isBulk = consumedVcsBulk || changedFiles.length >= this.bulkThreshold; + if (this.embed && !this.embedDegraded && context.callGraph) { + if (opts.syncFlush) { + // Direct handleChange path: inline so callers/tests observe it. + await this.updateVectors(context, changedFiles, changedNodes); + } else { + // Watcher path: schedule on the lower-priority embed lane. On a bulk + // event this still collapses to a single deferred pass. + this.scheduleEmbed(context, changedFiles, changedNodes); + } + } + + // 5. One summary line per batch (Step 6). Per-file detail is behind debug. + const n = changedFiles.length; + process.stderr.write( + `[mcp-watcher] ${isBulk ? `coalesced ${n} changes` : `updated ${n} file${n === 1 ? '' : 's'}`} (${Date.now() - t0}ms)\n` + ); + } + + // ── llm-context load + persistence + read-cache handoff (Step 2) ───────────── - await writeFile(contextPath, JSON.stringify(context, null, 2), 'utf-8'); - process.stderr.write(`[mcp-watcher] re-indexed signatures: ${rel}\n`); + /** + * True when this watcher writes to the canonical `/.openlore/analysis` + * layout that the MCP read handlers cache against. Only then is the shared + * in-memory read cache (primeContextCache) the right channel to prime; a custom + * `outputPath` (tests / non-standard installs) writes only to disk. + */ + private get usesStandardLayout(): boolean { + return this.outputPath === join(this.rootPath, OPENLORE_DIR, OPENLORE_ANALYSIS_SUBDIR); + } - // Incremental vector re-embed — silently skipped if no embedding service available - if (context.callGraph) { - await this.reEmbed(context, rel, content); + /** + * Load the context the watcher is about to patch. This ALWAYS reads fresh from + * disk — never through the shared read cache — because the cache is a read-path + * (tool-call) optimization, and patching a possibly-stale cached object could + * silently drop signatures written by a concurrent `analyze` between events. + * The writer reads ground truth; persistContext then primes the read cache with + * the result so the next tool call is still a hit (Step 2a, G1). + */ + private async loadContext(): Promise { + try { + const raw = await readFile(this.contextPath, 'utf-8'); + return JSON.parse(raw) as CachedContext; + } catch { + return null; } } - // ── Embed step ───────────────────────────────────────────────────────────── + private async persistContext(context: CachedContext): Promise { + // Strip the runtime-only EdgeStore handle before serializing. + const { edgeStore: _edgeStore, ...serializable } = context as CachedContext & { edgeStore?: unknown }; + void _edgeStore; + await writeFile(this.contextPath, JSON.stringify(serializable, null, 2), 'utf-8'); + // Hand the patched object back to the read cache, aligned to the new on-disk + // mtime, so the next tool call is a cache hit (no cold re-parse). This is the + // fix for root-cause item 2 (mtime bump forcing a full re-read). Only valid + // for the canonical layout the read handlers cache against. + if (this.usesStandardLayout) await primeContextCache(this.rootPath, context); + } + + // ── Embedding lane (Step 4) ────────────────────────────────────────────────── - private async reEmbed(context: LLMContext, rel: string, content: string): Promise { + private scheduleEmbed(context: CachedContext, changedFiles: ChangedFile[], nodes: FunctionNode[]): void { + for (const f of changedFiles) this.embedFiles.set(f.rel, f.content); + for (const node of nodes) this.embedNodes.set(node.id, node); + this.lastEmbedContext = context; + if (this.embedTimer) clearTimeout(this.embedTimer); + // Slightly behind the signature debounce so structural freshness always lands + // first and multiple flushes batch into one embed pass. + this.embedTimer = setTimeout(() => void this.runEmbedLane(), this.debounceMs); + } + + private async runEmbedLane(): Promise { + if (this.embedRunning) { + // Re-arm: drain again once the in-flight pass finishes. + this.embedTimer = setTimeout(() => void this.runEmbedLane(), this.debounceMs); + return; + } + if (this.embedFiles.size === 0 || !this.lastEmbedContext) return; + const changedFiles: ChangedFile[] = Array.from(this.embedFiles, ([rel, content]) => ({ rel, content })); + const nodes = Array.from(this.embedNodes.values()); + const context = this.lastEmbedContext; + this.embedFiles.clear(); + this.embedNodes.clear(); + this.embedRunning = true; try { - const { VectorIndex } = await import('../analyzer/vector-index.js'); + await this.updateVectors(context, changedFiles, nodes); + } catch (err) { + process.stderr.write(`[mcp-watcher] embed error: ${(err as Error).message}\n`); + } finally { + this.embedRunning = false; + if (this.embedFiles.size > 0) { + this.embedTimer = setTimeout(() => void this.runEmbedLane(), this.debounceMs); + } + } + } + + /** + * Row-level vector update for the changed files only (Step 3). Falls back to a + * silent no-op when no embedding service and no index are available. + */ + private async updateVectors(context: CachedContext, changedFiles: ChangedFile[], changedNodes: FunctionNode[]): Promise { + try { + const { VectorIndex } = await import('../analyzer/vector-index.js'); const { EmbeddingService } = await import('../analyzer/embedding-service.js'); const { readOpenLoreConfig } = await import('./config-manager.js'); @@ -289,37 +534,74 @@ export class McpWatcher { const cfg = await readOpenLoreConfig(this.rootPath); embedSvc = cfg ? EmbeddingService.fromConfig(cfg) : null; } - // embedSvc may be null: VectorIndex.build then refreshes the BM25-only - // corpus rather than re-embedding. Keeps the keyword index live in watch mode. - - const cg = context.callGraph!; - const hubIds = new Set((cg.hubFunctions ?? []).map(f => f.id)); - const entryIds = new Set((cg.entryPoints ?? []).map(f => f.id)); - const fileContents = new Map([[rel, content]]); - - const { embedded, reused, total, hasEmbeddings } = await VectorIndex.build( + // embedSvc may be null: updateFiles then refreshes the BM25-only corpus + // rather than re-embedding, keeping the keyword index live in watch mode. + + const cg = context.callGraph; + if (!cg) return; + const hubIds = new Set((cg.hubFunctions ?? []).map((f) => f.id)); + const entryIds = new Set((cg.entryPoints ?? []).map((f) => f.id)); + const changedFilePaths = new Set(changedFiles.map((f) => f.rel)); + const fileContents = new Map(changedFiles.map((f) => [f.rel, f.content])); + // Prefer the freshly-parsed nodes; fall back to the (possibly stale) + // call-graph nodes for the changed files when no edge store seeded them. + const nodes = changedNodes.length > 0 + ? changedNodes + : (cg.nodes ?? []).filter((n) => changedFilePaths.has(n.filePath)); + + const { embedded, reused, total, hasEmbeddings } = await VectorIndex.updateFiles( this.outputPath, - cg.nodes, + nodes, + changedFilePaths, context.signatures ?? [], hubIds, entryIds, embedSvc, fileContents, - /* incremental */ true ); - process.stderr.write( - hasEmbeddings - ? `[mcp-watcher] re-embedded ${rel}: ${embedded} new, ${reused} reused\n` - : `[mcp-watcher] refreshed BM25 index for ${rel}: ${total} functions\n` - ); + if (this.debug) { + process.stderr.write( + hasEmbeddings + ? `[mcp-watcher] re-embedded ${changedFilePaths.size} file(s): ${embedded} new, ${reused} reused\n` + : `[mcp-watcher] refreshed BM25 index for ${changedFilePaths.size} file(s): ${total} functions\n` + ); + } } catch (err) { process.stderr.write(`[mcp-watcher] embed error: ${(err as Error).message}\n`); } } + + // ── Helpers ────────────────────────────────────────────────────────────────── + + /** Bounded count of watched source files; stops early once `cap` is exceeded. */ + private async countSourceFiles(cap: number): Promise { + let count = 0; + const walk = async (dir: string): Promise => { + if (count > cap) return; + let entries; + try { + entries = await readdir(dir, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + if (count > cap) return; + const abs = join(dir, entry.name); + const rel = relative(this.rootPath, abs); + if (entry.isDirectory()) { + if (!isIgnoredRelPath(rel)) await walk(abs); + } else if (entry.isFile() && SOURCE_EXTENSIONS.test(entry.name) && !isIgnoredRelPath(rel)) { + count++; + } + } + }; + await walk(this.rootPath); + return count; + } } -// ── Helpers ─────────────────────────────────────────────────────────────────── +// ── Module helpers ────────────────────────────────────────────────────────────── function isTestFile(relPath: string): boolean { return ( @@ -368,7 +650,7 @@ async function buildGraphSubset( const result = await builder.build(files, undefined, undefined, resolutionNodes); // Only return nodes from changedFile — callerFiles nodes are already in DB and unchanged - const changedNodes = Array.from(result.nodes.values()).filter(n => n.filePath === changedRel); + const changedNodes = Array.from(result.nodes.values()).filter((n) => n.filePath === changedRel); return { edges: result.edges, nodes: changedNodes }; }