diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..314eb58 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,138 @@ +# Memscribe CI (whitepaper §8.9). +# +# Memscribe is deterministic and zero-LLM by construction, so CI is a hard gate, +# not a smoke test: the same input bytes must always produce the same nodes, the +# tree must be clippy- and rustfmt-clean, the dependency set must satisfy the +# license/advisory policy, and the crate must keep building on its MSRV. +# +# The toolchain is pinned to match rust-toolchain.toml (1.96.0). The fuzz job is +# best-effort: cargo-fuzz needs a nightly compiler, so it is allowed to fail +# without failing the workflow. + +name: CI + +on: + push: + branches: [main] + pull_request: + workflow_dispatch: + +# Cancel superseded runs on the same ref to save CI minutes. +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + # Pinned toolchain — keep in lockstep with rust-toolchain.toml. + RUST_PINNED: 1.96.0 + # Resilience against transient crates.io download blips (SSL EOFs, flaky + # mirrors): retry network ops aggressively and fetch the index over the + # sparse protocol with the git CLI, which recovers from partial transfers + # better than the built-in downloader. + CARGO_NET_RETRY: 10 + CARGO_NET_GIT_FETCH_WITH_CLI: "true" + CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse + +jobs: + # 1. The deterministic test suite: unit + golden + conformance + property. + test: + name: test (workspace, all-features) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install pinned toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.96.0 + - uses: Swatinem/rust-cache@v2 + - name: cargo test + run: cargo test --workspace --all-features --locked + + # 2. Lints as errors. No warning escapes review. + clippy: + name: clippy (-D warnings) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install pinned toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.96.0 + components: clippy + - uses: Swatinem/rust-cache@v2 + - name: cargo clippy + run: cargo clippy --workspace --all-targets --all-features -- -D warnings + + # 3. Formatting. The output is byte-stable, so the source should be too. + fmt: + name: rustfmt (--check) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install pinned toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.96.0 + components: rustfmt + - name: cargo fmt --check + run: cargo fmt --all --check + + # 4. License + advisory gate (deny.toml). + deny: + name: cargo-deny + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: cargo-deny check + uses: EmbarkStudios/cargo-deny-action@v2 + with: + command: check + arguments: --all-features + + # 5. MSRV — the crate must build on its declared minimum (1.96). + # Build + check only: tests pin newer dev-deps and run under `test`. + msrv: + name: MSRV (1.96) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install MSRV toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.96.0 + - uses: Swatinem/rust-cache@v2 + with: + key: msrv + - name: cargo check (MSRV) + run: cargo check --workspace --all-features --locked + + # 6. cargo-fuzz smoke build. Best-effort: cargo-fuzz needs nightly, and the + # fuzz/ targets may not be wired yet — never fail the workflow on this. + fuzz: + name: cargo-fuzz smoke build (best-effort) + runs-on: ubuntu-latest + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - name: Install nightly toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: nightly + - uses: Swatinem/rust-cache@v2 + with: + key: fuzz + - name: Install cargo-fuzz + run: cargo install cargo-fuzz --locked + - name: Build fuzz targets (no run) + working-directory: fuzz + run: | + if [ -f Cargo.toml ]; then + cargo +nightly fuzz build + else + echo "fuzz/ has no Cargo.toml yet — nothing to build (best-effort job)." + fi diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..8fc4770 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,195 @@ +# Memscribe architecture + +Memscribe is a **deterministic, zero-LLM** pipeline that turns the transcript +logs AI coding agents already write into typed nodes the downstream +inference-and-governance layer (**MemCortex**) can consume. No model is ever +called: capture is reading and parsing, never summarizing. The output is an exact +function of the input, which is what makes the whole module golden-file, +property, and fuzz testable. + +It is the bottom of a three-layer stack — **Memtrace** uses **MemCortex**, and +**MemCortex** uses **Memscribe**. The dependency direction is strictly one-way: +each layer depends only on the one below it, and `memscribe-core` depends on +nothing else in the workspace. Memscribe never calls upward. + +--- + +## The pipeline + +A single, linear, deterministic pipeline. Each stage is a trait, so it can be +tested in isolation and swapped. Everything between **Source** and **Sink** is a +pure, synchronous function of the event stream. + +``` + Source Adapter Gate Segmenter Binder NodePrep Sink + (memscribe-io) (memscribe- (core) (core) (core) (core) (memscribe-sink) + adapters) + tail JSONL → parse one → admit? → arc / turn → decision → assemble → NDJSON / SQLite + hook stdin RawRecord → commitment spans; ↔ edit, PreparedNode / MemDB + OTLP receiver CaptureEvent[] markers elevate gated PROV stream + (version- turns; seed (t_use + tolerant) decisions; ≤ t_gen) + collect edits + RawRecord CaptureEvent markers Segmentation BindingEdge PreparedNode (consumer) + (bytes + provenance) (normalized) stream +``` + +- **`Source → Adapter`** produces the normalized `CaptureEvent` stream — the + system of record. This is the only stage that touches tool-specific formats. +- **`Gate → Segmenter → Binder → NodePrep`** transform that stream into + `PreparedNode`s. Pure and synchronous given the events. +- An optional **redaction** pass runs over the prepared nodes before the sink. +- **`Sink`** writes the nodes out. It is the single seam that decouples + Memscribe from MemDB. + +The orchestration lives in `memscribe-core::pipeline::DefaultPipeline`: + +```rust +let nodes = DefaultPipeline::new() // redaction ON by default + .run_records(adapter.as_ref(), &records); // parse → prepare → redact +// or stream straight to a sink: +let n = DefaultPipeline::new() + .run_to_sink(adapter.as_ref(), &records, &mut sink)?; +``` + +`DefaultPipeline::prepare_events(&events)` is the **pure** core: its output is an +exact function of `events`. `without_redaction()` turns the redactor off (golden +tests assert on verbatim content), and `with_gate(..)` / `with_redactor(..)` +swap in config-driven stages. + +--- + +## Crate responsibilities + +| Crate | Responsibility | +|-------|----------------| +| `memscribe-core` | The frozen contract: the event model, the prepared-node output types, the `TranscriptAdapter` and `Sink` traits, and the deterministic pipeline (`gate` → `segmenter` → `binder` → `nodeprep`) plus the `redact` pass. Depends on nothing in the workspace. | +| `memscribe-adapters` | Per-tool parsers behind feature flags. Each implements `TranscriptAdapter`. The `registry` assembles the enabled set (`all_adapters`) and resolves one by `SourceKind` (`adapter_for`). | +| `memscribe-io` | Generic sources: a notify-based file tailer (offset resume), a hook server, and an OTLP receiver. Turns raw bytes into `RawRecord`s. | +| `memscribe-sink` | Concrete `Sink`s: `NdjsonSink` (canonical default), `SqliteSink` (feature `sqlite`), and `MemDbSink` (feature `memdb`, off by default). | +| `memscribe-cli` | The `memscribe` binary: `watch` / `hook` / `parse` / `replay` / `verify` / `redact`. | +| `memscribe-testkit` | The harness: `parse_events` / `prepare_nodes`, the invariant checks, golden-fixture loaders, and the cross-tool conformance scenario catalog. | + +--- + +## The contract types + +All of these live in `memscribe-core` and are re-exported from its crate root. +**Do not change their behavior or public shape** — the test suite and every +consumer depend on exact output. + +### Input: the normalized event model (`model.rs`) + +`CaptureEvent` is the system of record produced by adapters. Every field is +copied verbatim from the source; none is generated by Memscribe. + +```rust +pub struct CaptureEvent { + pub schema_version: u16, // SCHEMA_VERSION; consumers gate on this + pub source: SourceKind, // which tool produced it + pub session_id: String, // tool-native session/thread id + pub seq: u64, // monotonic per-session, from file order + pub event_id: String, // tool-native id, or blake3(content) fallback + pub parent_id: Option, // DAG link where the tool provides one + pub timestamp: OffsetDateTime, // RFC3339, verbatim + pub project: ProjectRef, // cwd / repo_root / git, from session start + pub kind: EventKind, // the payload + pub provenance: SourceLocation, // pointer back into the source bytes +} +``` + +`EventKind` is the payload enum. `EventKind::Unknown` is **load-bearing**: an +unrecognized record type or a new field is preserved verbatim and flagged, +never discarded — that is how the stream stays lossless across tool-version +churn. + +| `EventKind` variant | Meaning | +|---------------------|---------| +| `SessionStart` | cwd, git ref, model, tool version | +| `UserTurn` | a user message (flattened text + structured `Part`s) | +| `AssistantTurn` | an assistant message (text, thinking, model, usage, parts) | +| `ToolCall` | a tool invocation (`call_id`, name, raw args) | +| `ToolResult` | a tool result (`call_id`, `ok`, raw output) | +| `FileEdit` | a normalized `Diff` (from Edit/Write/apply_patch/replace) | +| `Compaction` | model-side history compaction — flagged, never stored as truth | +| `Rewind` | a user rewind back to an earlier event | +| `SessionEnd` | the session ended | +| `Unknown` | an unrecognized record, preserved verbatim and flagged | + +`SourceKind` enumerates the nine tools plus `Unknown`; `SourceKind::parse` maps +CLI/`--as` slugs (tolerant of aliases such as `claude` / `claude-code`). + +### Output: the prepared-node stream (`node.rs`) + +`PreparedNode` is the typed data a consumer ingests. It is a tagged enum: + +| `PreparedNode` variant | Payload | Meaning | +|------------------------|---------|---------| +| `Conversation` | `ConversationSpan` | a gated, verbatim dialogue span with the markers that fired | +| `Decision` | `DecisionRecord` | a deterministically-parsed decision (IBIS / QOC / MADR / Kruchten shape) | +| `Episode` | `CodeEpisode` | a code edit episode: path, `Diff`, git ref, deterministic `episode_id` | +| `Binding` | `BindingEdge` | a decision/conversation → episode edge carrying a `ProvRecord` | + +### Epistemic honesty: `FactStatus` + +Every node and edge carries a `FactStatus`. **Memscribe only ever emits the +first two**; the latter two are *flags* for a downstream inference layer — +values Memscribe never computes by guessing. This is the property that keeps the +module zero-LLM and its output golden-testable. + +| `FactStatus` | Who sets it | +|--------------|-------------| +| `Observed` | Memscribe — verbatim from the source | +| `DeterministicallyDerived` | Memscribe — a pure function of observed data | +| `StatisticallyRanked` | downstream — a statistical measure | +| `LlmHypothesis` | downstream — an LLM hypothesis; Memscribe only *flags* it | + +`ProvRecord` records `used(session, decision)` + `wasGeneratedBy(diff, session)` +with the temporal invariant `t_use ≤ t_gen` (`ProvRecord::is_temporally_valid`). + +--- + +## How to add a new adapter + +Adapters are the volatile part — every tool's format churns — so adding one is a +well-trodden, five-step path. The contract: a parser is **version-tolerant** +(it pattern-matches on the fields it needs and routes anything unrecognized to +`EventKind::Unknown`) and **must never panic**. + +1. **Add a `SourceKind` variant** (`memscribe-core/src/model.rs`). Wire its + stable snake_case slug into `SourceKind::as_str` and into `SourceKind::parse` + (include any aliases). This is the one allowed touch of `memscribe-core` for + a new tool — coordinate it, since the frozen contract is shared. + +2. **Add the adapter module** (`memscribe-adapters/src/.rs`) behind a + `#[cfg(feature = "")]` and a matching entry in the crate's `[features]` + table. Implement `TranscriptAdapter`: + - `source_kind()` — return your `SourceKind`. + - `discover(&DiscoverCfg)` — locate live & historical transcripts. Honor the + per-tool override key in `DiscoverCfg.overrides` (e.g. `CLAUDE_CONFIG_DIR`, + `CODEX_HOME`) and fall back to `cfg.home_dir()`. Return handles in a + deterministic (sorted) order. + - `parse(&RawRecord, &mut ParseCtx)` — turn ONE record into zero or more + `CaptureEvent`s. Use `ParseCtx::alloc_seq` for the monotonic `seq`, + `ParseCtx::first_seen` for dedup, and `ParseCtx::project_or_default` for the + project binding. Never panic; route unknowns to `EventKind::Unknown`. + - `schema_fingerprint(&RawRecord)` — return a `SchemaVariant` so the corpus + and runtime can version-gate the parser. + +3. **Register it** (`memscribe-adapters/src/registry.rs`). Add the cfg-gated + `push` in `all_adapters()` and the cfg-gated arm in `adapter_for()`. + +4. **Add fixtures** under `fixtures///.jsonl` for the + canonical scenarios in `memscribe-testkit::scenarios::SCENARIOS`, and bless + the expected outputs under `fixtures-expected///` (see + [CONTRIBUTING.md](./CONTRIBUTING.md) for the capture → golden → bless flow). + +5. **Add tests.** Unit-test the parser; run the shared invariant checks from + `memscribe-testkit::invariants` (`check_monotonic_seq`, `check_lossless`, + `check_unique_event_ids`, `check_determinism`); and add a `cargo-fuzz` target + so the never-panic contract is enforced. Verify in isolation: + `cargo test -p memscribe-adapters --test `. + +The conformance suite then asserts your tool normalizes the canonical scenarios +to the **same shape** as every other tool — that cross-tool equivalence is the +point of the thin-waist event model. diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..48e00ee --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,24 @@ +# Changelog + +All notable changes to Memscribe are documented here. The format follows +[Keep a Changelog](https://keepachangelog.com/), and the project adheres to +[Semantic Versioning](https://semver.org/). The event schema additionally +carries its own `schema_version` so the consumer layer (MemCortex) can refuse +or adapt to an incompatible event schema independently of the crate version. + +## [Unreleased] + +### Added +- **M1 — Core contract.** The frozen thin-waist: `CaptureEvent` / `EventKind` + normalized event model, `PreparedNode` output contract with `FactStatus`, + the `TranscriptAdapter` and `Sink` traits, and the deterministic pipeline + (Gate → Segmenter → Binder → NodePrep). +- **Adapters.** Claude Code, Codex CLI, Gemini CLI, OTel GenAI, plus + VS Code / Copilot / Cursor / Windsurf / Zed, each version-tolerant and + routing unknowns to `EventKind::Unknown`. +- **Sinks.** NDJSON (canonical default), SQLite, and a feature-gated MemDB sink. +- **IO sources.** notify-based file tailer with persisted byte-offset resume, + hook server, and an optional OTLP receiver. +- **CLI.** `watch`, `hook`, `parse`, `replay`, `verify`, `redact`. +- **Testkit.** Golden-file harness, cross-tool conformance suite, synthetic + generators, property tests, and fuzz targets. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..2f28d61 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,173 @@ +# Contributing to Memscribe + +Thanks for helping. Memscribe is the **deterministic, zero-LLM** data layer +beneath MemCortex (which Memtrace builds on). The bar for a change is unusual: the output must be +an *exact function* of the input, so the test suite is the contract. Read +[ARCHITECTURE.md](./ARCHITECTURE.md) first — it explains the pipeline and the +contract types you must not break. + +--- + +## Build & test + +The toolchain is pinned in `rust-toolchain.toml` (**1.96.0**); `rustup` picks it +up automatically. The MSRV is **1.96**. + +```console +# The whole gate, the way CI runs it: +cargo test --workspace --all-features # unit + golden + conformance + property +cargo clippy --workspace --all-targets --all-features -- -D warnings +cargo fmt --all --check +cargo deny check # license + advisory policy (deny.toml) +``` + +When you are iterating on a single crate or a single integration test, build it +in **isolation** so you do not compile peers' in-progress test files: + +```console +cargo test -p memscribe-adapters --test +cargo test -p memscribe-core +``` + +Do **not** run `cargo fmt` to reformat in a PR that you want reviewed cleanly — +write rustfmt-clean code and let `cargo fmt --all --check` verify it. + +--- + +## The rules that make Memscribe Memscribe + +These are not style preferences; they are the invariants the test suite (and the +downstream consumer) rely on. A change that violates one is a bug. + +### 1. Determinism + +The same input bytes must always produce byte-identical output. No clocks, no +randomness, no hash-map iteration order leaking into output, no filesystem walk +order leaking into discovery (sort it). The property test +`invariants::check_determinism` runs two parses and asserts the serialized +events are identical — keep it green. + +### 2. No LLM, ever + +Capture is reading and parsing, never summarizing or inferring. Memscribe emits +nodes only with `FactStatus::Observed` (verbatim) or +`FactStatus::DeterministicallyDerived` (a pure function of observed data). +Anything that would require inference — fine-grained decision typing, concept +naming, statistical ranking — is **flagged** (`StatisticallyRanked`, +`LlmHypothesis`) for a downstream layer to compute, never guessed here. If you +find yourself reaching for a heuristic that "usually" gets it right, stop: that +belongs downstream. + +### 3. Losslessness + +Every non-blank source record maps to at least one event. Unrecognized records +and new fields are preserved verbatim and routed to `EventKind::Unknown` / +`Part::Other`, never dropped. `invariants::check_lossless` enforces the lower +bound. + +### 4. Monotonic, unique, idempotent + +`seq` is strictly increasing within a session and matches file order +(`check_monotonic_seq`). A record is deduplicated once on its tool-native +`event_id` (`check_unique_event_ids`), so re-ingesting the same input is +idempotent. + +### 5. Never panic + +A parser must never panic on any input — malformed, truncated, adversarial, or +from a tool version it has never seen. Every adapter parser has a `cargo-fuzz` +target that asserts this. Use the version-tolerant pattern: match the fields you +need, route the rest to `Unknown`. + +--- + +## The adapter version-tolerance contract + +Tool transcript formats churn constantly; that is precisely why the adapters are +the open, community-versioned part of Memscribe. An adapter must: + +- **Pattern-match only the fields it needs** and route anything unrecognized to + `EventKind::Unknown` (with the raw record preserved) rather than failing the + stream. A new field in a record you understand must not break parsing. +- **Never panic** on any input (see rule 5). +- **Fingerprint** its input via `schema_fingerprint` → `SchemaVariant`, so the + corpus and runtime can version-gate the parser (e.g. `claude_code/2.1`, + `codex/rollout-v2`). When a tool ships an incompatible format, add a new + variant and a fixture under that version — do not silently widen the old one. +- **Honor `DiscoverCfg`** in `discover`: read the per-tool override key from + `DiscoverCfg.overrides` (e.g. `CLAUDE_CONFIG_DIR`, `CODEX_HOME`), fall back to + `cfg.home_dir()`, and return handles in a **sorted** (deterministic) order. + +The payoff is the conformance suite: the same canonical scenario, captured from +any tool, must normalize to the **same shape**. See ARCHITECTURE.md, "How to add +a new adapter," for the full five-step path. + +--- + +## The fixture-corpus workflow: capture → golden → bless + +Memscribe is tested the way a compiler is — fixtures in, exact expected output. +The corpus lives in two trees: + +```text +fixtures///.jsonl # input transcript +fixtures-expected///.events.json # expected CaptureEvent[] +fixtures-expected///.nodes.json # expected PreparedNode[] +``` + +The canonical scenario slugs are defined once in +`memscribe-testkit::scenarios::SCENARIOS` (e.g. +`happy_path_decision_then_edits`, `rejected_alternative`, `ban`, +`tool_failure`, …). Every tool's fixtures should cover them so the cross-tool +conformance suite can assert equivalence. + +**1. Capture.** Get a real (or hand-authored, minimal, redacted) transcript for +the scenario and drop it at `fixtures///.jsonl`. Real +captures must be scrubbed of secrets and personal paths first — run them through +the redactor and eyeball the result. For a live session you can snapshot: + +```console +memscribe verify --capture # (planned) snapshot a live session into a new fixture +``` + +**2. Golden.** Generate the expected output and inspect it by hand. The harness +parses with the adapter and runs the pipeline with **redaction off** so the +golden asserts on verbatim content: + +```console +# Eyeball the normalized events and prepared nodes for a fixture: +memscribe parse fixtures///.jsonl --as --no-redact +``` + +Confirm it satisfies the invariants and the scenario's stated expectation +(e.g. for `ban`, the `DecisionRecord.is_ban` flag is `true`). + +**3. Bless.** Write the reviewed output to the `fixtures-expected/` paths +(`*.events.json`, `*.nodes.json`). The golden tests use `insta`; accept a +reviewed snapshot with: + +```console +cargo insta review # interactively accept/reject changed snapshots +# or, after eyeballing the diff: +cargo insta accept +``` + +**Never bless a snapshot you have not read.** A blessed golden is a claim about +exactly what the deterministic pipeline produces; an unreviewed `accept` turns a +real regression into a permanent "expected" value. If a golden changes +unexpectedly, that is the suite doing its job — find out *why* before you +re-bless. + +--- + +## Submitting a change + +- Keep the change scoped; touch only the crate(s) you own. `memscribe-core` is a + frozen contract — changing a public type or its output ripples through every + consumer and every golden, so coordinate those separately. +- Make sure all five hard gates above pass locally before you open the PR. +- Add or update fixtures + tests for any behavior change. A behavior change with + no golden delta is a red flag. + +By contributing you agree your work is dual-licensed under **MIT OR Apache-2.0**, +matching the project. diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..1313272 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1691 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe" + +[[package]] +name = "assert_cmd" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2aa3a22042e45de04255c7bf3626e239f450200fd0493c1e382263544b20aea6" +dependencies = [ + "anstyle", + "bstr", + "libc", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", +] + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + +[[package]] +name = "block2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5" +dependencies = [ + "objc2", +] + +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bytes" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" + +[[package]] +name = "cc" +version = "1.2.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e228eec9be7c17ccb640b59b36a5cd805ea2a564a4c5e162c2f659fea30d3b96" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "console" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87" +dependencies = [ + "encode_unicode", + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "ctrlc" +version = "3.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0b1fab2ae45819af2d0731d60f2afe17227ebb1a1538a236da84c93e9a60162" +dependencies = [ + "dispatch2", + "nix", + "windows-sys 0.61.2", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "serde_core", +] + +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + +[[package]] +name = "dispatch2" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" +dependencies = [ + "bitflags 2.13.0", + "block2", + "libc", + "objc2", +] + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "file-id" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1fc6a637b6dc58414714eddd9170ff187ecb0933d4c7024d1abbd23a3cc26e9" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "filetime" +version = "0.2.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c287a33c7f0a620c38e641e7f60827713987b3c0f26e8ddc9462cc69cf75759" +dependencies = [ + "cfg-if", + "libc", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown 0.14.5", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.1", +] + +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags 1.3.2", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + +[[package]] +name = "insta" +version = "1.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f0f8fee8c926415c58d6ae43a08523a26faccb2323f5e6b644fe7dd4ef6b82" +dependencies = [ + "console", + "once_cell", + "serde", + "similar", + "tempfile", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "kqueue" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "273c0752728918e0ac4976f2b275b6fefb9ecd400585dec929419f3844cd87b5" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07293a4e297ac234359b510362495713f75ea345d5307140414f20c69ffeb087" +dependencies = [ + "bitflags 2.13.0", + "libc", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libsqlite3-sys" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "memscribe-adapters" +version = "0.1.0" +dependencies = [ + "blake3", + "memscribe-core", + "proptest", + "regex", + "serde", + "serde_json", + "time", + "walkdir", + "zstd", +] + +[[package]] +name = "memscribe-cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "assert_cmd", + "clap", + "ctrlc", + "memscribe-adapters", + "memscribe-core", + "memscribe-io", + "memscribe-sink", + "memscribe-testkit", + "predicates", + "serde", + "serde_json", + "tempfile", + "toml", + "tracing", + "tracing-subscriber", + "zstd", +] + +[[package]] +name = "memscribe-core" +version = "0.1.0" +dependencies = [ + "blake3", + "proptest", + "regex", + "serde", + "serde_json", + "thiserror", + "time", +] + +[[package]] +name = "memscribe-io" +version = "0.1.0" +dependencies = [ + "memscribe-core", + "notify", + "notify-debouncer-full", + "rusqlite", + "serde", + "serde_json", + "tempfile", + "time", + "tokio", + "tracing", + "walkdir", + "zstd", +] + +[[package]] +name = "memscribe-sink" +version = "0.1.0" +dependencies = [ + "memscribe-core", + "rusqlite", + "serde", + "serde_json", + "tempfile", + "time", +] + +[[package]] +name = "memscribe-testkit" +version = "0.1.0" +dependencies = [ + "insta", + "memscribe-adapters", + "memscribe-core", + "memscribe-io", + "memscribe-sink", + "proptest", + "serde", + "serde_json", +] + +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "nix" +version = "0.31.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf20d2fde8ff38632c426f1165ed7436270b44f199fc55284c38276f9db47c3d" +dependencies = [ + "bitflags 2.13.0", + "cfg-if", + "cfg_aliases", + "libc", +] + +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + +[[package]] +name = "notify" +version = "6.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" +dependencies = [ + "bitflags 2.13.0", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "log", + "mio", + "walkdir", + "windows-sys 0.48.0", +] + +[[package]] +name = "notify-debouncer-full" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb7fd166739789c9ff169e654dc1501373db9d80a4c3f972817c8a4d7cf8f34e" +dependencies = [ + "crossbeam-channel", + "file-id", + "log", + "notify", + "parking_lot", + "walkdir", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num-conv" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "objc2" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" +dependencies = [ + "objc2-encode", +] + +[[package]] +name = "objc2-encode" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "predicates" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ada8f2932f28a27ee7b70dd6c1c39ea0675c55a36879ab92f3a715eaa1e63cfe" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cad38746f3166b4031b1a0d39ad9f954dd291e7854fcc0eed52ee41a0b50d144" + +[[package]] +name = "predicates-tree" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0de1b847b39c8131db0467e9df1ff60e6d0562ab8e9a16e568ad0fdb372e2f2" +dependencies = [ + "predicates-core", + "termtree", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.13.0", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.13.0", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "rusqlite" +version = "0.32.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7753b721174eb8ff87a9a0e799e2d7bc3749323e773db92e0984debb00019d6e" +dependencies = [ + "bitflags 2.13.0", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags 2.13.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + +[[package]] +name = "similar" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" + +[[package]] +name = "smallvec" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.3", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85c17d80feb7334b40c484e45ed1a5273dfd8bfda537c3be2e74a06a6686f327" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" + +[[package]] +name = "time-macros" +version = "0.2.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcef1a61bdb119096e153208ec5cbec23944ce8bca13be5c7f60c634f7403935" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tokio" +version = "1.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +dependencies = [ + "bytes", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.4+wasi-0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..af12e73 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,74 @@ +# Memscribe — a self-contained, open-source Rust workspace for deterministic, +# zero-LLM conversation capture from AI coding agents. +# +# The dependency direction is strictly one-way: Memtrace depends on Memscribe, +# never the reverse. See the implementation whitepaper, §2 and §9. + +[workspace] +resolver = "2" +members = [ + "crates/memscribe-core", + "crates/memscribe-adapters", + "crates/memscribe-io", + "crates/memscribe-sink", + "crates/memscribe-cli", + "crates/memscribe-testkit", +] +# The cargo-fuzz crate is a standalone, workspace-EXCLUDED member: it depends on +# libfuzzer-sys (a nightly-only sanitizer runtime) and must not be pulled into a +# plain `cargo build`/`cargo test` of the workspace. See fuzz/README.md. +exclude = ["fuzz"] + +[workspace.package] +version = "0.1.0" +edition = "2021" +rust-version = "1.96" +license = "MIT OR Apache-2.0" +repository = "https://github.com/Alex793x/Memscribe" +authors = ["Memrack / Syncable"] +description = "Deterministic, zero-LLM transcript capture and node preparation for AI coding agents." + +# Shared dependency versions. Pinned to major lines; exact versions resolve in +# Cargo.lock for reproducible builds (see the whitepaper's recommended stack). +[workspace.dependencies] +serde = { version = "1", features = ["derive"] } +serde_json = { version = "1", features = ["preserve_order"] } +time = { version = "0.3", features = ["serde", "serde-well-known", "formatting", "parsing", "macros"] } +thiserror = "2" +blake3 = "1" +regex = "1" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tokio = { version = "1", features = ["rt", "rt-multi-thread", "macros", "fs", "io-util", "sync", "time"] } +futures = "0.3" +notify = "6" +notify-debouncer-full = "0.3" +zstd = "0.13" +rusqlite = { version = "0.32", features = ["bundled"] } +clap = { version = "4", features = ["derive"] } +anyhow = "1" +similar = "2" +walkdir = "2" +# dev / test +proptest = "1" +insta = { version = "1", features = ["json"] } +assert_cmd = "2" +predicates = "3" +tempfile = "3" + +# Workspace-wide member crates depend on each other through these handles. +memscribe-core = { path = "crates/memscribe-core", version = "0.1.0" } +memscribe-adapters = { path = "crates/memscribe-adapters", version = "0.1.0" } +memscribe-io = { path = "crates/memscribe-io", version = "0.1.0" } +memscribe-sink = { path = "crates/memscribe-sink", version = "0.1.0" } +memscribe-testkit = { path = "crates/memscribe-testkit", version = "0.1.0" } + +[workspace.lints.rust] +unsafe_code = "forbid" + +[workspace.lints.clippy] +# Keep the deterministic core honest without drowning contributors in pedantry. +all = "warn" + +[profile.release] +lto = "thin" diff --git a/LICENSE b/LICENSE-APACHE similarity index 100% rename from LICENSE rename to LICENSE-APACHE diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..969d061 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0f7aca7 --- /dev/null +++ b/README.md @@ -0,0 +1,190 @@ +
+ +Memscribe + +# Memscribe + +**Deterministic, zero-LLM conversation capture for AI coding agents.** + +Memscribe tails the transcript logs your AI coding agents already write — Claude Code, Codex, Gemini, Cursor, Windsurf, Zed, VS Code / Copilot, and any OpenTelemetry-instrumented agent — and prepares them into typed, queryable nodes. No model calls. Same bytes in, same nodes out, every time. + +[![CI](https://github.com/Alex793x/Memscribe/actions/workflows/ci.yml/badge.svg)](https://github.com/Alex793x/Memscribe/actions/workflows/ci.yml) +[![License: MIT OR Apache-2.0](https://img.shields.io/badge/license-MIT%20OR%20Apache--2.0-blue.svg)](#license) +[![Rust 1.96+](https://img.shields.io/badge/rust-1.96%2B-orange.svg)](rust-toolchain.toml) +[![Tests](https://img.shields.io/badge/tests-260%20passing-success.svg)](#testing) +[![Zero-LLM](https://img.shields.io/badge/LLM%20calls-0-1a1a2e.svg)](#why-deterministic-matters) + +
+ +--- + +## What it is + +A coding agent is a stream of decisions and edits — *"use Postgres instead of MySQL,"* followed by the diffs that implement it. That stream is gold for memory, audit, analytics, and replay, but it's buried in five different churning log formats. **Memscribe is the boring, deterministic half of a memory system:** it reads those logs, normalizes them, and emits typed nodes — and because it never calls a model, its output is an exact function of its input. + +That single property is the whole point. It makes capture **golden-file, property, and fuzz testable**, so the day a tool changes its format, the test suite fails loudly instead of silently corrupting your memory. + +### Where Memscribe sits + +Memscribe is the foundation of a three-layer stack. Each layer uses the one below it, and the dependency only ever points **downward**: + +| Layer | Role | Calls a model? | +|:------|:-----|:--------------:| +| [**Memtrace**](https://github.com/syncable-dev/memtrace) | The product — a code-intelligence graph with agent memory | — | +| **MemCortex** | Inference & governance — the judgment calls on top of the captured data | yes | +| **Memscribe** | Deterministic capture — normalizes transcripts into typed nodes *(this repo)* | **no** | + +Memtrace builds on MemCortex; MemCortex builds on Memscribe. Because Memscribe sits at the bottom, depends on nothing above it, and never calls a model, the boundary between the layers is a single stable data type — which is exactly what keeps this layer small, auditable, and exhaustively testable. + +## The pipeline + +One linear, deterministic pipeline. Every stage is a trait, so it can be tested in isolation and swapped. + +``` + Source (memscribe-io) Adapter (memscribe-adapters) + tail JSONL / hook stdin ─► parse_line ─► CaptureEvent[] + / OTLP receiver (version-tolerant) + │ RawRecord(bytes + provenance) │ normalized events + ▼ ▼ + Gate ─► Segmenter ─► Binder ─► NodePrep ─► Sink (memscribe-sink) + admit? arc / turn decision Prepared MemDB · ndjson · sqlite + markers spans ↔ edit Node +``` + +`Source → Adapter` produces a normalized `CaptureEvent` stream — the system of record. `Gate → Segmenter → Binder → NodePrep` turn that into `PreparedNode`s. The `Sink` writes them. Everything between Source and Sink is pure and synchronous given the event stream, which is what makes the whole thing golden-testable end to end. + +## Quick start + +```bash +# Parse a transcript to NDJSON (the workhorse — great for trying it out) +cargo run -p memscribe-cli -- parse ~/.claude/projects//.jsonl --as claude_code + +# Tail your agents live and write prepared nodes to a local SQLite store +cargo run -p memscribe-cli -- watch --tools claude,codex,gemini --sink sqlite --out memory.db + +# See exactly what the redaction pass would strip from a file +cargo run -p memscribe-cli -- redact session.jsonl +``` + +Every tool's transcript normalizes to the **same shape**. Here a Claude Code decision-and-edits session becomes four kinds of node: + +```jsonc +// memscribe parse fixtures/claude_code/2.0/happy_path_decision_then_edits.jsonl --as claude_code +{"node":"conversation","text":"Let's use Postgres instead of MySQL for the orders service.", + "markers":[{"rule_id":"decision_verb.use",...},{"rule_id":"rejection.instead_of",...}], "fact_status":"observed"} +{"node":"decision","epitome":"Let's use Postgres instead of MySQL ...", + "considered_options":[{"text":"MySQL","chosen":false},{"text":"Postgres","chosen":true}],"is_ban":false} +{"node":"episode","path":"src/db/config.rs","diff":{"added_lines":1,"removed_lines":1,...}} +{"node":"binding","relation":"produced","prov":{"t_use":"...10:00:00Z","t_gen":"...10:00:03Z"}, + "fact_status":"deterministically_derived","correlation":{...}} +``` + +## Supported tools + +Nine version-tolerant adapters, each behind a Cargo feature flag. Parsers pattern-match the fields they need and route anything unrecognized to `Unknown` — they never panic and never drop a record. + +| Tool | Transcript source | Status | +|:-----|:------------------|:-------| +| **Claude Code** | `~/.claude/projects//.jsonl` (append-only JSONL, DAG via `parentUuid`) | ✅ native | +| **Codex CLI** | `~/.codex/sessions/.../rollout-*.jsonl[.zst]` (`apply_patch` V4A diffs, transparent zstd) | ✅ native | +| **Gemini CLI** | `~/.gemini/tmp//chats/session-*.jsonl` (`$set` / `$rewindTo` control lines) | ✅ native | +| **OpenTelemetry** | OTLP / GenAI semconv records — the universal fallback for any instrumented agent | ✅ native | +| **Cursor** · **Windsurf** · **Zed** · **VS Code** · **Copilot** | exported chat JSON (desktop stores are SQLite/undocumented — export-based, per the whitepaper) | ✅ export-shape | + +All five **CLI/OTel** scenarios and the cross-tool conformance suite prove these adapters are interchangeable behind the contract. + +## Usable with MemDB — and fully usable without it + +The seam is the `Sink` trait. Nothing in the pipeline knows what a sink does with a node: + +```rust +pub trait Sink: Send { + fn emit(&mut self, node: &PreparedNode) -> Result<(), SinkError>; + fn flush(&mut self) -> Result<(), SinkError>; +} +``` + +| Sink | Feature | Use | +|:-----|:--------|:----| +| `NdjsonSink` | default | One JSON node per line — the canonical, audit-friendly default. | +| `SqliteSink` | default | A queryable local store with zero external services. | +| `MemDbSink` | `--features memdb` | Writes nodes into MemDB with bi-temporal headers, for Memtrace. **Off by default.** | + +Remove the `memdb` feature and Memscribe is a complete, auditable, local capture tool. See [`crates/memscribe-sink/MEMDB.md`](crates/memscribe-sink/MEMDB.md) for the integration design. + +## The output contract + +Memscribe only ever emits nodes with `Observed` or `DeterministicallyDerived` fact-status. Anything that would require inference (fine-grained decision typing, statistical ranking) is **flagged for a downstream layer, never guessed.** + +| Node | Meaning | Fact status | +|:-----|:--------|:------------| +| `Conversation` | A gated, verbatim dialogue span with the commitment markers that fired | `Observed` | +| `Decision` | Parsed deterministically (IBIS/QOC/MADR/Kruchten): epitome, options, `is_ban` | `Observed` | +| `Episode` | The edit(s): path, diff, git sha | `DeterministicallyDerived` | +| `Binding` | decision → episode, with PROV (`t_use ≤ t_gen`) + correlation tuple | `DeterministicallyDerived` | + +The **commitment-marker gate** (a config-driven, unit-tested rule table over decision verbs, rejections, bans, and imperatives) is the gate-before-store that the production audits showed is the difference between a working memory and a 97.8%-junk one. + +## Why deterministic matters + +| | Memscribe | LLM-based capture | +|:--|:--|:--| +| Output is a function of input | ✅ exact | ❌ varies run to run | +| Golden / property / fuzz testable | ✅ | ❌ | +| Cost per session | **$0.00** | API tokens | +| Reads your prompts & secrets | locally, redacted, auditable | sent to a model | +| Fails when a format changes | loudly (a test) | silently (bad data) | + +## Testing + +Because the pipeline is zero-LLM, it's tested the way a compiler is — fixtures in, exact expected output. The test corpus is a first-class deliverable. + +- **Golden-file / snapshot** tests per tool, version, and scenario (`insta`) +- **Cross-tool conformance** — all 9 §8.2 scenarios (happy path, rejected alternative, ban, interleaved arcs, multi-edit, tool failure, rewind/compaction, subagent, no-marker) must normalize to the same shape regardless of tool +- **Property tests** (`proptest`): determinism, idempotency, monotonic seq, losslessness, gate purity, offset resumption +- **Fuzzing** (`cargo-fuzz`): one target per adapter — never panic, never loop, skip-and-continue +- **Redaction & privacy**, **crash/resume**, and a **cross-version corpus** + +```bash +cargo test --workspace --all-features # 260 tests +cargo clippy --workspace --all-targets --all-features -- -D warnings +cargo deny check +``` + +## CLI + +| Command | What it does | +|:--------|:-------------| +| `memscribe watch [--tools …] [--sink …] [--out …]` | The steady-state capture daemon: tail transcripts, write nodes. | +| `memscribe parse [--as ]` | One-shot parse a transcript to NDJSON (the workhorse for tests/debugging). | +| `memscribe replay ` | Re-run preparation over a historical session. | +| `memscribe verify [--capture --as ]` | Run the conformance summary; `--capture` snapshots a live session into a fixture. | +| `memscribe redact [--no-content]` | Show what the redaction pass would strip. | +| `memscribe hook` | The hook handler agents invoke (reads stdin, records, returns immediately). | + +Configure per-tool path overrides, a custom commitment-marker table, redaction patterns, and the sink target in `memscribe.toml` — see [`memscribe.example.toml`](memscribe.example.toml). + +## Workspace layout + +| Crate | Responsibility | +|:------|:---------------| +| [`memscribe-core`](crates/memscribe-core) | The contract: model, traits, gate, segmenter, binder, node-prep, redaction. Depends on nothing in the workspace. | +| [`memscribe-adapters`](crates/memscribe-adapters) | The 9 per-tool parsers, behind feature flags. | +| [`memscribe-io`](crates/memscribe-io) | Sources: file reader, crash-safe offset tailer, live notify watcher, hook handler, OTLP receiver. | +| [`memscribe-sink`](crates/memscribe-sink) | NDJSON, SQLite, and the feature-gated MemDB sink. | +| [`memscribe-cli`](crates/memscribe-cli) | The `memscribe` binary. | +| [`memscribe-testkit`](crates/memscribe-testkit) | Golden harness, conformance suite, synthetic generators, invariant checks. | + +See [`ARCHITECTURE.md`](ARCHITECTURE.md) for the deep dive and [`CONTRIBUTING.md`](CONTRIBUTING.md) to add an adapter. + +## Requirements + +- **Rust ≥ 1.96** (pinned in [`rust-toolchain.toml`](rust-toolchain.toml)) +- **Git** — for repo/branch binding on episodes +- No network in the core path; the optional OTLP receiver binds to loopback only. + +## License + +Dual-licensed under either of [MIT](LICENSE-MIT) or [Apache-2.0](LICENSE-APACHE), at your option. + +
Built by Memrack / Syncable · the deterministic half of the memory system.
diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..cb5aae9 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,27 @@ +# Security & Privacy + +Memscribe reads files that contain prompts, source code, and potentially +secrets (API keys, tokens, `.env` contents). It is designed to be safe for +security-conscious teams to run: + +- **Local-only, no network in the core path.** The Source → Adapter → Gate → + Segmenter → Binder → NodePrep → Sink pipeline performs no network I/O. The + optional OTLP receiver binds to loopback only. The optional MemDB sink is the + only component that talks to another process, and it is feature-gated off by + default. +- **Redaction on by default.** Known secret patterns (API keys, bearer tokens, + `.env` assignments, private-key blocks) are stripped before the Sink. See + `memscribe redact ` to preview exactly what would be removed. +- **`--no-content` mode.** Stores structure only (event kinds, spans, diffs + stats) with all verbatim text elided — for the most sensitive environments. +- **Honors tool suppression switches.** When a tool exposes a privacy switch + (e.g. `CLAUDE_CODE_SKIP_PROMPT_HISTORY`), Memscribe respects it and does not + capture the suppressed content. +- **Auditable.** Because the default sink is NDJSON and every node carries a + `SourceLocation` provenance pointer, you can audit exactly what was captured + and trace any node back to the byte range it came from. + +## Reporting a vulnerability + +Please report security issues privately to the maintainers via the repository's +security advisory channel rather than a public issue. diff --git a/assets/memscribe-logo.svg b/assets/memscribe-logo.svg new file mode 100644 index 0000000..d4ce547 --- /dev/null +++ b/assets/memscribe-logo.svg @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/memscribe-adapters/Cargo.toml b/crates/memscribe-adapters/Cargo.toml new file mode 100644 index 0000000..825fc60 --- /dev/null +++ b/crates/memscribe-adapters/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "memscribe-adapters" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "Per-tool transcript adapters for Memscribe (Claude Code, Codex, Gemini, OTel, Cursor, Windsurf, Zed, VS Code, Copilot)." + +[lints] +workspace = true + +[features] +default = [ + "claude_code", + "codex", + "gemini", + "otel", + "cursor", + "windsurf", + "zed", + "vscode", + "copilot", +] +claude_code = [] +codex = ["dep:zstd"] +gemini = [] +otel = [] +cursor = [] +windsurf = [] +zed = [] +vscode = [] +copilot = [] + +[dependencies] +memscribe-core = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +regex = { workspace = true } +blake3 = { workspace = true } +walkdir = { workspace = true } +zstd = { workspace = true, optional = true } + +[dev-dependencies] +proptest = { workspace = true } +serde_json = { workspace = true } diff --git a/crates/memscribe-adapters/src/claude_code.rs b/crates/memscribe-adapters/src/claude_code.rs new file mode 100644 index 0000000..3374e48 --- /dev/null +++ b/crates/memscribe-adapters/src/claude_code.rs @@ -0,0 +1,844 @@ +//! Claude Code adapter. +//! +//! Transcripts: `~/.claude/projects//.jsonl`, append-only JSONL, +//! DAG via `parentUuid`. Dialogue: `type:user`/`assistant` records; +//! `message.content[]` text/thinking/tool_use/tool_result; `model`, `usage`. +//! Edits: `toolUseResult.structuredPatch` (Edit/Write/MultiEdit) → old/new/ +//! unified; `file-history-snapshot` baseline. Quirks: dedup by `uuid`; a +//! session's summary may live in another file (join by `leafUuid`); subagents +//! are separate `isSidechain` files. +//! +//! One on-disk record can normalize to several events: an assistant record with +//! a `tool_use` block emits an [`EventKind::AssistantTurn`] plus an +//! [`EventKind::ToolCall`]; a user record with a `tool_result` block emits an +//! [`EventKind::UserTurn`] plus an [`EventKind::ToolResult`]; an edit record's +//! top-level `toolUseResult.structuredPatch` emits an [`EventKind::FileEdit`]. +//! Every event for one record shares that record's `uuid` lineage; secondary +//! events derive a deterministic id from `uuid` + a stable discriminator so they +//! never collide. The whole record is deduplicated once, on its `uuid`. + +use crate::util; +use memscribe_core::{ + content_id, CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, Part, + ProjectRef, RawRecord, SchemaVariant, SourceKind, TranscriptAdapter, TranscriptHandle, Usage, +}; +use std::path::PathBuf; + +const SOURCE: SourceKind = SourceKind::ClaudeCode; + +/// Adapter for Anthropic Claude Code transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct ClaudeCodeAdapter; + +impl TranscriptAdapter for ClaudeCodeAdapter { + fn source_kind(&self) -> SourceKind { + SOURCE + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + discover_transcripts(cfg) + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + // A blank line or non-JSON line carries nothing; stay lossless via the + // shared stub (which routes non-JSON to Unknown and skips blanks). + let Some(value) = util::parse_json_line(raw) else { + return util::stub_parse(SOURCE, raw, ctx); + }; + Ok(parse_record(raw, ctx, &value)) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + fingerprint(sample) + } +} + +// --------------------------------------------------------------------------- +// Discovery +// --------------------------------------------------------------------------- + +/// Discover `/projects//.jsonl` transcripts. The config +/// dir is `CLAUDE_CONFIG_DIR` (override) else `/.claude`. +fn discover_transcripts(cfg: &DiscoverCfg) -> Vec { + let base = cfg + .overrides + .get("CLAUDE_CONFIG_DIR") + .cloned() + .unwrap_or_else(|| cfg.home_dir().join(".claude")); + let projects = base.join("projects"); + + let mut out = Vec::new(); + for entry in walkdir::WalkDir::new(&projects) + .into_iter() + .filter_map(Result::ok) + { + let path = entry.path(); + if !path.is_file() { + continue; + } + if path.extension().and_then(|e| e.to_str()) != Some("jsonl") { + continue; + } + let session_hint = path + .file_stem() + .and_then(|s| s.to_str()) + .map(str::to_string); + out.push(TranscriptHandle { + path: path.to_path_buf(), + source: SOURCE, + session_hint, + compressed: false, + }); + } + // Deterministic order regardless of filesystem walk order. + out.sort_by(|a, b| a.path.cmp(&b.path)); + out +} + +// --------------------------------------------------------------------------- +// Fingerprinting +// --------------------------------------------------------------------------- + +/// Fingerprint a sample record. Claude Code records are JSON objects carrying a +/// `type` plus `uuid`/`message`/`parentUuid` shape and (modern) a top-level +/// `version`. +fn fingerprint(sample: &RawRecord) -> SchemaVariant { + let Some(value) = util::parse_json_line(sample) else { + return SchemaVariant::unknown(SOURCE); + }; + let has_type = value.get("type").and_then(|v| v.as_str()).is_some(); + let looks_claude = value.get("uuid").is_some() + || value.get("parentUuid").is_some() + || value.get("sessionId").is_some() + || value.get("message").is_some(); + if !has_type || !looks_claude { + return SchemaVariant::unknown(SOURCE); + } + // The `2.x` line stamps a top-level `version`; older lines do not. + let variant = match value.get("version").and_then(|v| v.as_str()) { + Some(v) if v.starts_with("2.") => "claude_code/2.0", + Some(_) => "claude_code/1.x", + None => "claude_code/unknown", + }; + SchemaVariant::certain(SOURCE, variant) +} + +// --------------------------------------------------------------------------- +// Parsing +// --------------------------------------------------------------------------- + +/// Parse one record into zero or more normalized events. +fn parse_record( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, +) -> Vec { + // Learn the session id the first time we see it (subagents keep their own + // distinct sessionId — we never merge sidechains into the parent session). + if ctx.session_id.is_none() { + if let Some(sid) = value.get("sessionId").and_then(|v| v.as_str()) { + ctx.session_id = Some(sid.to_string()); + } + } + + // The record's native id and DAG parent. + let uuid = value + .get("uuid") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_else(|| content_id(&raw.bytes)); + let parent_uuid = value + .get("parentUuid") + .and_then(|v| v.as_str()) + .map(str::to_string); + + // Idempotency: a repeated record (same uuid) yields nothing on the replay. + if !ctx.first_seen(&uuid) { + return Vec::new(); + } + + let ts = util::ts_from(value, &["timestamp", "time", "ts"]); + let rec_type = value.get("type").and_then(|v| v.as_str()).unwrap_or(""); + + // The very first record opens the session: capture cwd / git / model / + // version as a SessionStart, then continue parsing the same record's body. + let mut events: Vec = Vec::new(); + let is_session_start = ctx.project.is_none() && session_startable(rec_type); + if is_session_start { + let project = project_from(value); + ctx.project = Some(project.clone()); + let model = string_field(value, "model").or_else(|| { + value + .get("message") + .and_then(|m| m.get("model")) + .and_then(|v| v.as_str()) + .map(str::to_string) + }); + let tool_version = string_field(value, "version"); + events.push(util::mk_event( + SOURCE, + ctx, + raw, + session_start_id(&uuid), + parent_uuid.clone(), + ts, + EventKind::SessionStart { + cwd: project.cwd.clone(), + git: project.git.clone(), + model, + tool_version, + }, + )); + } + + match rec_type { + "user" => parse_turn(raw, ctx, value, &uuid, parent_uuid, ts, false, &mut events), + "assistant" => parse_turn(raw, ctx, value, &uuid, parent_uuid, ts, true, &mut events), + "summary" | "file-history-snapshot" | "system" => { + // Recognized container records we do not normalize into a turn: + // keep them lossless as Unknown (a summary explicitly maps to + // Unknown per the format spec). + events.push(util::unknown_event(SOURCE, ctx, raw, value.clone())); + } + _ => { + events.push(util::unknown_event(SOURCE, ctx, raw, value.clone())); + } + } + + events +} + +/// Whether a record type can open a session (only the dialogue records carry the +/// cwd/git/version we bind the project from). +fn session_startable(rec_type: &str) -> bool { + matches!(rec_type, "user" | "assistant" | "system") +} + +/// Parse a `user`/`assistant` record body into a turn plus any embedded +/// tool_use / tool_result / file-edit events. +#[allow(clippy::too_many_arguments)] +fn parse_turn( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, + uuid: &str, + parent_uuid: Option, + ts: memscribe_core::Timestamp, + is_assistant: bool, + events: &mut Vec, +) { + let message = value.get("message"); + let blocks = message.and_then(|m| m.get("content")); + + let mut text = String::new(); + let mut thinking = String::new(); + let mut parts: Vec = Vec::new(); + // Tool calls / results discovered inside the content blocks; emitted after + // the turn so the turn always precedes its embedded tool events. + let mut tool_calls: Vec<(String, String, serde_json::Value)> = Vec::new(); + let mut tool_results: Vec<(String, bool, serde_json::Value)> = Vec::new(); + + match blocks { + // content as a plain string. + Some(serde_json::Value::String(s)) => { + push_text(&mut text, s); + parts.push(Part::Text { text: s.clone() }); + } + // content as an array of typed blocks. + Some(serde_json::Value::Array(arr)) => { + for block in arr { + let btype = block.get("type").and_then(|v| v.as_str()).unwrap_or(""); + match btype { + "text" => { + let t = block.get("text").and_then(|v| v.as_str()).unwrap_or(""); + push_text(&mut text, t); + parts.push(Part::Text { + text: t.to_string(), + }); + } + "thinking" => { + let t = block.get("thinking").and_then(|v| v.as_str()).unwrap_or(""); + push_text(&mut thinking, t); + parts.push(Part::Thinking { + text: t.to_string(), + }); + } + "tool_use" => { + let call_id = block + .get("id") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let name = block + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let args = block + .get("input") + .cloned() + .unwrap_or(serde_json::Value::Null); + parts.push(Part::ToolUse { + call_id: call_id.clone(), + name: name.clone(), + args: args.clone(), + }); + tool_calls.push((call_id, name, args)); + } + "tool_result" => { + let call_id = block + .get("tool_use_id") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let is_error = block + .get("is_error") + .and_then(serde_json::Value::as_bool) + .unwrap_or(false); + let output = block + .get("content") + .cloned() + .unwrap_or(serde_json::Value::Null); + parts.push(Part::ToolResult { + call_id: call_id.clone(), + output: output.clone(), + }); + tool_results.push((call_id, !is_error, output)); + } + "image" => { + let media_type = block + .get("source") + .and_then(|s| s.get("media_type")) + .and_then(|v| v.as_str()) + .map(str::to_string); + parts.push(Part::Image { media_type }); + } + _ => { + parts.push(Part::Other { raw: block.clone() }); + } + } + } + } + _ => {} + } + + // The turn event itself, carrying the record's native uuid. + if is_assistant { + let model = message + .and_then(|m| m.get("model")) + .and_then(|v| v.as_str()) + .map(str::to_string) + .or_else(|| string_field(value, "model")); + let usage = message.and_then(|m| m.get("usage")).map(usage_from); + events.push(util::mk_event( + SOURCE, + ctx, + raw, + uuid.to_string(), + parent_uuid.clone(), + ts, + EventKind::AssistantTurn { + text, + thinking: if thinking.is_empty() { + None + } else { + Some(thinking) + }, + model, + usage, + parts, + }, + )); + } else { + events.push(util::mk_event( + SOURCE, + ctx, + raw, + uuid.to_string(), + parent_uuid.clone(), + ts, + EventKind::UserTurn { text, parts }, + )); + } + + // Embedded tool calls (assistant) — one ToolCall per tool_use block. + for (idx, (call_id, name, args)) in tool_calls.into_iter().enumerate() { + events.push(util::mk_event( + SOURCE, + ctx, + raw, + derived_id(uuid, "tool_call", idx), + Some(uuid.to_string()), + ts, + EventKind::ToolCall { + call_id, + name, + args, + }, + )); + } + + // Embedded tool results (user) — one ToolResult per tool_result block. + for (idx, (call_id, ok, output)) in tool_results.into_iter().enumerate() { + events.push(util::mk_event( + SOURCE, + ctx, + raw, + derived_id(uuid, "tool_result", idx), + Some(uuid.to_string()), + ts, + EventKind::ToolResult { + call_id, + ok, + output, + }, + )); + } + + // A top-level `toolUseResult` with a structured patch is a file edit. + if let Some(edit) = parse_file_edit(value) { + events.push(util::mk_event( + SOURCE, + ctx, + raw, + derived_id(uuid, "file_edit", 0), + Some(uuid.to_string()), + ts, + edit, + )); + } +} + +/// Build a [`EventKind::FileEdit`] from a record's top-level `toolUseResult`, +/// when it carries a `structuredPatch`. Returns `None` otherwise. +fn parse_file_edit(value: &serde_json::Value) -> Option { + let tur = value.get("toolUseResult")?; + let patch = tur.get("structuredPatch")?.as_array()?; + + let path = tur + .get("filePath") + .and_then(|v| v.as_str()) + .map(PathBuf::from) + .unwrap_or_default(); + let old = tur + .get("oldString") + .and_then(|v| v.as_str()) + .map(str::to_string); + let new = tur + .get("newString") + .and_then(|v| v.as_str()) + .map(str::to_string); + + let mut unified_lines: Vec = Vec::new(); + let mut added: u32 = 0; + let mut removed: u32 = 0; + for hunk in patch { + if let Some(lines) = hunk.get("lines").and_then(|v| v.as_array()) { + for line in lines { + if let Some(s) = line.as_str() { + if let Some(first) = s.chars().next() { + if first == '+' { + added += 1; + } else if first == '-' { + removed += 1; + } + } + unified_lines.push(s.to_string()); + } + } + } + } + let unified = if unified_lines.is_empty() { + None + } else { + Some(unified_lines.join("\n")) + }; + + // The originating tool call id. An edit's structuredPatch arrives on the + // tool_result record, so the call id is the content block's `tool_use_id` + // (a `tool_result`) — or, when the patch is colocated with the call, the + // `tool_use` block's `id`. Either block type resolves the same call. + let call_id = value + .get("message") + .and_then(|m| m.get("content")) + .and_then(|c| c.as_array()) + .and_then(|arr| { + arr.iter() + .find_map(|b| match b.get("type").and_then(|v| v.as_str()) { + Some("tool_result") => b + .get("tool_use_id") + .and_then(|v| v.as_str()) + .map(str::to_string), + Some("tool_use") => b.get("id").and_then(|v| v.as_str()).map(str::to_string), + _ => None, + }) + }); + + Some(EventKind::FileEdit { + call_id, + diff: Diff { + path, + old, + new, + unified, + added_lines: added, + removed_lines: removed, + }, + }) +} + +// --------------------------------------------------------------------------- +// Small deterministic helpers +// --------------------------------------------------------------------------- + +/// Build the project binding from a session-opening record's cwd / git fields. +fn project_from(value: &serde_json::Value) -> ProjectRef { + let cwd = value + .get("cwd") + .and_then(|v| v.as_str()) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")); + let branch = value + .get("gitBranch") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_string); + let sha = value + .get("gitSha") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_default(); + let git = if branch.is_some() || !sha.is_empty() { + Some(GitRef { sha, branch }) + } else { + None + }; + ProjectRef { + cwd, + repo_root: None, + git, + } +} + +/// Read a top-level string field. +fn string_field(value: &serde_json::Value, key: &str) -> Option { + value.get(key).and_then(|v| v.as_str()).map(str::to_string) +} + +/// Build a [`Usage`] from a Claude Code `usage` object. +fn usage_from(u: &serde_json::Value) -> Usage { + let g = |k: &str| u.get(k).and_then(serde_json::Value::as_u64); + Usage { + input_tokens: g("input_tokens"), + output_tokens: g("output_tokens"), + cache_read_tokens: g("cache_read_input_tokens"), + cache_creation_tokens: g("cache_creation_input_tokens"), + } +} + +/// Append `s` to a text accumulator, separating multiple text blocks by a single +/// newline so concatenation stays readable and deterministic. +fn push_text(acc: &mut String, s: &str) { + if s.is_empty() { + return; + } + if !acc.is_empty() { + acc.push('\n'); + } + acc.push_str(s); +} + +/// A deterministic event id for the SessionStart synthesized from a record. +fn session_start_id(uuid: &str) -> String { + content_id(format!("claude_code/session_start/{uuid}").as_bytes()) +} + +/// A deterministic, collision-free id for a secondary event derived from a +/// record's `uuid` (the primary turn keeps the bare `uuid`). +fn derived_id(uuid: &str, kind: &str, idx: usize) -> String { + content_id(format!("claude_code/{kind}/{idx}/{uuid}").as_bytes()) +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(s: &str) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("session.jsonl", 0, 1)) + } + + fn parse_all(lines: &[&str]) -> Vec { + let adapter = ClaudeCodeAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for line in lines { + let evs = adapter.parse(&raw(line), &mut ctx).expect("never errors"); + out.extend(evs); + } + out + } + + fn tags(evs: &[CaptureEvent]) -> Vec<&'static str> { + evs.iter().map(|e| e.kind.tag()).collect() + } + + // --- TDD: the normalized sequence for a small dialogue -------------------- + + #[test] + fn first_user_record_yields_session_start_then_user_turn() { + let line = r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","gitBranch":"main","version":"2.0.1","message":{"role":"user","content":"Let's use Postgres instead of MySQL."}}"#; + let evs = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["session_start", "user_turn"]); + // Session + project were learned from the first record. + assert_eq!(evs[1].session_id, "s1"); + match &evs[0].kind { + EventKind::SessionStart { + cwd, + git, + tool_version, + .. + } => { + assert_eq!(cwd.to_str(), Some("/repo")); + assert_eq!(git.as_ref().and_then(|g| g.branch.as_deref()), Some("main")); + assert_eq!(tool_version.as_deref(), Some("2.0.1")); + } + other => panic!("expected session_start, got {other:?}"), + } + match &evs[1].kind { + EventKind::UserTurn { text, .. } => { + assert_eq!(text, "Let's use Postgres instead of MySQL."); + } + other => panic!("expected user_turn, got {other:?}"), + } + } + + #[test] + fn assistant_with_tool_use_yields_turn_then_tool_call() { + let session = r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","gitBranch":"main","version":"2.0.1","message":{"role":"user","content":"go"}}"#; + let asst = r#"{"type":"assistant","uuid":"a1","parentUuid":"u1","timestamp":"2026-06-22T10:00:01Z","sessionId":"s1","message":{"role":"assistant","model":"claude-opus-4-8","usage":{"input_tokens":10,"output_tokens":5,"cache_read_input_tokens":2,"cache_creation_input_tokens":1},"content":[{"type":"text","text":"Editing now."},{"type":"tool_use","id":"call_1","name":"Edit","input":{"file_path":"/repo/a.rs"}}]}}"#; + let evs = parse_all(&[session, asst]); + assert_eq!( + tags(&evs), + vec!["session_start", "user_turn", "assistant_turn", "tool_call"] + ); + match &evs[2].kind { + EventKind::AssistantTurn { + text, model, usage, .. + } => { + assert_eq!(text, "Editing now."); + assert_eq!(model.as_deref(), Some("claude-opus-4-8")); + let u = usage.as_ref().expect("usage present"); + assert_eq!(u.input_tokens, Some(10)); + assert_eq!(u.cache_read_tokens, Some(2)); + assert_eq!(u.cache_creation_tokens, Some(1)); + } + other => panic!("expected assistant_turn, got {other:?}"), + } + match &evs[3].kind { + EventKind::ToolCall { call_id, name, .. } => { + assert_eq!(call_id, "call_1"); + assert_eq!(name, "Edit"); + } + other => panic!("expected tool_call, got {other:?}"), + } + // The secondary tool_call carries a distinct, deterministic id. + assert_ne!(evs[2].event_id, evs[3].event_id); + assert_eq!(evs[3].parent_id.as_deref(), Some("a1")); + } + + #[test] + fn tool_result_record_yields_user_turn_then_tool_result() { + let session = r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","version":"2.0.1","message":{"role":"user","content":"go"}}"#; + let res = r#"{"type":"user","uuid":"u2","parentUuid":"a1","timestamp":"2026-06-22T10:00:02Z","sessionId":"s1","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"call_1","content":"ok","is_error":false}]}}"#; + let evs = parse_all(&[session, res]); + // The session record emits session_start + user_turn; the tool_result + // record emits its own (empty) user_turn carrier + the tool_result. + assert_eq!( + tags(&evs), + vec!["session_start", "user_turn", "user_turn", "tool_result"] + ); + let res_ev = evs.iter().find(|e| e.kind.tag() == "tool_result").unwrap(); + match &res_ev.kind { + EventKind::ToolResult { call_id, ok, .. } => { + assert_eq!(call_id, "call_1"); + assert!(*ok); + } + other => panic!("expected tool_result, got {other:?}"), + } + } + + // --- TDD: a decision + an edit produces UserTurn then FileEdit ------------ + + #[test] + fn decision_then_edit_yields_user_turn_then_file_edit() { + let decision = r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","gitBranch":"main","version":"2.0.1","message":{"role":"user","content":"Let's use Postgres instead of MySQL."}}"#; + let edit = r#"{"type":"assistant","uuid":"a1","parentUuid":"u1","timestamp":"2026-06-22T10:00:01Z","sessionId":"s1","message":{"role":"assistant","model":"claude-opus-4-8","content":[{"type":"tool_use","id":"call_1","name":"Edit","input":{"file_path":"/repo/db.rs"}}]},"toolUseResult":{"filePath":"/repo/db.rs","oldString":"mysql","newString":"postgres","structuredPatch":[{"oldStart":1,"oldLines":1,"newStart":1,"newLines":1,"lines":["-mysql","+postgres"]}]}}"#; + let evs = parse_all(&[decision, edit]); + let t = tags(&evs); + // user_turn appears (the decision), and a file_edit appears. + assert!(t.contains(&"user_turn"), "tags: {t:?}"); + assert!(t.contains(&"file_edit"), "tags: {t:?}"); + // The user_turn precedes the file_edit. + let ut = t.iter().position(|x| *x == "user_turn").unwrap(); + let fe = t.iter().position(|x| *x == "file_edit").unwrap(); + assert!(ut < fe, "user_turn must precede file_edit: {t:?}"); + + let edit_ev = evs.iter().find(|e| e.kind.tag() == "file_edit").unwrap(); + match &edit_ev.kind { + EventKind::FileEdit { call_id, diff } => { + assert_eq!(call_id.as_deref(), Some("call_1")); + assert_eq!(diff.path.to_str(), Some("/repo/db.rs")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 1); + assert_eq!(diff.old.as_deref(), Some("mysql")); + assert_eq!(diff.new.as_deref(), Some("postgres")); + assert_eq!(diff.unified.as_deref(), Some("-mysql\n+postgres")); + } + other => panic!("expected file_edit, got {other:?}"), + } + } + + // --- TDD: tool failure → FileEdit references a failing result ------------- + + #[test] + fn failed_edit_keeps_call_id_so_segmenter_can_drop_it() { + // The FileEdit must carry the call_id, and a sibling tool_result with + // ok=false must exist, so the segmenter drops the episode. + let session = r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","version":"2.0.1","message":{"role":"user","content":"edit"}}"#; + let edit = r#"{"type":"assistant","uuid":"a1","parentUuid":"u1","timestamp":"2026-06-22T10:00:01Z","sessionId":"s1","message":{"role":"assistant","content":[{"type":"tool_use","id":"call_x","name":"Edit","input":{}}]},"toolUseResult":{"filePath":"/repo/x.rs","oldString":"a","newString":"b","structuredPatch":[{"oldStart":1,"oldLines":1,"newStart":1,"newLines":1,"lines":["-a","+b"]}]}}"#; + let fail = r#"{"type":"user","uuid":"u2","parentUuid":"a1","timestamp":"2026-06-22T10:00:02Z","sessionId":"s1","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"call_x","content":"boom","is_error":true}]}}"#; + let evs = parse_all(&[session, edit, fail]); + + let edit_ev = evs.iter().find(|e| e.kind.tag() == "file_edit").unwrap(); + let edit_call = match &edit_ev.kind { + EventKind::FileEdit { call_id, .. } => call_id.clone(), + _ => unreachable!(), + }; + assert_eq!(edit_call.as_deref(), Some("call_x")); + + let res_ev = evs.iter().find(|e| e.kind.tag() == "tool_result").unwrap(); + match &res_ev.kind { + EventKind::ToolResult { call_id, ok, .. } => { + assert_eq!(call_id, "call_x"); + assert!(!ok, "the failing edit's result must be ok=false"); + } + _ => unreachable!(), + } + } + + // --- TDD: never panics on garbage ---------------------------------------- + + #[test] + fn garbage_input_never_panics_and_is_lossless() { + let adapter = ClaudeCodeAdapter; + let mut ctx = ParseCtx::new(); + // Not JSON at all. + let e1 = adapter.parse(&raw("}{ not json"), &mut ctx).unwrap(); + assert_eq!(e1.len(), 1); + assert_eq!(e1[0].kind.tag(), "unknown"); + // Valid JSON, unrecognized shape. + let e2 = adapter + .parse(&raw(r#"{"hello":"world"}"#), &mut ctx) + .unwrap(); + assert_eq!(e2.len(), 1); + assert_eq!(e2[0].kind.tag(), "unknown"); + // Blank line yields nothing. + let e3 = adapter.parse(&raw(" "), &mut ctx).unwrap(); + assert!(e3.is_empty()); + // A record with a non-string/array content does not panic. + let e4 = adapter + .parse( + &raw(r#"{"type":"user","uuid":"z","message":{"content":42}}"#), + &mut ctx, + ) + .unwrap(); + assert!(!e4.is_empty()); + } + + // --- TDD: dedup / idempotency on a repeated record ----------------------- + + #[test] + fn repeated_uuid_is_deduped_to_empty() { + let adapter = ClaudeCodeAdapter; + let mut ctx = ParseCtx::new(); + let line = r#"{"type":"user","uuid":"dup","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","version":"2.0.1","message":{"role":"user","content":"hi"}}"#; + let first = adapter.parse(&raw(line), &mut ctx).unwrap(); + assert!(!first.is_empty()); + let second = adapter.parse(&raw(line), &mut ctx).unwrap(); + assert!(second.is_empty(), "a repeated uuid must yield nothing"); + } + + // --- summary → Unknown ---------------------------------------------------- + + #[test] + fn summary_record_is_unknown() { + let line = r#"{"type":"summary","summary":"A recap","leafUuid":"x"}"#; + let evs = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["unknown"]); + } + + // --- ban turn surfaces as a UserTurn (gate runs downstream) --------------- + + #[test] + fn ban_turn_is_a_user_turn() { + let line = r#"{"type":"user","uuid":"b1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","version":"2.0.1","message":{"role":"user","content":"We will never add a dependency on left-pad."}}"#; + let evs = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["session_start", "user_turn"]); + match &evs[1].kind { + EventKind::UserTurn { text, .. } => { + assert!(text.contains("never add a dependency")); + } + other => panic!("expected user_turn, got {other:?}"), + } + } + + // --- determinism ---------------------------------------------------------- + + #[test] + fn parsing_is_deterministic() { + let line = r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","gitBranch":"main","version":"2.0.1","message":{"role":"user","content":"Let's use Postgres."}}"#; + let a = parse_all(&[line]); + let b = parse_all(&[line]); + assert_eq!(a, b); + } + + // --- fingerprint ---------------------------------------------------------- + + #[test] + fn fingerprint_recognizes_claude_2x() { + let adapter = ClaudeCodeAdapter; + let v = adapter.schema_fingerprint(&raw( + r#"{"type":"user","uuid":"u1","sessionId":"s1","version":"2.0.1","message":{"content":"hi"}}"#, + )); + assert_eq!(v.source, SourceKind::ClaudeCode); + assert_eq!(v.variant, "claude_code/2.0"); + assert_eq!(v.confidence, 100); + } + + #[test] + fn fingerprint_unknown_for_foreign_json() { + let adapter = ClaudeCodeAdapter; + let v = adapter.schema_fingerprint(&raw(r#"{"foo":"bar"}"#)); + assert_eq!(v.confidence, 0); + } + + // An edit's structuredPatch arrives on the tool_result record, so the + // FileEdit must resolve its call_id from the `tool_result` block's + // `tool_use_id` — this is what lets the segmenter drop a failed edit. + #[test] + fn file_edit_on_result_record_resolves_call_id_from_tool_result_block() { + let session = r#"{"type":"user","uuid":"u1","parentUuid":null,"timestamp":"2026-06-22T10:00:00Z","sessionId":"s1","cwd":"/repo","version":"2.0.1","message":{"role":"user","content":"go"}}"#; + // The structuredPatch is colocated with the tool_result block (no + // tool_use block on this record), exactly as Claude Code writes it. + let result = r#"{"type":"user","uuid":"u2","parentUuid":"a1","timestamp":"2026-06-22T10:00:02Z","sessionId":"s1","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"call_z","content":"ok","is_error":false}]},"toolUseResult":{"filePath":"/repo/z.rs","oldString":"a","newString":"b","structuredPatch":[{"oldStart":1,"oldLines":1,"newStart":1,"newLines":1,"lines":["-a","+b"]}]}}"#; + let evs = parse_all(&[session, result]); + let edit = evs.iter().find(|e| e.kind.tag() == "file_edit").unwrap(); + match &edit.kind { + EventKind::FileEdit { call_id, .. } => { + assert_eq!(call_id.as_deref(), Some("call_z")); + } + other => panic!("expected file_edit, got {other:?}"), + } + } +} diff --git a/crates/memscribe-adapters/src/codex.rs b/crates/memscribe-adapters/src/codex.rs new file mode 100644 index 0000000..ca54f95 --- /dev/null +++ b/crates/memscribe-adapters/src/codex.rs @@ -0,0 +1,1054 @@ +//! Codex CLI adapter. +//! +//! Transcripts: `~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl[.zst]`, +//! `{timestamp,type,payload}` JSONL. Dialogue: `response_item.message` +//! (role user/assistant), `function_call`/`function_call_output` paired by +//! `call_id`; `session_meta.git`. Edits: `function_call` `name=apply_patch` → +//! V4A diff in `arguments` → parse to `Diff`. Quirks: handle `.jsonl.zst`; the +//! protocol enum ≠ wire format (build to wire data); `history.jsonl` ≠ rollouts; +//! files may be `0644` (secrets). +//! +//! The io reader decompresses `.zst` before records reach `parse`, so this +//! module only ever sees plain JSON lines. Every record maps to zero or more +//! [`CaptureEvent`]s; any shape we do not recognize is routed to +//! [`memscribe_core::EventKind::Unknown`] via [`util::unknown_event`] so the +//! stream stays lossless. + +use crate::util; +use memscribe_core::{ + content_id, CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, Part, + ProjectRef, RawRecord, SchemaVariant, SourceKind, TranscriptAdapter, TranscriptHandle, +}; +use std::path::PathBuf; + +/// Adapter for OpenAI Codex CLI transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct CodexAdapter; + +impl TranscriptAdapter for CodexAdapter { + fn source_kind(&self) -> SourceKind { + SourceKind::Codex + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + discover_rollouts(cfg) + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + // Blank lines carry nothing; skip them (the io layer may hand us trailers). + let Some(value) = util::parse_json_line(raw) else { + return Ok(Vec::new()); + }; + Ok(parse_record(raw, ctx, &value)) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + match util::parse_json_line(sample) { + Some(v) if is_codex_record(&v) => { + SchemaVariant::certain(SourceKind::Codex, "codex/rollout-v2") + } + _ => SchemaVariant::unknown(SourceKind::Codex), + } + } +} + +/// A record is recognizably a Codex rollout line if it carries a top-level +/// `type` and `payload`. We keep this lenient so version churn still fingerprints. +fn is_codex_record(value: &serde_json::Value) -> bool { + value.get("type").and_then(|t| t.as_str()).is_some() && value.get("payload").is_some() +} + +/// Parse one decoded record into zero or more events. Never panics. +fn parse_record( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, +) -> Vec { + let rec_type = value.get("type").and_then(|t| t.as_str()); + let payload = value.get("payload"); + + match (rec_type, payload) { + (Some("session_meta"), Some(p)) => parse_session_meta(raw, ctx, value, p), + (Some("response_item"), Some(p)) => parse_response_item(raw, ctx, value, p), + // `event_msg` and `turn_context` are valid but carry no normalized + // payload of their own — preserve them losslessly as Unknown. + _ => vec![util::unknown_event( + SourceKind::Codex, + ctx, + raw, + value.clone(), + )], + } +} + +/// `session_meta` → [`EventKind::SessionStart`]. Sets `ctx.session_id` and +/// `ctx.project` so later records inherit the binding. +fn parse_session_meta( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, + payload: &serde_json::Value, +) -> Vec { + // Learn the session id (used by mk_event for every subsequent event). + if let Some(id) = payload.get("id").and_then(|v| v.as_str()) { + ctx.session_id = Some(id.to_string()); + } + + let cwd = payload + .get("cwd") + .and_then(|v| v.as_str()) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")); + + let git = parse_git(payload.get("git")); + + // Populate the project binding from the session-start record. + ctx.project = Some(ProjectRef { + cwd: cwd.clone(), + repo_root: None, + git: git.clone(), + }); + + // Codex labels its version under either `cli_version` or `originator`. + let tool_version = payload + .get("cli_version") + .and_then(|v| v.as_str()) + .or_else(|| payload.get("originator").and_then(|v| v.as_str())) + .map(str::to_string); + + let model = payload + .get("model") + .and_then(|v| v.as_str()) + .map(str::to_string); + + let ts = util::ts_from(value, &["timestamp", "time", "ts"]); + let event_id = session_event_id(ctx, raw); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + + let kind = EventKind::SessionStart { + cwd, + git, + model, + tool_version, + }; + vec![util::mk_event( + SourceKind::Codex, + ctx, + raw, + event_id, + None, + ts, + kind, + )] +} + +/// Build a stable event id for the session-start record: prefer the session id, +/// else a content hash, so dedup/idempotency holds for repeated meta lines. +fn session_event_id(ctx: &ParseCtx, raw: &RawRecord) -> String { + if let Some(id) = ctx.session_id.as_deref() { + return format!("session_meta:{id}"); + } + content_id(&raw.bytes) +} + +/// Parse a `git` object `{sha, branch}` into a [`GitRef`]. +fn parse_git(git: Option<&serde_json::Value>) -> Option { + let g = git?; + let sha = g.get("sha").and_then(|v| v.as_str())?.to_string(); + let branch = g.get("branch").and_then(|v| v.as_str()).map(str::to_string); + Some(GitRef { sha, branch }) +} + +/// A `response_item` payload has its own `type`. Dispatch on it. +fn parse_response_item( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, + payload: &serde_json::Value, +) -> Vec { + let item_type = payload.get("type").and_then(|t| t.as_str()); + let ts = util::ts_from(value, &["timestamp", "time", "ts"]); + + match item_type { + Some("message") => parse_message(raw, ctx, payload, ts), + Some("function_call") => parse_function_call(raw, ctx, payload, ts), + Some("function_call_output") => parse_function_call_output(raw, ctx, payload, ts), + // `reasoning` and any other item type carry no first-class mapping — + // keep them lossless as Unknown rather than guessing. + _ => vec![util::unknown_event( + SourceKind::Codex, + ctx, + raw, + value.clone(), + )], + } +} + +/// `message` → [`EventKind::UserTurn`] / [`EventKind::AssistantTurn`]. +fn parse_message( + raw: &RawRecord, + ctx: &mut ParseCtx, + payload: &serde_json::Value, + ts: memscribe_core::Timestamp, +) -> Vec { + let role = payload.get("role").and_then(|v| v.as_str()).unwrap_or(""); + let (text, parts) = flatten_content(payload.get("content")); + + let event_id = item_event_id(payload, raw); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + + let kind = match role { + "user" => EventKind::UserTurn { text, parts }, + "assistant" => EventKind::AssistantTurn { + text, + thinking: None, + model: payload + .get("model") + .and_then(|v| v.as_str()) + .map(str::to_string), + usage: None, + parts, + }, + // A message with an unexpected role: keep it lossless. + _ => { + return vec![util::unknown_event( + SourceKind::Codex, + ctx, + raw, + payload.clone(), + )]; + } + }; + vec![util::mk_event( + SourceKind::Codex, + ctx, + raw, + event_id, + None, + ts, + kind, + )] +} + +/// Flatten a `content` array of `{type:input_text|output_text, text}` parts into +/// a joined text blob and the structured [`Part`] list. +fn flatten_content(content: Option<&serde_json::Value>) -> (String, Vec) { + let mut text = String::new(); + let mut parts: Vec = Vec::new(); + let Some(items) = content.and_then(|c| c.as_array()) else { + return (text, parts); + }; + for item in items { + let ptype = item.get("type").and_then(|v| v.as_str()).unwrap_or(""); + match ptype { + "input_text" | "output_text" | "text" => { + if let Some(t) = item.get("text").and_then(|v| v.as_str()) { + if !text.is_empty() { + text.push('\n'); + } + text.push_str(t); + parts.push(Part::Text { + text: t.to_string(), + }); + } + } + "input_image" | "image" | "output_image" => { + parts.push(Part::Image { + media_type: item + .get("media_type") + .or_else(|| item.get("image_url")) + .and_then(|v| v.as_str()) + .map(str::to_string), + }); + } + _ => parts.push(Part::Other { raw: item.clone() }), + } + } + (text, parts) +} + +/// `function_call` → [`EventKind::ToolCall`], plus one +/// [`EventKind::FileEdit`] per file section when the call is `apply_patch`. +fn parse_function_call( + raw: &RawRecord, + ctx: &mut ParseCtx, + payload: &serde_json::Value, + ts: memscribe_core::Timestamp, +) -> Vec { + let name = payload + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let call_id = payload + .get("call_id") + .and_then(|v| v.as_str()) + .map(str::to_string); + + // `arguments` is a JSON *string* on the wire; parse it, falling back to a + // string value when it is not valid JSON (still lossless). + let args = parse_arguments(payload.get("arguments")); + + let event_id = call_id + .clone() + .unwrap_or_else(|| item_event_id(payload, raw)); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + + // Remember the call name so a later `function_call_output` can pair with it. + if let Some(cid) = &call_id { + ctx.call_names.insert(cid.clone(), name.clone()); + } + + let mut events = Vec::new(); + events.push(util::mk_event( + SourceKind::Codex, + ctx, + raw, + event_id.clone(), + None, + ts, + EventKind::ToolCall { + call_id: call_id.clone().unwrap_or_default(), + name: name.clone(), + args: args.clone(), + }, + )); + + // apply_patch also yields one FileEdit per file section in the V4A patch. + if name == "apply_patch" { + if let Some(patch) = extract_patch_text(&args) { + for section in parse_v4a_patch(&patch) { + // A unique, deterministic id per FileEdit so dedup does not + // collapse multiple edits from one call. + let edit_id = + content_id(format!("{}:edit:{}", event_id, section.path.display()).as_bytes()); + if !ctx.first_seen(&edit_id) { + continue; + } + events.push(util::mk_event( + SourceKind::Codex, + ctx, + raw, + edit_id, + call_id.clone(), + ts, + EventKind::FileEdit { + call_id: call_id.clone(), + diff: section.into_diff(), + }, + )); + } + } + } + + events +} + +/// `function_call_output` → [`EventKind::ToolResult`]. `ok` is false when the +/// output indicates an error. +fn parse_function_call_output( + raw: &RawRecord, + ctx: &mut ParseCtx, + payload: &serde_json::Value, + ts: memscribe_core::Timestamp, +) -> Vec { + let call_id = payload + .get("call_id") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_default(); + + let output = payload + .get("output") + .cloned() + .unwrap_or(serde_json::Value::Null); + let ok = output_is_ok(&output); + + let event_id = if call_id.is_empty() { + item_event_id(payload, raw) + } else { + format!("{call_id}:output") + }; + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + + // Record the result outcome so downstream pairing (and the segmenter) can + // drop edits from a failed call. + if !call_id.is_empty() { + ctx.call_ok.insert(call_id.clone(), ok); + } + + vec![util::mk_event( + SourceKind::Codex, + ctx, + raw, + event_id, + Some(call_id.clone()), + ts, + EventKind::ToolResult { + call_id, + ok, + output, + }, + )] +} + +/// Decide whether a tool output indicates success. Codex outputs are sometimes a +/// bare string, sometimes an object `{output, metadata:{exit_code}}` or carry a +/// `success` flag. We treat an explicit failure signal as not-ok; otherwise ok. +fn output_is_ok(output: &serde_json::Value) -> bool { + match output { + serde_json::Value::String(s) => !string_indicates_error(s), + serde_json::Value::Object(map) => { + // Explicit booleans win. + if let Some(b) = map.get("success").and_then(|v| v.as_bool()) { + return b; + } + if let Some(b) = map.get("ok").and_then(|v| v.as_bool()) { + return b; + } + // A non-zero exit code is a failure. + if let Some(code) = map + .get("exit_code") + .or_else(|| map.get("exitCode")) + .and_then(serde_json::Value::as_i64) + { + return code == 0; + } + if let Some(code) = map + .get("metadata") + .and_then(|m| m.get("exit_code").or_else(|| m.get("exitCode"))) + .and_then(serde_json::Value::as_i64) + { + return code == 0; + } + // Otherwise sniff the textual output for an error signature. + if let Some(s) = map.get("output").and_then(|v| v.as_str()) { + return !string_indicates_error(s); + } + true + } + // Null / numbers / arrays: assume ok (no negative signal). + _ => true, + } +} + +/// Heuristic, deterministic error detection for a free-text tool output. +fn string_indicates_error(s: &str) -> bool { + let lower = s.to_ascii_lowercase(); + lower.contains("error") + || lower.contains("failed") + || lower.contains("failure") + || lower.contains("traceback") + || lower.contains("exception") + || lower.contains("not found") + || lower.contains("no such file") + || lower.contains("patch does not apply") + || lower.contains("could not apply") +} + +/// `arguments` arrives as a JSON-encoded string. Parse it; if it is not a JSON +/// string (or not valid JSON), preserve whatever value was there verbatim. +fn parse_arguments(arguments: Option<&serde_json::Value>) -> serde_json::Value { + match arguments { + Some(serde_json::Value::String(s)) => { + serde_json::from_str(s).unwrap_or_else(|_| serde_json::Value::String(s.clone())) + } + Some(other) => other.clone(), + None => serde_json::Value::Null, + } +} + +/// Pull the V4A patch text out of parsed `apply_patch` arguments. Codex stores it +/// under `input` or `patch`; tolerate a bare string too. +fn extract_patch_text(args: &serde_json::Value) -> Option { + match args { + serde_json::Value::Object(map) => map + .get("input") + .or_else(|| map.get("patch")) + .and_then(|v| v.as_str()) + .map(str::to_string), + serde_json::Value::String(s) => Some(s.clone()), + _ => None, + } +} + +/// One file section parsed out of a V4A patch. +struct PatchSection { + path: PathBuf, + body: String, + added: u32, + removed: u32, +} + +impl PatchSection { + fn into_diff(self) -> Diff { + Diff { + path: self.path, + old: None, + new: None, + unified: Some(self.body), + added_lines: self.added, + removed_lines: self.removed, + } + } +} + +/// Parse a V4A patch string (the `*** Begin Patch` / `*** End Patch` envelope) +/// into one [`PatchSection`] per file. Deterministic, allocation-only, and +/// panic-free: it indexes nothing and never unwraps. +fn parse_v4a_patch(patch: &str) -> Vec { + let mut sections: Vec = Vec::new(); + let mut current: Option = None; + + for line in patch.lines() { + if let Some(path) = section_header(line) { + if let Some(sec) = current.take() { + sections.push(sec); + } + current = Some(PatchSection { + path: PathBuf::from(path), + body: String::new(), + added: 0, + removed: 0, + }); + continue; + } + + // The envelope markers themselves are not part of any section body. + if line.starts_with("*** Begin Patch") || line.starts_with("*** End Patch") { + continue; + } + + if let Some(sec) = current.as_mut() { + // Count added/removed lines. A leading '+'/'-' marks the change; '@@' + // and context (leading space) lines are body but not counted. + if let Some(first) = line.as_bytes().first() { + match first { + b'+' => sec.added += 1, + b'-' => sec.removed += 1, + _ => {} + } + } + if !sec.body.is_empty() { + sec.body.push('\n'); + } + sec.body.push_str(line); + } + } + + if let Some(sec) = current.take() { + sections.push(sec); + } + sections +} + +/// If `line` is a V4A file header (`*** Update/Add/Delete File: `), return +/// the path. Otherwise `None`. +fn section_header(line: &str) -> Option<&str> { + for prefix in ["*** Update File: ", "*** Add File: ", "*** Delete File: "] { + if let Some(rest) = line.strip_prefix(prefix) { + return Some(rest.trim()); + } + } + None +} + +/// A stable event id for a `response_item` that carries no native id: prefer an +/// explicit `id`, else a content hash of the raw record bytes. +fn item_event_id(payload: &serde_json::Value, raw: &RawRecord) -> String { + payload + .get("id") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_else(|| content_id(&raw.bytes)) +} + +/// Discover Codex rollout transcripts under `~/.codex/sessions/**/rollout-*`. +/// `history.jsonl` is the prompt history file, not a rollout, so it is skipped. +fn discover_rollouts(cfg: &DiscoverCfg) -> Vec { + let root = codex_sessions_root(cfg); + if !root.exists() { + return Vec::new(); + } + + let mut handles: Vec = Vec::new(); + for entry in walkdir::WalkDir::new(&root) + .follow_links(false) + .into_iter() + .filter_map(Result::ok) + { + if !entry.file_type().is_file() { + continue; + } + let path = entry.path(); + let Some(name) = path.file_name().and_then(|n| n.to_str()) else { + continue; + }; + if name == "history.jsonl" { + continue; // prompt history, not a rollout + } + if !name.starts_with("rollout-") { + continue; + } + let compressed = name.ends_with(".zst"); + let is_jsonl = name.ends_with(".jsonl") || name.ends_with(".jsonl.zst"); + if !is_jsonl { + continue; + } + handles.push(TranscriptHandle { + path: path.to_path_buf(), + source: SourceKind::Codex, + session_hint: session_hint_from_name(name), + compressed, + }); + } + + // Deterministic order regardless of filesystem walk order. + handles.sort_by(|a, b| a.path.cmp(&b.path)); + handles +} + +/// The `~/.codex/sessions` root, honoring a `CODEX_HOME` override. +fn codex_sessions_root(cfg: &DiscoverCfg) -> PathBuf { + if let Some(p) = cfg.overrides.get("CODEX_HOME") { + return p.join("sessions"); + } + cfg.home_dir().join(".codex").join("sessions") +} + +/// Derive a session-id hint from a `rollout-<...>.jsonl[.zst]` filename, if one +/// is embedded after the `rollout-` prefix. +fn session_hint_from_name(name: &str) -> Option { + let stem = name + .strip_suffix(".jsonl.zst") + .or_else(|| name.strip_suffix(".jsonl")) + .unwrap_or(name); + let rest = stem.strip_prefix("rollout-")?; + if rest.is_empty() { + None + } else { + Some(rest.to_string()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(s: &str) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("rollout-test.jsonl", 0, 1)) + } + + /// Parse a whole multi-line transcript through one shared context, the way + /// the pipeline does. Returns the flat event stream. + fn parse_all(lines: &[&str]) -> Vec { + let adapter = CodexAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for (i, line) in lines.iter().enumerate() { + let r = RawRecord::from_line( + line, + SourceLocation::new("rollout-test.jsonl", i as u64, (i + 1) as u64), + ); + out.extend(adapter.parse(&r, &mut ctx).expect("never errors")); + } + out + } + + fn tags(events: &[CaptureEvent]) -> Vec<&'static str> { + events.iter().map(|e| e.kind.tag()).collect() + } + + const META: &str = r#"{"timestamp":"2026-06-22T10:00:00Z","type":"session_meta","payload":{"id":"sess-abc","cwd":"/home/u/proj","git":{"sha":"deadbeef","branch":"main"},"cli_version":"0.5.0"}}"#; + + #[test] + fn session_meta_maps_to_session_start_with_project() { + let events = parse_all(&[META]); + assert_eq!(tags(&events), vec!["session_start"]); + let ev = &events[0]; + assert_eq!(ev.session_id, "sess-abc"); + match &ev.kind { + EventKind::SessionStart { + cwd, + git, + tool_version, + .. + } => { + assert_eq!(cwd, &PathBuf::from("/home/u/proj")); + let g = git.as_ref().expect("git present"); + assert_eq!(g.sha, "deadbeef"); + assert_eq!(g.branch.as_deref(), Some("main")); + assert_eq!(tool_version.as_deref(), Some("0.5.0")); + } + other => panic!("expected SessionStart, got {other:?}"), + } + // Project binding is stamped from session_meta. + assert_eq!(ev.project.cwd, PathBuf::from("/home/u/proj")); + assert!(ev.project.git.is_some()); + } + + #[test] + fn user_and_assistant_messages_map_to_turns() { + let user = r#"{"timestamp":"2026-06-22T10:00:01Z","type":"response_item","payload":{"type":"message","role":"user","content":[{"type":"input_text","text":"Let's use Postgres instead of MySQL."}]}}"#; + let asst = r#"{"timestamp":"2026-06-22T10:00:02Z","type":"response_item","payload":{"type":"message","role":"assistant","content":[{"type":"output_text","text":"Sounds good."}]}}"#; + let events = parse_all(&[META, user, asst]); + assert_eq!( + tags(&events), + vec!["session_start", "user_turn", "assistant_turn"] + ); + match &events[1].kind { + EventKind::UserTurn { text, .. } => { + assert_eq!(text, "Let's use Postgres instead of MySQL."); + } + other => panic!("expected UserTurn, got {other:?}"), + } + // The user-turn inherits the session id learned from session_meta. + assert_eq!(events[1].session_id, "sess-abc"); + } + + #[test] + fn function_call_and_output_pair_by_call_id() { + let call = r#"{"timestamp":"2026-06-22T10:00:03Z","type":"response_item","payload":{"type":"function_call","name":"shell","arguments":"{\"command\":[\"ls\"]}","call_id":"call-1"}}"#; + let out = r#"{"timestamp":"2026-06-22T10:00:04Z","type":"response_item","payload":{"type":"function_call_output","call_id":"call-1","output":"file1\nfile2"}}"#; + let events = parse_all(&[META, call, out]); + assert_eq!( + tags(&events), + vec!["session_start", "tool_call", "tool_result"] + ); + match &events[1].kind { + EventKind::ToolCall { + call_id, + name, + args, + } => { + assert_eq!(call_id, "call-1"); + assert_eq!(name, "shell"); + // arguments string was parsed into JSON. + assert_eq!(args["command"][0], "ls"); + } + other => panic!("expected ToolCall, got {other:?}"), + } + match &events[2].kind { + EventKind::ToolResult { call_id, ok, .. } => { + assert_eq!(call_id, "call-1"); + assert!(*ok, "plain output should be ok"); + } + other => panic!("expected ToolResult, got {other:?}"), + } + } + + #[test] + fn apply_patch_emits_tool_call_then_one_file_edit_per_section() { + let patch = "*** Begin Patch\n*** Update File: src/db.rs\n@@\n-let url = \"mysql://...\";\n+let url = \"postgres://...\";\n*** Add File: src/pg.rs\n+pub fn connect() {}\n*** End Patch"; + let args = serde_json::json!({ "input": patch }).to_string(); + let call = serde_json::json!({ + "timestamp": "2026-06-22T10:00:05Z", + "type": "response_item", + "payload": { + "type": "function_call", + "name": "apply_patch", + "arguments": args, + "call_id": "call-edit" + } + }) + .to_string(); + let events = parse_all(&[META, &call]); + assert_eq!( + tags(&events), + vec!["session_start", "tool_call", "file_edit", "file_edit"] + ); + // First edit: Update File with one add + one remove. + match &events[2].kind { + EventKind::FileEdit { call_id, diff } => { + assert_eq!(call_id.as_deref(), Some("call-edit")); + assert_eq!(diff.path, PathBuf::from("src/db.rs")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 1); + assert!(diff.unified.as_deref().unwrap().contains("postgres")); + } + other => panic!("expected FileEdit, got {other:?}"), + } + match &events[3].kind { + EventKind::FileEdit { diff, .. } => { + assert_eq!(diff.path, PathBuf::from("src/pg.rs")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 0); + } + other => panic!("expected FileEdit, got {other:?}"), + } + } + + #[test] + fn decision_then_edit_yields_user_turn_then_file_edit() { + let user = r#"{"timestamp":"2026-06-22T10:00:01Z","type":"response_item","payload":{"type":"message","role":"user","content":[{"type":"input_text","text":"Let's use Postgres instead of MySQL."}]}}"#; + let patch = "*** Begin Patch\n*** Update File: src/db.rs\n+let url = \"postgres://...\";\n*** End Patch"; + let args = serde_json::json!({ "patch": patch }).to_string(); + let call = serde_json::json!({ + "type": "response_item", + "payload": {"type":"function_call","name":"apply_patch","arguments":args,"call_id":"c1"} + }) + .to_string(); + let events = parse_all(&[META, user, &call]); + let t = tags(&events); + // The decision (UserTurn) precedes the FileEdit in stream order. + let user_idx = t.iter().position(|x| *x == "user_turn").unwrap(); + let edit_idx = t.iter().position(|x| *x == "file_edit").unwrap(); + assert!( + user_idx < edit_idx, + "user turn must precede file edit: {t:?}" + ); + } + + #[test] + fn failed_function_call_output_marks_not_ok() { + let out = r#"{"type":"response_item","payload":{"type":"function_call_output","call_id":"c9","output":"error: patch does not apply"}}"#; + let events = parse_all(&[META, out]); + match &events[1].kind { + EventKind::ToolResult { ok, .. } => assert!(!*ok, "error output must be not-ok"), + other => panic!("expected ToolResult, got {other:?}"), + } + } + + #[test] + fn exit_code_object_output_failure_is_not_ok() { + let out = r#"{"type":"response_item","payload":{"type":"function_call_output","call_id":"c8","output":{"output":"done","metadata":{"exit_code":1}}}}"#; + let events = parse_all(&[META, out]); + match &events[1].kind { + EventKind::ToolResult { ok, .. } => assert!(!*ok, "exit_code 1 must be not-ok"), + other => panic!("expected ToolResult, got {other:?}"), + } + } + + #[test] + fn unrecognized_record_routes_to_unknown_losslessly() { + let weird = + r#"{"timestamp":"2026-06-22T10:00:09Z","type":"turn_context","payload":{"foo":"bar"}}"#; + let events = parse_all(&[weird]); + assert_eq!(tags(&events), vec!["unknown"]); + match &events[0].kind { + EventKind::Unknown { raw_type, raw } => { + assert_eq!(raw_type, "turn_context"); + assert_eq!(raw["payload"]["foo"], "bar"); + } + other => panic!("expected Unknown, got {other:?}"), + } + } + + #[test] + fn reasoning_item_routes_to_unknown() { + let reasoning = + r#"{"type":"response_item","payload":{"type":"reasoning","summary":"thinking..."}}"#; + let events = parse_all(&[META, reasoning]); + assert_eq!(tags(&events), vec!["session_start", "unknown"]); + } + + #[test] + fn garbage_input_never_panics() { + // Invalid JSON, partial JSON, empty, non-record JSON, bare scalar. + let inputs = [ + "not json at all", + "{", + "", + " ", + "[1,2,3]", + "42", + r#"{"type":"session_meta"}"#, // missing payload + r#"{"payload":{"id":"x"}}"#, // missing type + r#"{"type":"response_item","payload":{}}"#, // item with no type + r#"{"type":"response_item","payload":{"type":"function_call","name":"apply_patch","arguments":"not-json"}}"#, + ]; + let adapter = CodexAdapter; + let mut ctx = ParseCtx::new(); + for s in inputs { + // Must not panic; result is fine either way. + let _ = adapter.parse(&raw(s), &mut ctx); + } + } + + #[test] + fn repeated_record_is_deduped() { + // Same session_meta twice → only one SessionStart. + let events = parse_all(&[META, META]); + assert_eq!(tags(&events), vec!["session_start"]); + + // Same function_call (same call_id) twice → only one ToolCall. + let call = r#"{"type":"response_item","payload":{"type":"function_call","name":"shell","arguments":"{}","call_id":"dup-1"}}"#; + let events = parse_all(&[META, call, call]); + assert_eq!(tags(&events), vec!["session_start", "tool_call"]); + } + + #[test] + fn malformed_patch_is_panic_free_and_emits_only_tool_call() { + // apply_patch whose arguments are not valid JSON: no FileEdit, just the + // ToolCall, and the args are preserved as a string. + let call = r#"{"type":"response_item","payload":{"type":"function_call","name":"apply_patch","arguments":"*** Begin Patch (truncated","call_id":"cx"}}"#; + let events = parse_all(&[META, call]); + assert_eq!(tags(&events), vec!["session_start", "tool_call"]); + match &events[1].kind { + EventKind::ToolCall { args, .. } => { + assert!(args.is_string(), "non-JSON arguments preserved as string"); + } + other => panic!("expected ToolCall, got {other:?}"), + } + } + + #[test] + fn schema_fingerprint_recognizes_codex_records() { + let adapter = CodexAdapter; + let fp = adapter.schema_fingerprint(&raw(META)); + assert_eq!(fp.source, SourceKind::Codex); + assert_eq!(fp.confidence, 100); + assert_eq!(fp.variant, "codex/rollout-v2"); + + let fp2 = adapter.schema_fingerprint(&raw("not a codex record")); + assert_eq!(fp2.confidence, 0); + } + + /// Load and parse a fixture from `fixtures/codex/v2/` (workspace root). + fn parse_fixture(name: &str) -> Vec { + // CARGO_MANIFEST_DIR is .../crates/memscribe-adapters; fixtures live two + // levels up at the workspace root. + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("..") + .join("fixtures") + .join("codex") + .join("v2") + .join(name); + let body = std::fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("read fixture {}: {e}", path.display())); + let adapter = CodexAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for (i, line) in body.lines().enumerate() { + let r = RawRecord::from_line( + line, + SourceLocation::new(path.clone(), i as u64, (i + 1) as u64), + ); + out.extend(adapter.parse(&r, &mut ctx).expect("never errors")); + } + out + } + + #[test] + fn fixture_happy_path_decision_then_edits() { + let events = parse_fixture("happy_path_decision_then_edits.jsonl"); + let t = tags(&events); + // session_start, user decision, assistant, tool_call(apply_patch), + // two file_edits, tool_result, assistant. (reasoning/event_msg → unknown) + assert_eq!(t.iter().filter(|x| **x == "session_start").count(), 1); + assert_eq!(t.iter().filter(|x| **x == "user_turn").count(), 1); + assert_eq!(t.iter().filter(|x| **x == "file_edit").count(), 2, "{t:?}"); + // The decision precedes both edits. + let user_idx = t.iter().position(|x| *x == "user_turn").unwrap(); + let first_edit = t.iter().position(|x| *x == "file_edit").unwrap(); + assert!(user_idx < first_edit, "decision must precede edits: {t:?}"); + // The successful edit's tool result is ok. + let res = events + .iter() + .find_map(|e| match &e.kind { + EventKind::ToolResult { ok, call_id, .. } if call_id == "call_apply_patch_001" => { + Some(*ok) + } + _ => None, + }) + .expect("tool result present"); + assert!(res, "successful patch result must be ok"); + // Edits carry the originating call_id so the segmenter can pair them. + for e in &events { + if let EventKind::FileEdit { call_id, .. } = &e.kind { + assert_eq!(call_id.as_deref(), Some("call_apply_patch_001")); + } + } + } + + #[test] + fn fixture_rejected_alternative_has_decision_no_edits() { + let events = parse_fixture("rejected_alternative.jsonl"); + let t = tags(&events); + assert_eq!(t.iter().filter(|x| **x == "user_turn").count(), 1); + assert_eq!(t.iter().filter(|x| **x == "file_edit").count(), 0, "{t:?}"); + } + + #[test] + fn fixture_ban_has_decision_no_edits() { + let events = parse_fixture("ban.jsonl"); + let t = tags(&events); + assert_eq!(t.iter().filter(|x| **x == "user_turn").count(), 1); + assert_eq!(t.iter().filter(|x| **x == "file_edit").count(), 0, "{t:?}"); + // The ban text is preserved verbatim on the user turn. + let txt = events.iter().find_map(|e| match &e.kind { + EventKind::UserTurn { text, .. } => Some(text.clone()), + _ => None, + }); + assert!(txt.unwrap().contains("never add a dependency")); + } + + #[test] + fn fixture_tool_failure_marks_edit_result_not_ok() { + let events = parse_fixture("tool_failure.jsonl"); + // The edit IS emitted (losslessly) ... + let edit = events.iter().find_map(|e| match &e.kind { + EventKind::FileEdit { call_id, .. } => call_id.clone(), + _ => None, + }); + assert_eq!(edit.as_deref(), Some("call_apply_patch_fail_001")); + // ... but its paired tool result is NOT ok, so the segmenter will drop + // the Episode (verified there; here we lock the not-ok signal). + let ok = events + .iter() + .find_map(|e| match &e.kind { + EventKind::ToolResult { ok, call_id, .. } + if call_id == "call_apply_patch_fail_001" => + { + Some(*ok) + } + _ => None, + }) + .expect("tool result present"); + assert!(!ok, "failed patch result must be not-ok"); + } + + #[test] + fn discover_finds_rollouts_and_skips_history() { + // Build a fake $CODEX_HOME tree under a temp dir. + let base = std::env::temp_dir().join(format!("codex-disc-{}", std::process::id())); + let day = base.join("sessions").join("2026").join("06").join("22"); + std::fs::create_dir_all(&day).expect("mkdir"); + std::fs::write(day.join("rollout-2026-06-22T10-00-00-sess.jsonl"), b"{}").unwrap(); + std::fs::write(day.join("rollout-cold.jsonl.zst"), b"{}").unwrap(); + std::fs::write(base.join("sessions").join("history.jsonl"), b"{}").unwrap(); + std::fs::write(day.join("notes.txt"), b"x").unwrap(); + + let mut overrides = std::collections::HashMap::new(); + overrides.insert("CODEX_HOME".to_string(), base.clone()); + let cfg = DiscoverCfg { + overrides, + ..Default::default() + }; + let handles = discover_rollouts(&cfg); + + // Two rollouts found; history.jsonl and notes.txt excluded. + assert_eq!(handles.len(), 2, "handles: {handles:?}"); + assert!(handles.iter().all(|h| h.source == SourceKind::Codex)); + assert!(handles.iter().any(|h| h.compressed)); + assert!(handles.iter().all(|h| h + .path + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("rollout-"))); + + std::fs::remove_dir_all(&base).ok(); + } +} diff --git a/crates/memscribe-adapters/src/copilot.rs b/crates/memscribe-adapters/src/copilot.rs new file mode 100644 index 0000000..f827e50 --- /dev/null +++ b/crates/memscribe-adapters/src/copilot.rs @@ -0,0 +1,797 @@ +//! GitHub Copilot adapter. +//! +//! Covers the GitHub Copilot CLI / Copilot chat export shape (distinct from the +//! VS Code Copilot Chat `workspaceStorage` handled by the `vscode` adapter). +//! Copilot's live store is an undocumented SQLite/`workspaceStorage` blob, so for +//! the initial deterministic model this adapter parses an **exported chat +//! JSON-lines** shape and routes anything unrecognized to +//! [`memscribe_core::EventKind::Unknown`] (losslessness). +//! +//! ## Exported record shape (one JSON object per line) +//! - A leading control record `{kind:"session_start", cwd, git:{sha,branch}, +//! toolVersion, model?, sessionId, ts}` binds the session/project. +//! - `{kind:"session_end", sessionId, reason?, ts}` closes it. +//! - Message records `{id, parentId, role:"user"|"assistant", ts, sessionId, +//! text, model?, usage:{input,output}?, toolCalls:[{id,name,args}]?, +//! toolResults:[{id,ok,output}]?, edits:[{path,oldText,newText,diff,added, +//! removed}]?}`. +//! +//! ## Mapping +//! - `session_start` → [`EventKind::SessionStart`]; `session_end` → +//! [`EventKind::SessionEnd`]. +//! - `role:"user"` → [`EventKind::UserTurn`]; `role:"assistant"` → +//! [`EventKind::AssistantTurn`] (`text`, `model`, `usage`, `parts`). +//! - `toolCalls[]` → [`EventKind::ToolCall`]; `toolResults[]` → +//! [`EventKind::ToolResult`] (`ok`); `edits[]` → [`EventKind::FileEdit`] +//! (`oldText`→`old`, `newText`→`new`, `diff`→`unified`, `added`/`removed`). +//! +//! `discover()` points at the real product paths (the binary store is not parsed +//! in this model). The parser is deterministic, never panics, and dedups repeated +//! records by their native id via [`ParseCtx::first_seen`]. + +use crate::util; +use memscribe_core::{ + CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, ProjectRef, + RawRecord, SchemaVariant, SourceKind, TranscriptAdapter, TranscriptHandle, Usage, +}; +use serde_json::Value; +use std::path::PathBuf; + +/// Adapter for GitHub Copilot transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct CopilotAdapter; + +impl TranscriptAdapter for CopilotAdapter { + fn source_kind(&self) -> SourceKind { + SourceKind::Copilot + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + discover_handles(cfg) + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + // Parse the line; blank lines yield nothing, invalid JSON is preserved as + // an Unknown (lossless) rather than failing the stream. + let Some(value) = util::parse_json_line(raw) else { + // Distinguish a blank line (skip) from invalid-but-present JSON. + if raw.as_str().map(str::trim).unwrap_or("").is_empty() { + return Ok(Vec::new()); + } + return util::stub_parse(SourceKind::Copilot, raw, ctx); + }; + + Ok(parse_value(raw, ctx, value)) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + fingerprint(sample) + } +} + +// --------------------------------------------------------------------------- +// Discovery +// --------------------------------------------------------------------------- + +/// The Copilot product paths we point discovery at. We do not parse the binary +/// store in this model, but we surface its location so the runtime/UX can show +/// where Copilot history lives. +fn discover_handles(cfg: &DiscoverCfg) -> Vec { + let home = cfg.home_dir(); + let mut handles = Vec::new(); + + // GitHub Copilot CLI config directory. + let cli_dir = home.join(".config").join("github-copilot"); + // VS Code Copilot Chat workspace storage (handled in detail by the vscode + // adapter, but Copilot history physically lives here too). + let vscode_dir = home + .join(".config") + .join("Code") + .join("User") + .join("workspaceStorage"); + + for dir in [cli_dir, vscode_dir] { + let session_hint = dir.file_name().and_then(|s| s.to_str()).map(str::to_string); + handles.push(TranscriptHandle { + path: dir, + source: SourceKind::Copilot, + session_hint, + compressed: false, + }); + } + + handles +} + +// --------------------------------------------------------------------------- +// Fingerprinting +// --------------------------------------------------------------------------- + +fn fingerprint(sample: &RawRecord) -> SchemaVariant { + let Some(value) = util::parse_json_line(sample) else { + return SchemaVariant::unknown(SourceKind::Copilot); + }; + + // A control record names the variant with high confidence. + if value.get("kind").and_then(Value::as_str) == Some("session_start") { + return SchemaVariant::certain(SourceKind::Copilot, "copilot/export-v1"); + } + + // A message record (role + id) is a reasonable but not definitive signal. + let has_role = value.get("role").and_then(Value::as_str).is_some(); + let has_id = value.get("id").and_then(Value::as_str).is_some(); + if has_role && has_id { + return SchemaVariant { + source: SourceKind::Copilot, + variant: "copilot/export-v1".to_string(), + confidence: 80, + }; + } + + SchemaVariant::unknown(SourceKind::Copilot) +} + +// --------------------------------------------------------------------------- +// Parsing +// --------------------------------------------------------------------------- + +/// Parse one already-decoded JSON record into zero or more events. +fn parse_value(raw: &RawRecord, ctx: &mut ParseCtx, value: Value) -> Vec { + match value.get("kind").and_then(Value::as_str) { + Some("session_start") => parse_session_start(raw, ctx, &value), + Some("session_end") => parse_session_end(raw, ctx, &value), + // A control record we do not recognize → Unknown (lossless). + Some(_) => vec![util::unknown_event(SourceKind::Copilot, ctx, raw, value)], + None => match value.get("role").and_then(Value::as_str) { + Some("user") | Some("assistant") => parse_message(raw, ctx, value), + // Not a control record and not a known message → Unknown. + _ => vec![util::unknown_event(SourceKind::Copilot, ctx, raw, value)], + }, + } +} + +fn parse_session_start(raw: &RawRecord, ctx: &mut ParseCtx, value: &Value) -> Vec { + // Bind the session id as soon as we learn it. + if ctx.session_id.is_none() { + if let Some(sid) = value.get("sessionId").and_then(Value::as_str) { + ctx.session_id = Some(sid.to_string()); + } + } + + let cwd = value + .get("cwd") + .and_then(Value::as_str) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")); + let git = parse_git(value.get("git")); + let model = str_field(value, "model"); + let tool_version = str_field(value, "toolVersion"); + + // Populate the project binding from the session-start record. + if ctx.project.is_none() { + ctx.project = Some(ProjectRef { + cwd: cwd.clone(), + repo_root: None, + git: git.clone(), + }); + } + + let event_id = event_id_for(value, &raw.bytes); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = ts_for(value); + + vec![util::mk_event( + SourceKind::Copilot, + ctx, + raw, + event_id, + None, + ts, + EventKind::SessionStart { + cwd, + git, + model, + tool_version, + }, + )] +} + +fn parse_session_end(raw: &RawRecord, ctx: &mut ParseCtx, value: &Value) -> Vec { + if ctx.session_id.is_none() { + if let Some(sid) = value.get("sessionId").and_then(Value::as_str) { + ctx.session_id = Some(sid.to_string()); + } + } + let event_id = event_id_for(value, &raw.bytes); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = ts_for(value); + let reason = str_field(value, "reason"); + + vec![util::mk_event( + SourceKind::Copilot, + ctx, + raw, + event_id, + None, + ts, + EventKind::SessionEnd { reason }, + )] +} + +/// Parse a message record into a turn event, plus any embedded tool calls, tool +/// results, and file edits (each a distinct event with a derived id). +fn parse_message(raw: &RawRecord, ctx: &mut ParseCtx, value: Value) -> Vec { + if ctx.session_id.is_none() { + if let Some(sid) = value.get("sessionId").and_then(Value::as_str) { + ctx.session_id = Some(sid.to_string()); + } + } + + let msg_id = event_id_for(&value, &raw.bytes); + // Idempotency: a repeated message record (same id) is dropped wholesale, + // including all of its derived sub-events. + if !ctx.first_seen(&msg_id) { + return Vec::new(); + } + + let parent_id = str_field(&value, "parentId"); + let ts = ts_for(&value); + let text = str_field(&value, "text").unwrap_or_default(); + let role = value.get("role").and_then(Value::as_str).unwrap_or(""); + + let mut events = Vec::new(); + + // 1. The turn itself. + let kind = if role == "assistant" { + let model = str_field(&value, "model"); + let usage = parse_usage(value.get("usage")); + EventKind::AssistantTurn { + text, + thinking: str_field(&value, "thinking"), + model, + usage, + parts: Vec::new(), + } + } else { + EventKind::UserTurn { + text, + parts: Vec::new(), + } + }; + events.push(util::mk_event( + SourceKind::Copilot, + ctx, + raw, + msg_id.clone(), + parent_id.clone(), + ts, + kind, + )); + + // 2. Tool calls embedded in the turn. + if let Some(calls) = value.get("toolCalls").and_then(Value::as_array) { + for (i, call) in calls.iter().enumerate() { + if let Some(ev) = tool_call_event(raw, ctx, &msg_id, ts, call, i) { + events.push(ev); + } + } + } + + // 3. Tool results embedded in the turn. We record the success flag in the + // context so that any sibling FileEdit can be paired with its outcome. + if let Some(results) = value.get("toolResults").and_then(Value::as_array) { + for (i, result) in results.iter().enumerate() { + if let Some(ev) = tool_result_event(raw, ctx, &msg_id, ts, result, i) { + events.push(ev); + } + } + } + + // 4. File edits embedded in the turn. + if let Some(edits) = value.get("edits").and_then(Value::as_array) { + for (i, edit) in edits.iter().enumerate() { + if let Some(ev) = file_edit_event(raw, ctx, &msg_id, ts, edit, i) { + events.push(ev); + } + } + } + + events +} + +fn tool_call_event( + raw: &RawRecord, + ctx: &mut ParseCtx, + msg_id: &str, + ts: memscribe_core::Timestamp, + call: &Value, + idx: usize, +) -> Option { + let call_id = str_field(call, "id").unwrap_or_else(|| format!("{msg_id}:call:{idx}")); + let name = str_field(call, "name").unwrap_or_default(); + let args = call.get("args").cloned().unwrap_or(Value::Null); + + // Remember the tool name for this call id (call/result pairing). + ctx.call_names.insert(call_id.clone(), name.clone()); + + let event_id = format!("{msg_id}:call:{call_id}"); + if !ctx.first_seen(&event_id) { + return None; + } + Some(util::mk_event( + SourceKind::Copilot, + ctx, + raw, + event_id, + Some(msg_id.to_string()), + ts, + EventKind::ToolCall { + call_id, + name, + args, + }, + )) +} + +fn tool_result_event( + raw: &RawRecord, + ctx: &mut ParseCtx, + msg_id: &str, + ts: memscribe_core::Timestamp, + result: &Value, + idx: usize, +) -> Option { + let call_id = str_field(result, "id").unwrap_or_else(|| format!("{msg_id}:result:{idx}")); + // `ok` defaults to true when absent (a present result with no flag is success). + let ok = result.get("ok").and_then(Value::as_bool).unwrap_or(true); + let output = result.get("output").cloned().unwrap_or(Value::Null); + + // Record the outcome so a sibling FileEdit can be paired with it downstream. + ctx.call_ok.insert(call_id.clone(), ok); + + let event_id = format!("{msg_id}:result:{call_id}"); + if !ctx.first_seen(&event_id) { + return None; + } + Some(util::mk_event( + SourceKind::Copilot, + ctx, + raw, + event_id, + Some(msg_id.to_string()), + ts, + EventKind::ToolResult { + call_id, + ok, + output, + }, + )) +} + +fn file_edit_event( + raw: &RawRecord, + ctx: &mut ParseCtx, + msg_id: &str, + ts: memscribe_core::Timestamp, + edit: &Value, + idx: usize, +) -> Option { + let path = str_field(edit, "path")?; + // An edit may name the originating tool call id so downstream can join its + // ToolResult.ok (a failed edit must not become an Episode). + let call_id = str_field(edit, "callId").or_else(|| str_field(edit, "call_id")); + + let diff = Diff { + path: PathBuf::from(&path), + old: str_field(edit, "oldText"), + new: str_field(edit, "newText"), + unified: str_field(edit, "diff"), + added_lines: u32_field(edit, "added"), + removed_lines: u32_field(edit, "removed"), + }; + + let event_id = format!("{msg_id}:edit:{idx}:{path}"); + if !ctx.first_seen(&event_id) { + return None; + } + Some(util::mk_event( + SourceKind::Copilot, + ctx, + raw, + event_id, + Some(msg_id.to_string()), + ts, + EventKind::FileEdit { call_id, diff }, + )) +} + +// --------------------------------------------------------------------------- +// Field helpers (all total — never panic on missing/odd input) +// --------------------------------------------------------------------------- + +/// The native event id, else a stable content hash of the raw bytes. +fn event_id_for(value: &Value, bytes: &[u8]) -> String { + str_field(value, "id").unwrap_or_else(|| memscribe_core::content_id(bytes)) +} + +/// Pull a string field, treating empty/non-string as absent. +fn str_field(value: &Value, key: &str) -> Option { + value + .get(key) + .and_then(Value::as_str) + .map(str::to_string) + .filter(|s| !s.is_empty()) +} + +/// Pull a non-negative count as `u32`, clamping out-of-range/odd values to 0. +fn u32_field(value: &Value, key: &str) -> u32 { + value + .get(key) + .and_then(Value::as_u64) + .and_then(|n| u32::try_from(n).ok()) + .unwrap_or(0) +} + +/// The record timestamp via the shared helper, tolerant of RFC3339 and epoch. +fn ts_for(value: &Value) -> memscribe_core::Timestamp { + util::ts_from(value, &["ts", "timestamp", "time", "created_at"]) +} + +fn parse_git(value: Option<&Value>) -> Option { + let v = value?; + let sha = str_field(v, "sha")?; + Some(GitRef { + sha, + branch: str_field(v, "branch"), + }) +} + +fn parse_usage(value: Option<&Value>) -> Option { + let v = value?; + let input_tokens = v.get("input").and_then(Value::as_u64); + let output_tokens = v.get("output").and_then(Value::as_u64); + if input_tokens.is_none() && output_tokens.is_none() { + return None; + } + Some(Usage { + input_tokens, + output_tokens, + cache_read_tokens: v.get("cacheRead").and_then(Value::as_u64), + cache_creation_tokens: v.get("cacheCreation").and_then(Value::as_u64), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(s: &str) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("copilot.jsonl", 0, 1)) + } + + fn parse_all(lines: &[&str]) -> Vec { + let adapter = CopilotAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for line in lines { + out.extend(adapter.parse(&raw(line), &mut ctx).expect("never errs")); + } + out + } + + const SESSION_START: &str = r#"{"kind":"session_start","cwd":"/Users/dev/projects/orbit","git":{"sha":"abc123","branch":"main"},"toolVersion":"copilot-cli 1.4.0","model":"gpt-4o","sessionId":"copilot-001","ts":"2026-06-22T10:00:00Z"}"#; + + #[test] + fn session_start_binds_session_and_project() { + let adapter = CopilotAdapter; + let mut ctx = ParseCtx::new(); + let evs = adapter.parse(&raw(SESSION_START), &mut ctx).unwrap(); + assert_eq!(evs.len(), 1); + assert_eq!(evs[0].kind.tag(), "session_start"); + assert_eq!(evs[0].session_id, "copilot-001"); + assert_eq!(ctx.session_id.as_deref(), Some("copilot-001")); + let proj = ctx.project.as_ref().expect("project bound"); + assert_eq!(proj.cwd, PathBuf::from("/Users/dev/projects/orbit")); + assert_eq!(proj.git.as_ref().map(|g| g.sha.as_str()), Some("abc123")); + match &evs[0].kind { + EventKind::SessionStart { + model, + tool_version, + .. + } => { + assert_eq!(model.as_deref(), Some("gpt-4o")); + assert_eq!(tool_version.as_deref(), Some("copilot-cli 1.4.0")); + } + other => panic!("expected session_start, got {other:?}"), + } + } + + #[test] + fn normalized_event_sequence_for_decision_then_edit() { + let lines = [ + SESSION_START, + r#"{"id":"m1","parentId":null,"role":"user","sessionId":"copilot-001","ts":"2026-06-22T10:00:05Z","text":"Let's use Postgres instead of MySQL."}"#, + r#"{"id":"m2","parentId":"m1","role":"assistant","sessionId":"copilot-001","ts":"2026-06-22T10:00:09Z","text":"Switching to Postgres.","model":"gpt-4o","usage":{"input":1000,"output":200},"edits":[{"path":"src/db.rs","oldText":"mysql","newText":"postgres","diff":"@@ -1 +1 @@\n-mysql\n+postgres","added":1,"removed":1}]}"#, + r#"{"kind":"session_end","sessionId":"copilot-001","reason":"done","ts":"2026-06-22T10:01:00Z"}"#, + ]; + let evs = parse_all(&lines); + let tags: Vec<&str> = evs.iter().map(|e| e.kind.tag()).collect(); + assert_eq!( + tags, + vec![ + "session_start", + "user_turn", + "assistant_turn", + "file_edit", + "session_end" + ] + ); + } + + #[test] + fn decision_turn_then_file_edit() { + let lines = [ + SESSION_START, + r#"{"id":"m1","role":"user","sessionId":"copilot-001","ts":"2026-06-22T10:00:05Z","text":"Use Redis for the cache."}"#, + r#"{"id":"m2","parentId":"m1","role":"assistant","sessionId":"copilot-001","ts":"2026-06-22T10:00:09Z","text":"Done.","edits":[{"path":"src/cache.rs","oldText":"a","newText":"b","diff":"d","added":3,"removed":2}]}"#, + ]; + let evs = parse_all(&lines); + // user_turn precedes file_edit. + let user_idx = evs + .iter() + .position(|e| e.kind.tag() == "user_turn") + .unwrap(); + let edit_idx = evs + .iter() + .position(|e| e.kind.tag() == "file_edit") + .unwrap(); + assert!(user_idx < edit_idx); + match &evs[edit_idx].kind { + EventKind::FileEdit { diff, .. } => { + assert_eq!(diff.path, PathBuf::from("src/cache.rs")); + assert_eq!(diff.old.as_deref(), Some("a")); + assert_eq!(diff.new.as_deref(), Some("b")); + assert_eq!(diff.unified.as_deref(), Some("d")); + assert_eq!(diff.added_lines, 3); + assert_eq!(diff.removed_lines, 2); + } + other => panic!("expected file_edit, got {other:?}"), + } + } + + #[test] + fn tool_call_and_result_with_ok_flag() { + let lines = [ + SESSION_START, + r#"{"id":"m2","role":"assistant","sessionId":"copilot-001","ts":"2026-06-22T10:00:09Z","text":"Running.","toolCalls":[{"id":"c1","name":"apply_patch","args":{"path":"x.rs"}}],"toolResults":[{"id":"c1","ok":false,"output":"patch rejected"}]}"#, + ]; + let evs = parse_all(&lines); + let tags: Vec<&str> = evs.iter().map(|e| e.kind.tag()).collect(); + assert_eq!( + tags, + vec![ + "session_start", + "assistant_turn", + "tool_call", + "tool_result" + ] + ); + match &evs[3].kind { + EventKind::ToolResult { call_id, ok, .. } => { + assert_eq!(call_id, "c1"); + assert!(!ok, "failed tool result must carry ok=false"); + } + other => panic!("expected tool_result, got {other:?}"), + } + } + + #[test] + fn assistant_usage_and_model_captured() { + let lines = [ + SESSION_START, + r#"{"id":"m2","role":"assistant","sessionId":"copilot-001","ts":"2026-06-22T10:00:09Z","text":"Hi.","model":"gpt-4o-mini","usage":{"input":42,"output":7}}"#, + ]; + let evs = parse_all(&lines); + match &evs[1].kind { + EventKind::AssistantTurn { model, usage, .. } => { + assert_eq!(model.as_deref(), Some("gpt-4o-mini")); + let u = usage.as_ref().expect("usage present"); + assert_eq!(u.input_tokens, Some(42)); + assert_eq!(u.output_tokens, Some(7)); + } + other => panic!("expected assistant_turn, got {other:?}"), + } + } + + #[test] + fn no_panic_on_garbage_input() { + let adapter = CopilotAdapter; + let mut ctx = ParseCtx::new(); + for junk in [ + "not json at all", + "{", + "[]", + "12345", + "null", + "true", + r#"{"role":"user"}"#, // missing id/text + r#"{"kind":"weird_control"}"#, // unknown control + r#"{"id":"e","role":"alien"}"#, // unknown role + r#"{"id":"x","role":"assistant","text":null,"edits":"not-an-array"}"#, + r#"{"kind":"session_start"}"#, // missing all fields + ] { + let res = adapter.parse(&raw(junk), &mut ctx); + assert!(res.is_ok(), "parse must never error on: {junk}"); + } + } + + #[test] + fn unrecognized_records_route_to_unknown() { + let evs = parse_all(&[ + r#"{"kind":"telemetry_ping","seq":3}"#, + r#"{"id":"z","role":"system","text":"boot"}"#, + ]); + assert_eq!(evs.len(), 2); + assert!(evs.iter().all(|e| e.kind.tag() == "unknown")); + } + + #[test] + fn blank_lines_skipped() { + let evs = parse_all(&["", " ", "\t"]); + assert!(evs.is_empty()); + } + + #[test] + fn dedup_repeated_record_is_idempotent() { + let user = r#"{"id":"m1","role":"user","sessionId":"copilot-001","ts":"2026-06-22T10:00:05Z","text":"hello"}"#; + let adapter = CopilotAdapter; + let mut ctx = ParseCtx::new(); + let first = adapter.parse(&raw(user), &mut ctx).unwrap(); + assert_eq!(first.len(), 1); + // Re-ingesting the same record (same native id) yields nothing. + let second = adapter.parse(&raw(user), &mut ctx).unwrap(); + assert!(second.is_empty(), "repeated record must dedup to empty"); + } + + #[test] + fn dedup_drops_derived_subevents_too() { + let msg = r#"{"id":"m2","role":"assistant","sessionId":"copilot-001","text":"x","edits":[{"path":"a.rs","oldText":"o","newText":"n","added":1,"removed":0}]}"#; + let adapter = CopilotAdapter; + let mut ctx = ParseCtx::new(); + let first = adapter.parse(&raw(msg), &mut ctx).unwrap(); + assert_eq!(first.len(), 2); // assistant_turn + file_edit + let second = adapter.parse(&raw(msg), &mut ctx).unwrap(); + assert!(second.is_empty()); + } + + #[test] + fn seq_is_monotonic_across_subevents() { + let evs = parse_all(&[ + SESSION_START, + r#"{"id":"m2","role":"assistant","sessionId":"copilot-001","text":"x","toolCalls":[{"id":"c1","name":"edit","args":{}}],"edits":[{"path":"a.rs","oldText":"o","newText":"n","added":1,"removed":0}]}"#, + ]); + let seqs: Vec = evs.iter().map(|e| e.seq).collect(); + assert_eq!(seqs, vec![0, 1, 2, 3]); + } + + #[test] + fn ban_turn_is_a_user_turn_with_verbatim_text() { + let evs = parse_all(&[ + SESSION_START, + r#"{"id":"m1","role":"user","sessionId":"copilot-001","ts":"2026-06-22T10:00:05Z","text":"We will never add a dependency on left-pad."}"#, + ]); + match &evs[1].kind { + EventKind::UserTurn { text, .. } => { + assert_eq!(text, "We will never add a dependency on left-pad."); + } + other => panic!("expected user_turn, got {other:?}"), + } + } + + #[test] + fn discover_points_at_product_paths() { + let cfg = DiscoverCfg { + home: Some(PathBuf::from("/home/dev")), + ..Default::default() + }; + let handles = CopilotAdapter.discover(&cfg); + assert!(!handles.is_empty()); + assert!(handles.iter().all(|h| h.source == SourceKind::Copilot)); + assert!(handles.iter().any(|h| h.path.ends_with("github-copilot"))); + } + + #[test] + fn fingerprint_recognizes_session_start_and_messages() { + let fp = CopilotAdapter.schema_fingerprint(&raw(SESSION_START)); + assert_eq!(fp.source, SourceKind::Copilot); + assert_eq!(fp.confidence, 100); + assert_eq!(fp.variant, "copilot/export-v1"); + + let msg = raw(r#"{"id":"m1","role":"user","text":"hi"}"#); + let fp2 = CopilotAdapter.schema_fingerprint(&msg); + assert_eq!(fp2.confidence, 80); + + let junk = raw("not json"); + let fp3 = CopilotAdapter.schema_fingerprint(&junk); + assert_eq!(fp3.confidence, 0); + } + + #[test] + fn invariants_hold_on_happy_path() { + let evs = parse_all(&[ + SESSION_START, + r#"{"id":"m1","role":"user","sessionId":"copilot-001","ts":"2026-06-22T10:00:05Z","text":"Use Postgres."}"#, + r#"{"id":"m2","parentId":"m1","role":"assistant","sessionId":"copilot-001","ts":"2026-06-22T10:00:09Z","text":"ok","edits":[{"path":"a.rs","oldText":"o","newText":"n","added":1,"removed":1}]}"#, + ]); + // Monotonic seq within the session. + let mut last = None; + for e in &evs { + if let Some(p) = last { + assert!(e.seq > p); + } + last = Some(e.seq); + } + // Unique event ids. + let mut seen = std::collections::HashSet::new(); + for e in &evs { + assert!(seen.insert(e.event_id.clone()), "dup id {}", e.event_id); + } + } + + // --- Fixture-parity guards (mirror fixtures/copilot/v1/*.jsonl verbatim) --- + + #[test] + fn fixture_tool_failure_edit_has_failed_result() { + // The assistant record from fixtures/copilot/v1/tool_failure.jsonl: the + // edit's tool result failed, so the FileEdit must coexist with a + // ToolResult{ok:false} — that is the signal downstream uses to suppress + // a spurious Episode. + let line = r#"{"id":"msg-2","parentId":"msg-1","role":"assistant","ts":"2026-06-22T13:00:13Z","sessionId":"copilot-thread-004","text":"I'll apply the migration patch.","model":"gpt-4o","usage":{"input":720,"output":90},"toolCalls":[{"id":"call-z9","name":"apply_patch","args":{"path":"migrations/0007_email_not_null.sql"}}],"toolResults":[{"id":"call-z9","ok":false,"output":"error: patch did not apply cleanly: hunk #1 FAILED at line 3"}],"edits":[{"path":"migrations/0007_email_not_null.sql","callId":"call-z9","oldText":"email TEXT","newText":"email TEXT NOT NULL","diff":"@@ -3 +3 @@\n-email TEXT\n+email TEXT NOT NULL","added":1,"removed":1}]}"#; + let evs = parse_all(&[line]); + let tags: Vec<&str> = evs.iter().map(|e| e.kind.tag()).collect(); + assert_eq!( + tags, + vec!["assistant_turn", "tool_call", "tool_result", "file_edit"] + ); + // The failed result carries ok=false. + let failed = evs + .iter() + .find(|e| e.kind.tag() == "tool_result") + .expect("tool_result present"); + match &failed.kind { + EventKind::ToolResult { ok, call_id, .. } => { + assert!(!ok, "tool result must be ok=false"); + assert_eq!(call_id, "call-z9"); + } + other => panic!("expected tool_result, got {other:?}"), + } + // The FileEdit links back to the failing call id for downstream pairing. + let edit = evs + .iter() + .find(|e| e.kind.tag() == "file_edit") + .expect("file_edit present"); + match &edit.kind { + EventKind::FileEdit { call_id, .. } => { + assert_eq!(call_id.as_deref(), Some("call-z9")); + } + other => panic!("expected file_edit, got {other:?}"), + } + } + + #[test] + fn fixture_rejected_alternative_edit_succeeds() { + // The assistant record from fixtures/copilot/v1/rejected_alternative.jsonl: + // the edit's tool result succeeded (ok=true). + let line = r#"{"id":"msg-2","parentId":"msg-1","role":"assistant","ts":"2026-06-22T11:00:14Z","sessionId":"copilot-thread-002","text":"Understood. I'll wire up the Stripe SDK and drop the PayPal client.","model":"gpt-4o","usage":{"input":980,"output":210},"toolCalls":[{"id":"call-a1","name":"apply_patch","args":{"path":"src/payments/provider.rs"}}],"toolResults":[{"id":"call-a1","ok":true,"output":"patch applied (1 file changed)"}],"edits":[{"path":"src/payments/provider.rs","callId":"call-a1","oldText":"use paypal_sdk::Client;","newText":"use stripe::Client;","diff":"@@ -1 +1 @@\n-use paypal_sdk::Client;\n+use stripe::Client;","added":1,"removed":1}]}"#; + let evs = parse_all(&[line]); + let ok = evs.iter().any(|e| { + matches!(&e.kind, EventKind::ToolResult { ok: true, call_id, .. } if call_id == "call-a1") + }); + assert!(ok, "rejected_alternative edit result should be ok=true"); + } +} diff --git a/crates/memscribe-adapters/src/cursor.rs b/crates/memscribe-adapters/src/cursor.rs new file mode 100644 index 0000000..96df9c8 --- /dev/null +++ b/crates/memscribe-adapters/src/cursor.rs @@ -0,0 +1,820 @@ +//! Cursor adapter. +//! +//! Cursor is a VS Code fork; live chat state lives in the per-workspace +//! `state.vscdb` SQLite store under +//! `~/Library/Application Support/Cursor/User/workspaceStorage//` (and, +//! on newer builds, under `~/.cursor/`). That binary store is undocumented, so +//! the first deterministic model targets an **exported JSON-lines** transcript +//! with a stable `{role, text, ...}` shape; a SQLite reader can be layered into +//! `memscribe-io` later. This parser pattern-matches the fields it needs and +//! routes anything unrecognized to [`EventKind::Unknown`] so the stream stays +//! lossless across Cursor-version churn. +//! +//! Record shape (one JSON object per line): +//! - leading `{"kind":"session_start","cwd":..,"git":{"sha","branch"}, +//! "toolVersion":..,"sessionId":..}` → [`EventKind::SessionStart`] and seeds +//! `ctx.project` / `ctx.session_id`. +//! - message records +//! `{"id","parentId","role":"user"|"assistant","ts","sessionId","text", +//! "model","usage":{"input","output"},"toolCalls":[..],"toolResults":[..], +//! "edits":[..]}`. One record expands to multiple events, in a stable order: +//! the turn (`UserTurn` / `AssistantTurn`), then each `ToolCall`, each +//! `ToolResult`, then each `FileEdit`. +//! +//! `event_id` = the record's native `id`, else a `blake3(content)` fallback. + +use crate::util; +use memscribe_core::{ + content_id, CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, Part, + ProjectRef, RawRecord, SchemaVariant, SourceKind, TranscriptAdapter, TranscriptHandle, Usage, +}; +use serde_json::Value; +use std::path::PathBuf; + +const SRC: SourceKind = SourceKind::Cursor; + +/// Adapter for Cursor transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct CursorAdapter; + +impl TranscriptAdapter for CursorAdapter { + fn source_kind(&self) -> SourceKind { + SRC + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + let mut out = Vec::new(); + let home = cfg.home_dir(); + // Point at the real product locations. We do not parse the binary store + // in this model, but discovery should surface where it lives so the + // runtime can wire a SQLite reader without re-deriving these paths. + let roots = [ + home.join("Library/Application Support/Cursor/User/workspaceStorage"), + home.join(".cursor"), + ]; + for root in roots { + for entry in walkdir::WalkDir::new(&root) + .max_depth(3) + .into_iter() + .filter_map(std::result::Result::ok) + { + let path = entry.path(); + let name = match path.file_name().and_then(|n| n.to_str()) { + Some(n) => n, + None => continue, + }; + let is_store = name == "state.vscdb" + || name.ends_with(".jsonl") + || name.ends_with(".cursorchat"); + if is_store && path.is_file() { + let session_hint = path + .parent() + .and_then(|p| p.file_name()) + .and_then(|n| n.to_str()) + .map(str::to_string); + out.push(TranscriptHandle { + path: path.to_path_buf(), + source: SRC, + session_hint, + compressed: false, + }); + } + } + } + // Deterministic order regardless of filesystem iteration order. + out.sort_by(|a, b| a.path.cmp(&b.path)); + out + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + // Blank lines yield nothing; invalid JSON is preserved verbatim as an + // Unknown so the stream is still lossless (never an error here). + let value = match util::parse_json_line(raw) { + Some(v) => v, + None => { + let s = raw.as_str().map(str::trim).unwrap_or(""); + if s.is_empty() { + return Ok(Vec::new()); + } + return Ok(vec![util::unknown_event( + SRC, + ctx, + raw, + Value::String(s.to_string()), + )]); + } + }; + + // We only know how to parse JSON objects; anything else is Unknown. + let obj = match value.as_object() { + Some(o) => o, + None => return Ok(vec![util::unknown_event(SRC, ctx, raw, value)]), + }; + + // Seed session id from any record that carries one (records are parsed + // in file order, so the first one wins for the whole stream). + if ctx.session_id.is_none() { + if let Some(sid) = str_field(obj, "sessionId") { + ctx.session_id = Some(sid.to_string()); + } + } + + // Dispatch on the record discriminator. A `kind` of `session_start` + // (and a couple of tolerant aliases) means the session header; a `role` + // means a dialogue turn. Everything else is Unknown. + if let Some(kind) = str_field(obj, "kind") { + match kind { + "session_start" | "session-start" | "sessionStart" => { + return Ok(parse_session_start(obj, ctx, raw)); + } + "session_end" | "session-end" | "sessionEnd" => { + return Ok(parse_session_end(obj, ctx, raw)); + } + _ => {} + } + } + + if str_field(obj, "role").is_some() { + return Ok(parse_message(obj, ctx, raw)); + } + + Ok(vec![util::unknown_event(SRC, ctx, raw, value)]) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + match util::parse_json_line(sample) + .as_ref() + .and_then(Value::as_object) + { + Some(obj) + if obj.contains_key("role") + || matches!(str_field(obj, "kind"), Some("session_start")) => + { + SchemaVariant::certain(SRC, "cursor/export-v1") + } + _ => SchemaVariant::unknown(SRC), + } + } +} + +/// Parse a `session_start` header: seeds `ctx.project` and emits `SessionStart`. +fn parse_session_start( + obj: &serde_json::Map, + ctx: &mut ParseCtx, + raw: &RawRecord, +) -> Vec { + let cwd_str = str_field(obj, "cwd").unwrap_or("."); + let cwd = PathBuf::from(cwd_str); + let git = parse_git(obj.get("git")); + let model = str_field(obj, "model").map(str::to_string); + let tool_version = str_field(obj, "toolVersion") + .or_else(|| str_field(obj, "tool_version")) + .map(str::to_string); + + // Bind the project for every subsequent event in this session. + ctx.project = Some(ProjectRef { + cwd: cwd.clone(), + repo_root: str_field(obj, "repoRoot") + .or_else(|| str_field(obj, "repo_root")) + .map(PathBuf::from), + git: git.clone(), + }); + + let event_id = event_id_for(obj, raw); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = ts_for(obj); + vec![util::mk_event( + SRC, + ctx, + raw, + event_id, + None, + ts, + EventKind::SessionStart { + cwd, + git, + model, + tool_version, + }, + )] +} + +/// Parse a `session_end` header into a `SessionEnd` event. +fn parse_session_end( + obj: &serde_json::Map, + ctx: &mut ParseCtx, + raw: &RawRecord, +) -> Vec { + let event_id = event_id_for(obj, raw); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = ts_for(obj); + let reason = str_field(obj, "reason").map(str::to_string); + vec![util::mk_event( + SRC, + ctx, + raw, + event_id, + parent_field(obj), + ts, + EventKind::SessionEnd { reason }, + )] +} + +/// Parse a dialogue record into the turn event plus any embedded tool calls, +/// tool results, and file edits — in a stable, deterministic order. +fn parse_message( + obj: &serde_json::Map, + ctx: &mut ParseCtx, + raw: &RawRecord, +) -> Vec { + let base_id = event_id_for(obj, raw); + // Idempotency: a repeated record (same id) yields nothing. + if !ctx.first_seen(&base_id) { + return Vec::new(); + } + + let ts = ts_for(obj); + let parent = parent_field(obj); + let role = str_field(obj, "role").unwrap_or(""); + let text = str_field(obj, "text").unwrap_or("").to_string(); + + let mut events = Vec::new(); + + // 1) The turn itself. + let turn_kind = match role { + "user" => EventKind::UserTurn { + text, + parts: text_parts(obj), + }, + "assistant" => EventKind::AssistantTurn { + text, + thinking: str_field(obj, "thinking").map(str::to_string), + model: str_field(obj, "model").map(str::to_string), + usage: parse_usage(obj.get("usage")), + parts: text_parts(obj), + }, + _ => { + // A role we don't recognize → Unknown, but still keep ordering. + EventKind::Unknown { + raw_type: role.to_string(), + raw: Value::Object(obj.clone()), + } + } + }; + events.push(util::mk_event( + SRC, + ctx, + raw, + base_id.clone(), + parent.clone(), + ts, + turn_kind, + )); + + // 2) Tool calls. Each gets a synthetic, deterministic id derived from the + // turn id + the call id so it never collides with the turn or siblings. + if let Some(calls) = obj.get("toolCalls").and_then(Value::as_array) { + for (i, call) in calls.iter().enumerate() { + let call_obj = match call.as_object() { + Some(o) => o, + None => continue, + }; + let call_id = str_field(call_obj, "id") + .map(str::to_string) + .unwrap_or_else(|| format!("{base_id}:call:{i}")); + let name = str_field(call_obj, "name").unwrap_or("").to_string(); + let args = call_obj.get("args").cloned().unwrap_or(Value::Null); + // Remember the name so a later result can be paired by call_id. + ctx.call_names.insert(call_id.clone(), name.clone()); + let ev_id = format!("{base_id}#toolcall:{call_id}"); + if !ctx.first_seen(&ev_id) { + continue; + } + events.push(util::mk_event( + SRC, + ctx, + raw, + ev_id, + Some(base_id.clone()), + ts, + EventKind::ToolCall { + call_id, + name, + args, + }, + )); + } + } + + // 3) Tool results. + if let Some(results) = obj.get("toolResults").and_then(Value::as_array) { + for (i, res) in results.iter().enumerate() { + let res_obj = match res.as_object() { + Some(o) => o, + None => continue, + }; + let call_id = str_field(res_obj, "id") + .map(str::to_string) + .unwrap_or_else(|| format!("{base_id}:result:{i}")); + let ok = bool_field(res_obj, "ok").unwrap_or(true); + ctx.call_ok.insert(call_id.clone(), ok); + let output = res_obj.get("output").cloned().unwrap_or(Value::Null); + let ev_id = format!("{base_id}#toolresult:{call_id}"); + if !ctx.first_seen(&ev_id) { + continue; + } + events.push(util::mk_event( + SRC, + ctx, + raw, + ev_id, + Some(base_id.clone()), + ts, + EventKind::ToolResult { + call_id, + ok, + output, + }, + )); + } + } + + // 4) File edits. + if let Some(edits) = obj.get("edits").and_then(Value::as_array) { + for (i, edit) in edits.iter().enumerate() { + let edit_obj = match edit.as_object() { + Some(o) => o, + None => continue, + }; + let path = str_field(edit_obj, "path").unwrap_or("").to_string(); + let diff = Diff { + path: PathBuf::from(path), + old: str_field(edit_obj, "oldText").map(str::to_string), + new: str_field(edit_obj, "newText").map(str::to_string), + unified: str_field(edit_obj, "diff").map(str::to_string), + added_lines: u32_field(edit_obj, "added").unwrap_or(0), + removed_lines: u32_field(edit_obj, "removed").unwrap_or(0), + }; + let call_id = str_field(edit_obj, "callId") + .or_else(|| str_field(edit_obj, "call_id")) + .map(str::to_string); + let ev_id = format!("{base_id}#edit:{i}"); + if !ctx.first_seen(&ev_id) { + continue; + } + events.push(util::mk_event( + SRC, + ctx, + raw, + ev_id, + Some(base_id.clone()), + ts, + EventKind::FileEdit { call_id, diff }, + )); + } + } + + events +} + +/// Build text/thinking [`Part`]s from a message (best-effort, never fails). +fn text_parts(obj: &serde_json::Map) -> Vec { + let mut parts = Vec::new(); + if let Some(t) = str_field(obj, "text") { + if !t.is_empty() { + parts.push(Part::Text { + text: t.to_string(), + }); + } + } + if let Some(th) = str_field(obj, "thinking") { + if !th.is_empty() { + parts.push(Part::Thinking { + text: th.to_string(), + }); + } + } + parts +} + +/// Parse `usage:{input,output}` (also tolerant of token-suffixed keys). +fn parse_usage(value: Option<&Value>) -> Option { + let obj = value?.as_object()?; + let input = u64_field(obj, "input").or_else(|| u64_field(obj, "input_tokens")); + let output = u64_field(obj, "output").or_else(|| u64_field(obj, "output_tokens")); + if input.is_none() && output.is_none() { + return None; + } + Some(Usage { + input_tokens: input, + output_tokens: output, + cache_read_tokens: None, + cache_creation_tokens: None, + }) +} + +/// Parse a `{sha, branch}` git ref, if present. +fn parse_git(value: Option<&Value>) -> Option { + let obj = value?.as_object()?; + let sha = str_field(obj, "sha")?.to_string(); + Some(GitRef { + sha, + branch: str_field(obj, "branch").map(str::to_string), + }) +} + +// ---- small, total field accessors (no panics, no indexing) ---- + +fn str_field<'a>(obj: &'a serde_json::Map, key: &str) -> Option<&'a str> { + obj.get(key).and_then(Value::as_str) +} + +fn bool_field(obj: &serde_json::Map, key: &str) -> Option { + obj.get(key).and_then(Value::as_bool) +} + +fn u64_field(obj: &serde_json::Map, key: &str) -> Option { + obj.get(key).and_then(Value::as_u64) +} + +fn u32_field(obj: &serde_json::Map, key: &str) -> Option { + obj.get(key) + .and_then(Value::as_u64) + .map(|n| u32::try_from(n).unwrap_or(u32::MAX)) +} + +fn parent_field(obj: &serde_json::Map) -> Option { + str_field(obj, "parentId") + .or_else(|| str_field(obj, "parent_id")) + .map(str::to_string) +} + +/// The event id for a record: native `id`, else a stable content hash. +fn event_id_for(obj: &serde_json::Map, raw: &RawRecord) -> String { + str_field(obj, "id") + .map(str::to_string) + .unwrap_or_else(|| content_id(&raw.bytes)) +} + +/// Timestamp from any of the common keys, falling back to the epoch. +fn ts_for(obj: &serde_json::Map) -> memscribe_core::Timestamp { + util::ts_from( + &Value::Object(obj.clone()), + &["ts", "timestamp", "time", "created_at"], + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(s: &str, line: u64) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("cursor.jsonl", 0, line)) + } + + /// Parse a whole JSONL string through one shared context (file order), + /// returning the flattened event-kind tags. + fn parse_all(jsonl: &str) -> (Vec, ParseCtx) { + let adapter = CursorAdapter; + let mut ctx = ParseCtx::new(); + let mut events = Vec::new(); + for (i, line) in jsonl.lines().enumerate() { + let r = raw(line, i as u64 + 1); + let evs = adapter.parse(&r, &mut ctx).expect("parse never errors"); + events.extend(evs); + } + (events, ctx) + } + + fn tags(events: &[CaptureEvent]) -> Vec<&'static str> { + events.iter().map(|e| e.kind.tag()).collect() + } + + #[test] + fn session_start_then_decision_then_edit() { + let jsonl = r#"{"kind":"session_start","sessionId":"s1","cwd":"/work/app","git":{"sha":"abc123","branch":"main"},"toolVersion":"0.42.0"} +{"id":"m1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"s1","text":"Let's use Postgres instead of MySQL"} +{"id":"m2","parentId":"m1","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"s1","text":"Switching to Postgres.","model":"cursor-fast","usage":{"input":12,"output":7},"edits":[{"path":"db/config.toml","oldText":"engine=mysql","newText":"engine=postgres","diff":"@@\n-engine=mysql\n+engine=postgres","added":1,"removed":1}]}"#; + let (events, ctx) = parse_all(jsonl); + assert_eq!( + tags(&events), + vec!["session_start", "user_turn", "assistant_turn", "file_edit"] + ); + // Session + project were learned from the header. + assert_eq!(ctx.session_id.as_deref(), Some("s1")); + assert_eq!(events[1].session_id, "s1"); + assert_eq!(events[1].project.cwd, PathBuf::from("/work/app")); + // seq is monotonic from file order. + assert_eq!( + events.iter().map(|e| e.seq).collect::>(), + vec![0, 1, 2, 3] + ); + // The decision turn is a UserTurn carrying the text verbatim. + match &events[1].kind { + EventKind::UserTurn { text, .. } => { + assert_eq!(text, "Let's use Postgres instead of MySQL"); + } + other => panic!("expected user_turn, got {other:?}"), + } + // The FileEdit carries old/new/unified and line counts. + match &events[3].kind { + EventKind::FileEdit { diff, .. } => { + assert_eq!(diff.path, PathBuf::from("db/config.toml")); + assert_eq!(diff.old.as_deref(), Some("engine=mysql")); + assert_eq!(diff.new.as_deref(), Some("engine=postgres")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 1); + } + other => panic!("expected file_edit, got {other:?}"), + } + } + + #[test] + fn assistant_usage_and_model_captured() { + let jsonl = r#"{"id":"a1","role":"assistant","sessionId":"s","text":"hi","model":"cursor-pro","usage":{"input":5,"output":9}}"#; + let (events, _) = parse_all(jsonl); + match &events[0].kind { + EventKind::AssistantTurn { + model, usage, text, .. + } => { + assert_eq!(text, "hi"); + assert_eq!(model.as_deref(), Some("cursor-pro")); + let u = usage.as_ref().expect("usage"); + assert_eq!(u.input_tokens, Some(5)); + assert_eq!(u.output_tokens, Some(9)); + } + other => panic!("expected assistant_turn, got {other:?}"), + } + } + + #[test] + fn tool_call_then_result_pairing() { + let jsonl = r#"{"id":"t1","role":"assistant","sessionId":"s","text":"running","toolCalls":[{"id":"c1","name":"shell","args":{"cmd":"ls"}}],"toolResults":[{"id":"c1","ok":true,"output":"a\nb"}]}"#; + let (events, ctx) = parse_all(jsonl); + assert_eq!( + tags(&events), + vec!["assistant_turn", "tool_call", "tool_result"] + ); + assert_eq!(ctx.call_names.get("c1").map(String::as_str), Some("shell")); + assert_eq!(ctx.call_ok.get("c1").copied(), Some(true)); + match &events[2].kind { + EventKind::ToolResult { call_id, ok, .. } => { + assert_eq!(call_id, "c1"); + assert!(*ok); + } + other => panic!("expected tool_result, got {other:?}"), + } + } + + #[test] + fn failed_tool_result_is_marked_not_ok() { + // Mirrors the tool_failure fixture: edit's result failed. + let jsonl = r#"{"id":"f1","role":"assistant","sessionId":"s","text":"trying","toolCalls":[{"id":"e1","name":"edit","args":{"path":"x.rs"}}],"toolResults":[{"id":"e1","ok":false,"output":"permission denied"}],"edits":[{"path":"x.rs","oldText":"a","newText":"b","added":1,"removed":1}]}"#; + let (events, ctx) = parse_all(jsonl); + assert_eq!( + tags(&events), + vec!["assistant_turn", "tool_call", "tool_result", "file_edit"] + ); + // The failed result is observable so downstream can suppress the Episode. + assert_eq!(ctx.call_ok.get("e1").copied(), Some(false)); + match &events[2].kind { + EventKind::ToolResult { ok, .. } => assert!(!*ok), + other => panic!("expected tool_result, got {other:?}"), + } + } + + #[test] + fn dedup_repeated_record_is_idempotent() { + let line = r#"{"id":"dup","role":"user","sessionId":"s","text":"hello"}"#; + let jsonl = format!("{line}\n{line}"); + let (events, _) = parse_all(&jsonl); + // The second identical record yields nothing. + assert_eq!(tags(&events), vec!["user_turn"]); + } + + #[test] + fn unrecognized_record_routes_to_unknown() { + let jsonl = r#"{"kind":"telemetry_ping","payload":{"x":1}} +{"id":"w1","role":"wizard","sessionId":"s","text":"???"}"#; + let (events, _) = parse_all(jsonl); + // A record with neither a known kind nor a role → Unknown; a record with + // an unknown role also degrades to Unknown rather than panicking. + assert_eq!(tags(&events), vec!["unknown", "unknown"]); + } + + #[test] + fn garbage_never_panics() { + let adapter = CursorAdapter; + let mut ctx = ParseCtx::new(); + for bad in [ + "", + " ", + "not json at all", + "{", + "[1,2,3]", + "42", + "true", + "null", + r#"{"role":42}"#, + r#"{"kind":"session_start","git":"oops","cwd":12}"#, + r#"{"id":"x","role":"user","toolCalls":"not-an-array","edits":{"nope":1}}"#, + ] { + let r = raw(bad, 1); + let evs = adapter.parse(&r, &mut ctx).expect("never errors"); + // Blank lines produce nothing; everything else is lossless (>=1). + if bad.trim().is_empty() { + assert!(evs.is_empty()); + } else { + assert!(!evs.is_empty(), "lossless for {bad:?}"); + } + } + } + + #[test] + fn invalid_json_is_preserved_as_unknown() { + let (events, _) = parse_all("this is not json"); + assert_eq!(tags(&events), vec!["unknown"]); + match &events[0].kind { + EventKind::Unknown { raw, .. } => { + assert_eq!(raw, &Value::String("this is not json".to_string())); + } + other => panic!("expected unknown, got {other:?}"), + } + } + + #[test] + fn schema_fingerprint_detects_export() { + let adapter = CursorAdapter; + let hdr = raw(r#"{"kind":"session_start","cwd":"/x"}"#, 1); + let msg = raw(r#"{"id":"m","role":"user","text":"hi"}"#, 2); + let junk = raw(r#"{"kind":"telemetry"}"#, 3); + assert_eq!(adapter.schema_fingerprint(&hdr).confidence, 100); + assert_eq!(adapter.schema_fingerprint(&msg).confidence, 100); + assert_eq!(adapter.schema_fingerprint(&junk).confidence, 0); + } + + #[test] + fn session_id_set_before_header_is_learned_from_record() { + // Even without a header, the first record carrying sessionId seeds ctx. + let jsonl = r#"{"id":"m1","role":"user","sessionId":"late","text":"hi"}"#; + let (events, ctx) = parse_all(jsonl); + assert_eq!(ctx.session_id.as_deref(), Some("late")); + assert_eq!(events[0].session_id, "late"); + } + + // ---- on-disk fixture conformance ---- + // + // The fixtures under `fixtures/cursor/v1/` ARE this tool's real record + // shape and feed the Phase-2 conformance suite. These tests parse them + // through the live adapter to guarantee the two never drift apart. + + fn fixture(name: &str) -> String { + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../../fixtures/cursor/v1") + .join(name); + std::fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("read fixture {}: {e}", path.display())) + } + + #[test] + fn fixture_happy_path_decision_then_edits() { + let (events, ctx) = parse_all(&fixture("happy_path_decision_then_edits.jsonl")); + assert_eq!(ctx.session_id.as_deref(), Some("cur-sess-001")); + // header, decision (user), assistant+call+result+edit, assistant+call+result+edit, end. + assert_eq!( + tags(&events), + vec![ + "session_start", + "user_turn", + "assistant_turn", + "tool_call", + "tool_result", + "file_edit", + "assistant_turn", + "tool_call", + "tool_result", + "file_edit", + "session_end", + ] + ); + // The decision turn is the user's, and a FileEdit follows. + match &events[1].kind { + EventKind::UserTurn { text, .. } => assert!(text.contains("Postgres")), + other => panic!("expected user_turn, got {other:?}"), + } + assert!(events.iter().any(|e| matches!( + &e.kind, + EventKind::FileEdit { diff, .. } + if diff.path == std::path::Path::new("config/database.toml") + ))); + // Project binding came from the session_start header. + assert_eq!( + events[1].project.cwd, + PathBuf::from("/Users/dev/projects/orders-api") + ); + // Every tool result in this fixture succeeded. + assert!(ctx.call_ok.values().all(|ok| *ok)); + } + + #[test] + fn fixture_rejected_alternative_parses() { + let (events, _) = parse_all(&fixture("rejected_alternative.jsonl")); + // Contains a user decision to reject Redux followed by an edit. + assert!(events.iter().any( + |e| matches!(&e.kind, EventKind::UserTurn { text, .. } if text.contains("reject")) + )); + assert!(events + .iter() + .any(|e| matches!(&e.kind, EventKind::FileEdit { .. }))); + } + + #[test] + fn fixture_ban_parses_decision_and_edit() { + let (events, _) = parse_all(&fixture("ban.jsonl")); + assert!(events.iter().any(|e| matches!( + &e.kind, + EventKind::UserTurn { text, .. } if text.contains("never") + ))); + assert!(events + .iter() + .any(|e| matches!(&e.kind, EventKind::FileEdit { .. }))); + } + + #[test] + fn fixture_tool_failure_edit_has_failed_result() { + // The edit's tool result FAILED → downstream must NOT mint an Episode. + // At the event level: there IS a FileEdit, but it is LINKED by call_id to + // a ToolResult with ok=false, so the segmenter drops it (no spurious + // episode, §8.2). + let (events, ctx) = parse_all(&fixture("tool_failure.jsonl")); + // The edit event still exists (losslessness), keyed to call-edit-4. + let edit = events + .iter() + .find_map(|e| match &e.kind { + EventKind::FileEdit { call_id, diff } => Some((call_id.clone(), diff.clone())), + _ => None, + }) + .expect("an edit event"); + assert_eq!(edit.1.path, PathBuf::from("deploy.sh")); + // The edit is tied to the failing call so the segmenter can drop it. + assert_eq!(edit.0.as_deref(), Some("call-edit-4")); + // The failing result is observable by call_id → the gate for "no Episode". + assert_eq!(ctx.call_ok.get("call-edit-4").copied(), Some(false)); + // And the ToolResult event itself is marked not-ok. + assert!(events.iter().any(|e| matches!( + &e.kind, + EventKind::ToolResult { call_id, ok: false, .. } if call_id == "call-edit-4" + ))); + } + + #[test] + fn fixture_tool_failure_yields_no_episode_via_segmenter() { + // End-to-end through the segmenter: the failed edit must NOT mint an + // Episode, and the happy path must still mint two. + use memscribe_core::gate::CommitmentGate; + use memscribe_core::segmenter::{DefaultSegmenter, Segmenter}; + + let gate = CommitmentGate::default(); + let seg = DefaultSegmenter; + + let (fail_events, _) = parse_all(&fixture("tool_failure.jsonl")); + let fail_seg = seg.segment(&fail_events, &gate); + assert_eq!( + fail_seg.episodes.len(), + 0, + "a failed edit must produce no episode" + ); + + let (ok_events, _) = parse_all(&fixture("happy_path_decision_then_edits.jsonl")); + let ok_seg = seg.segment(&ok_events, &gate); + assert_eq!( + ok_seg.episodes.len(), + 2, + "the happy path must still produce two episodes" + ); + } + + #[test] + fn all_fixtures_lossless_and_never_error() { + for name in [ + "happy_path_decision_then_edits.jsonl", + "rejected_alternative.jsonl", + "ban.jsonl", + "tool_failure.jsonl", + ] { + let (events, _) = parse_all(&fixture(name)); + // No record silently vanished: a non-empty fixture yields events, + // and none degraded to Unknown (the shapes are all recognized). + assert!(!events.is_empty(), "{name} produced no events"); + assert!( + events.iter().all(|e| e.kind.tag() != "unknown"), + "{name} produced an Unknown event" + ); + } + } +} diff --git a/crates/memscribe-adapters/src/gemini.rs b/crates/memscribe-adapters/src/gemini.rs new file mode 100644 index 0000000..608ef7b --- /dev/null +++ b/crates/memscribe-adapters/src/gemini.rs @@ -0,0 +1,823 @@ +//! Gemini CLI adapter. +//! +//! Transcripts: `~/.gemini/tmp//chats/session-*.jsonl`, append-only JSONL; +//! also a legacy single-blob `.json` history. Each non-control line is a message +//! record: `{role: user|gemini|model, text|content|parts, timestamp, thoughts, +//! tokens:{input,output}|tokenCount, toolCalls:[{name, args, resultDisplay}]}`. +//! Control records: `{"$set":{...}}` (session/cwd metadata) and +//! `{"$rewindTo": }` (logical truncation). +//! +//! Mapping (whitepaper §5 + Appendix A): +//! - `role:user` → [`EventKind::UserTurn`]. +//! - `role:gemini|model` → [`EventKind::AssistantTurn`] with `thinking` from +//! `thoughts`, `usage` from `tokens`, and structured `parts`. +//! - nested `toolCalls[]` → a [`EventKind::ToolCall`], and when `resultDisplay` +//! is present a [`EventKind::ToolResult`]; a `FileDiff`-shaped `resultDisplay` +//! additionally yields a [`EventKind::FileEdit`]. +//! - `{"$rewindTo"}` → [`EventKind::Rewind`]. +//! - `{"$set"}` that carries a cwd/project → [`EventKind::SessionStart`], else +//! [`EventKind::Unknown`]. +//! +//! Quirks: tolerate the legacy single-blob `.json` and the `$set`/`$rewindTo` +//! control records; prefer `chats/*.jsonl` over `logs.json`. The parser never +//! panics, is fully deterministic, and routes anything unrecognized to +//! [`EventKind::Unknown`] so the stream stays lossless. + +use crate::util; +use memscribe_core::{ + content_id, CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, Part, + ProjectRef, RawRecord, SchemaVariant, SourceKind, TranscriptAdapter, TranscriptHandle, Usage, +}; +use serde_json::Value; +use std::path::{Path, PathBuf}; +use walkdir::WalkDir; + +const SOURCE: SourceKind = SourceKind::Gemini; + +/// Adapter for Google Gemini CLI transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct GeminiAdapter; + +impl TranscriptAdapter for GeminiAdapter { + fn source_kind(&self) -> SourceKind { + SOURCE + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + discover_transcripts(cfg) + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + let Some(value) = util::parse_json_line(raw) else { + // Blank line or invalid JSON: nothing to emit (blank) or an Unknown + // for non-empty-but-unparseable bytes — keep losslessness. + let s = raw.as_str().map(str::trim).unwrap_or(""); + if s.is_empty() { + return Ok(Vec::new()); + } + return Ok(vec![util::unknown_event( + SOURCE, + ctx, + raw, + Value::String(s.to_string()), + )]); + }; + Ok(parse_value(raw, ctx, value)) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + match util::parse_json_line(sample) { + Some(v) if v.get("$set").is_some() || v.get("$rewindTo").is_some() => { + SchemaVariant::certain(SOURCE, "gemini/control") + } + Some(v) if v.get("role").is_some() => SchemaVariant::certain(SOURCE, "gemini/chat-v1"), + Some(_) => SchemaVariant::unknown(SOURCE), + None => SchemaVariant::unknown(SOURCE), + } + } +} + +/// Discover Gemini transcripts under `/.gemini/tmp//`. +/// +/// Prefers `chats/session-*.jsonl` (and any `chats/*.jsonl`) over the legacy +/// `logs.json`; only when a project directory has no JSONL chat does it fall +/// back to a `logs.json` / `*.json` blob. Output is sorted for determinism. +fn discover_transcripts(cfg: &DiscoverCfg) -> Vec { + let root = cfg.home_dir().join(".gemini").join("tmp"); + if !root.is_dir() { + return Vec::new(); + } + + let mut jsonl: Vec = Vec::new(); + let mut blob: Vec = Vec::new(); + for entry in WalkDir::new(&root) + .into_iter() + .filter_map(std::result::Result::ok) + { + let path = entry.path(); + if !path.is_file() { + continue; + } + match path.extension().and_then(|e| e.to_str()) { + Some("jsonl") => jsonl.push(path.to_path_buf()), + Some("json") => blob.push(path.to_path_buf()), + _ => {} + } + } + + // Project hash = the directory directly under `tmp/`. If any `.jsonl` chat + // exists for a project, drop that project's `.json` blobs (prefer chats). + let projects_with_jsonl: std::collections::HashSet = + jsonl.iter().filter_map(|p| project_dir(&root, p)).collect(); + blob.retain(|p| match project_dir(&root, p) { + Some(proj) => !projects_with_jsonl.contains(&proj), + None => true, + }); + + let mut handles: Vec = jsonl + .into_iter() + .chain(blob) + .map(|path| TranscriptHandle { + session_hint: session_hint_of(&path), + path, + source: SOURCE, + compressed: false, + }) + .collect(); + handles.sort_by(|a, b| a.path.cmp(&b.path)); + handles +} + +/// The project-hash directory directly beneath `tmp/` for a transcript path. +fn project_dir(root: &Path, path: &Path) -> Option { + let rel = path.strip_prefix(root).ok()?; + let first = rel.components().next()?; + Some(root.join(first.as_os_str())) +} + +/// Derive a session hint from a `session-.jsonl` filename. +fn session_hint_of(path: &Path) -> Option { + let stem = path.file_stem().and_then(|s| s.to_str())?; + Some(stem.strip_prefix("session-").unwrap_or(stem).to_string()) +} + +/// Parse one already-decoded JSON record into zero or more events. +fn parse_value(raw: &RawRecord, ctx: &mut ParseCtx, value: Value) -> Vec { + if value.get("$rewindTo").is_some() { + return parse_rewind(raw, ctx, value); + } + if value.get("$set").is_some() { + return parse_set(raw, ctx, value); + } + if value.get("role").is_some() { + return parse_message(raw, ctx, value); + } + vec![util::unknown_event(SOURCE, ctx, raw, value)] +} + +/// `{"$rewindTo": }` → [`EventKind::Rewind`]. +fn parse_rewind(raw: &RawRecord, ctx: &mut ParseCtx, value: Value) -> Vec { + let target = value.get("$rewindTo"); + let to_event = match target { + Some(Value::String(s)) => s.clone(), + Some(other) => other.to_string(), + None => String::new(), + }; + let event_id = record_id(&value, &raw.bytes); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = util::ts_from(&value, TS_KEYS); + vec![util::mk_event( + SOURCE, + ctx, + raw, + event_id, + None, + ts, + EventKind::Rewind { to_event }, + )] +} + +/// `{"$set": {...}}` → [`EventKind::SessionStart`] when it carries a cwd/project, +/// otherwise [`EventKind::Unknown`] (lossless). +fn parse_set(raw: &RawRecord, ctx: &mut ParseCtx, value: Value) -> Vec { + let set = value.get("$set"); + // Learn the session id if present anywhere in the $set payload. + if let Some(sid) = set + .and_then(|s| first_str(s, &["sessionId", "session_id", "id"])) + .or_else(|| first_str(&value, &["sessionId", "session_id"])) + { + if ctx.session_id.is_none() { + ctx.session_id = Some(sid); + } + } + + let cwd = set.and_then(|s| first_str(s, &["cwd", "projectRoot", "project_root", "workspace"])); + let event_id = record_id(&value, &raw.bytes); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = util::ts_from(&value, TS_KEYS); + + let Some(cwd) = cwd else { + // A `$set` with no project binding is metadata we don't model yet. + return vec![util::unknown_event(SOURCE, ctx, raw, value)]; + }; + + let git = git_ref_from(set.unwrap_or(&value)); + let model = set.and_then(|s| first_str(s, &["model", "modelName", "model_name"])); + let tool_version = set.and_then(|s| first_str(s, &["version", "cliVersion", "cli_version"])); + + // Stamp the project binding so every later event inherits it. + let repo_root = set + .and_then(|s| first_str(s, &["repoRoot", "repo_root"])) + .map(PathBuf::from); + ctx.project = Some(ProjectRef { + cwd: PathBuf::from(&cwd), + repo_root, + git: git.clone(), + }); + + vec![util::mk_event( + SOURCE, + ctx, + raw, + event_id, + None, + ts, + EventKind::SessionStart { + cwd: PathBuf::from(cwd), + git, + model, + tool_version, + }, + )] +} + +/// A message record (`role: user|gemini|model`) → one turn plus any nested +/// `toolCalls[]` as ToolCall/ToolResult/FileEdit events. +fn parse_message(raw: &RawRecord, ctx: &mut ParseCtx, value: Value) -> Vec { + let role = value.get("role").and_then(Value::as_str).unwrap_or(""); + let event_id = record_id(&value, &raw.bytes); + if !ctx.first_seen(&event_id) { + // Idempotency: a repeated record produces nothing on re-ingest. + return Vec::new(); + } + let ts = util::ts_from(&value, TS_KEYS); + let text = flatten_text(&value); + + let mut out = Vec::new(); + let kind = match role { + "user" => EventKind::UserTurn { + text, + parts: message_parts(&value), + }, + "gemini" | "model" | "assistant" => { + let thinking = first_str(&value, &["thoughts", "thinking", "reasoning"]); + EventKind::AssistantTurn { + text, + thinking, + model: first_str(&value, &["model", "modelName", "model_name"]), + usage: usage_from(&value), + parts: message_parts(&value), + } + } + _ => { + // An unrecognized role is still a valid record: keep it verbatim. + return vec![util::unknown_event(SOURCE, ctx, raw, value)]; + } + }; + out.push(util::mk_event( + SOURCE, + ctx, + raw, + event_id.clone(), + None, + ts, + kind, + )); + + // Nested tool calls become their own events, parented to the turn. + if let Some(calls) = value.get("toolCalls").and_then(Value::as_array) { + for (i, call) in calls.iter().enumerate() { + out.extend(parse_tool_call(raw, ctx, &event_id, ts, i, call)); + } + } + out +} + +/// One nested `toolCalls[]` entry → a ToolCall and, when `resultDisplay` is +/// present, a ToolResult (+ a FileEdit for a `FileDiff`-shaped result). +fn parse_tool_call( + raw: &RawRecord, + ctx: &mut ParseCtx, + turn_id: &str, + ts: memscribe_core::Timestamp, + index: usize, + call: &Value, +) -> Vec { + let name = first_str(call, &["name", "tool", "toolName"]).unwrap_or_default(); + let args = call + .get("args") + .or_else(|| call.get("arguments")) + .or_else(|| call.get("input")) + .cloned() + .unwrap_or(Value::Null); + // A deterministic, stable call id: native id if present, else turn+index. + let call_id = first_str(call, &["callId", "call_id", "id"]) + .unwrap_or_else(|| format!("{turn_id}:tool:{index}")); + + let mut out = Vec::new(); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{call_id}:call"), + Some(turn_id.to_string()), + ts, + EventKind::ToolCall { + call_id: call_id.clone(), + name, + args, + }, + )); + + let Some(result) = call.get("resultDisplay").or_else(|| call.get("result")) else { + return out; + }; + + let ok = result_ok(call, result); + ctx.call_ok.insert(call_id.clone(), ok); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{call_id}:result"), + Some(turn_id.to_string()), + ts, + EventKind::ToolResult { + call_id: call_id.clone(), + ok, + output: result.clone(), + }, + )); + + if let Some(diff) = file_diff_from(result) { + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{call_id}:edit"), + Some(turn_id.to_string()), + ts, + EventKind::FileEdit { + call_id: Some(call_id), + diff, + }, + )); + } + out +} + +/// Timestamp keys Gemini may use, in priority order. +const TS_KEYS: &[&str] = &["timestamp", "time", "ts", "createdAt", "created_at"]; + +/// A record's native id, falling back to a `blake3` content hash. +fn record_id(value: &Value, bytes: &[u8]) -> String { + first_str(value, &["id", "messageId", "message_id", "uuid"]) + .unwrap_or_else(|| content_id(bytes)) +} + +/// The first string-valued key from `keys` present on `value` (non-empty). +fn first_str(value: &Value, keys: &[&str]) -> Option { + for k in keys { + if let Some(s) = value.get(*k).and_then(Value::as_str) { + if !s.is_empty() { + return Some(s.to_string()); + } + } + } + None +} + +/// Flatten a message's textual content from `text`, `content`, or `parts[]`. +fn flatten_text(value: &Value) -> String { + if let Some(s) = value.get("text").and_then(Value::as_str) { + return s.to_string(); + } + if let Some(s) = value.get("content").and_then(Value::as_str) { + return s.to_string(); + } + if let Some(parts) = value + .get("parts") + .or_else(|| value.get("content")) + .and_then(Value::as_array) + { + let mut buf = String::new(); + for p in parts { + if let Some(s) = p.as_str() { + buf.push_str(s); + } else if let Some(s) = p.get("text").and_then(Value::as_str) { + buf.push_str(s); + } + } + return buf; + } + String::new() +} + +/// Structured `parts[]`, preserving anything we don't recognize as [`Part::Other`]. +fn message_parts(value: &Value) -> Vec { + let Some(parts) = value.get("parts").and_then(Value::as_array) else { + return Vec::new(); + }; + parts + .iter() + .map(|p| { + if let Some(s) = p.as_str() { + Part::Text { + text: s.to_string(), + } + } else if let Some(s) = p.get("text").and_then(Value::as_str) { + Part::Text { + text: s.to_string(), + } + } else if let Some(s) = p.get("thought").and_then(Value::as_str) { + Part::Thinking { + text: s.to_string(), + } + } else { + Part::Other { raw: p.clone() } + } + }) + .collect() +} + +/// Token usage from `tokens:{input,output,...}` or a flat `tokenCount`. +fn usage_from(value: &Value) -> Option { + if let Some(tokens) = value.get("tokens").filter(|v| v.is_object()) { + let usage = Usage { + input_tokens: u64_at(tokens, &["input", "inputTokens", "prompt", "promptTokens"]), + output_tokens: u64_at( + tokens, + &["output", "outputTokens", "completion", "completionTokens"], + ), + cache_read_tokens: u64_at(tokens, &["cacheRead", "cached", "cachedContentTokens"]), + cache_creation_tokens: u64_at(tokens, &["cacheCreation", "cacheWrite"]), + }; + if usage != Usage::default() { + return Some(usage); + } + } + if let Some(total) = u64_at(value, &["tokenCount", "totalTokens"]) { + return Some(Usage { + output_tokens: Some(total), + ..Usage::default() + }); + } + None +} + +/// First unsigned-integer value among `keys` on `value`. +fn u64_at(value: &Value, keys: &[&str]) -> Option { + for k in keys { + if let Some(n) = value.get(*k).and_then(Value::as_u64) { + return Some(n); + } + } + None +} + +/// A git ref from `commit`/`sha` (+ optional `branch`) within a `$set` payload. +fn git_ref_from(value: &Value) -> Option { + let sha = first_str(value, &["commit", "sha", "head", "gitCommit"])?; + Some(GitRef { + sha, + branch: first_str(value, &["branch", "gitBranch"]), + }) +} + +/// Whether a tool result is a success. A `FileDiff`-shaped result is a success +/// by construction; otherwise an explicit `error`/`success`/`ok`/`status` field +/// decides, defaulting to success when none is present. +fn result_ok(call: &Value, result: &Value) -> bool { + for v in [call, result] { + if let Some(b) = v.get("success").and_then(Value::as_bool) { + return b; + } + if let Some(b) = v.get("ok").and_then(Value::as_bool) { + return b; + } + if let Some(b) = v.get("error").and_then(Value::as_bool) { + return !b; + } + if let Some(s) = v.get("status").and_then(Value::as_str) { + let s = s.to_ascii_lowercase(); + if s == "error" || s == "failed" || s == "failure" || s == "rejected" { + return false; + } + if s == "success" || s == "ok" || s == "completed" { + return true; + } + } + // A non-empty `error` string/object means failure. + match v.get("error") { + Some(Value::String(s)) if !s.is_empty() => return false, + Some(Value::Object(o)) if !o.is_empty() => return false, + _ => {} + } + } + true +} + +/// A normalized [`Diff`] from a `FileDiff`-shaped `resultDisplay`, if it looks +/// like one (`fileName`/`filePath` plus diff content). Returns `None` for +/// non-edit results. +fn file_diff_from(result: &Value) -> Option { + let obj = result.as_object()?; + let path = first_str(result, &["fileName", "filePath", "file", "path"])?; + let has_edit_shape = obj.contains_key("originalContent") + || obj.contains_key("newContent") + || obj.contains_key("fileDiff") + || obj.contains_key("diff") + || obj.contains_key("diffStat"); + if !has_edit_shape { + return None; + } + + let old = first_str(result, &["originalContent", "oldContent", "old"]); + let new = first_str(result, &["newContent", "new"]); + let unified = first_str(result, &["fileDiff", "diff", "unified"]); + + let (added, removed) = diff_stat(result); + Some(Diff { + path: PathBuf::from(path), + old, + new, + unified, + added_lines: added, + removed_lines: removed, + }) +} + +/// Added/removed line counts from `diffStat:{added,removed}` or the +/// `model_added_lines`/`model_removed_lines` shape. +fn diff_stat(result: &Value) -> (u32, u32) { + if let Some(stat) = result.get("diffStat").filter(|v| v.is_object()) { + let added = u64_at(stat, &["added", "additions", "model_added_lines"]).unwrap_or(0); + let removed = u64_at(stat, &["removed", "deletions", "model_removed_lines"]).unwrap_or(0); + return (clamp_u32(added), clamp_u32(removed)); + } + let added = u64_at(result, &["model_added_lines", "added"]).unwrap_or(0); + let removed = u64_at(result, &["model_removed_lines", "removed"]).unwrap_or(0); + (clamp_u32(added), clamp_u32(removed)) +} + +/// Saturate a `u64` line count into the model's `u32` field. +fn clamp_u32(n: u64) -> u32 { + u32::try_from(n).unwrap_or(u32::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(line: &str) -> RawRecord { + RawRecord::from_line(line, SourceLocation::new("session-x.jsonl", 0, 1)) + } + + fn parse_all(lines: &[&str]) -> Vec { + let adapter = GeminiAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for (i, l) in lines.iter().enumerate() { + let r = + RawRecord::from_line(l, SourceLocation::new("session-x.jsonl", 0, i as u64 + 1)); + out.extend(adapter.parse(&r, &mut ctx).expect("never errors")); + } + out + } + + fn tags(events: &[CaptureEvent]) -> Vec<&'static str> { + events.iter().map(|e| e.kind.tag()).collect() + } + + #[test] + fn set_with_cwd_is_session_start_and_binds_project() { + let line = r#"{"$set":{"sessionId":"sess-1","cwd":"/home/u/app","model":"gemini-2.5-pro","branch":"main","commit":"abc123"}}"#; + let evs = parse_all(&[line]); + assert_eq!(tags(&evs), ["session_start"]); + assert_eq!(evs[0].session_id, "sess-1"); + assert_eq!(evs[0].project.cwd, PathBuf::from("/home/u/app")); + match &evs[0].kind { + EventKind::SessionStart { + cwd, git, model, .. + } => { + assert_eq!(cwd, &PathBuf::from("/home/u/app")); + assert_eq!(model.as_deref(), Some("gemini-2.5-pro")); + let git = git.as_ref().expect("git ref"); + assert_eq!(git.sha, "abc123"); + assert_eq!(git.branch.as_deref(), Some("main")); + } + other => panic!("expected SessionStart, got {other:?}"), + } + } + + #[test] + fn set_without_project_is_unknown_not_session_start() { + let evs = parse_all(&[r#"{"$set":{"theme":"dark"}}"#]); + assert_eq!(tags(&evs), ["unknown"]); + } + + #[test] + fn user_then_assistant_with_edit_yields_decision_then_file_edit() { + // A user decision turn, then an assistant turn whose tool call edits a + // file — the canonical happy path: UserTurn then (eventually) FileEdit. + let user = r#"{"id":"m1","role":"user","text":"Let's use Postgres instead of MySQL.","timestamp":"2026-06-22T10:00:00Z"}"#; + let asst = r#"{"id":"m2","role":"model","text":"Switching the driver.","thoughts":"swap the dep","tokens":{"input":12,"output":34},"timestamp":"2026-06-22T10:00:01Z","toolCalls":[{"name":"write_file","args":{"path":"db.rs"},"resultDisplay":{"fileName":"db.rs","originalContent":"mysql","newContent":"postgres","fileDiff":"@@ -1 +1 @@\n-mysql\n+postgres","diffStat":{"added":1,"removed":1}}}]}"#; + let evs = parse_all(&[user, asst]); + assert_eq!( + tags(&evs), + [ + "user_turn", + "assistant_turn", + "tool_call", + "tool_result", + "file_edit", + ] + ); + + // UserTurn first. + match &evs[0].kind { + EventKind::UserTurn { text, .. } => { + assert!(text.contains("instead of MySQL")); + } + other => panic!("expected UserTurn, got {other:?}"), + } + // AssistantTurn carries thinking + usage. + match &evs[1].kind { + EventKind::AssistantTurn { + thinking, usage, .. + } => { + assert_eq!(thinking.as_deref(), Some("swap the dep")); + let u = usage.as_ref().expect("usage"); + assert_eq!(u.input_tokens, Some(12)); + assert_eq!(u.output_tokens, Some(34)); + } + other => panic!("expected AssistantTurn, got {other:?}"), + } + // FileEdit normalized from the FileDiff. + match &evs[4].kind { + EventKind::FileEdit { call_id, diff } => { + assert!(call_id.is_some()); + assert_eq!(diff.path, PathBuf::from("db.rs")); + assert_eq!(diff.old.as_deref(), Some("mysql")); + assert_eq!(diff.new.as_deref(), Some("postgres")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 1); + assert!(diff.unified.as_deref().unwrap().contains("+postgres")); + } + other => panic!("expected FileEdit, got {other:?}"), + } + } + + #[test] + fn failed_tool_result_reports_ok_false() { + // An edit whose tool result failed must surface ToolResult.ok=false so + // the downstream segmenter suppresses the episode. The FileEdit still + // carries the same call_id the failed result keys on. + let line = r#"{"id":"m9","role":"model","text":"trying","toolCalls":[{"callId":"c7","name":"write_file","args":{"path":"x.rs"},"resultDisplay":{"fileName":"x.rs","newContent":"...","fileDiff":"@@","error":"permission denied"}}]}"#; + let evs = parse_all(&[line]); + assert_eq!( + tags(&evs), + ["assistant_turn", "tool_call", "tool_result", "file_edit"] + ); + let result = evs + .iter() + .find(|e| matches!(e.kind, EventKind::ToolResult { .. })) + .unwrap(); + let edit = evs + .iter() + .find(|e| matches!(e.kind, EventKind::FileEdit { .. })) + .unwrap(); + match (&result.kind, &edit.kind) { + ( + EventKind::ToolResult { + call_id: rid, ok, .. + }, + EventKind::FileEdit { + call_id: Some(eid), .. + }, + ) => { + assert!(!ok, "failed result must be ok=false"); + assert_eq!(rid, eid, "edit and failed result must share call_id"); + } + other => panic!("unexpected kinds: {other:?}"), + } + } + + #[test] + fn rewind_control_record_maps_to_rewind() { + let evs = parse_all(&[r#"{"$rewindTo":"m1"}"#]); + assert_eq!(tags(&evs), ["rewind"]); + match &evs[0].kind { + EventKind::Rewind { to_event } => assert_eq!(to_event, "m1"), + other => panic!("expected Rewind, got {other:?}"), + } + } + + #[test] + fn rewind_to_numeric_index_stringifies() { + let evs = parse_all(&[r#"{"$rewindTo":3}"#]); + match &evs[0].kind { + EventKind::Rewind { to_event } => assert_eq!(to_event, "3"), + other => panic!("expected Rewind, got {other:?}"), + } + } + + #[test] + fn garbage_never_panics_and_is_lossless() { + // Invalid JSON, an empty object, a number, an unknown role, a blank + // line: none may panic, none may be silently dropped (except blanks). + let evs = parse_all(&[ + "not json at all", + "{}", + "42", + r#"{"role":"system","text":"?"}"#, + " ", + r#"{"foo":"bar"}"#, + ]); + // The blank line yields nothing; everything else is at least Unknown. + assert_eq!(evs.len(), 5); + assert!(evs.iter().all(|e| { + matches!( + e.kind.tag(), + "unknown" | "user_turn" | "assistant_turn" | "session_start" + ) + })); + // The unknown-role record is preserved verbatim, not dropped. + assert!(evs.iter().any(|e| e.kind.tag() == "unknown")); + } + + #[test] + fn repeated_record_is_deduped_for_idempotency() { + let line = r#"{"id":"dup-1","role":"user","text":"hello"}"#; + let once = parse_all(&[line]); + assert_eq!(tags(&once), ["user_turn"]); + // Re-ingesting the SAME record id within the session yields nothing. + let twice = parse_all(&[line, line]); + assert_eq!(tags(&twice), ["user_turn"]); + } + + #[test] + fn parse_is_deterministic() { + let lines = [ + r#"{"$set":{"sessionId":"s","cwd":"/w"}}"#, + r#"{"id":"a","role":"user","text":"go with Stripe instead of PayPal"}"#, + r#"{"id":"b","role":"model","text":"ok","toolCalls":[{"name":"edit","args":{},"resultDisplay":{"fileName":"a.rs","newContent":"x","fileDiff":"@@","diffStat":{"added":1,"removed":0}}}]}"#, + ]; + let a = parse_all(&lines); + let b = parse_all(&lines); + assert_eq!( + serde_json::to_string(&a).unwrap(), + serde_json::to_string(&b).unwrap() + ); + } + + #[test] + fn seq_is_monotonic_across_a_message_with_tool_calls() { + let lines = [ + r#"{"id":"a","role":"user","text":"hi"}"#, + r#"{"id":"b","role":"model","text":"editing","toolCalls":[{"name":"e","args":{},"resultDisplay":{"fileName":"f","newContent":"n","diff":"d"}}]}"#, + ]; + let evs = parse_all(&lines); + for w in evs.windows(2) { + assert!(w[1].seq > w[0].seq, "seq must strictly increase"); + } + } + + #[test] + fn legacy_text_only_assistant_has_no_usage_when_absent() { + let evs = parse_all(&[r#"{"id":"z","role":"gemini","content":"plain reply"}"#]); + match &evs[0].kind { + EventKind::AssistantTurn { text, usage, .. } => { + assert_eq!(text, "plain reply"); + assert!(usage.is_none()); + } + other => panic!("expected AssistantTurn, got {other:?}"), + } + } + + #[test] + fn token_count_flat_field_becomes_output_usage() { + let evs = parse_all(&[r#"{"id":"t","role":"model","text":"x","tokenCount":99}"#]); + match &evs[0].kind { + EventKind::AssistantTurn { usage, .. } => { + assert_eq!(usage.as_ref().unwrap().output_tokens, Some(99)); + } + other => panic!("expected AssistantTurn, got {other:?}"), + } + } + + #[test] + fn schema_fingerprint_classifies_records() { + let a = GeminiAdapter; + assert_eq!( + a.schema_fingerprint(&raw(r#"{"$set":{"cwd":"/w"}}"#)) + .variant, + "gemini/control" + ); + assert_eq!( + a.schema_fingerprint(&raw(r#"{"$rewindTo":1}"#)).variant, + "gemini/control" + ); + assert_eq!( + a.schema_fingerprint(&raw(r#"{"role":"user","text":"hi"}"#)) + .variant, + "gemini/chat-v1" + ); + assert_eq!(a.schema_fingerprint(&raw("garbage")).confidence, 0); + } +} diff --git a/crates/memscribe-adapters/src/lib.rs b/crates/memscribe-adapters/src/lib.rs new file mode 100644 index 0000000..9857030 --- /dev/null +++ b/crates/memscribe-adapters/src/lib.rs @@ -0,0 +1,37 @@ +//! # memscribe-adapters +//! +//! Per-tool transcript adapters. Each tool implements +//! [`memscribe_core::TranscriptAdapter`]: where its logs live, and how to turn +//! one raw record into normalized [`memscribe_core::CaptureEvent`]s. Parsers are +//! **version-tolerant** — they route anything unrecognized to +//! [`memscribe_core::EventKind::Unknown`] rather than failing — and **must never +//! panic** (every parser has a fuzz target). +//! +//! Adapters are behind feature flags so a consumer can compile only the tools it +//! needs. The [`registry`] assembles the set of enabled adapters. +#![forbid(unsafe_code)] + +pub mod util; + +#[cfg(feature = "claude_code")] +pub mod claude_code; +#[cfg(feature = "codex")] +pub mod codex; +#[cfg(feature = "copilot")] +pub mod copilot; +#[cfg(feature = "cursor")] +pub mod cursor; +#[cfg(feature = "gemini")] +pub mod gemini; +#[cfg(feature = "otel")] +pub mod otel; +#[cfg(feature = "vscode")] +pub mod vscode; +#[cfg(feature = "windsurf")] +pub mod windsurf; +#[cfg(feature = "zed")] +pub mod zed; + +pub mod registry; + +pub use registry::{adapter_for, all_adapters}; diff --git a/crates/memscribe-adapters/src/otel.rs b/crates/memscribe-adapters/src/otel.rs new file mode 100644 index 0000000..3fe9ba2 --- /dev/null +++ b/crates/memscribe-adapters/src/otel.rs @@ -0,0 +1,806 @@ +//! OpenTelemetry GenAI adapter. +//! +//! Source: OTLP push (local collector / file), OTel GenAI semconv records as +//! JSON / NDJSON — the universal fallback channel for any instrumented agent. +//! Each line is a log record or span. Attributes come in two shapes and this +//! adapter accepts both: +//! +//! - **flat**: `{ "gen_ai.input.messages": [...], "gen_ai.system": "...", ... }` +//! - **OTLP nested**: `{ "attributes": [ { "key": "...", +//! "value": { "stringValue" | "intValue" | "arrayValue" } }, ... ] }` +//! +//! Mapping (whitepaper §5): +//! - `gen_ai.input.messages` (role `user`) and `gen_ai.cli.user_prompt` +//! → [`EventKind::UserTurn`] +//! - `gen_ai.output.messages` → [`EventKind::AssistantTurn`] (with model + usage) +//! - `execute_tool` span → [`EventKind::ToolCall`] (+ [`EventKind::ToolResult`] +//! when result attributes are present) +//! - `file_operation` span (and `execute_tool` edits) → [`EventKind::FileEdit`] +//! with `file.path`, `model_added_lines`/`code.added_lines`, +//! `model_removed_lines`/`code.removed_lines` +//! - `gen_ai.conversation.id` → session id; record `time` → timestamp. +//! +//! Anything well-formed but unrecognized is routed to [`EventKind::Unknown`] so +//! the stream stays lossless. The parser never panics. + +use crate::util; +use memscribe_core::{ + content_id, CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, Part, + ProjectRef, RawRecord, SchemaVariant, SourceKind, Timestamp, TranscriptAdapter, + TranscriptHandle, Usage, +}; +use serde_json::Value; +use std::path::PathBuf; + +/// Adapter for OpenTelemetry GenAI records. +#[derive(Debug, Default, Clone, Copy)] +pub struct OtelAdapter; + +impl TranscriptAdapter for OtelAdapter { + fn source_kind(&self) -> SourceKind { + SourceKind::Otel + } + + fn discover(&self, _cfg: &DiscoverCfg) -> Vec { + // OTel records are pushed (collector / file tail) rather than discovered + // in a well-known per-tool directory, so there is nothing to glob. + Vec::new() + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + let Some(value) = util::parse_json_line(raw) else { + // Blank line → nothing; non-JSON → lossless Unknown of the raw text. + let s = raw.as_str().map(str::trim).unwrap_or(""); + if s.is_empty() { + return Ok(Vec::new()); + } + return Ok(vec![util::unknown_event( + SourceKind::Otel, + ctx, + raw, + Value::String(s.to_string()), + )]); + }; + + // Normalize both shapes into a flat attribute view. + let attrs = Attrs::from_record(&value); + + // Learn the session id and project binding as early as we can. + if ctx.session_id.is_none() { + if let Some(sid) = attrs.str("gen_ai.conversation.id") { + ctx.session_id = Some(sid.to_string()); + } + } + + let ts = attrs.timestamp(); + let op = attrs.operation_name(); + + let events = match op.as_deref() { + Some("session.start") | Some("session_start") | Some("gen_ai.session.start") => { + vec![self.session_start(ctx, raw, &attrs, ts)] + } + Some("session.end") | Some("session_end") | Some("gen_ai.session.end") => { + vec![mk( + ctx, + raw, + derive_id(raw, "session_end", 0), + ts, + EventKind::SessionEnd { + reason: attrs.str("reason").map(str::to_string), + }, + )] + } + Some("execute_tool") | Some("gen_ai.execute_tool") => { + self.execute_tool(ctx, raw, &attrs, ts) + } + Some("file_operation") | Some("gen_ai.file_operation") => { + self.file_operation(ctx, raw, &attrs, ts) + } + // Chat / inference records carry the dialogue. + _ => self.dialogue(ctx, raw, &attrs, ts, &value), + }; + + // Dedup / idempotency: drop any event whose id we have already emitted. + Ok(self.dedup(ctx, events)) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + let Some(value) = util::parse_json_line(sample) else { + return SchemaVariant::unknown(SourceKind::Otel); + }; + // The OTLP nested shape carries an `attributes` array of {key,value}. + if value + .get("attributes") + .and_then(Value::as_array) + .is_some_and(|a| a.iter().any(|e| e.get("key").is_some())) + { + return SchemaVariant::certain(SourceKind::Otel, "otel/genai-otlp"); + } + // The flat shape uses dotted `gen_ai.*` keys directly on the object. + if value + .as_object() + .is_some_and(|m| m.keys().any(|k| k.starts_with("gen_ai."))) + { + return SchemaVariant::certain(SourceKind::Otel, "otel/genai-flat"); + } + SchemaVariant::unknown(SourceKind::Otel) + } +} + +impl OtelAdapter { + fn session_start( + &self, + ctx: &mut ParseCtx, + raw: &RawRecord, + attrs: &Attrs, + ts: Timestamp, + ) -> CaptureEvent { + let cwd: PathBuf = attrs + .str("cwd") + .or_else(|| attrs.str("gen_ai.cli.cwd")) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")); + let git = attrs.str("git.commit").map(|sha| GitRef { + sha: sha.to_string(), + branch: attrs.str("git.branch").map(str::to_string), + }); + // Stamp the project binding for the rest of the session. + if ctx.project.is_none() { + ctx.project = Some(ProjectRef { + cwd: cwd.clone(), + repo_root: attrs.str("repo_root").map(PathBuf::from), + git: git.clone(), + }); + } + mk( + ctx, + raw, + derive_id(raw, "session_start", 0), + ts, + EventKind::SessionStart { + cwd, + git, + model: attrs.str("gen_ai.request.model").map(str::to_string), + tool_version: attrs + .str("gen_ai.tool.version") + .or_else(|| attrs.str("service.version")) + .map(str::to_string), + }, + ) + } + + /// `execute_tool` span → a ToolCall, plus a ToolResult when the span carries + /// result/error attributes, plus a FileEdit when it carries `file.path`. + fn execute_tool( + &self, + ctx: &mut ParseCtx, + raw: &RawRecord, + attrs: &Attrs, + ts: Timestamp, + ) -> Vec { + let mut out = Vec::new(); + let name = attrs.str("gen_ai.tool.name").unwrap_or("tool").to_string(); + let call_id = attrs + .str("gen_ai.tool.call.id") + .map(str::to_string) + .unwrap_or_else(|| content_id(&raw.bytes)); + let args = attrs + .value("gen_ai.tool.call.arguments") + .cloned() + .unwrap_or(Value::Null); + + ctx.call_names.insert(call_id.clone(), name.clone()); + out.push(mk( + ctx, + raw, + derive_id(raw, "tool_call", 0), + ts, + EventKind::ToolCall { + call_id: call_id.clone(), + name: name.clone(), + args, + }, + )); + + // A result is present iff the span reports a status/result/error. + let ok = attrs.tool_ok(); + if let Some(ok) = ok { + ctx.call_ok.insert(call_id.clone(), ok); + let output = attrs + .value("gen_ai.tool.result") + .cloned() + .unwrap_or(Value::Null); + out.push(mk( + ctx, + raw, + derive_id(raw, "tool_result", 0), + ts, + EventKind::ToolResult { + call_id: call_id.clone(), + ok, + output, + }, + )); + } + + // An edit-shaped tool span also yields a FileEdit (keyed to the call). + if let Some(diff) = attrs.file_diff() { + out.push(mk( + ctx, + raw, + derive_id(raw, "file_edit", 0), + ts, + EventKind::FileEdit { + call_id: Some(call_id), + diff, + }, + )); + } + out + } + + /// `file_operation` span → a FileEdit (no call id). + fn file_operation( + &self, + ctx: &mut ParseCtx, + raw: &RawRecord, + attrs: &Attrs, + ts: Timestamp, + ) -> Vec { + match attrs.file_diff() { + Some(diff) => vec![mk( + ctx, + raw, + derive_id(raw, "file_edit", 0), + ts, + EventKind::FileEdit { + call_id: None, + diff, + }, + )], + // A file_operation without a path is unrecognized → lossless Unknown. + None => vec![util::unknown_event( + SourceKind::Otel, + ctx, + raw, + attrs.raw().clone(), + )], + } + } + + /// A chat / inference record: zero or more UserTurns from input messages and + /// the CLI prompt, then zero or more AssistantTurns from output messages. + fn dialogue( + &self, + ctx: &mut ParseCtx, + raw: &RawRecord, + attrs: &Attrs, + ts: Timestamp, + value: &Value, + ) -> Vec { + let mut out = Vec::new(); + + // `gen_ai.cli.user_prompt` → one UserTurn. + if let Some(prompt) = attrs.str("gen_ai.cli.user_prompt") { + if !prompt.is_empty() { + out.push(mk( + ctx, + raw, + derive_id(raw, "user_prompt", 0), + ts, + EventKind::UserTurn { + text: prompt.to_string(), + parts: vec![Part::Text { + text: prompt.to_string(), + }], + }, + )); + } + } + + // `gen_ai.input.messages` → one UserTurn per user-role message. + if let Some(msgs) = attrs.array("gen_ai.input.messages") { + for (i, m) in msgs.iter().enumerate() { + if !is_user_role(m) { + continue; + } + let text = message_text(m); + out.push(mk( + ctx, + raw, + derive_id(raw, "input_msg", i), + ts, + EventKind::UserTurn { + text: text.clone(), + parts: vec![Part::Text { text }], + }, + )); + } + } + + // `gen_ai.output.messages` → one AssistantTurn per message. + if let Some(msgs) = attrs.array("gen_ai.output.messages") { + let model = attrs.str("gen_ai.request.model").map(str::to_string); + let usage = attrs.usage(); + for (i, m) in msgs.iter().enumerate() { + let text = message_text(m); + out.push(mk( + ctx, + raw, + derive_id(raw, "output_msg", i), + ts, + EventKind::AssistantTurn { + text: text.clone(), + thinking: None, + model: model.clone(), + usage: usage.clone(), + parts: vec![Part::Text { text }], + }, + )); + } + } + + // A record with none of the recognized dialogue fields is preserved + // verbatim so the stream stays lossless. + if out.is_empty() { + out.push(util::unknown_event( + SourceKind::Otel, + ctx, + raw, + value.clone(), + )); + } + out + } + + /// Drop events whose ids have already been emitted (dedup / idempotency). + fn dedup(&self, ctx: &mut ParseCtx, events: Vec) -> Vec { + events + .into_iter() + .filter(|e| ctx.first_seen(&e.event_id)) + .collect() + } +} + +/// Build a normalized event with the OTel source. +fn mk( + ctx: &mut ParseCtx, + raw: &RawRecord, + event_id: String, + ts: Timestamp, + kind: EventKind, +) -> CaptureEvent { + util::mk_event(SourceKind::Otel, ctx, raw, event_id, None, ts, kind) +} + +/// A deterministic per-logical-event id: the record's content hash plus a stable +/// `kind`/index suffix so multiple events from one record don't collide and a +/// repeated record dedups to the same ids. +fn derive_id(raw: &RawRecord, kind: &str, index: usize) -> String { + format!("{}:{kind}:{index}", content_id(&raw.bytes)) +} + +/// Is a `gen_ai.*.messages` entry a user-role message? +fn is_user_role(m: &Value) -> bool { + m.get("role") + .and_then(Value::as_str) + .map(|r| r.eq_ignore_ascii_case("user")) + .unwrap_or(false) +} + +/// Flatten a GenAI message's text from `content` (string or parts array) or +/// `parts` (array of `{type,text}` / `{content}` / strings). +fn message_text(m: &Value) -> String { + if let Some(s) = m.get("content").and_then(Value::as_str) { + return s.to_string(); + } + let mut buf = String::new(); + for key in ["parts", "content"] { + if let Some(arr) = m.get(key).and_then(Value::as_array) { + for p in arr { + if let Some(s) = p.as_str() { + push_part(&mut buf, s); + } else if let Some(s) = p + .get("text") + .or_else(|| p.get("content")) + .and_then(Value::as_str) + { + push_part(&mut buf, s); + } + } + } + } + buf +} + +fn push_part(buf: &mut String, s: &str) { + if !buf.is_empty() { + buf.push('\n'); + } + buf.push_str(s); +} + +/// A flat view over a GenAI record's attributes, hiding the flat-vs-OTLP shape. +struct Attrs<'a> { + /// The flat record object, when the record is already flat. + flat: Option<&'a serde_json::Map>, + /// Materialized {key → value} from the OTLP `attributes` array, when nested. + nested: Option>, + /// The original record (for lossless Unknown fallbacks). + raw: &'a Value, +} + +impl<'a> Attrs<'a> { + fn from_record(value: &'a Value) -> Self { + // OTLP nested: an `attributes: [{key, value:{...}}]` array. + if let Some(arr) = value.get("attributes").and_then(Value::as_array) { + let mut map = std::collections::HashMap::new(); + for entry in arr { + if let Some(key) = entry.get("key").and_then(Value::as_str) { + if let Some(v) = entry.get("value").map(otlp_value) { + map.insert(key.to_string(), v); + } + } + } + return Attrs { + flat: None, + nested: Some(map), + raw: value, + }; + } + Attrs { + flat: value.as_object(), + nested: None, + raw: value, + } + } + + fn raw(&self) -> &'a Value { + self.raw + } + + fn value(&self, key: &str) -> Option<&Value> { + if let Some(m) = self.flat { + return m.get(key); + } + self.nested.as_ref().and_then(|m| m.get(key)) + } + + fn str(&self, key: &str) -> Option<&str> { + self.value(key).and_then(Value::as_str) + } + + fn array(&self, key: &str) -> Option<&Vec> { + self.value(key).and_then(Value::as_array) + } + + fn u64(&self, key: &str) -> Option { + let v = self.value(key)?; + if let Some(n) = v.as_u64() { + return Some(n); + } + // OTLP intValue is often a stringified integer. + v.as_str().and_then(|s| s.trim().parse::().ok()) + } + + fn u32(&self, key: &str) -> Option { + self.u64(key).and_then(|n| u32::try_from(n).ok()) + } + + fn operation_name(&self) -> Option { + self.str("gen_ai.operation.name") + .or_else(|| self.str("operation.name")) + .or_else(|| self.str("name")) + .map(str::to_string) + } + + fn timestamp(&self) -> Timestamp { + // Prefer record-level time fields (which may live outside `attributes`). + util::ts_from( + self.raw, + &[ + "time", + "timestamp", + "timeUnixNano", + "observedTimeUnixNano", + "ts", + ], + ) + } + + fn usage(&self) -> Option { + let input = self.u64("gen_ai.usage.input_tokens"); + let output = self.u64("gen_ai.usage.output_tokens"); + if input.is_none() && output.is_none() { + return None; + } + Some(Usage { + input_tokens: input, + output_tokens: output, + cache_read_tokens: self.u64("gen_ai.usage.cache_read_tokens"), + cache_creation_tokens: self.u64("gen_ai.usage.cache_creation_tokens"), + }) + } + + /// The success flag of a tool span, if any result/error attribute is present. + /// `None` means "no result observed on this span". + fn tool_ok(&self) -> Option { + if let Some(status) = self + .str("gen_ai.tool.result.status") + .or_else(|| self.str("otel.status_code")) + .or_else(|| self.str("status")) + { + let s = status.trim().to_ascii_lowercase(); + return Some(!matches!(s.as_str(), "error" | "failed" | "failure" | "ko")); + } + if self.value("error.type").is_some() || self.value("exception.type").is_some() { + return Some(false); + } + if self.value("gen_ai.tool.result").is_some() { + return Some(true); + } + None + } + + /// A normalized diff from a file-edit span, if it carries a `file.path`. + fn file_diff(&self) -> Option { + let path = self + .str("file.path") + .or_else(|| self.str("code.filepath"))?; + let added = self + .u32("model_added_lines") + .or_else(|| self.u32("code.added_lines")) + .unwrap_or(0); + let removed = self + .u32("model_removed_lines") + .or_else(|| self.u32("code.removed_lines")) + .unwrap_or(0); + Some(Diff { + path: PathBuf::from(path), + old: None, + new: None, + unified: None, + added_lines: added, + removed_lines: removed, + }) + } +} + +/// Collapse an OTLP `value` object (`stringValue` / `intValue` / `boolValue` / +/// `doubleValue` / `arrayValue` / `kvlistValue`) to a plain JSON value. +fn otlp_value(v: &Value) -> Value { + if let Some(s) = v.get("stringValue") { + return s.clone(); + } + if let Some(i) = v.get("intValue") { + return i.clone(); + } + if let Some(b) = v.get("boolValue") { + return b.clone(); + } + if let Some(d) = v.get("doubleValue") { + return d.clone(); + } + if let Some(arr) = v.get("arrayValue").and_then(|a| a.get("values")) { + if let Some(items) = arr.as_array() { + return Value::Array(items.iter().map(otlp_value).collect()); + } + } + if let Some(kv) = v.get("kvlistValue").and_then(|k| k.get("values")) { + if let Some(items) = kv.as_array() { + let mut map = serde_json::Map::new(); + for item in items { + if let Some(key) = item.get("key").and_then(Value::as_str) { + if let Some(val) = item.get("value").map(otlp_value) { + map.insert(key.to_string(), val); + } + } + } + return Value::Object(map); + } + } + // Already a plain scalar/array, or an unrecognized shape: pass through. + v.clone() +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(s: &str) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("otel.jsonl", 0, 1)) + } + + fn parse_all(lines: &[&str]) -> (Vec, ParseCtx) { + let adapter = OtelAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for l in lines { + let evs = adapter.parse(&raw(l), &mut ctx).expect("never errors"); + out.extend(evs); + } + (out, ctx) + } + + fn tags(evs: &[CaptureEvent]) -> Vec<&'static str> { + evs.iter().map(|e| e.kind.tag()).collect() + } + + const SESSION_START: &str = r#"{"time":"2026-06-22T10:00:00Z","gen_ai.operation.name":"session.start","gen_ai.conversation.id":"sess-1","gen_ai.request.model":"claude-opus-4-8","cwd":"/home/dev/svc","repo_root":"/home/dev/svc","git.commit":"abc1234","git.branch":"main"}"#; + const USER_PROMPT: &str = r#"{"time":"2026-06-22T10:00:05Z","gen_ai.operation.name":"chat","gen_ai.conversation.id":"sess-1","gen_ai.cli.user_prompt":"Let's use Postgres instead of MySQL."}"#; + const FILE_EDIT: &str = r#"{"time":"2026-06-22T10:00:15Z","gen_ai.operation.name":"file_operation","gen_ai.conversation.id":"sess-1","file.path":"db/config.rs","code.added_lines":12,"code.removed_lines":4}"#; + + #[test] + fn normalized_sequence_for_a_small_session() { + let assistant = r#"{"gen_ai.operation.name":"chat","gen_ai.conversation.id":"sess-1","gen_ai.request.model":"claude-opus-4-8","gen_ai.usage.input_tokens":42,"gen_ai.usage.output_tokens":8,"gen_ai.output.messages":[{"role":"assistant","content":"Switching to Postgres."}]}"#; + let session_end = r#"{"gen_ai.operation.name":"session.end","gen_ai.conversation.id":"sess-1","reason":"done"}"#; + let (evs, ctx) = parse_all(&[ + SESSION_START, + USER_PROMPT, + assistant, + FILE_EDIT, + session_end, + ]); + assert_eq!( + tags(&evs), + vec![ + "session_start", + "user_turn", + "assistant_turn", + "file_edit", + "session_end" + ] + ); + // Session id is learned from gen_ai.conversation.id and stamped. + assert_eq!(ctx.session_id.as_deref(), Some("sess-1")); + assert!(evs.iter().all(|e| e.session_id == "sess-1")); + // Project binding came from the session-start record. + let proj = ctx.project.expect("project set at session start"); + assert_eq!(proj.cwd, PathBuf::from("/home/dev/svc")); + assert_eq!(proj.git.as_ref().map(|g| g.sha.as_str()), Some("abc1234")); + } + + #[test] + fn decision_then_edit_produces_user_turn_then_file_edit() { + let (evs, _) = parse_all(&[USER_PROMPT, FILE_EDIT]); + assert_eq!(tags(&evs), vec!["user_turn", "file_edit"]); + match &evs[0].kind { + EventKind::UserTurn { text, .. } => { + assert!(text.contains("Postgres")); + } + other => panic!("expected user_turn, got {other:?}"), + } + match &evs[1].kind { + EventKind::FileEdit { diff, call_id } => { + assert_eq!(diff.path, PathBuf::from("db/config.rs")); + assert_eq!(diff.added_lines, 12); + assert_eq!(diff.removed_lines, 4); + assert!(call_id.is_none()); + } + other => panic!("expected file_edit, got {other:?}"), + } + } + + #[test] + fn input_messages_become_user_turns() { + let line = r#"{"gen_ai.operation.name":"chat","gen_ai.conversation.id":"s","gen_ai.input.messages":[{"role":"user","content":"hello"},{"role":"system","content":"ignore"},{"role":"user","parts":[{"type":"text","text":"world"}]}]}"#; + let (evs, _) = parse_all(&[line]); + // Only the two user-role messages map to UserTurns. + assert_eq!(tags(&evs), vec!["user_turn", "user_turn"]); + match &evs[1].kind { + EventKind::UserTurn { text, .. } => assert_eq!(text, "world"), + other => panic!("expected user_turn, got {other:?}"), + } + } + + #[test] + fn execute_tool_emits_call_result_and_edit() { + let line = r#"{"gen_ai.operation.name":"execute_tool","gen_ai.conversation.id":"s","gen_ai.tool.name":"edit_file","gen_ai.tool.call.id":"c1","file.path":"a.rs","model_added_lines":3,"model_removed_lines":1,"gen_ai.tool.result":"ok"}"#; + let (evs, _) = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["tool_call", "tool_result", "file_edit"]); + match &evs[1].kind { + EventKind::ToolResult { ok, call_id, .. } => { + assert!(*ok); + assert_eq!(call_id, "c1"); + } + other => panic!("expected tool_result, got {other:?}"), + } + } + + #[test] + fn failed_tool_result_is_marked_not_ok() { + // tool_failure scenario: the ToolResult must be ok=false so downstream + // produces no spurious Episode. + let line = r#"{"gen_ai.operation.name":"execute_tool","gen_ai.conversation.id":"s","gen_ai.tool.name":"edit_file","gen_ai.tool.call.id":"cf","file.path":"a.rs","model_added_lines":3,"model_removed_lines":1,"gen_ai.tool.result.status":"error","error.type":"PatchConflict","gen_ai.tool.result":"hunk failed"}"#; + let (evs, _) = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["tool_call", "tool_result", "file_edit"]); + match &evs[1].kind { + EventKind::ToolResult { ok, .. } => assert!(!*ok, "failed tool must be ok=false"), + other => panic!("expected tool_result, got {other:?}"), + } + } + + #[test] + fn otlp_nested_shape_is_supported() { + // The same edit, expressed in the OTLP attributes-array shape. + let line = r#"{"timeUnixNano":"1750586400000000000","attributes":[{"key":"gen_ai.operation.name","value":{"stringValue":"file_operation"}},{"key":"gen_ai.conversation.id","value":{"stringValue":"nested-1"}},{"key":"file.path","value":{"stringValue":"src/main.rs"}},{"key":"code.added_lines","value":{"intValue":"7"}},{"key":"code.removed_lines","value":{"intValue":"2"}}]}"#; + let (evs, ctx) = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["file_edit"]); + assert_eq!(ctx.session_id.as_deref(), Some("nested-1")); + match &evs[0].kind { + EventKind::FileEdit { diff, .. } => { + assert_eq!(diff.path, PathBuf::from("src/main.rs")); + assert_eq!(diff.added_lines, 7); + assert_eq!(diff.removed_lines, 2); + } + other => panic!("expected file_edit, got {other:?}"), + } + } + + #[test] + fn unrecognized_record_routes_to_unknown() { + let line = r#"{"gen_ai.operation.name":"telemetry.heartbeat","gen_ai.conversation.id":"s","foo":"bar"}"#; + let (evs, _) = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["unknown"]); + } + + #[test] + fn garbage_input_never_panics_and_stays_lossless() { + let adapter = OtelAdapter; + let mut ctx = ParseCtx::new(); + // Non-JSON garbage → one lossless Unknown. + let evs = adapter + .parse(&raw("}{ not json at all <<<"), &mut ctx) + .expect("never errors"); + assert_eq!(tags(&evs), vec!["unknown"]); + // Blank line → nothing. + let evs = adapter.parse(&raw(" "), &mut ctx).expect("never errors"); + assert!(evs.is_empty()); + // Truncated / weird JSON values must not panic. + for g in [ + "null", + "[]", + "123", + "\"a string\"", + r#"{"attributes":"not-an-array"}"#, + r#"{"gen_ai.input.messages":42}"#, + r#"{"gen_ai.output.messages":[{}]}"#, + ] { + let _ = adapter.parse(&raw(g), &mut ctx).expect("never errors"); + } + } + + #[test] + fn repeated_record_dedups_to_empty() { + let adapter = OtelAdapter; + let mut ctx = ParseCtx::new(); + let first = adapter.parse(&raw(FILE_EDIT), &mut ctx).expect("ok"); + assert_eq!(tags(&first), vec!["file_edit"]); + // The very same record again → idempotent, emits nothing. + let second = adapter.parse(&raw(FILE_EDIT), &mut ctx).expect("ok"); + assert!(second.is_empty(), "repeat must dedup to empty"); + } + + #[test] + fn ban_prompt_is_carried_as_user_turn() { + let line = r#"{"gen_ai.operation.name":"chat","gen_ai.conversation.id":"s","gen_ai.cli.user_prompt":"We will never add a dependency on left-pad."}"#; + let (evs, _) = parse_all(&[line]); + assert_eq!(tags(&evs), vec!["user_turn"]); + match &evs[0].kind { + EventKind::UserTurn { text, .. } => assert!(text.contains("never add a dependency")), + other => panic!("expected user_turn, got {other:?}"), + } + } + + #[test] + fn fingerprint_distinguishes_flat_and_otlp() { + let adapter = OtelAdapter; + let flat = adapter.schema_fingerprint(&raw(USER_PROMPT)); + assert_eq!(flat.variant, "otel/genai-flat"); + assert_eq!(flat.confidence, 100); + let nested = raw(r#"{"attributes":[{"key":"gen_ai.system","value":{"stringValue":"x"}}]}"#); + let otlp = adapter.schema_fingerprint(&nested); + assert_eq!(otlp.variant, "otel/genai-otlp"); + } +} diff --git a/crates/memscribe-adapters/src/registry.rs b/crates/memscribe-adapters/src/registry.rs new file mode 100644 index 0000000..d6fb08b --- /dev/null +++ b/crates/memscribe-adapters/src/registry.rs @@ -0,0 +1,56 @@ +//! The adapter registry: assemble the set of enabled adapters and resolve one +//! by [`SourceKind`]. + +use memscribe_core::{SourceKind, TranscriptAdapter}; + +/// Every enabled adapter, in a stable order. +#[must_use] +#[allow(clippy::vec_init_then_push)] // pushes are cfg-gated; a vec! literal won't work +pub fn all_adapters() -> Vec> { + let mut v: Vec> = Vec::new(); + #[cfg(feature = "claude_code")] + v.push(Box::new(crate::claude_code::ClaudeCodeAdapter)); + #[cfg(feature = "codex")] + v.push(Box::new(crate::codex::CodexAdapter)); + #[cfg(feature = "gemini")] + v.push(Box::new(crate::gemini::GeminiAdapter)); + #[cfg(feature = "otel")] + v.push(Box::new(crate::otel::OtelAdapter)); + #[cfg(feature = "cursor")] + v.push(Box::new(crate::cursor::CursorAdapter)); + #[cfg(feature = "windsurf")] + v.push(Box::new(crate::windsurf::WindsurfAdapter)); + #[cfg(feature = "zed")] + v.push(Box::new(crate::zed::ZedAdapter)); + #[cfg(feature = "vscode")] + v.push(Box::new(crate::vscode::VsCodeAdapter)); + #[cfg(feature = "copilot")] + v.push(Box::new(crate::copilot::CopilotAdapter)); + v +} + +/// Resolve the adapter for a given source, if its feature is enabled. +#[must_use] +pub fn adapter_for(kind: SourceKind) -> Option> { + match kind { + #[cfg(feature = "claude_code")] + SourceKind::ClaudeCode => Some(Box::new(crate::claude_code::ClaudeCodeAdapter)), + #[cfg(feature = "codex")] + SourceKind::Codex => Some(Box::new(crate::codex::CodexAdapter)), + #[cfg(feature = "gemini")] + SourceKind::Gemini => Some(Box::new(crate::gemini::GeminiAdapter)), + #[cfg(feature = "otel")] + SourceKind::Otel => Some(Box::new(crate::otel::OtelAdapter)), + #[cfg(feature = "cursor")] + SourceKind::Cursor => Some(Box::new(crate::cursor::CursorAdapter)), + #[cfg(feature = "windsurf")] + SourceKind::Windsurf => Some(Box::new(crate::windsurf::WindsurfAdapter)), + #[cfg(feature = "zed")] + SourceKind::Zed => Some(Box::new(crate::zed::ZedAdapter)), + #[cfg(feature = "vscode")] + SourceKind::VsCode => Some(Box::new(crate::vscode::VsCodeAdapter)), + #[cfg(feature = "copilot")] + SourceKind::Copilot => Some(Box::new(crate::copilot::CopilotAdapter)), + _ => None, + } +} diff --git a/crates/memscribe-adapters/src/util.rs b/crates/memscribe-adapters/src/util.rs new file mode 100644 index 0000000..322fd3c --- /dev/null +++ b/crates/memscribe-adapters/src/util.rs @@ -0,0 +1,186 @@ +//! Shared, deterministic helpers for adapters. **Do not add tool-specific logic +//! here** — keep that in each tool's module so adapters can be maintained +//! independently. These helpers build normalized events with correct +//! provenance, sequencing, and ids. + +use memscribe_core::{ + content_id, CaptureEvent, EventKind, ParseCtx, ParseError, RawRecord, SourceKind, Timestamp, + SCHEMA_VERSION, +}; +use time::format_description::well_known::Rfc3339; +use time::OffsetDateTime; + +/// Parse a JSONL line into a JSON value. Returns `None` for blank lines or +/// invalid JSON (the caller decides whether that is an `Unknown` or a skip). +#[must_use] +pub fn parse_json_line(raw: &RawRecord) -> Option { + let s = raw.as_str()?.trim(); + if s.is_empty() { + return None; + } + serde_json::from_str(s).ok() +} + +/// Parse a timestamp from RFC3339, or from epoch seconds/milliseconds. Returns +/// `None` if neither parses. +#[must_use] +pub fn parse_ts(s: &str) -> Option { + if let Ok(t) = OffsetDateTime::parse(s.trim(), &Rfc3339) { + return Some(t); + } + let n: i64 = s.trim().parse().ok()?; + // Heuristic: values above ~year 2286 in seconds are really milliseconds. + let (secs, nanos) = if n.abs() > 10_000_000_000 { + (n / 1000, (n % 1000) * 1_000_000) + } else { + (n, 0) + }; + OffsetDateTime::from_unix_timestamp(secs) + .ok() + .map(|t| t + time::Duration::nanoseconds(nanos)) +} + +/// Extract an RFC3339/epoch timestamp from a JSON object under any of the given +/// keys, falling back to the Unix epoch (so output stays deterministic even when +/// a record carries no timestamp). +#[must_use] +pub fn ts_from(value: &serde_json::Value, keys: &[&str]) -> Timestamp { + for k in keys { + if let Some(v) = value.get(*k) { + if let Some(s) = v.as_str() { + if let Some(t) = parse_ts(s) { + return t; + } + } else if let Some(n) = v.as_i64() { + if let Some(t) = parse_ts(&n.to_string()) { + return t; + } + } + } + } + OffsetDateTime::UNIX_EPOCH +} + +/// The Unix epoch — a stable default timestamp. +#[must_use] +pub fn epoch() -> Timestamp { + OffsetDateTime::UNIX_EPOCH +} + +/// Build a normalized [`CaptureEvent`], allocating the monotonic `seq` from the +/// context and stamping the session/project binding. +#[allow(clippy::too_many_arguments)] +#[must_use] +pub fn mk_event( + source: SourceKind, + ctx: &mut ParseCtx, + raw: &RawRecord, + event_id: String, + parent_id: Option, + timestamp: Timestamp, + kind: EventKind, +) -> CaptureEvent { + let seq = ctx.alloc_seq(); + let session_id = ctx + .session_id + .clone() + .unwrap_or_else(|| "unknown".to_string()); + CaptureEvent { + schema_version: SCHEMA_VERSION, + source, + session_id, + seq, + event_id, + parent_id, + timestamp, + project: ctx.project_or_default(), + kind, + provenance: raw.location.clone(), + } +} + +/// Build an [`EventKind::Unknown`] event from a raw JSON value — the lossless, +/// version-tolerant fallback every adapter uses for records it does not yet +/// understand. +#[must_use] +pub fn unknown_event( + source: SourceKind, + ctx: &mut ParseCtx, + raw: &RawRecord, + value: serde_json::Value, +) -> CaptureEvent { + let raw_type = value + .get("type") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_else(|| "unknown".to_string()); + let timestamp = ts_from(&value, &["timestamp", "time", "ts", "created_at"]); + let id = content_id(&raw.bytes); + mk_event( + source, + ctx, + raw, + id, + None, + timestamp, + EventKind::Unknown { + raw_type, + raw: value, + }, + ) +} + +/// The default skeleton parse: emit exactly one `Unknown` event per non-blank +/// record (so the stream is lossless even before a real parser exists). Tool +/// modules replace this with real parsing but should preserve the losslessness +/// guarantee for records they do not recognize. +/// +/// # Errors +/// Never returns an error — present for signature parity with `parse`. +pub fn stub_parse( + source: SourceKind, + raw: &RawRecord, + ctx: &mut ParseCtx, +) -> Result, ParseError> { + let s = raw.as_str().map(str::trim).unwrap_or(""); + if s.is_empty() { + return Ok(Vec::new()); + } + let value = + serde_json::from_str(s).unwrap_or_else(|_| serde_json::Value::String(s.to_string())); + Ok(vec![unknown_event(source, ctx, raw, value)]) +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(s: &str) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("t.jsonl", 0, 1)) + } + + #[test] + fn parse_ts_rfc3339_and_epoch() { + assert!(parse_ts("2026-06-22T10:00:00Z").is_some()); + assert!(parse_ts("1750000000").is_some()); + assert!(parse_ts("1750000000000").is_some()); + assert!(parse_ts("not a time").is_none()); + } + + #[test] + fn stub_parse_is_lossless_for_nonblank() { + let mut ctx = ParseCtx::new(); + let evs = stub_parse(SourceKind::Unknown, &raw("{\"type\":\"x\"}"), &mut ctx).unwrap(); + assert_eq!(evs.len(), 1); + assert_eq!(evs[0].kind.tag(), "unknown"); + } + + #[test] + fn stub_parse_skips_blank() { + let mut ctx = ParseCtx::new(); + assert!(stub_parse(SourceKind::Unknown, &raw(" "), &mut ctx) + .unwrap() + .is_empty()); + } +} diff --git a/crates/memscribe-adapters/src/vscode.rs b/crates/memscribe-adapters/src/vscode.rs new file mode 100644 index 0000000..e200612 --- /dev/null +++ b/crates/memscribe-adapters/src/vscode.rs @@ -0,0 +1,796 @@ +//! VS Code adapter (Copilot Chat / chat sessions). +//! +//! VS Code stores chat sessions under +//! `/workspaceStorage//chatSessions/*.json` (and +//! `chatEditingSessions` for edits). That on-disk store is an undocumented, +//! version-churning JSON blob, so this adapter parses two shapes: +//! +//! 1. A stable, **exported** chat JSON-lines shape (one record per line) that a +//! companion exporter writes — a leading `{kind:session_start, cwd, git, +//! toolVersion}` followed by message records `{id, parentId, role, ts, +//! sessionId, text, model, usage, toolCalls, toolResults, edits}`. +//! 2. The **native** `chatSessions` JSON shape, where a single object carries +//! `{version, requesterUsername, responderUsername, requests:[{message, +//! response}]}`; each request maps to a `UserTurn` and its response to an +//! `AssistantTurn`. +//! +//! Anything unrecognized-but-valid routes to [`memscribe_core::EventKind::Unknown`] +//! via [`util::unknown_event`], so the stream stays lossless across VS Code +//! version churn. The parser is fully deterministic and never panics. + +use crate::util; +use memscribe_core::{ + content_id, CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, + ProjectRef, RawRecord, SchemaVariant, SourceKind, TranscriptAdapter, TranscriptHandle, Usage, +}; +use serde_json::Value; +use std::path::PathBuf; + +const SOURCE: SourceKind = SourceKind::VsCode; + +/// Adapter for VS Code chat-session transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct VsCodeAdapter; + +impl TranscriptAdapter for VsCodeAdapter { + fn source_kind(&self) -> SourceKind { + SOURCE + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + // Point at the real product path; we don't parse the binary store here. + // `Application Support/Code/User/workspaceStorage//chatSessions/*.json` + let home = cfg.home_dir(); + let base = home + .join("Library") + .join("Application Support") + .join("Code") + .join("User") + .join("workspaceStorage"); + let mut handles = Vec::new(); + // Walk workspaceStorage//chatSessions/*.json deterministically. + let mut hashes: Vec = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&base) { + for entry in entries.flatten() { + let p = entry.path(); + if p.is_dir() { + hashes.push(p); + } + } + } + hashes.sort(); + for ws in hashes { + let sessions_dir = ws.join("chatSessions"); + let mut files: Vec = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&sessions_dir) { + for entry in entries.flatten() { + let p = entry.path(); + if p.extension().and_then(|e| e.to_str()) == Some("json") { + files.push(p); + } + } + } + files.sort(); + for f in files { + let session_hint = f.file_stem().and_then(|s| s.to_str()).map(str::to_string); + handles.push(TranscriptHandle { + path: f, + source: SOURCE, + session_hint, + compressed: false, + }); + } + } + handles + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + let Some(value) = util::parse_json_line(raw) else { + // Blank line → nothing; non-JSON but non-blank → lossless Unknown. + let s = raw.as_str().map(str::trim).unwrap_or(""); + if s.is_empty() { + return Ok(Vec::new()); + } + return Ok(vec![util::unknown_event( + SOURCE, + ctx, + raw, + Value::String(s.to_string()), + )]); + }; + + // The native chatSessions shape: a single object with a `requests` array. + if value.get("requests").and_then(Value::as_array).is_some() { + return Ok(parse_native_session(raw, ctx, &value)); + } + + // Otherwise treat it as one exported JSON-lines record. + Ok(parse_exported_record(raw, ctx, value)) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + let Some(value) = util::parse_json_line(sample) else { + return SchemaVariant::unknown(SOURCE); + }; + if value.get("requests").and_then(Value::as_array).is_some() { + return SchemaVariant::certain(SOURCE, "vscode/chat-sessions-native"); + } + if value.get("kind").and_then(Value::as_str) == Some("session_start") + || value.get("role").and_then(Value::as_str).is_some() + { + return SchemaVariant::certain(SOURCE, "vscode/exported-jsonl-v1"); + } + SchemaVariant::unknown(SOURCE) + } +} + +/// Parse one record of the exported JSON-lines shape into zero or more events. +fn parse_exported_record(raw: &RawRecord, ctx: &mut ParseCtx, value: Value) -> Vec { + let kind = value.get("kind").and_then(Value::as_str); + + // Record-level dedup / idempotency: a repeated record yields nothing. + let record_id = record_event_id(raw, &value); + if !ctx.first_seen(&record_id) { + return Vec::new(); + } + + match kind { + Some("session_start") => { + apply_session_start(ctx, &value); + let ts = util::ts_from(&value, &["ts", "timestamp", "time"]); + let git = parse_git(value.get("git")); + let cwd = string_field(&value, "cwd").unwrap_or_else(|| ".".to_string()); + let model = string_field(&value, "model"); + let tool_version = string_field(&value, "toolVersion"); + vec![util::mk_event( + SOURCE, + ctx, + raw, + record_id, + None, + ts, + EventKind::SessionStart { + cwd: PathBuf::from(cwd), + git, + model, + tool_version, + }, + )] + } + Some("session_end") => { + adopt_session(ctx, &value); + let ts = util::ts_from(&value, &["ts", "timestamp", "time"]); + let reason = string_field(&value, "reason"); + vec![util::mk_event( + SOURCE, + ctx, + raw, + record_id, + None, + ts, + EventKind::SessionEnd { reason }, + )] + } + _ => parse_message_record(raw, ctx, &value, record_id), + } +} + +/// Parse one message record (`role: user|assistant`, with optional toolCalls, +/// toolResults, and edits) into an ordered list of events. +fn parse_message_record( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &Value, + record_id: String, +) -> Vec { + adopt_session(ctx, value); + let ts = util::ts_from(value, &["ts", "timestamp", "time"]); + let parent_id = string_field(value, "parentId"); + let role = value.get("role").and_then(Value::as_str); + let text = string_field(value, "text").unwrap_or_default(); + + let mut out = Vec::new(); + + match role { + Some("user") => out.push(util::mk_event( + SOURCE, + ctx, + raw, + record_id.clone(), + parent_id.clone(), + ts, + EventKind::UserTurn { + text, + parts: Vec::new(), + }, + )), + Some("assistant") => out.push(util::mk_event( + SOURCE, + ctx, + raw, + record_id.clone(), + parent_id.clone(), + ts, + EventKind::AssistantTurn { + text, + thinking: None, + model: string_field(value, "model"), + usage: parse_usage(value.get("usage")), + parts: Vec::new(), + }, + )), + // A record with edits/tool data but no recognized role is still + // valuable; if it carries no actionable role and nothing else, fall + // through to the sub-records below and, if none, emit Unknown. + _ => { + let has_children = value + .get("toolCalls") + .and_then(Value::as_array) + .is_some_and(|a| !a.is_empty()) + || value + .get("toolResults") + .and_then(Value::as_array) + .is_some_and(|a| !a.is_empty()) + || value + .get("edits") + .and_then(Value::as_array) + .is_some_and(|a| !a.is_empty()); + if !has_children { + return vec![util::unknown_event(SOURCE, ctx, raw, value.clone())]; + } + } + } + + // Tool calls. + if let Some(calls) = value.get("toolCalls").and_then(Value::as_array) { + for (i, call) in calls.iter().enumerate() { + let call_id = + string_field(call, "id").unwrap_or_else(|| format!("{record_id}:call:{i}")); + let name = string_field(call, "name").unwrap_or_default(); + let args = call.get("args").cloned().unwrap_or(Value::Null); + ctx.call_names.insert(call_id.clone(), name.clone()); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + derive_id(&record_id, "toolcall", i, &call_id), + Some(record_id.clone()), + ts, + EventKind::ToolCall { + call_id, + name, + args, + }, + )); + } + } + + // Tool results. + if let Some(results) = value.get("toolResults").and_then(Value::as_array) { + for (i, res) in results.iter().enumerate() { + let call_id = + string_field(res, "id").unwrap_or_else(|| format!("{record_id}:result:{i}")); + let ok = res.get("ok").and_then(Value::as_bool).unwrap_or(true); + let output = res.get("output").cloned().unwrap_or(Value::Null); + ctx.call_ok.insert(call_id.clone(), ok); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + derive_id(&record_id, "toolresult", i, &call_id), + Some(record_id.clone()), + ts, + EventKind::ToolResult { + call_id, + ok, + output, + }, + )); + } + } + + // File edits. + if let Some(edits) = value.get("edits").and_then(Value::as_array) { + for (i, edit) in edits.iter().enumerate() { + let diff = parse_edit(edit); + let edit_id = string_field(edit, "id") + .or_else(|| diff.path.to_str().map(str::to_string)) + .unwrap_or_default(); + // Link the edit to its originating tool call when the export carries + // one (`callId`/`call_id`). This is what lets the segmenter drop an + // edit whose paired ToolResult failed (ok=false) — "a tool failure → + // no spurious episode" (§8.2). Absent the field, the edit stands on + // its own (the happy path), matching the prior behavior. + let call_id = string_field(edit, "callId").or_else(|| string_field(edit, "call_id")); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + derive_id(&record_id, "edit", i, &edit_id), + Some(record_id.clone()), + ts, + EventKind::FileEdit { call_id, diff }, + )); + } + } + + out +} + +/// Parse the native `chatSessions` shape: `requests[].message` → `UserTurn`, +/// `requests[].response[]` → `AssistantTurn`. +fn parse_native_session(raw: &RawRecord, ctx: &mut ParseCtx, value: &Value) -> Vec { + // Record-level idempotency on the whole session object. + let record_id = record_event_id(raw, value); + if !ctx.first_seen(&record_id) { + return Vec::new(); + } + adopt_session(ctx, value); + if ctx.session_id.is_none() { + // Native files have no `sessionId`; derive a stable one from content. + ctx.session_id = Some(format!("vscode-{}", &record_id[..record_id.len().min(16)])); + } + + let ts = util::ts_from(value, &["ts", "timestamp", "time"]); + let responder = string_field(value, "responderUsername"); + let mut out = Vec::new(); + + let Some(requests) = value.get("requests").and_then(Value::as_array) else { + return vec![util::unknown_event(SOURCE, ctx, raw, value.clone())]; + }; + + for (i, req) in requests.iter().enumerate() { + // User turn from `message`. + let user_text = req + .get("message") + .map(flatten_native_text) + .unwrap_or_default(); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{record_id}:req:{i}:user"), + None, + ts, + EventKind::UserTurn { + text: user_text, + parts: Vec::new(), + }, + )); + + // Assistant turn from `response` (an array of parts). + let resp_text = flatten_native_response(req.get("response")); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{record_id}:req:{i}:asst"), + Some(format!("{record_id}:req:{i}:user")), + ts, + EventKind::AssistantTurn { + text: resp_text, + thinking: None, + model: responder.clone(), + usage: None, + parts: Vec::new(), + }, + )); + } + + out +} + +// --------------------------------------------------------------------------- +// helpers +// --------------------------------------------------------------------------- + +/// The deterministic record-level event id: the native `id` field, else a +/// `blake3` of the raw bytes (per the format spec: `event_id = id else content_id`). +fn record_event_id(raw: &RawRecord, value: &Value) -> String { + string_field(value, "id").unwrap_or_else(|| content_id(&raw.bytes)) +} + +/// Derive a stable, collision-free child event id under a parent record id. +fn derive_id(record_id: &str, kind: &str, index: usize, native: &str) -> String { + format!("{record_id}:{kind}:{index}:{native}") +} + +/// Read a string field, ignoring empty/non-string values. +fn string_field(value: &Value, key: &str) -> Option { + value + .get(key) + .and_then(Value::as_str) + .map(str::to_string) + .filter(|s| !s.is_empty()) +} + +/// Set `ctx.session_id` from `sessionId` if not already set. +fn adopt_session(ctx: &mut ParseCtx, value: &Value) { + if ctx.session_id.is_none() { + if let Some(sid) = string_field(value, "sessionId") { + ctx.session_id = Some(sid); + } + } +} + +/// Apply a session-start record to the context: session id + project binding. +fn apply_session_start(ctx: &mut ParseCtx, value: &Value) { + adopt_session(ctx, value); + let cwd = string_field(value, "cwd").unwrap_or_else(|| ".".to_string()); + let git = parse_git(value.get("git")); + ctx.project = Some(ProjectRef { + cwd: PathBuf::from(cwd), + repo_root: None, + git, + }); +} + +/// Parse a `{sha, branch}` git object. +fn parse_git(value: Option<&Value>) -> Option { + let obj = value?; + let sha = string_field(obj, "sha")?; + let branch = string_field(obj, "branch"); + Some(GitRef { sha, branch }) +} + +/// Parse a `{input, output}` usage object into [`Usage`]. +fn parse_usage(value: Option<&Value>) -> Option { + let obj = value?; + let input_tokens = obj.get("input").and_then(Value::as_u64); + let output_tokens = obj.get("output").and_then(Value::as_u64); + if input_tokens.is_none() && output_tokens.is_none() { + return None; + } + Some(Usage { + input_tokens, + output_tokens, + cache_read_tokens: None, + cache_creation_tokens: None, + }) +} + +/// Parse an `{path, oldText, newText, diff, added, removed}` edit into a [`Diff`]. +fn parse_edit(edit: &Value) -> Diff { + let path = string_field(edit, "path").unwrap_or_default(); + Diff { + path: PathBuf::from(path), + old: string_field(edit, "oldText"), + new: string_field(edit, "newText"), + unified: string_field(edit, "diff"), + added_lines: edit + .get("added") + .and_then(Value::as_u64) + .unwrap_or(0) + .min(u64::from(u32::MAX)) as u32, + removed_lines: edit + .get("removed") + .and_then(Value::as_u64) + .unwrap_or(0) + .min(u64::from(u32::MAX)) as u32, + } +} + +/// Flatten a native `message` object (`{text, parts:[{kind:text,text}]}`). +fn flatten_native_text(message: &Value) -> String { + if let Some(t) = string_field(message, "text") { + return t; + } + flatten_text_parts(message.get("parts")) +} + +/// Flatten a native `response` (an array of `{kind:text,text}` parts). +fn flatten_native_response(response: Option<&Value>) -> String { + match response { + Some(Value::Array(_)) => flatten_text_parts(response), + Some(Value::String(s)) => s.clone(), + Some(obj @ Value::Object(_)) => flatten_native_text(obj), + _ => String::new(), + } +} + +/// Concatenate the `text` of every `{kind:"text", text}` part in an array. +fn flatten_text_parts(parts: Option<&Value>) -> String { + let Some(arr) = parts.and_then(Value::as_array) else { + return String::new(); + }; + let mut chunks: Vec = Vec::new(); + for part in arr { + if part.get("kind").and_then(Value::as_str) == Some("text") { + if let Some(t) = string_field(part, "text") { + chunks.push(t); + } + } + } + chunks.join("") +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(line: &str) -> RawRecord { + RawRecord::from_line(line, SourceLocation::new("vscode.jsonl", 0, 1)) + } + + /// Run a slice of JSONL lines through the adapter, threading one context. + fn run(lines: &[&str]) -> Vec { + let adapter = VsCodeAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for line in lines { + let evs = adapter.parse(&raw(line), &mut ctx).expect("never errors"); + out.extend(evs); + } + out + } + + fn tags(events: &[CaptureEvent]) -> Vec<&'static str> { + events.iter().map(|e| e.kind.tag()).collect() + } + + const SESSION_START: &str = r#"{"kind":"session_start","sessionId":"s1","cwd":"/work","git":{"sha":"abc","branch":"main"},"toolVersion":"1.92.0","model":"gpt-4o"}"#; + + #[test] + fn session_start_sets_session_and_project() { + let evs = run(&[SESSION_START]); + assert_eq!(tags(&evs), vec!["session_start"]); + assert_eq!(evs[0].session_id, "s1"); + assert_eq!(evs[0].project.cwd, PathBuf::from("/work")); + let git = evs[0].project.git.as_ref().expect("git set from start"); + assert_eq!(git.sha, "abc"); + assert_eq!(git.branch.as_deref(), Some("main")); + match &evs[0].kind { + EventKind::SessionStart { + model, + tool_version, + .. + } => { + assert_eq!(model.as_deref(), Some("gpt-4o")); + assert_eq!(tool_version.as_deref(), Some("1.92.0")); + } + other => panic!("expected session_start, got {other:?}"), + } + } + + #[test] + fn normalized_sequence_decision_then_edits() { + let user = r#"{"id":"m1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"s1","text":"Let's use Postgres instead of MySQL."}"#; + let asst = r#"{"id":"m2","parentId":"m1","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"s1","text":"Switching now.","model":"gpt-4o","usage":{"input":10,"output":3},"edits":[{"path":"src/db.ts","oldText":"mysql","newText":"postgres","added":1,"removed":1}]}"#; + let evs = run(&[SESSION_START, user, asst]); + // session_start, user_turn, assistant_turn, file_edit + assert_eq!( + tags(&evs), + vec!["session_start", "user_turn", "assistant_turn", "file_edit"] + ); + // The decision turn is a UserTurn carrying the decision text. + match &evs[1].kind { + EventKind::UserTurn { text, .. } => { + assert!(text.contains("Postgres")); + } + other => panic!("expected user_turn, got {other:?}"), + } + // The edit is a FileEdit with the diff fields mapped. + match &evs[3].kind { + EventKind::FileEdit { diff, .. } => { + assert_eq!(diff.path, PathBuf::from("src/db.ts")); + assert_eq!(diff.old.as_deref(), Some("mysql")); + assert_eq!(diff.new.as_deref(), Some("postgres")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 1); + } + other => panic!("expected file_edit, got {other:?}"), + } + // The edit's parent links back to the assistant turn record id. + assert_eq!(evs[3].parent_id.as_deref(), Some("m2")); + } + + #[test] + fn tool_call_then_result_failure() { + let asst = r#"{"id":"t2","role":"assistant","ts":"2026-06-22T13:00:07Z","sessionId":"s4","text":"applying","toolCalls":[{"id":"c1","name":"applyEdit","args":{"path":"x"}}],"edits":[{"path":"x","callId":"c1","oldText":"a","newText":"b","added":1,"removed":1}]}"#; + let res = r#"{"id":"t3","role":"assistant","ts":"2026-06-22T13:00:09Z","sessionId":"s4","text":"","toolResults":[{"id":"c1","ok":false,"output":"FAILED"}]}"#; + let evs = run(&[asst, res]); + // assistant_turn, tool_call, file_edit, assistant_turn, tool_result + assert_eq!( + tags(&evs), + vec![ + "assistant_turn", + "tool_call", + "file_edit", + "assistant_turn", + "tool_result" + ] + ); + // The edit is linked to the failing call by call_id, so the segmenter + // drops it (no spurious episode for a failed edit). + match &evs[2].kind { + EventKind::FileEdit { call_id, .. } => assert_eq!(call_id.as_deref(), Some("c1")), + other => panic!("expected file_edit, got {other:?}"), + } + // The failed result must carry ok=false (so no Episode is produced + // downstream for the failed edit). + match &evs[4].kind { + EventKind::ToolResult { ok, call_id, .. } => { + assert!(!ok); + assert_eq!(call_id, "c1"); + } + other => panic!("expected tool_result, got {other:?}"), + } + } + + /// On-disk fixture conformance: the `tool_failure` fixture must, end-to-end + /// through the segmenter, mint NO episode; the happy path must still mint two. + fn vscode_fixture(name: &str) -> String { + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../../fixtures/vscode/v1") + .join(name); + std::fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("read fixture {}: {e}", path.display())) + } + + fn run_file(jsonl: &str) -> Vec { + let adapter = VsCodeAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for line in jsonl.lines() { + let evs = adapter.parse(&raw(line), &mut ctx).expect("never errors"); + out.extend(evs); + } + out + } + + #[test] + fn fixture_tool_failure_yields_no_episode_via_segmenter() { + use memscribe_core::gate::CommitmentGate; + use memscribe_core::segmenter::{DefaultSegmenter, Segmenter}; + + let gate = CommitmentGate::default(); + let seg = DefaultSegmenter; + + let fail_events = run_file(&vscode_fixture("tool_failure.jsonl")); + // The failed edit is linked to a failing ToolResult by call_id. + assert!(fail_events.iter().any(|e| matches!( + &e.kind, + EventKind::FileEdit { call_id, .. } if call_id.as_deref() == Some("call-edit-1") + ))); + let fail_seg = seg.segment(&fail_events, &gate); + assert_eq!( + fail_seg.episodes.len(), + 0, + "a failed edit must produce no episode" + ); + + let ok_events = run_file(&vscode_fixture("happy_path_decision_then_edits.jsonl")); + let ok_seg = seg.segment(&ok_events, &gate); + assert_eq!( + ok_seg.episodes.len(), + 2, + "the happy path must still produce two episodes" + ); + } + + #[test] + fn never_panics_on_garbage() { + // Invalid JSON, empty, and structurally-weird-but-valid inputs. + let garbage = run(&[ + "not json at all {{{", + "", + " ", + "42", + "true", + r#"{"role":12345}"#, + r#"{"kind":"session_start"}"#, + r#"{"requests":"not-an-array"}"#, + r#"{"id":"x","edits":[{}]}"#, + ]); + // Nothing panicked; every non-blank record produced at least an event. + // Blank lines produce nothing, so the count is < the input count but > 0. + assert!(!garbage.is_empty()); + // A non-JSON line is preserved as Unknown (lossless). + assert!(garbage.iter().any(|e| e.kind.tag() == "unknown")); + } + + #[test] + fn unrecognized_valid_record_routes_to_unknown() { + let evs = run(&[r#"{"id":"weird","kind":"telemetry","payload":{"a":1}}"#]); + assert_eq!(tags(&evs), vec!["unknown"]); + match &evs[0].kind { + EventKind::Unknown { raw_type, raw } => { + assert_eq!(raw_type, "unknown"); + assert!(raw.get("payload").is_some()); + } + other => panic!("expected unknown, got {other:?}"), + } + } + + #[test] + fn dedup_repeated_record_is_idempotent() { + let user = + r#"{"id":"m1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"s1","text":"hi"}"#; + // Same record twice → only one event. + let evs = run(&[SESSION_START, user, user]); + assert_eq!(tags(&evs), vec!["session_start", "user_turn"]); + // Sequence numbers are still monotonic and gap-free for what was kept. + assert_eq!(evs[0].seq, 0); + assert_eq!(evs[1].seq, 1); + } + + #[test] + fn idempotent_record_with_children_dedups_whole_record() { + let asst = r#"{"id":"m2","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"s1","text":"x","edits":[{"path":"a.ts","oldText":"1","newText":"2","added":1,"removed":0}]}"#; + let evs = run(&[asst, asst]); + // First time: assistant_turn + file_edit. Second time: nothing. + assert_eq!(tags(&evs), vec!["assistant_turn", "file_edit"]); + } + + #[test] + fn child_event_ids_do_not_collide_with_turn() { + let asst = r#"{"id":"m9","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"s1","text":"x","toolCalls":[{"id":"m9","name":"t","args":{}}]}"#; + // The tool-call's native id collides with the record id; derivation must + // keep the events distinct so both survive. + let evs = run(&[asst]); + assert_eq!(tags(&evs), vec!["assistant_turn", "tool_call"]); + assert_ne!(evs[0].event_id, evs[1].event_id); + } + + #[test] + fn epoch_ms_timestamp_parses() { + let user = r#"{"id":"m1","role":"user","ts":1750000000000,"sessionId":"s1","text":"hi"}"#; + let evs = run(&[user]); + assert_eq!(tags(&evs), vec!["user_turn"]); + // 1_750_000_000_000 ms = 2025-06-15ish — well after the epoch. + assert!(evs[0].timestamp.unix_timestamp() > 1_700_000_000); + } + + #[test] + fn native_chatsession_shape_maps_requests() { + let native = r#"{"version":3,"requesterUsername":"dev","responderUsername":"Copilot","requests":[{"message":{"text":"Add a health check","parts":[{"kind":"text","text":"Add a health check"}]},"response":[{"kind":"text","text":"Adding GET /healthz."},{"kind":"text","text":" Done."}]}]}"#; + let evs = run(&[native]); + assert_eq!(tags(&evs), vec!["user_turn", "assistant_turn"]); + match &evs[0].kind { + EventKind::UserTurn { text, .. } => assert_eq!(text, "Add a health check"), + other => panic!("expected user_turn, got {other:?}"), + } + match &evs[1].kind { + EventKind::AssistantTurn { text, model, .. } => { + assert_eq!(text, "Adding GET /healthz. Done."); + assert_eq!(model.as_deref(), Some("Copilot")); + } + other => panic!("expected assistant_turn, got {other:?}"), + } + } + + #[test] + fn schema_fingerprint_distinguishes_shapes() { + let adapter = VsCodeAdapter; + let exported = adapter.schema_fingerprint(&raw(SESSION_START)); + assert_eq!(exported.variant, "vscode/exported-jsonl-v1"); + assert_eq!(exported.confidence, 100); + + let native = adapter.schema_fingerprint(&raw(r#"{"requests":[]}"#)); + assert_eq!(native.variant, "vscode/chat-sessions-native"); + + let unknown = adapter.schema_fingerprint(&raw("garbage")); + assert_eq!(unknown.confidence, 0); + } + + #[test] + fn session_id_falls_back_to_unknown_without_start() { + // A bare message with no sessionId and no prior session_start. + let user = r#"{"id":"m1","role":"user","ts":"2026-06-22T10:00:00Z","text":"hi"}"#; + let evs = run(&[user]); + assert_eq!(evs[0].session_id, "unknown"); + } + + #[test] + fn determinism_same_input_same_output() { + let lines = [ + SESSION_START, + r#"{"id":"m1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"s1","text":"a"}"#, + r#"{"id":"m2","parentId":"m1","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"s1","text":"b","edits":[{"path":"p","oldText":"x","newText":"y","added":2,"removed":1}]}"#, + ]; + let a = run(&lines); + let b = run(&lines); + assert_eq!(a, b); + } +} diff --git a/crates/memscribe-adapters/src/windsurf.rs b/crates/memscribe-adapters/src/windsurf.rs new file mode 100644 index 0000000..fc641ad --- /dev/null +++ b/crates/memscribe-adapters/src/windsurf.rs @@ -0,0 +1,527 @@ +//! Windsurf (Codeium) adapter. +//! +//! Windsurf is a VS Code-based editor whose Cascade agent stores chat in an +//! undocumented binary/SQLite store under `~/.codeium/windsurf/` and +//! `~/Library/Application Support/Windsurf/User/`. We do not parse that binary +//! store in this model. Instead this adapter targets a deterministic **exported +//! Cascade chat JSON-Lines** shape (one JSON object per line) and routes any +//! unrecognized-but-valid record to [`memscribe_core::EventKind::Unknown`] so the +//! stream stays lossless. +//! +//! Record shape (see `fixtures/windsurf/v1/`): +//! - a leading session header: `{"kind":"session_start","cwd":..,"git":{"sha","branch"},"toolVersion":..,"sessionId":..,"model":..}` +//! - message records: `{"id","parentId","role":"user"|"assistant","ts","sessionId","text","model","usage":{"input","output"},"toolCalls":[{"id","name","args"}],"toolResults":[{"id","ok","output"}],"edits":[{"path","oldText","newText","diff","added","removed"}]}` +//! +//! Mapping: `session_start` → `SessionStart`; `role:user` → `UserTurn`; +//! `role:assistant` → `AssistantTurn`; each `toolCalls[]` → `ToolCall`; each +//! `toolResults[]` → `ToolResult{ok}`; each `edits[]` → `FileEdit`. +//! +//! Hard rules honored: never panics (no unwrap/expect/indexing on parsed input); +//! deterministic (no clock/random/global state); `ctx.session_id` is set from the +//! first record carrying it; `ctx.project` is populated from the session-start +//! record; repeated records dedup via `ctx.first_seen(event_id)`. + +use crate::util; +use memscribe_core::{ + content_id, CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, + ProjectRef, RawRecord, SchemaVariant, SourceKind, TranscriptHandle, Usage, +}; +use std::path::PathBuf; + +/// Adapter for Windsurf transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct WindsurfAdapter; + +const SOURCE: SourceKind = SourceKind::Windsurf; + +impl memscribe_core::TranscriptAdapter for WindsurfAdapter { + fn source_kind(&self) -> SourceKind { + SOURCE + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + // The real product stores chat in a binary/SQLite store; we do not parse + // it here, but discovery still points at the on-disk locations so a + // future exporter / probe has the canonical paths. Order is stable. + let home = cfg.home_dir(); + let mut out = Vec::new(); + let candidates = [ + home.join(".codeium").join("windsurf"), + home.join("Library") + .join("Application Support") + .join("Windsurf") + .join("User"), + ]; + for path in candidates { + out.push(TranscriptHandle { + path, + source: SOURCE, + session_hint: None, + compressed: false, + }); + } + out + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + // Blank lines / invalid JSON: skip (blank) or fall through to a string + // Unknown so nothing is lost. + let Some(value) = util::parse_json_line(raw) else { + let s = raw.as_str().map(str::trim).unwrap_or(""); + if s.is_empty() { + return Ok(Vec::new()); + } + let v = serde_json::Value::String(s.to_string()); + return Ok(vec![util::unknown_event(SOURCE, ctx, raw, v)]); + }; + + let kind = str_field(&value, "kind"); + let role = str_field(&value, "role"); + + if kind.as_deref() == Some("session_start") { + return Ok(parse_session_start(raw, ctx, value)); + } + match role.as_deref() { + Some("user") => Ok(parse_message(raw, ctx, value, false)), + Some("assistant") => Ok(parse_message(raw, ctx, value, true)), + // A valid JSON record we don't recognize: lossless Unknown. + _ => Ok(vec![util::unknown_event(SOURCE, ctx, raw, value)]), + } + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + match util::parse_json_line(sample) { + Some(v) + if str_field(&v, "kind").as_deref() == Some("session_start") + || str_field(&v, "role").is_some() => + { + SchemaVariant::certain(SOURCE, "windsurf/cascade-export-v1") + } + _ => SchemaVariant::unknown(SOURCE), + } + } +} + +/// Parse the session-start header: stamp `ctx.session_id` and `ctx.project`, +/// then emit a `SessionStart` event (deduped). +fn parse_session_start( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: serde_json::Value, +) -> Vec { + if let Some(sid) = str_field(&value, "sessionId") { + if ctx.session_id.is_none() { + ctx.session_id = Some(sid); + } + } + + let cwd = str_field(&value, "cwd") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")); + let git = parse_git(value.get("git")); + let model = str_field(&value, "model"); + let tool_version = str_field(&value, "toolVersion"); + + // Bind the project for every event in this session. + ctx.project = Some(ProjectRef { + cwd: cwd.clone(), + repo_root: None, + git: git.clone(), + }); + + let event_id = event_id_for(&value, raw); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = ts_of(&value); + vec![util::mk_event( + SOURCE, + ctx, + raw, + event_id, + None, + ts, + EventKind::SessionStart { + cwd, + git, + model, + tool_version, + }, + )] +} + +/// Parse a user/assistant message record into its turn event plus any nested +/// tool calls, tool results, and file edits (in a deterministic order). +fn parse_message( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: serde_json::Value, + assistant: bool, +) -> Vec { + // Pick up a session id if the header was missing. + if ctx.session_id.is_none() { + if let Some(sid) = str_field(&value, "sessionId") { + ctx.session_id = Some(sid); + } + } + + let turn_id = event_id_for(&value, raw); + // Idempotency: a repeated record (same id) emits nothing. + if !ctx.first_seen(&turn_id) { + return Vec::new(); + } + + let parent_id = str_field(&value, "parentId"); + let ts = ts_of(&value); + let text = str_field(&value, "text").unwrap_or_default(); + + let mut out = Vec::new(); + + let turn_kind = if assistant { + let model = str_field(&value, "model"); + let usage = parse_usage(value.get("usage")); + EventKind::AssistantTurn { + text, + thinking: None, + model, + usage, + parts: Vec::new(), + } + } else { + EventKind::UserTurn { + text, + parts: Vec::new(), + } + }; + out.push(util::mk_event( + SOURCE, + ctx, + raw, + turn_id.clone(), + parent_id, + ts, + turn_kind, + )); + + // Tool calls — record call name so a later result/edit can pair by call_id. + if let Some(calls) = value.get("toolCalls").and_then(|v| v.as_array()) { + for (i, call) in calls.iter().enumerate() { + let call_id = str_field(call, "id").unwrap_or_else(|| format!("{turn_id}/call/{i}")); + let name = str_field(call, "name").unwrap_or_default(); + let args = call.get("args").cloned().unwrap_or(serde_json::Value::Null); + ctx.call_names.insert(call_id.clone(), name.clone()); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{turn_id}#toolcall:{call_id}"), + Some(turn_id.clone()), + ts, + EventKind::ToolCall { + call_id, + name, + args, + }, + )); + } + } + + // Tool results — `ok` flag is recorded so edits can detect tool failures. + if let Some(results) = value.get("toolResults").and_then(|v| v.as_array()) { + for (i, res) in results.iter().enumerate() { + let call_id = str_field(res, "id").unwrap_or_else(|| format!("{turn_id}/result/{i}")); + let ok = res + .get("ok") + .and_then(serde_json::Value::as_bool) + .unwrap_or(true); + let output = res + .get("output") + .cloned() + .unwrap_or(serde_json::Value::Null); + ctx.call_ok.insert(call_id.clone(), ok); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{turn_id}#toolresult:{call_id}"), + Some(turn_id.clone()), + ts, + EventKind::ToolResult { + call_id, + ok, + output, + }, + )); + } + } + + // File edits — normalized to FileEdit{diff}. + if let Some(edits) = value.get("edits").and_then(|v| v.as_array()) { + for (i, edit) in edits.iter().enumerate() { + let path = str_field(edit, "path").unwrap_or_default(); + let diff = Diff { + path: PathBuf::from(path), + old: str_field(edit, "oldText"), + new: str_field(edit, "newText"), + unified: str_field(edit, "diff"), + added_lines: u32_field(edit, "added"), + removed_lines: u32_field(edit, "removed"), + }; + let call_id = str_field(edit, "id"); + out.push(util::mk_event( + SOURCE, + ctx, + raw, + format!("{turn_id}#edit:{i}"), + Some(turn_id.clone()), + ts, + EventKind::FileEdit { call_id, diff }, + )); + } + } + + out +} + +/// The event id: tool-native `id` when present, else a content hash of the bytes. +fn event_id_for(value: &serde_json::Value, raw: &RawRecord) -> String { + str_field(value, "id").unwrap_or_else(|| content_id(&raw.bytes)) +} + +/// The record timestamp, via the shared `parse_ts` over the common keys. +fn ts_of(value: &serde_json::Value) -> memscribe_core::Timestamp { + util::ts_from(value, &["ts", "timestamp", "time", "created_at"]) +} + +/// Read a string field, returning `None` when absent or not a string. +fn str_field(value: &serde_json::Value, key: &str) -> Option { + value.get(key).and_then(|v| v.as_str()).map(str::to_string) +} + +/// Read a non-negative integer field as `u32`, clamped, defaulting to 0. +fn u32_field(value: &serde_json::Value, key: &str) -> u32 { + value + .get(key) + .and_then(serde_json::Value::as_u64) + .map(|n| u32::try_from(n).unwrap_or(u32::MAX)) + .unwrap_or(0) +} + +/// Parse the optional `git` object into a `GitRef`. +fn parse_git(value: Option<&serde_json::Value>) -> Option { + let g = value?; + let sha = str_field(g, "sha")?; + let branch = str_field(g, "branch"); + Some(GitRef { sha, branch }) +} + +/// Parse the optional `usage` object into a `Usage`. +fn parse_usage(value: Option<&serde_json::Value>) -> Option { + let u = value?; + let input_tokens = u.get("input").and_then(serde_json::Value::as_u64); + let output_tokens = u.get("output").and_then(serde_json::Value::as_u64); + Some(Usage { + input_tokens, + output_tokens, + cache_read_tokens: None, + cache_creation_tokens: None, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::{SourceLocation, TranscriptAdapter}; + use std::path::Path; + + fn raw(s: &str) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("cascade.jsonl", 0, 1)) + } + + fn parse_all(lines: &[&str]) -> Vec { + let a = WindsurfAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for l in lines { + out.extend(a.parse(&raw(l), &mut ctx).expect("parse never errors")); + } + out + } + + const SESSION_START: &str = r#"{"kind":"session_start","sessionId":"ws-1","cwd":"/home/dev/proj","git":{"sha":"abc123","branch":"main"},"toolVersion":"1.2.3","model":"cascade-base"}"#; + + #[test] + fn session_start_sets_session_and_project() { + let evs = parse_all(&[SESSION_START]); + assert_eq!(evs.len(), 1); + let e = &evs[0]; + assert_eq!(e.kind.tag(), "session_start"); + assert_eq!(e.session_id, "ws-1"); + assert_eq!(e.project.cwd, PathBuf::from("/home/dev/proj")); + let git = e.project.git.as_ref().expect("git bound"); + assert_eq!(git.sha, "abc123"); + assert_eq!(git.branch.as_deref(), Some("main")); + match &e.kind { + EventKind::SessionStart { + model, + tool_version, + .. + } => { + assert_eq!(model.as_deref(), Some("cascade-base")); + assert_eq!(tool_version.as_deref(), Some("1.2.3")); + } + other => panic!("expected session_start, got {other:?}"), + } + } + + #[test] + fn normalized_sequence_user_then_assistant_with_tools_and_edit() { + let user = r#"{"id":"u1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"ws-1","text":"Let's use Postgres instead of MySQL"}"#; + let asst = r#"{"id":"a1","parentId":"u1","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"ws-1","text":"On it.","model":"cascade-base","usage":{"input":10,"output":4},"toolCalls":[{"id":"c1","name":"edit_file","args":{"path":"db.rs"}}],"toolResults":[{"id":"c1","ok":true,"output":"done"}],"edits":[{"id":"c1","path":"db.rs","oldText":"mysql","newText":"postgres","diff":"@@ -1 +1 @@","added":1,"removed":1}]}"#; + let tags: Vec<&str> = parse_all(&[SESSION_START, user, asst]) + .iter() + .map(|e| e.kind.tag()) + .collect(); + assert_eq!( + tags, + vec![ + "session_start", + "user_turn", + "assistant_turn", + "tool_call", + "tool_result", + "file_edit", + ] + ); + } + + #[test] + fn decision_then_edit_produces_user_turn_then_file_edit() { + let user = r#"{"id":"u1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"ws-1","text":"Let's use Postgres instead of MySQL","edits":[{"path":"schema.sql","oldText":"a","newText":"b","added":1,"removed":1}]}"#; + let evs = parse_all(&[user]); + assert_eq!(evs.len(), 2); + assert_eq!(evs[0].kind.tag(), "user_turn"); + assert_eq!(evs[1].kind.tag(), "file_edit"); + match &evs[0].kind { + EventKind::UserTurn { text, .. } => { + assert_eq!(text, "Let's use Postgres instead of MySQL"); + } + other => panic!("expected user_turn, got {other:?}"), + } + match &evs[1].kind { + EventKind::FileEdit { diff, .. } => { + assert_eq!(diff.path, PathBuf::from("schema.sql")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 1); + } + other => panic!("expected file_edit, got {other:?}"), + } + } + + #[test] + fn assistant_usage_and_model_are_copied() { + let asst = r#"{"id":"a1","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"ws-1","text":"hi","model":"cascade-pro","usage":{"input":100,"output":42}}"#; + let evs = parse_all(&[asst]); + match &evs[0].kind { + EventKind::AssistantTurn { model, usage, .. } => { + assert_eq!(model.as_deref(), Some("cascade-pro")); + let u = usage.as_ref().expect("usage present"); + assert_eq!(u.input_tokens, Some(100)); + assert_eq!(u.output_tokens, Some(42)); + } + other => panic!("expected assistant_turn, got {other:?}"), + } + } + + #[test] + fn tool_failure_result_marks_not_ok() { + let asst = r#"{"id":"a1","role":"assistant","ts":"2026-06-22T10:00:05Z","sessionId":"ws-1","text":"trying","toolResults":[{"id":"c1","ok":false,"output":"permission denied"}],"edits":[{"id":"c1","path":"locked.rs","oldText":"x","newText":"y","added":1,"removed":1}]}"#; + let evs = parse_all(&[asst]); + let tr = evs + .iter() + .find(|e| e.kind.tag() == "tool_result") + .expect("tool_result present"); + match &tr.kind { + EventKind::ToolResult { ok, .. } => assert!(!ok), + other => panic!("expected tool_result, got {other:?}"), + } + // The edit is still captured (losslessness); episode-building downstream + // decides not to mint an Episode for a failed edit. + assert!(evs.iter().any(|e| e.kind.tag() == "file_edit")); + } + + #[test] + fn unrecognized_valid_record_becomes_unknown() { + let weird = r#"{"kind":"telemetry","payload":{"latency_ms":12}}"#; + let evs = parse_all(&[weird]); + assert_eq!(evs.len(), 1); + assert_eq!(evs[0].kind.tag(), "unknown"); + } + + #[test] + fn garbage_never_panics_and_is_lossless() { + // Invalid JSON, a bare scalar, and blank input. + let a = WindsurfAdapter; + let mut ctx = ParseCtx::new(); + let garbage = a.parse(&raw("{not json at all"), &mut ctx).unwrap(); + assert_eq!(garbage.len(), 1); + assert_eq!(garbage[0].kind.tag(), "unknown"); + let scalar = a.parse(&raw("42"), &mut ctx).unwrap(); + assert_eq!(scalar.len(), 1); + assert_eq!(scalar[0].kind.tag(), "unknown"); + let blank = a.parse(&raw(" "), &mut ctx).unwrap(); + assert!(blank.is_empty()); + } + + #[test] + fn repeated_record_is_deduped() { + let user = r#"{"id":"u1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"ws-1","text":"hello"}"#; + let evs = parse_all(&[user, user]); + assert_eq!(evs.len(), 1, "second identical record dedups to empty"); + assert_eq!(evs[0].kind.tag(), "user_turn"); + } + + #[test] + fn seq_is_monotonic_and_deterministic() { + let user = r#"{"id":"u1","role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"ws-1","text":"a"}"#; + let asst = r#"{"id":"a1","role":"assistant","ts":"2026-06-22T10:00:01Z","sessionId":"ws-1","text":"b"}"#; + let evs = parse_all(&[SESSION_START, user, asst]); + let seqs: Vec = evs.iter().map(|e| e.seq).collect(); + assert_eq!(seqs, vec![0, 1, 2]); + } + + #[test] + fn no_id_falls_back_to_content_hash() { + let rec = + r#"{"role":"user","ts":"2026-06-22T10:00:00Z","sessionId":"ws-1","text":"no id here"}"#; + let evs = parse_all(&[rec]); + assert_eq!(evs.len(), 1); + assert!(!evs[0].event_id.is_empty()); + // Deterministic: same bytes → same id. + let again = parse_all(&[rec]); + assert_eq!(evs[0].event_id, again[0].event_id); + } + + #[test] + fn discover_points_at_real_product_paths() { + let cfg = DiscoverCfg { + home: Some(PathBuf::from("/home/dev")), + ..DiscoverCfg::default() + }; + let handles = WindsurfAdapter.discover(&cfg); + assert_eq!(handles.len(), 2); + assert!(handles.iter().all(|h| h.source == SourceKind::Windsurf)); + assert!(handles + .iter() + .any(|h| h.path == Path::new("/home/dev/.codeium/windsurf"))); + } + + #[test] + fn schema_fingerprint_recognizes_cascade_export() { + let fp = WindsurfAdapter.schema_fingerprint(&raw(SESSION_START)); + assert_eq!(fp.variant, "windsurf/cascade-export-v1"); + assert_eq!(fp.confidence, 100); + let unknown = WindsurfAdapter.schema_fingerprint(&raw("{not json")); + assert_eq!(unknown.confidence, 0); + } +} diff --git a/crates/memscribe-adapters/src/zed.rs b/crates/memscribe-adapters/src/zed.rs new file mode 100644 index 0000000..0b1ac5d --- /dev/null +++ b/crates/memscribe-adapters/src/zed.rs @@ -0,0 +1,776 @@ +//! Zed adapter. +//! +//! Zed's assistant/agent stores threads under its application support directory +//! (`~/Library/Application Support/Zed/threads/` on macOS, `~/.local/share/zed/` +//! on Linux) in an undocumented SQLite/JSON store. We do not parse that binary +//! store in this model; instead this adapter targets an **exported JSON-lines** +//! thread shape and routes anything unrecognized to [`EventKind::Unknown`] so the +//! stream stays lossless across Zed's frequent format churn. +//! +//! ## Exported record shape (one JSON object per line) +//! +//! A leading session header: +//! ```json +//! {"kind":"session_start","cwd":"…","git":{"sha":"…","branch":"…"}, +//! "toolVersion":"zed 0.182.0","sessionId":"…","ts":"2026-06-22T10:00:00Z"} +//! ``` +//! followed by message records: +//! ```json +//! {"id":"…","parentId":"…","role":"user|assistant","ts":"…","sessionId":"…", +//! "text":"…","model":"…","usage":{"input":N,"output":N}, +//! "toolCalls":[{"id":"…","name":"…","args":{…}}], +//! "toolResults":[{"id":"…","ok":true,"output":…}], +//! "edits":[{"path":"…","oldText":"…","newText":"…","diff":"…", +//! "added":N,"removed":N}]} +//! ``` +//! and an optional `{"kind":"session_end","reason":"…"}` trailer. +//! +//! ## Mapping +//! - `kind:session_start` → [`EventKind::SessionStart`] (also binds +//! `ctx.session_id` and `ctx.project`). +//! - `role:user` → [`EventKind::UserTurn`]. +//! - `role:assistant` → [`EventKind::AssistantTurn`] (`text`, `model`, `usage`). +//! - `toolCalls[]` → [`EventKind::ToolCall`]. +//! - `toolResults[]` → [`EventKind::ToolResult`] (`ok`), and the `ok` flag is +//! remembered so downstream can suppress episodes for failed edits. +//! - `edits[]` → [`EventKind::FileEdit`] with a normalized [`Diff`]. +//! +//! ## Invariants +//! Never panics (no `unwrap`/`expect`/indexing on parsed input); fully +//! deterministic (no clock/random/global state); deduplicates by record id via +//! [`ParseCtx::first_seen`]; any valid-but-unrecognized record becomes +//! [`EventKind::Unknown`]. + +use crate::util; +use memscribe_core::{ + CaptureEvent, Diff, DiscoverCfg, EventKind, GitRef, ParseCtx, ParseError, ProjectRef, + RawRecord, SchemaVariant, SourceKind, TranscriptAdapter, TranscriptHandle, Usage, +}; +use std::path::PathBuf; + +/// Adapter for Zed transcripts. +#[derive(Debug, Default, Clone, Copy)] +pub struct ZedAdapter; + +impl TranscriptAdapter for ZedAdapter { + fn source_kind(&self) -> SourceKind { + SourceKind::Zed + } + + fn discover(&self, cfg: &DiscoverCfg) -> Vec { + let home = cfg.home_dir(); + // Zed's real on-disk thread stores. We point at them so the runtime can + // surface where Zed history lives even though this model parses exported + // JSONL rather than the binary store. + let roots = [ + home.join("Library/Application Support/Zed/threads"), + home.join(".local/share/zed/threads"), + home.join(".local/share/zed"), + ]; + let mut handles = Vec::new(); + for root in roots { + if !root.is_dir() { + continue; + } + for entry in walkdir::WalkDir::new(&root) + .max_depth(4) + .into_iter() + .filter_map(Result::ok) + { + let path = entry.path(); + if !path.is_file() { + continue; + } + let ext_ok = path + .extension() + .and_then(|e| e.to_str()) + .map(|e| matches!(e, "jsonl" | "json" | "ndjson")) + .unwrap_or(false); + if !ext_ok { + continue; + } + let session_hint = path + .file_stem() + .and_then(|s| s.to_str()) + .map(str::to_string); + handles.push(TranscriptHandle { + path: path.to_path_buf(), + source: SourceKind::Zed, + session_hint, + compressed: false, + }); + } + } + // Deterministic ordering across platforms / filesystem iteration order. + handles.sort_by(|a, b| a.path.cmp(&b.path)); + handles + } + + fn parse(&self, raw: &RawRecord, ctx: &mut ParseCtx) -> Result, ParseError> { + let Some(value) = util::parse_json_line(raw) else { + // Blank line → nothing; non-JSON garbage → lossless Unknown. + let s = raw.as_str().map(str::trim).unwrap_or(""); + if s.is_empty() { + return Ok(Vec::new()); + } + let v = serde_json::Value::String(s.to_string()); + return Ok(vec![util::unknown_event(SourceKind::Zed, ctx, raw, v)]); + }; + + // `kind`-tagged control records (session lifecycle). + if let Some(kind) = value.get("kind").and_then(|v| v.as_str()) { + match kind { + "session_start" => return Ok(parse_session_start(raw, ctx, &value)), + "session_end" => return Ok(parse_session_end(raw, ctx, &value)), + _ => return Ok(vec![util::unknown_event(SourceKind::Zed, ctx, raw, value)]), + } + } + + // Otherwise it should be a `role`-tagged message record. + if value.get("role").and_then(|v| v.as_str()).is_some() { + return Ok(parse_message(raw, ctx, &value)); + } + + // Valid JSON we don't recognize → Unknown (losslessness). + Ok(vec![util::unknown_event(SourceKind::Zed, ctx, raw, value)]) + } + + fn schema_fingerprint(&self, sample: &RawRecord) -> SchemaVariant { + let Some(value) = util::parse_json_line(sample) else { + return SchemaVariant::unknown(SourceKind::Zed); + }; + let looks_like_zed = value.get("kind").and_then(|v| v.as_str()) == Some("session_start") + || (value.get("role").is_some() + && (value.get("toolCalls").is_some() + || value.get("toolResults").is_some() + || value.get("edits").is_some() + || value.get("sessionId").is_some())); + if looks_like_zed { + SchemaVariant::certain(SourceKind::Zed, "zed/export-v1") + } else { + SchemaVariant::unknown(SourceKind::Zed) + } + } +} + +/// Parse a `kind:session_start` header, binding session + project on `ctx`. +fn parse_session_start( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, +) -> Vec { + if let Some(sid) = value.get("sessionId").and_then(|v| v.as_str()) { + ctx.session_id = Some(sid.to_string()); + } + let cwd = value + .get("cwd") + .and_then(|v| v.as_str()) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")); + let git = parse_git(value.get("git")); + let model = value + .get("model") + .and_then(|v| v.as_str()) + .map(str::to_string); + let tool_version = value + .get("toolVersion") + .and_then(|v| v.as_str()) + .map(str::to_string); + + // Bind the project for every subsequent event in this session. + ctx.project = Some(ProjectRef { + cwd: cwd.clone(), + repo_root: None, + git: git.clone(), + }); + + let event_id = event_id_for(value, raw); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = util::ts_from(value, &["ts", "timestamp", "time"]); + vec![util::mk_event( + SourceKind::Zed, + ctx, + raw, + event_id, + None, + ts, + EventKind::SessionStart { + cwd, + git, + model, + tool_version, + }, + )] +} + +/// Parse a `kind:session_end` trailer. +fn parse_session_end( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, +) -> Vec { + if let Some(sid) = value.get("sessionId").and_then(|v| v.as_str()) { + if ctx.session_id.is_none() { + ctx.session_id = Some(sid.to_string()); + } + } + let event_id = event_id_for(value, raw); + if !ctx.first_seen(&event_id) { + return Vec::new(); + } + let ts = util::ts_from(value, &["ts", "timestamp", "time"]); + let reason = value + .get("reason") + .and_then(|v| v.as_str()) + .map(str::to_string); + vec![util::mk_event( + SourceKind::Zed, + ctx, + raw, + event_id, + None, + ts, + EventKind::SessionEnd { reason }, + )] +} + +/// Parse a `role`-tagged message record into its turn plus any embedded +/// tool calls, tool results, and file edits (one record fans out to many events). +fn parse_message( + raw: &RawRecord, + ctx: &mut ParseCtx, + value: &serde_json::Value, +) -> Vec { + if let Some(sid) = value.get("sessionId").and_then(|v| v.as_str()) { + if ctx.session_id.is_none() { + ctx.session_id = Some(sid.to_string()); + } + } + + let record_id = event_id_for(value, raw); + // Idempotency: a repeated record (same id) yields nothing. + if !ctx.first_seen(&record_id) { + return Vec::new(); + } + + let ts = util::ts_from(value, &["ts", "timestamp", "time", "created_at"]); + let parent_id = value + .get("parentId") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_string); + let role = value.get("role").and_then(|v| v.as_str()).unwrap_or(""); + let text = value + .get("text") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let mut events = Vec::new(); + + // 1) The turn itself. + let turn_kind = match role { + "user" => EventKind::UserTurn { + text, + parts: Vec::new(), + }, + "assistant" => { + let model = value + .get("model") + .and_then(|v| v.as_str()) + .map(str::to_string); + let thinking = value + .get("thinking") + .and_then(|v| v.as_str()) + .map(str::to_string); + let usage = parse_usage(value.get("usage")); + EventKind::AssistantTurn { + text, + thinking, + model, + usage, + parts: Vec::new(), + } + } + _ => { + // Unknown role → lossless Unknown for the whole record. + return vec![util::unknown_event( + SourceKind::Zed, + ctx, + raw, + value.clone(), + )]; + } + }; + events.push(util::mk_event( + SourceKind::Zed, + ctx, + raw, + record_id.clone(), + parent_id, + ts, + turn_kind, + )); + + // 2) Tool calls embedded in the turn. + if let Some(calls) = value.get("toolCalls").and_then(|v| v.as_array()) { + for (i, call) in calls.iter().enumerate() { + let call_id = call + .get("id") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_else(|| format!("{record_id}#call{i}")); + let name = call + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let args = call.get("args").cloned().unwrap_or(serde_json::Value::Null); + // Remember the call name for pairing with results/edits. + ctx.call_names.insert(call_id.clone(), name.clone()); + let child_id = format!("{record_id}:call:{call_id}"); + if !ctx.first_seen(&child_id) { + continue; + } + events.push(util::mk_event( + SourceKind::Zed, + ctx, + raw, + child_id, + Some(record_id.clone()), + ts, + EventKind::ToolCall { + call_id, + name, + args, + }, + )); + } + } + + // 3) Tool results embedded in the turn. + if let Some(results) = value.get("toolResults").and_then(|v| v.as_array()) { + for (i, result) in results.iter().enumerate() { + let call_id = result + .get("id") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_else(|| format!("{record_id}#res{i}")); + // `ok` defaults to true when omitted; an explicit `false` marks failure. + let ok = result.get("ok").and_then(|v| v.as_bool()).unwrap_or(true); + let output = result + .get("output") + .cloned() + .unwrap_or(serde_json::Value::Null); + // Remember success/failure so downstream can suppress failed-edit episodes. + ctx.call_ok.insert(call_id.clone(), ok); + let child_id = format!("{record_id}:result:{call_id}"); + if !ctx.first_seen(&child_id) { + continue; + } + events.push(util::mk_event( + SourceKind::Zed, + ctx, + raw, + child_id, + Some(record_id.clone()), + ts, + EventKind::ToolResult { + call_id, + ok, + output, + }, + )); + } + } + + // 4) File edits embedded in the turn. + if let Some(edits) = value.get("edits").and_then(|v| v.as_array()) { + for (i, edit) in edits.iter().enumerate() { + let path = edit + .get("path") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let old = edit + .get("oldText") + .and_then(|v| v.as_str()) + .map(str::to_string); + let new = edit + .get("newText") + .and_then(|v| v.as_str()) + .map(str::to_string); + let unified = edit + .get("diff") + .and_then(|v| v.as_str()) + .map(str::to_string); + let added_lines = edit.get("added").and_then(|v| v.as_u64()).unwrap_or(0) as u32; + let removed_lines = edit.get("removed").and_then(|v| v.as_u64()).unwrap_or(0) as u32; + // Correlate the edit to a tool call in the same record, if exactly one + // exists (so downstream can join the edit to its result's `ok` flag). + let call_id = edit + .get("callId") + .and_then(|v| v.as_str()) + .map(str::to_string) + .or_else(|| sole_tool_call_id(value)); + let child_id = format!("{record_id}:edit:{i}"); + if !ctx.first_seen(&child_id) { + continue; + } + events.push(util::mk_event( + SourceKind::Zed, + ctx, + raw, + child_id, + Some(record_id.clone()), + ts, + EventKind::FileEdit { + call_id, + diff: Diff { + path: PathBuf::from(path), + old, + new, + unified, + added_lines, + removed_lines, + }, + }, + )); + } + } + + events +} + +/// Resolve the `event_id`: tool-native `id` when present, else a content hash. +fn event_id_for(value: &serde_json::Value, raw: &RawRecord) -> String { + value + .get("id") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_string) + .unwrap_or_else(|| memscribe_core::content_id(&raw.bytes)) +} + +/// Parse an optional `git` object into a [`GitRef`]. A missing/blank sha yields +/// `None` rather than an empty ref. +fn parse_git(value: Option<&serde_json::Value>) -> Option { + let g = value?; + let sha = g.get("sha").and_then(|v| v.as_str())?; + if sha.is_empty() { + return None; + } + let branch = g + .get("branch") + .and_then(|v| v.as_str()) + .filter(|s| !s.is_empty()) + .map(str::to_string); + Some(GitRef { + sha: sha.to_string(), + branch, + }) +} + +/// Parse an optional `usage` object. Returns `None` when no fields are present. +fn parse_usage(value: Option<&serde_json::Value>) -> Option { + let u = value?; + let input_tokens = u + .get("input") + .or_else(|| u.get("input_tokens")) + .and_then(|v| v.as_u64()); + let output_tokens = u + .get("output") + .or_else(|| u.get("output_tokens")) + .and_then(|v| v.as_u64()); + if input_tokens.is_none() && output_tokens.is_none() { + return None; + } + Some(Usage { + input_tokens, + output_tokens, + cache_read_tokens: None, + cache_creation_tokens: None, + }) +} + +/// If a record carries exactly one tool call, return its id — used to correlate +/// a sibling file edit to that call. Returns `None` for zero or many calls. +fn sole_tool_call_id(value: &serde_json::Value) -> Option { + let calls = value.get("toolCalls").and_then(|v| v.as_array())?; + if calls.len() != 1 { + return None; + } + calls + .first() + .and_then(|c| c.get("id")) + .and_then(|v| v.as_str()) + .map(str::to_string) +} + +#[cfg(test)] +mod tests { + use super::*; + use memscribe_core::SourceLocation; + + fn raw(s: &str) -> RawRecord { + RawRecord::from_line(s, SourceLocation::new("zed.jsonl", 0, 1)) + } + + /// Parse a whole JSONL string through one shared ctx, mirroring runtime use. + fn parse_all(lines: &str) -> (Vec, ParseCtx) { + let adapter = ZedAdapter; + let mut ctx = ParseCtx::new(); + let mut out = Vec::new(); + for line in lines.lines() { + let evs = adapter.parse(&raw(line), &mut ctx).expect("never errors"); + out.extend(evs); + } + (out, ctx) + } + + fn tags(evs: &[CaptureEvent]) -> Vec<&'static str> { + evs.iter().map(|e| e.kind.tag()).collect() + } + + #[test] + fn session_start_binds_session_and_project() { + let line = r#"{"kind":"session_start","cwd":"/w/orbit","git":{"sha":"abc","branch":"main"},"toolVersion":"zed 0.1","sessionId":"s1","ts":"2026-06-22T10:00:00Z"}"#; + let (evs, ctx) = parse_all(line); + assert_eq!(tags(&evs), vec!["session_start"]); + assert_eq!(ctx.session_id.as_deref(), Some("s1")); + assert_eq!(evs[0].session_id, "s1"); + match &evs[0].kind { + EventKind::SessionStart { + cwd, + git, + tool_version, + .. + } => { + assert_eq!(cwd.as_path(), std::path::Path::new("/w/orbit")); + assert_eq!(git.as_ref().map(|g| g.sha.as_str()), Some("abc")); + assert_eq!(tool_version.as_deref(), Some("zed 0.1")); + } + other => panic!("expected SessionStart, got {other:?}"), + } + // Project propagated from session start. + assert_eq!(evs[0].project.cwd, std::path::Path::new("/w/orbit")); + } + + #[test] + fn decision_turn_then_edit_sequence() { + let lines = concat!( + r#"{"kind":"session_start","cwd":"/w","git":{"sha":"a"},"sessionId":"s","ts":"2026-06-22T10:00:00Z"}"#, + "\n", + r#"{"id":"u1","role":"user","ts":"2026-06-22T10:00:05Z","sessionId":"s","text":"Let's use Postgres instead of MySQL"}"#, + "\n", + r#"{"id":"a1","parentId":"u1","role":"assistant","ts":"2026-06-22T10:00:09Z","sessionId":"s","text":"ok","model":"m","usage":{"input":10,"output":3},"edits":[{"path":"src/db.rs","oldText":"mysql","newText":"postgres","diff":"d","added":1,"removed":1}]}"#, + ); + let (evs, _) = parse_all(lines); + // A decision (UserTurn) followed by a FileEdit must appear in order. + assert_eq!( + tags(&evs), + vec!["session_start", "user_turn", "assistant_turn", "file_edit"] + ); + // The user decision text is preserved verbatim. + match &evs[1].kind { + EventKind::UserTurn { text, .. } => assert!(text.contains("Postgres")), + other => panic!("expected UserTurn, got {other:?}"), + } + // The edit normalizes old/new/unified + line counts. + match &evs[3].kind { + EventKind::FileEdit { diff, .. } => { + assert_eq!(diff.path, PathBuf::from("src/db.rs")); + assert_eq!(diff.old.as_deref(), Some("mysql")); + assert_eq!(diff.new.as_deref(), Some("postgres")); + assert_eq!(diff.unified.as_deref(), Some("d")); + assert_eq!(diff.added_lines, 1); + assert_eq!(diff.removed_lines, 1); + } + other => panic!("expected FileEdit, got {other:?}"), + } + // Seq is monotonic across the fanned-out events. + let seqs: Vec = evs.iter().map(|e| e.seq).collect(); + assert_eq!(seqs, vec![0, 1, 2, 3]); + } + + #[test] + fn tool_call_and_result_ok_recorded() { + let lines = concat!( + r#"{"id":"a","role":"assistant","sessionId":"s","text":"calling","toolCalls":[{"id":"c1","name":"read_file","args":{"path":"x"}}]}"#, + "\n", + r#"{"id":"b","role":"assistant","sessionId":"s","text":"got it","toolResults":[{"id":"c1","ok":true,"output":"data"}]}"#, + ); + let (evs, ctx) = parse_all(lines); + assert_eq!( + tags(&evs), + vec![ + "assistant_turn", + "tool_call", + "assistant_turn", + "tool_result" + ] + ); + assert_eq!( + ctx.call_names.get("c1").map(String::as_str), + Some("read_file") + ); + assert_eq!(ctx.call_ok.get("c1"), Some(&true)); + } + + #[test] + fn failed_tool_result_marks_call_not_ok() { + // An edit whose tool result failed: the edit is captured but the result's + // ok:false is recorded so downstream can suppress the episode. + let lines = concat!( + r#"{"id":"a","role":"assistant","sessionId":"s","text":"editing","toolCalls":[{"id":"c9","name":"edit_file","args":{}}],"edits":[{"path":"src/c.rs","oldText":"x","newText":"y","added":1,"removed":1}]}"#, + "\n", + r#"{"id":"b","role":"assistant","sessionId":"s","text":"failed","toolResults":[{"id":"c9","ok":false,"output":"locked"}]}"#, + ); + let (evs, ctx) = parse_all(lines); + assert_eq!( + tags(&evs), + vec![ + "assistant_turn", + "tool_call", + "file_edit", + "assistant_turn", + "tool_result" + ] + ); + // The edit was correlated to the sole tool call in its record. + match &evs[2].kind { + EventKind::FileEdit { call_id, .. } => { + assert_eq!(call_id.as_deref(), Some("c9")); + } + other => panic!("expected FileEdit, got {other:?}"), + } + // The failure is recorded against the call id → downstream drops the episode. + assert_eq!(ctx.call_ok.get("c9"), Some(&false)); + match &evs[4].kind { + EventKind::ToolResult { ok, .. } => assert!(!ok), + other => panic!("expected ToolResult, got {other:?}"), + } + } + + #[test] + fn dedup_repeated_record_is_idempotent() { + let line = r#"{"id":"u1","role":"user","sessionId":"s","text":"hi"}"#; + let adapter = ZedAdapter; + let mut ctx = ParseCtx::new(); + let first = adapter.parse(&raw(line), &mut ctx).unwrap(); + let second = adapter.parse(&raw(line), &mut ctx).unwrap(); + assert_eq!(tags(&first), vec!["user_turn"]); + assert!(second.is_empty(), "repeated record must yield nothing"); + } + + #[test] + fn unknown_record_kind_is_lossless() { + let line = r#"{"kind":"telemetry_ping","payload":42}"#; + let (evs, _) = parse_all(line); + assert_eq!(tags(&evs), vec!["unknown"]); + match &evs[0].kind { + EventKind::Unknown { raw, .. } => { + assert_eq!(raw.get("payload").and_then(|v| v.as_i64()), Some(42)); + } + other => panic!("expected Unknown, got {other:?}"), + } + } + + #[test] + fn unknown_role_is_lossless() { + let line = r#"{"id":"x","role":"system","sessionId":"s","text":"boot"}"#; + let (evs, _) = parse_all(line); + assert_eq!(tags(&evs), vec!["unknown"]); + } + + #[test] + fn garbage_input_never_panics_and_is_lossless() { + let adapter = ZedAdapter; + let mut ctx = ParseCtx::new(); + // Non-JSON line → Unknown, no panic. + let g = adapter.parse(&raw("}{ not json at all"), &mut ctx).unwrap(); + assert_eq!(tags(&g), vec!["unknown"]); + // Blank line → nothing. + let blank = adapter.parse(&raw(" "), &mut ctx).unwrap(); + assert!(blank.is_empty()); + // Truncated / weird JSON shapes must not panic. + for s in [ + "{", + "[1,2,3]", + "null", + "12345", + r#"{"role":"assistant"}"#, + r#"{"role":"assistant","edits":[{}]}"#, + r#"{"kind":"session_start"}"#, + r#"{"role":"assistant","toolCalls":"not-an-array"}"#, + r#"{"role":"assistant","usage":"oops"}"#, + ] { + let _ = adapter.parse(&raw(s), &mut ctx).unwrap(); + } + } + + #[test] + fn session_end_emits_session_end() { + let line = r#"{"kind":"session_end","sessionId":"s","reason":"user_closed","ts":"2026-06-22T10:01:30Z"}"#; + let (evs, _) = parse_all(line); + assert_eq!(tags(&evs), vec!["session_end"]); + match &evs[0].kind { + EventKind::SessionEnd { reason } => assert_eq!(reason.as_deref(), Some("user_closed")), + other => panic!("expected SessionEnd, got {other:?}"), + } + } + + #[test] + fn no_id_falls_back_to_content_hash() { + let line = r#"{"role":"user","sessionId":"s","text":"anon"}"#; + let (evs, _) = parse_all(line); + assert_eq!(tags(&evs), vec!["user_turn"]); + // 64-hex blake3 content id (no native id present). + assert_eq!(evs[0].event_id.len(), 64); + assert!(evs[0].event_id.chars().all(|c| c.is_ascii_hexdigit())); + } + + #[test] + fn fingerprint_recognizes_zed_export() { + let adapter = ZedAdapter; + let start = raw(r#"{"kind":"session_start","sessionId":"s","cwd":"/w"}"#); + assert_eq!(adapter.schema_fingerprint(&start).confidence, 100); + let msg = raw(r#"{"id":"a","role":"assistant","sessionId":"s","edits":[]}"#); + assert_eq!(adapter.schema_fingerprint(&msg).confidence, 100); + let foreign = raw(r#"{"type":"summary","text":"x"}"#); + assert_eq!(adapter.schema_fingerprint(&foreign).confidence, 0); + } + + #[test] + fn full_happy_path_fixture_shape_parses() { + // Mirrors fixtures/zed/v1/happy_path_decision_then_edits.jsonl in shape. + let lines = concat!( + r#"{"kind":"session_start","cwd":"/w","git":{"sha":"a","branch":"main"},"toolVersion":"zed 0.1","sessionId":"t1","ts":"2026-06-22T10:00:00Z"}"#, + "\n", + r#"{"id":"m1","parentId":null,"role":"user","ts":"2026-06-22T10:00:05Z","sessionId":"t1","text":"Let's use Postgres instead of MySQL."}"#, + "\n", + r#"{"id":"m2","parentId":"m1","role":"assistant","ts":"2026-06-22T10:00:09Z","sessionId":"t1","text":"ok","model":"m","usage":{"input":1,"output":1},"edits":[{"path":"a.rs","oldText":"x","newText":"y","diff":"d","added":1,"removed":1},{"path":"b.rs","oldText":"p","newText":"q","diff":"d2","added":2,"removed":1}]}"#, + "\n", + r#"{"kind":"session_end","sessionId":"t1","ts":"2026-06-22T10:01:30Z","reason":"user_closed"}"#, + ); + let (evs, _) = parse_all(lines); + assert_eq!( + tags(&evs), + vec![ + "session_start", + "user_turn", + "assistant_turn", + "file_edit", + "file_edit", + "session_end" + ] + ); + // Every event carries the bound session + project. + assert!(evs.iter().all(|e| e.session_id == "t1")); + assert!(evs + .iter() + .all(|e| e.project.cwd == std::path::Path::new("/w"))); + } +} diff --git a/crates/memscribe-cli/Cargo.toml b/crates/memscribe-cli/Cargo.toml new file mode 100644 index 0000000..7e37530 --- /dev/null +++ b/crates/memscribe-cli/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "memscribe-cli" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "The `memscribe` binary: watch / hook / parse / replay / verify / redact." + +[lints] +workspace = true + +[[bin]] +name = "memscribe" +path = "src/main.rs" + +[dependencies] +memscribe-core = { workspace = true } +memscribe-adapters = { workspace = true } +memscribe-sink = { workspace = true } +memscribe-io = { workspace = true, features = ["watch", "cursor-store"] } +memscribe-testkit = { workspace = true } +clap = { workspace = true } +ctrlc = "3" +anyhow = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +toml = "0.8" +zstd = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } + +[dev-dependencies] +assert_cmd = { workspace = true } +predicates = { workspace = true } +tempfile = { workspace = true } diff --git a/crates/memscribe-cli/README.md b/crates/memscribe-cli/README.md new file mode 100644 index 0000000..8f4b6e7 --- /dev/null +++ b/crates/memscribe-cli/README.md @@ -0,0 +1,131 @@ +# memscribe (CLI) + +The `memscribe` binary — the daemon and the toolbox for Memscribe's +deterministic, zero-LLM transcript capture. It wraps the workspace crates +(`memscribe-core`, `-adapters`, `-io`, `-sink`) behind six subcommands. + +```console +cargo run -p memscribe-cli -- [args] +# or, once installed: +memscribe [args] +``` + +The pipeline is deterministic and never calls a model. By default the redaction +pass is **on**, so secrets are stripped before anything is written. See the +workspace [ARCHITECTURE.md](../../ARCHITECTURE.md) for the pipeline and +[memscribe.example.toml](../../memscribe.example.toml) for the config surface. + +--- + +## Commands + +### `watch` — the steady-state capture daemon + +Tail discovered transcripts (and serve the hook endpoint), preparing nodes to a +sink as they arrive. + +```console +memscribe watch [--tools claude,codex,gemini] [--sink ndjson|sqlite|memdb] \ + [--out FILE|-] [--root DIR ...] [--once] [--config memscribe.toml] +``` + +| Flag | Default | Meaning | +|------|---------|---------| +| `--tools` | every adapter | Comma-separated tool slugs to watch (`SourceKind::parse` values). | +| `--sink` | `ndjson` | Sink target: `ndjson`, `sqlite`, or `memdb` (`memdb` needs the `memdb` feature). | +| `--out` | `-` (stdout) | Where prepared nodes go (a file for `ndjson`/`sqlite`); `-` is stdout. | +| `--root` | `$HOME` | Directory root(s) to scan for transcripts; repeatable. | +| `--once` | off | Drain what already exists and exit, instead of tailing live. | +| `--config` | — | Path to a `memscribe.toml` (see `memscribe.example.toml`). | + +### `hook` — the hook handler + +Reads a hook payload from stdin, records it, and exits `0` immediately. It never +blocks the agent and never invokes a model. Agents wire this as their hook +command. + +```console +memscribe hook < payload.json +``` + +### `parse` — one-shot parse a transcript to NDJSON + +The workhorse for tests and debugging: run one transcript file through the +adapter and the full pipeline, emitting prepared nodes as NDJSON on stdout. + +```console +memscribe parse [--as TOOL] [--no-redact] +``` + +- `--as TOOL` forces a specific adapter (`claude_code`, `codex`, `gemini`, + `otel`, `cursor`, `windsurf`, `zed`, `vscode`, `copilot`). Omit it to infer + the tool from the path; if inference fails the command tells you to pass + `--as`. +- `--no-redact` emits verbatim content (used by golden tests that assert on + exact text). Redaction is on otherwise. + +```console +memscribe parse ~/.claude/projects/foo/session.jsonl --as claude_code +``` + +### `replay` — re-run preparation over a historical session + +Re-prepares a transcript file with the current pipeline (redaction on). Useful +after an adapter or pipeline change to see the new node stream for an old +session. + +```console +memscribe replay [--as TOOL] +``` + +### `verify` — the conformance smoke suite + +Parses every fixture under `fixtures/` and prints a per-tool `CASES / OK / NODES` +table, exiting non-zero on any failure. This is the fast, shellable summary the +daemon ships with; full cross-tool conformance and the §8.3 invariants live in +the testkit (`cargo test -p memscribe-testkit`). + +```console +memscribe verify +memscribe verify --capture # (planned) snapshot a live session into a new fixture +``` + +### `redact` — preview the redaction pass + +Reads a file and prints it with secrets replaced by `[REDACTED: