syncable-dev · Alex793x · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,138 @@
+# Memscribe CI (whitepaper §8.9).
+#
+# Memscribe is deterministic and zero-LLM by construction, so CI is a hard gate,
+# not a smoke test: the same input bytes must always produce the same nodes, the
+# tree must be clippy- and rustfmt-clean, the dependency set must satisfy the
+# license/advisory policy, and the crate must keep building on its MSRV.
+#
+# The toolchain is pinned to match rust-toolchain.toml (1.96.0). The fuzz job is
+# best-effort: cargo-fuzz needs a nightly compiler, so it is allowed to fail
+# without failing the workflow.
+
+name: CI
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+  workflow_dispatch:
+
+# Cancel superseded runs on the same ref to save CI minutes.
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+env:
+  CARGO_TERM_COLOR: always
+  RUST_BACKTRACE: 1
+  # Pinned toolchain — keep in lockstep with rust-toolchain.toml.
+  RUST_PINNED: 1.96.0
+  # Resilience against transient crates.io download blips (SSL EOFs, flaky
+  # mirrors): retry network ops aggressively and fetch the index over the
+  # sparse protocol with the git CLI, which recovers from partial transfers
+  # better than the built-in downloader.
+  CARGO_NET_RETRY: 10
+  CARGO_NET_GIT_FETCH_WITH_CLI: "true"
+  CARGO_REGISTRIES_CRATES_IO_PROTOCOL: sparse
+
+jobs:
+  # 1. The deterministic test suite: unit + golden + conformance + property.
+  test:
+    name: test (workspace, all-features)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install pinned toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: 1.96.0
+      - uses: Swatinem/rust-cache@v2
+      - name: cargo test
+        run: cargo test --workspace --all-features --locked
+
+  # 2. Lints as errors. No warning escapes review.
+  clippy:
+    name: clippy (-D warnings)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install pinned toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: 1.96.0
+          components: clippy
+      - uses: Swatinem/rust-cache@v2
+      - name: cargo clippy
+        run: cargo clippy --workspace --all-targets --all-features -- -D warnings
+
+  # 3. Formatting. The output is byte-stable, so the source should be too.
+  fmt:
+    name: rustfmt (--check)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install pinned toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: 1.96.0
+          components: rustfmt
+      - name: cargo fmt --check
+        run: cargo fmt --all --check
+
+  # 4. License + advisory gate (deny.toml).
+  deny:
+    name: cargo-deny
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: cargo-deny check
+        uses: EmbarkStudios/cargo-deny-action@v2
+        with:
+          command: check
+          arguments: --all-features
+
+  # 5. MSRV — the crate must build on its declared minimum (1.96).
+  #    Build + check only: tests pin newer dev-deps and run under `test`.
+  msrv:
+    name: MSRV (1.96)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install MSRV toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: 1.96.0
+      - uses: Swatinem/rust-cache@v2
+        with:
+          key: msrv
+      - name: cargo check (MSRV)
+        run: cargo check --workspace --all-features --locked
+
+  # 6. cargo-fuzz smoke build. Best-effort: cargo-fuzz needs nightly, and the
+  #    fuzz/ targets may not be wired yet — never fail the workflow on this.
+  fuzz:
+    name: cargo-fuzz smoke build (best-effort)
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install nightly toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly
+      - uses: Swatinem/rust-cache@v2
+        with:
+          key: fuzz
+      - name: Install cargo-fuzz
+        run: cargo install cargo-fuzz --locked
+      - name: Build fuzz targets (no run)
+        working-directory: fuzz
+        run: |
+          if [ -f Cargo.toml ]; then
+            cargo +nightly fuzz build
+          else
+            echo "fuzz/ has no Cargo.toml yet — nothing to build (best-effort job)."
+          fi
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -0,0 +1,195 @@
+# Memscribe architecture
+
+Memscribe is a **deterministic, zero-LLM** pipeline that turns the transcript
+logs AI coding agents already write into typed nodes the downstream
+inference-and-governance layer (**MemCortex**) can consume. No model is ever
+called: capture is reading and parsing, never summarizing. The output is an exact
+function of the input, which is what makes the whole module golden-file,
+property, and fuzz testable.
+
+It is the bottom of a three-layer stack — **Memtrace** uses **MemCortex**, and
+**MemCortex** uses **Memscribe**. The dependency direction is strictly one-way:
+each layer depends only on the one below it, and `memscribe-core` depends on
+nothing else in the workspace. Memscribe never calls upward.
+
+---
+
+## The pipeline
+
+A single, linear, deterministic pipeline. Each stage is a trait, so it can be
+tested in isolation and swapped. Everything between **Source** and **Sink** is a
+pure, synchronous function of the event stream.
+
+```
+ Source                Adapter           Gate        Segmenter      Binder        NodePrep        Sink
+ (memscribe-io)        (memscribe-       (core)      (core)         (core)        (core)          (memscribe-sink)
+                        adapters)
+ tail JSONL        →   parse one     →   admit?  →   arc / turn  →  decision  →   assemble    →   NDJSON / SQLite
+ hook stdin            RawRecord →       commitment  spans;         ↔ edit,       PreparedNode    / MemDB
+ OTLP receiver         CaptureEvent[]    markers     elevate gated  PROV          stream
+                       (version-                     turns; seed    (t_use
+                        tolerant)                     decisions;    ≤ t_gen)
+                                                      collect edits
+   RawRecord               CaptureEvent      markers    Segmentation   BindingEdge   PreparedNode    (consumer)
+   (bytes + provenance)    (normalized)                                              stream
+```
+
+- **`Source → Adapter`** produces the normalized `CaptureEvent` stream — the
+  system of record. This is the only stage that touches tool-specific formats.
+- **`Gate → Segmenter → Binder → NodePrep`** transform that stream into
+  `PreparedNode`s. Pure and synchronous given the events.
+- An optional **redaction** pass runs over the prepared nodes before the sink.
+- **`Sink`** writes the nodes out. It is the single seam that decouples
+  Memscribe from MemDB.
+
+The orchestration lives in `memscribe-core::pipeline::DefaultPipeline`:
+
+```rust
+let nodes = DefaultPipeline::new()                 // redaction ON by default
+    .run_records(adapter.as_ref(), &records);      // parse → prepare → redact
+// or stream straight to a sink:
+let n = DefaultPipeline::new()
+    .run_to_sink(adapter.as_ref(), &records, &mut sink)?;
+```
+
+`DefaultPipeline::prepare_events(&events)` is the **pure** core: its output is an
+exact function of `events`. `without_redaction()` turns the redactor off (golden
+tests assert on verbatim content), and `with_gate(..)` / `with_redactor(..)`
+swap in config-driven stages.
+
+---
+
+## Crate responsibilities
+
+| Crate | Responsibility |
+|-------|----------------|
+| `memscribe-core` | The frozen contract: the event model, the prepared-node output types, the `TranscriptAdapter` and `Sink` traits, and the deterministic pipeline (`gate` → `segmenter` → `binder` → `nodeprep`) plus the `redact` pass. Depends on nothing in the workspace. |
+| `memscribe-adapters` | Per-tool parsers behind feature flags. Each implements `TranscriptAdapter`. The `registry` assembles the enabled set (`all_adapters`) and resolves one by `SourceKind` (`adapter_for`). |
+| `memscribe-io` | Generic sources: a notify-based file tailer (offset resume), a hook server, and an OTLP receiver. Turns raw bytes into `RawRecord`s. |
+| `memscribe-sink` | Concrete `Sink`s: `NdjsonSink` (canonical default), `SqliteSink` (feature `sqlite`), and `MemDbSink` (feature `memdb`, off by default). |
+| `memscribe-cli` | The `memscribe` binary: `watch` / `hook` / `parse` / `replay` / `verify` / `redact`. |
+| `memscribe-testkit` | The harness: `parse_events` / `prepare_nodes`, the invariant checks, golden-fixture loaders, and the cross-tool conformance scenario catalog. |
+
+---
+
+## The contract types
+
+All of these live in `memscribe-core` and are re-exported from its crate root.
+**Do not change their behavior or public shape** — the test suite and every
+consumer depend on exact output.
+
+### Input: the normalized event model (`model.rs`)
+
+`CaptureEvent` is the system of record produced by adapters. Every field is
+copied verbatim from the source; none is generated by Memscribe.
+
+```rust
+pub struct CaptureEvent {
+    pub schema_version: u16,        // SCHEMA_VERSION; consumers gate on this
+    pub source: SourceKind,         // which tool produced it
+    pub session_id: String,         // tool-native session/thread id
+    pub seq: u64,                   // monotonic per-session, from file order
+    pub event_id: String,           // tool-native id, or blake3(content) fallback
+    pub parent_id: Option<String>,  // DAG link where the tool provides one
+    pub timestamp: OffsetDateTime,  // RFC3339, verbatim
+    pub project: ProjectRef,        // cwd / repo_root / git, from session start
+    pub kind: EventKind,            // the payload
+    pub provenance: SourceLocation, // pointer back into the source bytes
+}
+```
+
+`EventKind` is the payload enum. `EventKind::Unknown` is **load-bearing**: an
+unrecognized record type or a new field is preserved verbatim and flagged,
+never discarded — that is how the stream stays lossless across tool-version
+churn.
+
+| `EventKind` variant | Meaning |
+|---------------------|---------|
+| `SessionStart` | cwd, git ref, model, tool version |
+| `UserTurn` | a user message (flattened text + structured `Part`s) |
+| `AssistantTurn` | an assistant message (text, thinking, model, usage, parts) |
+| `ToolCall` | a tool invocation (`call_id`, name, raw args) |
+| `ToolResult` | a tool result (`call_id`, `ok`, raw output) |
+| `FileEdit` | a normalized `Diff` (from Edit/Write/apply_patch/replace) |
+| `Compaction` | model-side history compaction — flagged, never stored as truth |
+| `Rewind` | a user rewind back to an earlier event |
+| `SessionEnd` | the session ended |
+| `Unknown` | an unrecognized record, preserved verbatim and flagged |
+
+`SourceKind` enumerates the nine tools plus `Unknown`; `SourceKind::parse` maps
+CLI/`--as` slugs (tolerant of aliases such as `claude` / `claude-code`).
+
+### Output: the prepared-node stream (`node.rs`)
+
+`PreparedNode` is the typed data a consumer ingests. It is a tagged enum:
+
+| `PreparedNode` variant | Payload | Meaning |
+|------------------------|---------|---------|
+| `Conversation` | `ConversationSpan` | a gated, verbatim dialogue span with the markers that fired |
+| `Decision` | `DecisionRecord` | a deterministically-parsed decision (IBIS / QOC / MADR / Kruchten shape) |
+| `Episode` | `CodeEpisode` | a code edit episode: path, `Diff`, git ref, deterministic `episode_id` |
+| `Binding` | `BindingEdge` | a decision/conversation → episode edge carrying a `ProvRecord` |
+
+### Epistemic honesty: `FactStatus`
+
+Every node and edge carries a `FactStatus`. **Memscribe only ever emits the
+first two**; the latter two are *flags* for a downstream inference layer —
+values Memscribe never computes by guessing. This is the property that keeps the
+module zero-LLM and its output golden-testable.
+
+| `FactStatus` | Who sets it |
+|--------------|-------------|
+| `Observed` | Memscribe — verbatim from the source |
+| `DeterministicallyDerived` | Memscribe — a pure function of observed data |
+| `StatisticallyRanked` | downstream — a statistical measure |
+| `LlmHypothesis` | downstream — an LLM hypothesis; Memscribe only *flags* it |
+
+`ProvRecord` records `used(session, decision)` + `wasGeneratedBy(diff, session)`
+with the temporal invariant `t_use ≤ t_gen` (`ProvRecord::is_temporally_valid`).
+
+---
+
+## How to add a new adapter
+
+Adapters are the volatile part — every tool's format churns — so adding one is a
+well-trodden, five-step path. The contract: a parser is **version-tolerant**
+(it pattern-matches on the fields it needs and routes anything unrecognized to
+`EventKind::Unknown`) and **must never panic**.
+
+1. **Add a `SourceKind` variant** (`memscribe-core/src/model.rs`). Wire its
+   stable snake_case slug into `SourceKind::as_str` and into `SourceKind::parse`
+   (include any aliases). This is the one allowed touch of `memscribe-core` for
+   a new tool — coordinate it, since the frozen contract is shared.
+
+2. **Add the adapter module** (`memscribe-adapters/src/<tool>.rs`) behind a
+   `#[cfg(feature = "<tool>")]` and a matching entry in the crate's `[features]`
+   table. Implement `TranscriptAdapter`:
+   - `source_kind()` — return your `SourceKind`.
+   - `discover(&DiscoverCfg)` — locate live & historical transcripts. Honor the
+     per-tool override key in `DiscoverCfg.overrides` (e.g. `CLAUDE_CONFIG_DIR`,
+     `CODEX_HOME`) and fall back to `cfg.home_dir()`. Return handles in a
+     deterministic (sorted) order.
+   - `parse(&RawRecord, &mut ParseCtx)` — turn ONE record into zero or more
+     `CaptureEvent`s. Use `ParseCtx::alloc_seq` for the monotonic `seq`,
+     `ParseCtx::first_seen` for dedup, and `ParseCtx::project_or_default` for the
+     project binding. Never panic; route unknowns to `EventKind::Unknown`.
+   - `schema_fingerprint(&RawRecord)` — return a `SchemaVariant` so the corpus
+     and runtime can version-gate the parser.
+
+3. **Register it** (`memscribe-adapters/src/registry.rs`). Add the cfg-gated
+   `push` in `all_adapters()` and the cfg-gated arm in `adapter_for()`.
+
+4. **Add fixtures** under `fixtures/<tool>/<version>/<scenario>.jsonl` for the
+   canonical scenarios in `memscribe-testkit::scenarios::SCENARIOS`, and bless
+   the expected outputs under `fixtures-expected/<tool>/<version>/` (see
+   [CONTRIBUTING.md](./CONTRIBUTING.md) for the capture → golden → bless flow).
+
+5. **Add tests.** Unit-test the parser; run the shared invariant checks from
+   `memscribe-testkit::invariants` (`check_monotonic_seq`, `check_lossless`,
+   `check_unique_event_ids`, `check_determinism`); and add a `cargo-fuzz` target
+   so the never-panic contract is enforced. Verify in isolation:
+   `cargo test -p memscribe-adapters --test <your_file_stem>`.
+
+The conformance suite then asserts your tool normalizes the canonical scenarios
+to the **same shape** as every other tool — that cross-tool equivalence is the
+point of the thin-waist event model.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,24 @@
+# Changelog
+
+All notable changes to Memscribe are documented here. The format follows
+[Keep a Changelog](https://keepachangelog.com/), and the project adheres to
+[Semantic Versioning](https://semver.org/). The event schema additionally
+carries its own `schema_version` so the consumer layer (MemCortex) can refuse
+or adapt to an incompatible event schema independently of the crate version.
+
+## [Unreleased]
+
+### Added
+- **M1 — Core contract.** The frozen thin-waist: `CaptureEvent` / `EventKind`
+  normalized event model, `PreparedNode` output contract with `FactStatus`,
+  the `TranscriptAdapter` and `Sink` traits, and the deterministic pipeline
+  (Gate → Segmenter → Binder → NodePrep).
+- **Adapters.** Claude Code, Codex CLI, Gemini CLI, OTel GenAI, plus
+  VS Code / Copilot / Cursor / Windsurf / Zed, each version-tolerant and
+  routing unknowns to `EventKind::Unknown`.
+- **Sinks.** NDJSON (canonical default), SQLite, and a feature-gated MemDB sink.
+- **IO sources.** notify-based file tailer with persisted byte-offset resume,
+  hook server, and an optional OTLP receiver.
+- **CLI.** `watch`, `hook`, `parse`, `replay`, `verify`, `redact`.
+- **Testkit.** Golden-file harness, cross-tool conformance suite, synthetic
+  generators, property tests, and fuzz targets.