diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index bb3bff3..42a2b86 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -13,16 +13,23 @@ TCP Client
 RespCodec (beyond_resp)     ← RESP2/RESP3 framing
   │ RESP Array → Bytes
   ▼
-Command::parse()            ← command.rs  — stack-allocated parsing, arity check
+Command::parse()            ← command.rs — stack-allocated parsing, arity check
   │ Command::Set { key, value, args }
+  │   bad arity / unknown option ─────────────────────────────► ERR (connection stays open)
   ▼
 dispatch()                  ← dispatch.rs — NX/XX condition, TTL conversion
-  │ SetOptions { ttl: Duration, metadata }
+  │ SetOptions { ttl, metadata }
+  │   value > KV_MAX_VALUE_BYTES ─────────────────────────────► ERR 413 / RESP ERR
   ▼
 ShardStore::set()           ← store.rs (async)
-  ├─ record::encode(tstamp, flags, expires_at_ms, key, value, metadata)
-  ├─ NamespaceLog::put_full → active_file.append(buf) → fsync   ← L2 write (io_uring)
-  └─ MemCache::insert(key, value, ...)                          ← L1 write
+  │   frozen (handoff seal in progress) ──────────────────────► ERR Frozen
+  ├─ NamespaceLog::put_full
+  │    ├─ value ≥ 128 KiB → ValueStore::put(value)             ← blob write (io_uring, write-once)
+  │    │    io error ──────────────────────────────────────────► ERR propagated to client
+  │    ├─ record::encode(tstamp, flags, expires_at_ms, key, value-or-hash, metadata)
+  │    └─ active_file.append(buf) → fsync                       ← L2 write (io_uring)
+  │         io error → file poisoned ──────────────────────────► ERR; subsequent writes fail until restart
+  └─ MemCache::insert(key, value, ...)                          ← L1 write (stores full value)
   │
   ▼
 r::ok()                     ← response.rs
@@ -41,17 +48,22 @@ Command::Get { key }
   │
   ▼
 ShardStore::get() (async)
-  ├─ MemCache::get(key, now_ms)  ── hit? ──► check expiry ──► return Entry  (L1 fast path)
+  ├─ MemCache::get(key, now_ms)  ── hit? ──► check expiry ──► return Entry  (L1 fast path; full value)
   │                                                │ expired
   │                                                ▼
   │                                  remove from L1, append tombstone, return None
   │
   └─ miss? ──► NsIndex::get(key)
-                 ├─ None ──────────────────────────────────────────► return None
-                 ├─ expired (TTL sidecar) ──► append tombstone ────► None
-                 └─ live ──► file.read_at(record_offset, record_size)  (single io_uring SQE)
-                                ├─ parse header → slice value/metadata
-                                └─ MemCache::insert ──► return Entry
+                 ├─ None ──────────────────────────────────────────────────► return None
+                 ├─ expired (TTL sidecar) ──► append tombstone ────────────► None
+                 └─ live ──► file.read_at(record_offset, record_size)        (one io_uring SQE)
+                                │ parse header → slice value field
+                                ├─ VALUE_SEP flag clear: value field IS the value
+                                │    └─ MemCache::insert(full value) ──────► return Entry
+                                └─ VALUE_SEP flag set: value field is 16-byte hash
+                                     └─ ValueStore::get(hash)               (one io_uring SQE)
+                                          ├─ blob missing ─────────────────► ERR BadRecord
+                                          └─ ok ──► MemCache::insert ──────► return Entry
   │
   ▼
 r::bulk(entry.value) or r::nil()
@@ -91,6 +103,39 @@ http.rs router
 HTTP Client
 ```
 
+### Startup / Recovery (per shard, per namespace)
+
+```
+ShardStore::open()
+  └─ for each namespace dir found on disk:
+       NamespaceLog::open()
+         ├─ recover::open_namespace()
+         │    ├─ for each sealed data-*.log (ascending file_id):
+         │    │    ├─ read_footer()  ── magic matches? ──► apply_footer_entries()  (O(1), no body scan)
+         │    │    │                                         └─ rebuilds index + TTL + valsep sidecars
+         │    │    └─ magic mismatch / CRC fail ──► rebuild_from_records()  (full body scan, fallback)
+         │    └─ highest file:
+         │         ├─ footer present (clean shutdown) ──► treat as sealed, open new active
+         │         └─ no footer (crash) ──► replay_active()
+         │              └─ scan records; bad CRC → truncate at last good boundary
+         ├─ for each live value-separated key: ValueStore::incr_ref(hash)
+         └─ ValueStore::sweep_orphans()  ──► delete values/blob-* with no live key reference
+```
+
+### Background Durability (per shard, every 1 second)
+
+```
+ShardStore::sync_all()
+  └─ for each open namespace:
+       NamespaceLog::sync()
+         └─ unsynced_bytes > 0? ──► active_file.fsync()  (io_uring)
+              io error ──► kv_log_sync_failures_total++ ──► /readyz 503 after threshold
+```
+
+This IS the durability mechanism — `appendfsync everysec`. Individual writes call `write_all_at` (goes to the OS page cache; not yet on stable storage) and increment `unsynced_bytes`. The 1-second timer is the only thing that calls `fsync`. A crash before the next timer fires can lose up to ~1 second of writes. The meaningful secondary effect is on `/readyz`: fsync failures increment `readyz_sync_failure_count`; once it exceeds `KV_READYZ_SYNC_FAILURE_THRESHOLD` the shard reports degraded and `/readyz` returns 503.
+
+**New-file directory durability.** Whenever a new `data-*.log` is created or renamed into place (fresh namespace, rotate, reclaim, FLUSHDB, clean-shutdown recovery), the engine fsyncs the _namespace directory_ (`file.rs:sync_dir`) so the file's directory entry is durable — not just its bytes. Without this a power loss could leave a created file's fsynced records unreachable (data present, name lost), violating the everysec contract for any file past the first. This runs only on those rare paths, never on the per-write hot path. (Residual assumption: that `fsync` is honest down through the filesystem/GlideFS/hardware stack — not verifiable in software.)
+
 ### TTL Expiry
 
 ```
@@ -142,18 +187,22 @@ SCAN iterates shards sequentially: when a shard's inner cursor returns `"0"` (ex
 
 ## Concepts & Terminology
 
-| Term                   | What It Controls                                                                                                                                                                                | NOT                                                                                               |
-| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
-| Namespace (`ns`)       | Which `NamespaceLog` (and therefore which on-disk directory) receives reads/writes; set by `SELECT <n>` (RESP, any non-negative integer) or `/namespaces/{ns}/` (HTTP); max 1024 open per shard | Not an auth or tenant boundary                                                                    |
-| Shard / ShardStore     | One independent storage unit per OS thread — lazily-opened `NamespaceLog` per namespace + L1 cache                                                                                              | A partition of the keyspace: a key lives on exactly one shard, picked by `FxHash(key) % n_shards` |
-| L1 / MemCache          | In-process S3-FIFO cache that short-circuits disk reads                                                                                                                                         | Not write-through durable storage                                                                 |
-| L2 / NamespaceLog      | Persistent on-disk store; in-RAM hash index over an append-only log file; authoritative source of truth                                                                                         | Not the hot path for reads after first access                                                     |
-| Active file            | The currently-writable log file. Records are appended, fsynced, then made visible via the index                                                                                                 | Not modified in place; only appended                                                              |
-| Sealed file            | A previously-active file that has been merged through reclaim. Read-only, has a footer of live entries                                                                                          | Not deleted until reclaim runs again                                                              |
-| Ghost Set              | MemCache tracking of recently evicted keys; a ghost hit promotes the next insert directly to the Main queue                                                                                     | Not a tombstone or deletion marker                                                                |
-| Cursor `"0"`           | SCAN sentinel meaning "start from beginning" or "scan complete" — the same value signals both states                                                                                            | Not a literal zero integer                                                                        |
-| `\x01`-prefixed cursor | Single-shard continuation cursor: `b"\x01"` + last_key from the previous page                                                                                                                   | Not a user-visible value; internal to scan                                                        |
-| `\x02`-prefixed cursor | Multi-shard continuation cursor: `b"\x02"` + `[shard_idx: u8]` + per-shard inner cursor; only emitted when `n_shards > 1`                                                                       | Never produced by single-shard deployments; not a user-visible value                              |
+| Term                   | What It Controls                                                                                                                                                                                                   | NOT                                                                                                                               |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------- |
+| Namespace (`ns`)       | Which `NamespaceLog` (and therefore which on-disk directory) receives reads/writes; set by `SELECT <n>` (RESP, any non-negative integer) or `/namespaces/{ns}/` (HTTP); max 1024 open per shard                    | Not an auth or tenant boundary                                                                                                    |
+| Shard / ShardStore     | One independent storage unit per OS thread — lazily-opened `NamespaceLog` per namespace + L1 cache                                                                                                                 | A partition of the keyspace: a key lives on exactly one shard, picked by `FxHash(key) % n_shards`                                 |
+| L1 / MemCache          | In-process S3-FIFO cache that short-circuits disk reads                                                                                                                                                            | Not write-through durable storage                                                                                                 |
+| L2 / NamespaceLog      | Persistent on-disk store; ordered in-RAM index (`BTreeMap`) over an append-only log file + a blob store; authoritative source of truth                                                                             | Not the hot path for reads after first access                                                                                     |
+| Active file            | The currently-writable log file. Records are appended, fsynced, then made visible via the index                                                                                                                    | Not modified in place; only appended                                                                                              |
+| Sealed file            | A previously-active file that has been merged through reclaim. Read-only, has a footer of live entries                                                                                                             | Not deleted until reclaim runs again                                                                                              |
+| Run / Level            | A run is one sealed file; its level is its size-tier. Reclaim merges `fanout` runs at level L into one run at L+1 — bounds write amplification to O(log N)                                                         | Not persisted: a restart resets every run to level 0                                                                              |
+| Write stripe (`wlock`) | One of 64 per-namespace async mutexes; a write locks `stripe[FxHash(key) % 64]` for check→append→commit. Serializes same-key writes; makes CAS/INCR atomic                                                         | Not cross-thread (shard is single-threaded); not taken by reads; not per-key (stripes are shared, collisions just over-serialize) |
+| Value separation       | A value ≥ `value_sep_threshold` (128 KiB) is stored in the content-addressed blob store; the log record carries only its 16-byte hash, so compaction never re-uploads the value                                    | Not applied to small values (they stay inline); not a per-key dedup of small data                                                 |
+| Blob                   | An immutable, content-addressed value file at `values/blob-{hash}`; refcounted, write-once, deduped across keys; at refcount 0 it is deleted by `collect_garbage` after the next fsync (deferred for crash-safety) | Not mutated in place; not moved by compaction; not deleted eagerly on unref                                                       |
+| Ghost Set              | MemCache tracking of recently evicted keys; a ghost hit promotes the next insert directly to the Main queue                                                                                                        | Not a tombstone or deletion marker                                                                                                |
+| Cursor `"0"`           | SCAN sentinel meaning "start from beginning" or "scan complete" — the same value signals both states                                                                                                               | Not a literal zero integer                                                                                                        |
+| `\x01`-prefixed cursor | Single-shard continuation cursor: `b"\x01"` + last_key from the previous page                                                                                                                                      | Not a user-visible value; internal to scan                                                                                        |
+| `\x02`-prefixed cursor | Multi-shard continuation cursor: `b"\x02"` + `[shard_idx: u8]` + per-shard inner cursor; only emitted when `n_shards > 1`                                                                                          | Never produced by single-shard deployments; not a user-visible value                                                              |
 
 ## Core Mechanism
 
@@ -182,7 +231,7 @@ The accept loop in `main.rs` peeks the first command's key on each new connectio
 
 ### Two-Level Storage
 
-Every read checks L1 first. L1 hits avoid all disk I/O. On L1 miss the engine looks up the key in the in-RAM hash index, then issues a single io_uring `read_at(record_offset, record_size)` against the file holding that record. The header carries `key_size`/`val_size`/`meta_size`, so the value and metadata are sliced out in-memory after the read completes.
+Every read checks L1 first. L1 hits avoid all disk I/O. On L1 miss the engine looks up the key in the in-RAM index (`BTreeMap`), then issues a single io_uring `read_at(record_offset, record_size)` against the file holding that record. The header carries `key_size`/`val_size`/`meta_size`, so the value and metadata are sliced out in-memory after the read completes. If the record's `VALUE_SEP` flag is set, the sliced "value" is a 16-byte hash and one additional blob read fetches the value — still O(1), since the hash came straight from the record. The blob is then re-hashed and checked against that content hash before being returned (parity with the CRC the inline path verifies on every read) — silent blob corruption or a blob/hash mismatch surfaces as an error instead of wrong data.
 
 Writes go to both levels in order: append + fsync to disk first (durable), then L1 (hot set).
 
@@ -208,26 +257,68 @@ Each namespace gets its own directory `{data_dir}/shard-{n}/{ns}/`. Files in tha
 | key bytes | value bytes | metadata bytes |
 ```
 
-CRC-64/NVME via `crc-fast` covers everything after the CRC field. `flags` carries `TOMBSTONE` (0x01), `NO_EXPIRY` (0x02), `TTL_UPDATE` (0x04). Tombstone and TTL-update records have `val_size = meta_size = 0`.
+CRC-64/NVME via `crc-fast` covers everything after the CRC field. `flags` carries `TOMBSTONE` (0x01), `NO_EXPIRY` (0x02), `TTL_UPDATE` (0x04), `VALUE_SEP` (0x08). Tombstone and TTL-update records have `val_size = meta_size = 0`. A `VALUE_SEP` record's "value bytes" are a 16-byte content hash, not the value — the value lives in the blob store (see [Value Separation](#value-separation)).
 
-**In-RAM index** (per namespace): `FxHashMap<Bytes, IndexEntry>`. `IndexEntry` is 24 bytes:
+**In-RAM index** (per namespace): `BTreeMap<Bytes, IndexEntry>` (ordered, so SCAN is a range walk). `IndexEntry` is 24 bytes:
 
 ```rust
 struct IndexEntry {
     record_offset: u64,
     record_size: u32,
-    file_id: u16,
+    file_id: u32, // u32 (not u16): file IDs are never reused, so a hot namespace
+    // must not exhaust them — u32 ≈ unbounded; still packs to 24 B
     tstamp_ms: u64, // revision — enables O(1) CAS checks without a disk read
 }
 ```
 
-Plus a TTL sidecar `FxHashMap<Bytes, u64>` so only TTL'd keys pay the extra 16-byte slot.
+Two FxHashMap sidecars, each paid only by the keys that need it: a TTL sidecar `FxHashMap<Bytes, u64>` (TTL'd keys) and a value-separation sidecar `FxHashMap<Bytes, [u8;16]>` mapping a large-value key to its blob hash (used to unref the old blob on overwrite/delete and to rebuild blob refcounts on recovery).
+
+**Sealed-file footer** (written when a file is sealed by reclaim): one entry per live key — `(key, record_offset, record_size, expires_at_ms, tstamp_ms, value_hash?)` — followed by a 24-byte trailer (body length + CRC + magic `0x4259_4F4E_445F_4B58`, "BYOND_KX" v3). The `value_hash` (present only for value-separated keys) is carried in the footer so recovery rebuilds both the index and the value-sep sidecar in O(1) without reading record bodies. On startup, recovery reads each sealed file's footer; if the magic doesn't match (older format or crash mid-seal), it falls back to a full record scan — which still repopulates the value-sep sidecar from each record's `VALUE_SEP` flag. The active file's tail is replayed record-by-record; first bad CRC truncates the active file at the last good boundary. After the index is rebuilt, blob refcounts are reconstructed by walking the value-sep sidecar (one `incr_ref` per live large-value key).
+
+**Reclaim (compaction)** is **size-tiered** — one strategy, no flag (`reclaim_inner`). Triggered two ways: `BGREWRITEAOF` (current namespace, synchronous from the client's perspective) or the auto-reclaim background task (every `KV_RECLAIM_INTERVAL_SECS`, default 300s) which reclaims any namespace whose sealed file count exceeds `KV_RECLAIM_SEALED_THRESHOLD` (default 4, 0 = disabled).
+
+A reclaim seals the active file as a fresh level-0 run, then repeatedly finds the lowest level holding ≥ `fanout` (`KV_COMPACTION_FANOUT`, default 8) runs and merges just those into one run at the next level, cascading upward (`reclaim_namespace` copies each live record's bytes verbatim into the merged file and unlinks its inputs). Each reclaim rewrites **one level, not the whole live set** — O(log N) amortized write amplification. On GlideFS this matters directly: a reclaim re-uploads one level's worth of bytes to S3, not the entire namespace.
+
+**Reclaim does not error writes.** A write that arrives during a reclaim _waits_ (`await_reclaim`, before it takes the in-flight count) and proceeds when the reclaim finishes — it never returns `ReclamationBusy` to the client (only a second _concurrent reclaim_ gets that). Before sealing, reclaim **drains in-flight writes** (waits for `in_flight_writes == 0`) so the footer it writes is a consistent snapshot — a write that appended to the active file but hadn't yet updated the index can't be missed from the footer and silently lost on a later footer recovery. `FLUSHDB` uses the same gate + drain (so a write can't race the file replacement). Trade-off: writes _stall_ for the reclaim's duration (standard LSM write-stall, bounded by level size / tunable via `rotate_threshold`·`fanout`), but they always succeed.
+
+`NamespaceLog::compaction_bytes` counts the bytes each reclaim rewrites, so write-amp is directly measurable. Level assignments live in an in-memory `RefCell<FxHashMap<u32,u8>>`; **a restart resets all runs to level 0** (levels are not yet persisted).
+
+> **Why not full-merge** (rewrite the entire live set into one file per reclaim, the classic compacting-log design)? On GlideFS, full-merge re-uploads the whole namespace to S3 on _every_ reclaim — O(live-set) each time. Measured on the real engine over 12 reclaims of a churning ~200-key set, size-tiered rewrote **4.6× fewer bytes** than full-merge would. Point reads don't pay for the extra runs: the in-RAM index resolves each key straight to `(file_id, offset)`, so a GET is one read regardless of run count. Full-merge was removed, not flag-gated.
+
+**Forks need no special handling.** A GlideFS fork is a copy-on-write volume: the child shares the parent's packs and only pays for what it writes. Because reclaim writes merged runs to _new_ offsets (never rewriting a parent's packs) and large values live in immutable blobs the child shares for free, a fork's amplification is bounded by its own divergence with zero fork-awareness in the engine — no "freeze the inherited base" step, no fork-vs-restart detection. (An earlier `freeze_inherited` design was removed: it required a fork hook that doesn't exist and pinned dead inherited data forever, defeating GC.)
+
+**FLUSHDB** unlinks-and-recreates the namespace's data files (does NOT truncate in place) so CoW sharing with the parent fork's blocks is preserved, and drops the namespace's blob store (`values/`).
+
+### Value Separation
+
+A value whose length ≥ `LogConfig::value_sep_threshold` (default **128 KiB = one GlideFS block**) is written to a content-addressed blob store at `{ns}/values/blob-{hash}` instead of inline in the log; the log record carries only the 16-byte BLAKE3-128 hash with the `VALUE_SEP` flag set. Small values stay inline. `value_store.rs` is the store; the wiring lives in `log/mod.rs` (`put_full`/`put_full_cond`/`put_many` separate on write, `read_value`/`bulk_read` deref on read).
+
+```
+SET big (256 KiB) ─► blob store: values/blob-<hash>  (262,144 bytes, write-once)
+                  └► log record: header + key + 16-byte hash  (≈100 bytes)
+GET big ─► index → record (the hash) → blob store get(hash) → value   (still ONE disk read)
+```
+
+Behavior, observed on the running binary (256 KiB value):
+
+| event                        | what actually happens                                                                                                                                              |
+| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| SET large value              | 262,144 bytes written to `values/`; **log grows ~100 bytes** (the pointer)                                                                                         |
+| GET                          | index resolves the record → 16-byte hash → blob read; one read, value returned                                                                                     |
+| identical content (any keys) | deduped to **one blob** (content-addressed, write-once)                                                                                                            |
+| BGREWRITEAOF / reclaim       | copies the ~100-byte pointer record; the blob is **never touched** — log stays tiny                                                                                |
+| overwrite / delete / expire  | the old blob is `unref`'d; at refcount 0 it is **queued**, then physically unlinked by `collect_garbage` after the next fsync (deferred — see durability ordering) |
+
+**Why:** on GlideFS, the cost that matters is _bytes moved by compaction_ — relocating a record to a new offset re-uploads it to S3 (dedup is offset-keyed). Inline, every compaction re-moves the value; a value surviving N reclaims is uploaded N+1 times. Separated, compaction only ever moves the 16-byte pointer; the value is uploaded once and reclaimed by deletion (unlink → whole blocks freed → GlideFS dead-pack GC), never by rewrite. **Measured on the real engine** (60 keys × 32 KiB, 10 churning reclaims): inline moved **22.5 MiB** of compaction bytes, value-separated moved **0.01 MiB** — 3337× less. The threshold is one block because below it a blob-per-value wastes the rest of the block (space-amp explodes); at/above it write-amp collapses to ~1×.
+
+Blob I/O is async on the shard's io_uring reactor — `monoio::fs` `read`/`write`/`remove_file`/`create_dir`/`sync_all` (the `mkdirat`/`unlinkat` features), never a blocking syscall on the hot path. Reads re-hash the blob and verify it against the content hash (integrity, see above). Blob refcounts are in-memory, rebuilt on open from the value-sep sidecar (which the footer/scan recovery repopulates); immediately after, `ValueStore::sweep_orphans` deletes any blob on disk that no live key references. The create and delete of a given content hash are serialized by a per-content **file-op lock** (16 stripes), so `collect_garbage` can never unlink a blob a same-content `put` is concurrently recreating (a by-construction guard against io_uring completion reordering).
 
-**Sealed-file footer** (written when the active file is rotated by reclaim): array of `(key, record_offset, record_size, expires_at_ms, tstamp_ms)` entries followed by a 24-byte trailer (body length + CRC + magic `0x4259_4F4E_445F_4B57`). On startup, recovery reads the footer of each sealed file in O(1) and rebuilds the index without scanning the file body. Sealed files with the older magic (`0x4259_4F4E_445F_4B56`, written before the `tstamp_ms` footer field was added) fall back to a full sequential scan of the file body, reading `tstamp_ms` from each record header — no explicit migration needed. The active file's tail (between its last hint checkpoint and EOF) is replayed record-by-record; first bad CRC truncates the active file at the last good boundary.
+**Durability ordering** — the pointer (in the log) and the value (in a blob file) live in _different_ files, so both edges of a blob's lifetime are ordered against the log's fsync:
 
-**Reclaim**: seal the current active file, walk live index entries, copy live records to a new sealed file, write its footer + fsync, atomic-rename, unlink old sealed files. A fresh active file is opened. Triggered two ways: `BGREWRITEAOF` (current namespace, synchronous from the client's perspective) or the auto-reclaim background task (every `KV_RECLAIM_INTERVAL_SECS`, default 300s) which reclaims any namespace whose sealed file count exceeds `KV_RECLAIM_SEALED_THRESHOLD` (default 4, 0 = disabled).
+- **Create before reference.** `put` makes the blob crash-durable _before_ it returns, before the caller writes the pointer record: `write_all_at` → `sync_all` (blob bytes) → fsync the `values/` directory (blob's name). Write-ahead ordering: a durable pointer can never reference a non-durable blob.
+- **Delete after the superseding record is durable.** `unref` only drops the refcount and _queues_ the blob; `collect_garbage` (run after each `sync`) physically deletes it. So the old blob of an overwrite/delete survives until the record that superseded it is durable. Were it deleted eagerly, a power loss that lost the superseding record would revert the key to its old value — whose blob would be gone (a dangling pointer). Deferring makes the revert safe.
 
-**FLUSHDB** unlinks-and-recreates the namespace's data files (does NOT truncate in place) so CoW sharing with the parent fork's blocks is preserved.
+The log itself is `appendfsync everysec` (≤1 s of writes lost on power loss), so the worst a crash does to a value-separated write is leave an **orphan blob** (durable blob, pointer lost, or queued-but-uncollected) — which `sweep_orphans` reclaims on the next open. **There is no dangling-pointer (durable pointer, missing blob) window.** This is verified exhaustively by the `crash_consistency` test module: `exhaustive_tail_truncation_is_consistent` truncates the un-fsynced tail at **every byte offset** (and includes a value-sep overwrite in the crash zone — the case the deferred-delete fix protects); `corruption_truncates_at_bad_record_keeping_prefix` does the same for single-byte bit-rot of durable records; `torn_footer_falls_back_to_scan_across_files` reclaims to a sealed+active multi-file layout, then cuts the sealed file's footer at every offset to exercise the `read_footer`→record-scan fallback (which rebuilds value-sep state from the `VALUE_SEP` flag). Each asserts a valid recovered prefix with zero dangling pointers and zero blob leaks. The harness has teeth: reintroducing the synchronous-delete bug makes it fail at the exact offset where the overwrite is lost.
 
 ### Command Parsing (`command.rs`)
 
@@ -262,13 +353,13 @@ A connection is pinned to one shard, but multi-key commands (MGET, MSET, DEL, EX
 
 ### SCAN Glob Matching
 
-Pattern matching uses a stack-based backtracking algorithm that handles `*` (any sequence) and `?` (single character). No heap allocation; runs inline during RocksDB iteration. See `store.rs:glob_match()`.
+Pattern matching uses a stack-based backtracking algorithm that handles `*` (any sequence), `?` (single character), and `[abc]` / `[a-z]` / `[^abc]` character classes. No heap allocation; runs inline during the `BTreeMap` range walk — each key is tested as the cursor advances. See `store.rs:glob_match()`.
 
 ### Watch / Subscribe
 
 Clients can subscribe to mutations on a key or a key prefix and receive a live stream of events. The mechanism is the same for both transports; only the framing differs.
 
-**Revision** — every log record's `tstamp_ms` field doubles as a revision ID. No separate counter. Revisions are monotonically increasing per-shard and are included in every `WatchEvent`, enabling resumable subscriptions.
+**Revision** — every log record's `tstamp_ms` field doubles as a revision ID. No separate counter. Revisions are monotonically increasing per-shard (a hybrid logical clock: `max(wall_clock_ms, last_revision + 1)`), so they advance even if two writes land in the same millisecond or the wall clock steps backward mid-run. On open, the clock is **seeded from the highest tstamp recovered**, so revisions stay monotonic across a restart too (a post-restart write can never be assigned a revision ≤ existing data, which would otherwise corrupt `scan_since` watch resumption). Revisions are included in every `WatchEvent`, enabling resumable subscriptions.
 
 **WatchRegistry** (`engine/src/watch.rs`) — one per `ShardStore`, owned behind `RefCell` (no locking needed; single-threaded per shard). Holds two tables:
 
@@ -280,7 +371,7 @@ After each successful `set`, `mset`, or `del`, the store calls `WatchRegistry::n
 **Initial state delivery** (`watch_subscribe`):
 
 - `since == 0` → call `NamespaceLog::current_entries` — reads the live index + fetches values from disk for matching keys. Delivers the current state snapshot immediately.
-- `since > 0` → call `NamespaceLog::scan_since` — scans all log files in `file_id` order to replay mutations with `tstamp_ms > since`. Used by clients that reconnect after a brief disconnection to catch up without missing writes.
+- `since > 0` → call `NamespaceLog::scan_since` — scans all log files in `file_id` order to replay mutations with `tstamp_ms > since`. Used by clients that reconnect after a brief disconnection to catch up without missing writes. Value-separated records are deref'd (and integrity-checked) during the scan, so replayed events carry the real value, not the blob-hash pointer.
 
 **RESP3 transport** — `WATCH key [key ...] [SINCE <revision>]` and `PWATCH prefix [SINCE <revision>]` intercept in `handle_conn` before `dispatch()`. They require RESP3 (`HELLO 3` first). Initial events are sent as Push frames, followed by a `>2 watch ready` frame, then a live select loop:
 
@@ -318,7 +409,7 @@ GET /v1/watch?prefix=<p>&since=<r> → resumable prefix stream
 
 ### Compare-And-Swap (CAS)
 
-CAS enables optimistic concurrency control: a write succeeds only if the current revision of the key matches the caller's expected value. Because each shard is single-threaded, the check-then-write is atomic with no race window.
+CAS enables optimistic concurrency control: a write succeeds only if the current revision of the key matches the caller's expected value. The check and the write are atomic — `put_full_cond` holds the key's write stripe across check→append→commit, so no concurrent same-key write can interleave (even at the disk-I/O `.await`). A failed condition writes nothing: it checks before appending, so there is no record on disk for a CAS that returned "no" (this is what makes CAS crash-safe — a failed CAS can never resurrect after a crash).
 
 **RESP** — `SET key value REV <n>`:
 
@@ -331,13 +422,14 @@ CAS enables optimistic concurrency control: a write succeeds only if the current
 - Mismatch → `409 Conflict` + `{"error":"conflict","message":"revision mismatch"}`.
 - `GET` always returns `X-KV-Revision: <n>` so the caller can capture the revision before a CAS write.
 
-**Implementation** — `ShardStore::setrev()`:
+**Implementation** — `ShardStore::setrev()` → `NamespaceLog::put_full_cond(key, …, WriteCondition::Revision(n))`:
 
-1. `ensure_ns()` borrows the in-memory index.
-2. Reads `IndexEntry.tstamp_ms` for the key (O(1), no disk read).
-3. Expired keys are treated as absent (revision mismatch).
-4. On match: write via `put_full()`, notify watchers, return `Ok(Some(new_rev))`.
-5. On mismatch: return `Ok(None)` — no write, no disk I/O.
+1. Acquire the key's write stripe (`wlock`) — held across the whole operation.
+2. Check `IndexEntry.tstamp_ms == n` (O(1), no disk read; expired keys count as absent → mismatch).
+3. On mismatch: return `Ok(None)` immediately — **no append, no disk I/O, no record**.
+4. On match: encode + append + commit + notify watchers, return `Ok(Some(new_rev))`.
+
+Because the stripe is held from the check through the commit, no concurrent same-key write can land in between — the check is authoritative and the failed path leaves nothing on disk.
 
 `REV` is mutually exclusive with `NX`/`XX` at the protocol layer.
 
@@ -368,7 +460,7 @@ CAS enables optimistic concurrency control: a write succeeds only if the current
 
 ```
 absent ──SET──► live
-  live ──GET──► live  (freq bumped in L1; revision returned in X-KV-Revision / Entry.revision)
+  live ──GET──► live  (freq bumped in L1; revision in X-KV-Revision / Entry.revision)
   live ──DEL──► absent
   live ──expired────► absent  (lazy, on next access or L1 sweep)
   live ──PERSIST──► live (TTL cleared)
@@ -378,11 +470,30 @@ absent ──SET──► live
 absent ──CAS──────────────────► absent (mismatch; 409 / nil returned)
 ```
 
+| From   | Event              | To     | Guard                          | What Actually Happens                                                                                                                                           |
+| ------ | ------------------ | ------ | ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| absent | SET                | live   | —                              | Record appended + fsynced; index entry inserted; L1 populated. Large value written to blob store first; record carries 16-byte hash.                            |
+| live   | SET (overwrite)    | live   | —                              | New record appended; index entry replaced; L1 updated. Old blob `unref`'d if value-separated; new blob written (dedup: no write if identical content).          |
+| live   | SET NX             | live   | key present                    | No write, no disk I/O. Returns nil / 0.                                                                                                                         |
+| absent | SET NX             | live   | key absent                     | Same as SET.                                                                                                                                                    |
+| live   | SET XX             | live   | key present                    | Same as SET (overwrite).                                                                                                                                        |
+| absent | SET XX             | absent | key absent                     | No write. Returns nil / 0.                                                                                                                                      |
+| live   | DEL                | absent | —                              | Tombstone appended; index entry removed; L1 evicted. Blob `unref`'d if value-separated → unlinked at refcount 0.                                                |
+| live   | EXPIRE             | live   | —                              | TTL_UPDATE record appended; TTL sidecar updated. No value rewrite.                                                                                              |
+| live   | PERSIST            | live   | —                              | TTL_UPDATE record (NO_EXPIRY flag) appended; TTL sidecar entry removed.                                                                                         |
+| live   | GET (TTL elapsed)  | absent | `now_ms ≥ expires_at_ms`       | Tombstone appended; index + TTL sidecar cleared; L1 evicted. Blob `unref`'d. Caller receives nil.                                                               |
+| live   | CAS (rev matches)  | live   | `tstamp_ms == expected`        | Same as SET overwrite. New revision returned.                                                                                                                   |
+| live   | CAS (rev mismatch) | live   | `tstamp_ms != expected`        | No write, no disk I/O. 409 / nil returned.                                                                                                                      |
+| absent | CAS                | absent | key absent = revision mismatch | No write. 409 / nil returned.                                                                                                                                   |
+| live   | FLUSHDB            | absent | —                              | All data files unlinked and recreated; blob store directory removed; index and sidecars cleared. CoW sharing with parent fork preserved (unlink, not truncate). |
+
 ## Why It Behaves This Way
 
 ### Why each thread has its own engine instance
 
-Sharing storage across threads would require locking on the index and the active-file write offset. Per-thread instances eliminate that coordination entirely and keep the hot path lock-free. The tradeoff is that the routing layer must pin each client connection to a thread — a key read on thread 0 won't see a write made on thread 1.
+Sharing storage across threads would require cross-thread locking on the index and the active-file write offset. Per-thread instances eliminate that coordination entirely: **reads are lock-free**, and there is no cross-thread synchronization anywhere. The tradeoff is that the routing layer must pin each client connection to a thread — a key read on thread 0 won't see a write made on thread 1.
+
+Within a shard, writes take one **per-key stripe lock** (64 stripes per namespace, `wlock(key)`) for their check→append→commit. This is _not_ cross-thread (the shard is single-threaded; it's an async mutex serializing the cooperative tasks that interleave at `.await` points). Writes to different keys hash to different stripes and proceed fully concurrently; only same-key writes serialize. It exists so conditional writes (CAS/NX/XX) and read-modify-write (INCR) are atomic on disk — the holder checks before appending, so a failed condition or lost race writes nothing (no orphan record). Reads never take it.
 
 Connection routing is built into the server: `peek_resp_key` peeks the first bytes of a new TCP connection (without consuming them), extracts the key from the first command, and runs `FxHash(key) % n_shards` to pick a worker thread. The connection is then pinned to that thread for its lifetime. Multi-key commands (MGET, MSET, DEL, EXISTS) whose keys span shards are transparently fanned out via per-shard request channels (see "Cross-Shard Fan-Out") so the client sees a single response in original key order — no `CROSSSLOT` error.
 
@@ -408,26 +519,28 @@ Redis protocol defines SCAN to return "0" when iteration is complete. Reusing "0
 
 ### Why MSET is atomic (within one shard)
 
-Redis MSET is documented as atomic. Within a single shard this implementation builds one buffer containing every record, calls `write_at(buf, base_offset)` and `fsync()` once, then bulk-updates the index — all keys land or none do. The L1 cache is populated after the disk fsync; in the narrow window between the two, a cache miss correctly falls back to disk and sees all keys.
+Redis MSET is documented as atomic. Within a single shard this implementation builds one buffer containing every record and calls `write_all_at(buf, base_offset)` — a single OS write — then bulk-updates the index atomically. All keys are visible together or not at all: a crash before the next 1-second fsync loses all of them; a crash after leaves all of them. The L1 cache is populated after the write; a cache miss in the narrow window correctly falls back to disk and sees all keys.
 
 Across shards (when MSET keys span shard boundaries), atomicity is **not** preserved: each shard's subset commits independently, matching Redis Cluster's semantics.
 
 ## Configuration
 
-| CLI Flag / Env Var                                                     | Default              | What It Controls at Runtime                                                                       |
-| ---------------------------------------------------------------------- | -------------------- | ------------------------------------------------------------------------------------------------- |
-| `--data-dir` / `KV_DATA_DIR`                                           | `/var/lib/beyond/kv` | Root path for all shard directories (`{data_dir}/shard-{n}`)                                      |
-| `--resp-port` / `KV_RESP_PORT`                                         | `6379`               | TCP port each thread's RESP listener binds to                                                     |
-| `--http-address` / `KV_ADDRESS`                                        | `0.0.0.0:4869`       | Socket address each thread's HTTP listener binds to (full `ip:port`)                              |
-| `--threads` / `KV_THREADS`                                             | `num_cpus::get()`    | Number of OS threads (= number of shards)                                                         |
-| `--memory-bytes` / `KV_MEMORY_BYTES`                                   | `268435456` (256 MB) | Total L1 cache budget; divided evenly across threads                                              |
-| `--max-conns-per-shard` / `KV_MAX_CONNS_PER_SHARD`                     | `10000`              | Per-shard connection cap; connections beyond this are dropped immediately with a busy response    |
-| `--idle-timeout-secs` / `KV_IDLE_TIMEOUT_SECS`                         | `60`                 | Seconds of inactivity before a connection is closed                                               |
-| `--max-value-bytes` / `KV_MAX_VALUE_BYTES`                             | `67108864` (64 MB)   | Maximum accepted value size; larger bodies are rejected with HTTP 413 or RESP `ERR`               |
-| `--reclaim-sealed-threshold` / `KV_RECLAIM_SEALED_THRESHOLD`           | `4`                  | Auto-reclaim a namespace when its sealed file count exceeds this value; `0` disables auto-reclaim |
-| `--reclaim-interval-secs` / `KV_RECLAIM_INTERVAL_SECS`                 | `300`                | Seconds between auto-reclaim scans (ignored when threshold is 0)                                  |
-| `--readyz-sync-failure-threshold` / `KV_READYZ_SYNC_FAILURE_THRESHOLD` | `3`                  | Consecutive log-sync failures on any shard before `/readyz` returns 503                           |
-| `--log-level` / `LOG_LEVEL`                                            | `info`               | `tracing` filter level; set `ENVIRONMENT=development` for pretty-printed logs                     |
+| CLI Flag / Env Var                                                     | Default              | What It Controls at Runtime                                                                                                                                                |
+| ---------------------------------------------------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--data-dir` / `KV_DATA_DIR`                                           | `/var/lib/beyond/kv` | Root path for all shard directories (`{data_dir}/shard-{n}`)                                                                                                               |
+| `--resp-port` / `KV_RESP_PORT`                                         | `6379`               | TCP port each thread's RESP listener binds to                                                                                                                              |
+| `--http-address` / `KV_ADDRESS`                                        | `0.0.0.0:4869`       | Socket address each thread's HTTP listener binds to (full `ip:port`)                                                                                                       |
+| `--threads` / `KV_THREADS`                                             | `num_cpus::get()`    | Number of OS threads (= number of shards)                                                                                                                                  |
+| `--memory-bytes` / `KV_MEMORY_BYTES`                                   | `268435456` (256 MB) | Total L1 cache budget; divided evenly across threads                                                                                                                       |
+| `--max-conns-per-shard` / `KV_MAX_CONNS_PER_SHARD`                     | `10000`              | Per-shard connection cap; connections beyond this are dropped immediately with a busy response                                                                             |
+| `--idle-timeout-secs` / `KV_IDLE_TIMEOUT_SECS`                         | `60`                 | Seconds of inactivity before a connection is closed                                                                                                                        |
+| `--max-value-bytes` / `KV_MAX_VALUE_BYTES`                             | `67108864` (64 MB)   | Maximum accepted value size; larger bodies are rejected with HTTP 413 or RESP `ERR`                                                                                        |
+| `--reclaim-sealed-threshold` / `KV_RECLAIM_SEALED_THRESHOLD`           | `4`                  | Auto-reclaim a namespace when its sealed file count exceeds this value; `0` disables auto-reclaim                                                                          |
+| `--reclaim-interval-secs` / `KV_RECLAIM_INTERVAL_SECS`                 | `300`                | Seconds between auto-reclaim scans (ignored when threshold is 0)                                                                                                           |
+| `KV_COMPACTION_FANOUT`                                                 | `8`                  | Size-tiered compaction: a level merges into the next once it holds this many runs (higher = less write-amp, more space-amp); values < 2 ignored                            |
+| `KV_VALUE_SEP_THRESHOLD`                                               | `131072` (128 KiB)   | Values ≥ this go to the content-addressed blob store instead of inline; one GlideFS block — below it a blob-per-value wastes space, at/above it write-amp collapses to ~1× |
+| `--readyz-sync-failure-threshold` / `KV_READYZ_SYNC_FAILURE_THRESHOLD` | `3`                  | Consecutive log-sync failures on any shard before `/readyz` returns 503                                                                                                    |
+| `--log-level` / `LOG_LEVEL`                                            | `info`               | `tracing` filter level; set `ENVIRONMENT=development` for pretty-printed logs                                                                                              |
 
 ## Observability
 
@@ -474,43 +587,50 @@ The server is designed to run inside a trusted network perimeter (the same Glide
 
 ## Failure Modes
 
-| Failure                          | What Actually Happens                                                                                                                           | Recovery                                                            |
-| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------- |
-| Thread panic                     | `panic = "abort"` — process terminates immediately; no unwinding                                                                                | External process supervisor restarts the process                    |
-| Disk write error                 | `EngineError::Io` propagated; RESP client receives `ERR` response; connection stays open                                                        | Client retries; underlying disk issue must be resolved externally   |
-| CRC mismatch on replay           | `EngineError::CrcMismatch` during recovery — active file truncates at the last good boundary, sealed-file footer falls back to scanning records | Automatic; the offending tail bytes are dropped                     |
-| Bad record header                | `EngineError::BadRecord`; treated as the truncation point during replay                                                                         | Affected tail records are lost; older records survive               |
-| RESP parse error                 | Connection closed; no response sent                                                                                                             | Client reconnects                                                   |
-| HTTP malformed request           | JSON error body `{"error": "...", "message": "..."}` with 4xx status                                                                            | Client fixes request                                                |
-| Expired key read                 | Tombstone appended, evicted from L1; `None` returned to caller                                                                                  | Transparent; client sees cache miss                                 |
-| Crash during MSET (single shard) | Single fsynced write — either all records land or the partial tail is truncated by recovery's CRC check                                         | No partial state; client can safely retry                           |
-| Crash during cross-shard MSET    | Each shard's subset is independent; some shards may have committed before the crash                                                             | Client retries; idempotent overwrites converge to the desired state |
-| Crash mid-reclaim                | Old sealed files are still authoritative; tmp file from the partial reclaim is removed on next reclaim                                          | Automatic; no data loss (no rename happened)                        |
-| L1 cache over capacity           | Eviction runs inline during insert; oldest Small-queue entries dropped first                                                                    | Automatic; no data loss (L2 is authoritative)                       |
+| Failure                                                                     | What Actually Happens                                                                                                                                                                                                                                                                                                   | Recovery                                                                                                                                                                                                                           |
+| --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Thread panic                                                                | `panic = "abort"` — process terminates immediately; no unwinding                                                                                                                                                                                                                                                        | External process supervisor restarts the process                                                                                                                                                                                   |
+| Process crash between writes and fsync                                      | Writes in the last ≤1 second (since the previous timer-fsync) are lost — they went to the OS page cache but not stable storage                                                                                                                                                                                          | Up to ~1 second of writes lost; recovery truncates the active file at the last fsynced CRC boundary                                                                                                                                |
+| Disk write error                                                            | `EngineError::Io` propagated; RESP client receives `ERR` response; connection stays open                                                                                                                                                                                                                                | Client retries; underlying disk issue must be resolved externally                                                                                                                                                                  |
+| CRC mismatch on replay                                                      | `EngineError::CrcMismatch` during recovery — active file truncates at the last good boundary, sealed-file footer falls back to scanning records                                                                                                                                                                         | Automatic; the offending tail bytes are dropped                                                                                                                                                                                    |
+| Bad record header                                                           | `EngineError::BadRecord`; treated as the truncation point during replay                                                                                                                                                                                                                                                 | Affected tail records are lost; older records survive                                                                                                                                                                              |
+| Value-separated blob corrupted (bit-rot, mismatch)                          | Read re-hashes the blob; on content-hash mismatch returns `EngineError::BadRecord` instead of the wrong bytes (parity with inline CRC)                                                                                                                                                                                  | Detected, not silent; the key reads as an error until the blob is restored/overwritten                                                                                                                                             |
+| RESP parse error                                                            | Connection closed; no response sent                                                                                                                                                                                                                                                                                     | Client reconnects                                                                                                                                                                                                                  |
+| HTTP malformed request                                                      | JSON error body `{"error": "...", "message": "..."}` with 4xx status                                                                                                                                                                                                                                                    | Client fixes request                                                                                                                                                                                                               |
+| Expired key read                                                            | Tombstone appended, evicted from L1; `None` returned to caller                                                                                                                                                                                                                                                          | Transparent; client sees cache miss                                                                                                                                                                                                |
+| Crash during MSET (single shard)                                            | All records are built into one buffer and written with a single `write_all_at` — they're atomically visible or not from the OS perspective, but are only on stable storage after the next 1s fsync. A crash before that fsync loses the whole MSET. Recovery truncates the active file at the last fsynced CRC boundary | The MSET either fully lands or is fully absent after recovery — no partial MSET state                                                                                                                                              |
+| Crash during cross-shard MSET                                               | Each shard's subset is independent; some shards may have committed before the crash                                                                                                                                                                                                                                     | Client retries; idempotent overwrites converge to the desired state                                                                                                                                                                |
+| Crash mid-reclaim                                                           | Old sealed files are still authoritative; tmp file from the partial reclaim is removed on next reclaim                                                                                                                                                                                                                  | Automatic; no data loss (no rename happened)                                                                                                                                                                                       |
+| Crash between blob write and log append                                     | The blob is written but no record references it — an **orphan blob** (wasted disk only, never data loss). Recovery doesn't index it (no footer/record points at it)                                                                                                                                                     | `ValueStore::sweep_orphans` at the next open deletes every `values/blob-*` not referenced by a live key. Proven on the binary across a SIGKILL restart                                                                             |
+| Power loss after a value-sep overwrite/delete, before its record is durable | The key reverts to its previous value (everysec: the un-fsynced overwrite is lost). The old blob is **still present** — its deletion was deferred until the superseding record's fsync, which didn't happen                                                                                                             | Reads return the old value correctly (no dangling pointer). If the superseding record _was_ durable, the old blob is instead a true orphan → reclaimed by `sweep_orphans`. Exhaustively verified by the crash-consistency tests    |
+| Concurrent same-key write races a conditional write (CAS/NX/XX), then crash | **Closed.** Conditional writes hold the key's write stripe and check the condition _before_ appending, so a failed condition writes **no record at all** — there is no optimistic orphan to resurrect. (Previously: an aborted optimistic CAS left a valid orphan record a crash could resurrect.)                      | N/A — the orphan-producing code path was removed, not guarded. Verified by `concurrency_tests::concurrent_mixed_writes_recover_to_runtime_state` (recovery reproduces runtime exactly under heavy same-key CAS/SET/DEL contention) |
+| L1 cache over capacity                                                      | Eviction runs inline during insert; oldest Small-queue entries dropped first                                                                                                                                                                                                                                            | Automatic; no data loss (L2 is authoritative)                                                                                                                                                                                      |
 
 ## File Map
 
-| File                               | What It Does                                                                                                                                                |
-| ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `crates/proto/src/command.rs`      | Parses RESP arrays into `Command` enum; validates arity and option syntax                                                                                   |
-| `crates/proto/src/response.rs`     | Builds RESP values (ok, nil, bulk, error, array, hello reply, scan reply)                                                                                   |
-| `crates/proto/src/error.rs`        | Protocol-level error variants returned to clients                                                                                                           |
-| `crates/engine/src/store.rs`       | `ShardStore`: all storage operations; coordinates L1 + L2; expiry logic; SCAN; bulk MGET                                                                    |
-| `crates/engine/src/cache.rs`       | `MemCache`: S3-FIFO in-memory cache; eviction; ghost set; memory accounting                                                                                 |
-| `crates/engine/src/types.rs`       | `Entry`, `SetOptions`, `TtlResult`, `ScanPage`                                                                                                              |
-| `crates/engine/src/error.rs`       | Storage-level errors (I/O, CRC mismatch, bad record, invalid namespace, metadata JSON)                                                                      |
-| `crates/engine/src/log/mod.rs`     | `NamespaceLog`: index + active + sealed files; put_full / put_many / tombstone / ttl_update / bulk_read / flush / reclaim                                   |
-| `crates/engine/src/log/file.rs`    | `LogFile`: monoio io_uring file wrapper; append, read_at, write_footer, read_footer                                                                         |
-| `crates/engine/src/log/record.rs`  | Record encoding/decoding; CRC-64/NVME via `crc-fast`; flag bits                                                                                             |
-| `crates/engine/src/log/index.rs`   | `NsIndex`: hashmap + TTL sidecar + bucket-cursor SCAN                                                                                                       |
-| `crates/engine/src/log/recover.rs` | Startup: parse sealed-file footers; clean-shutdown active file has a footer (fast path), crash falls back to CRC-truncating replay                          |
-| `crates/engine/src/log/reclaim.rs` | Threshold-triggered merge of sealed files into a new sealed file; also exposed as `BGREWRITEAOF`                                                            |
-| `crates/server/src/main.rs`        | Thread spawning; per-thread Monoio runtime + ShardStore initialization                                                                                      |
-| `crates/server/src/config.rs`      | CLI arg + env var parsing into `Config`                                                                                                                     |
-| `crates/server/src/dispatch.rs`    | Maps `Command` → `ShardStore` calls → RESP response; `ConnState`; cross-shard fan-out for MGET/MSET/DEL/EXISTS                                              |
-| `crates/server/src/cross_shard.rs` | `CrossShardRequest` enum (MGet, MSet, Del, Set, Incr, DelRev, SetNx, SetXx, SetRev, GetDel, …) + per-shard receiver loop; `futures_channel::mpsc` transport |
-| `crates/engine/src/watch.rs`       | `WatchEvent`, `KeyFilter`, `WatchRegistry` — per-shard subscription registry; dead-sender lazy pruning                                                      |
-| `crates/server/src/resp.rs`        | TCP accept loop; RESP framing; connection state machine; `WATCH`/`PWATCH` streaming (RESP3 only)                                                            |
-| `crates/server/src/http.rs`        | HTTP route handlers; header/query param extraction; JSON error responses; SSE watch endpoint; batch endpoint                                                |
-| `crates/server/src/routing.rs`     | `peek_resp_key` / `peek_http_key` — peek first bytes of a new connection to extract routing key; `shard_for_key` (FxHash); percent-decode for HTTP paths    |
-| `crates/server/src/metrics.rs`     | Prometheus metric definitions (`MetricsInner` / `Metrics`); `encode()` flushes atomic cache counters into registered `CounterVec` before gathering          |
+| File                               | What It Does                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `crates/proto/src/command.rs`      | Parses RESP arrays into `Command` enum; validates arity and option syntax                                                                                                                                                                                                                                                                                                                                                                                         |
+| `crates/proto/src/response.rs`     | Builds RESP values (ok, nil, bulk, error, array, hello reply, scan reply)                                                                                                                                                                                                                                                                                                                                                                                         |
+| `crates/proto/src/error.rs`        | Protocol-level error variants returned to clients                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| `crates/engine/src/store.rs`       | `ShardStore`: all storage operations; coordinates L1 + L2; expiry logic; SCAN; bulk MGET                                                                                                                                                                                                                                                                                                                                                                          |
+| `crates/engine/src/cache.rs`       | `MemCache`: S3-FIFO in-memory cache; eviction; ghost set; memory accounting                                                                                                                                                                                                                                                                                                                                                                                       |
+| `crates/engine/src/types.rs`       | `Entry`, `SetOptions`, `TtlResult`, `ScanPage`                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| `crates/engine/src/error.rs`       | Storage-level errors (I/O, CRC mismatch, bad record, invalid namespace, metadata JSON)                                                                                                                                                                                                                                                                                                                                                                            |
+| `crates/engine/src/log/mod.rs`     | `NamespaceLog`: index + active + sealed files + blob store; put_full / put_many / tombstone / ttl_update / bulk_read / flush; `reclaim` → `reclaim_inner` (size-tiered); value separation on write (`maybe_separate`/`apply_valsep_insert`) and deref on read; `compaction_bytes`                                                                                                                                                                                 |
+| `crates/engine/src/value_store.rs` | `ValueStore`: content-addressed blob store (`values/blob-{hash}`), all I/O async via `monoio::fs` (io_uring); `put` (write-once + dedup, fsync data+dir before returning), `get`, `unref` (refcount-- + queue), `collect_garbage` (delete queued blobs after fsync), `incr_ref` (recovery), `sweep_orphans` (reclaim crash-orphaned blobs at open), `clear` (FLUSHDB); per-content `flock` stripes serialize create/delete; callers re-hash on read for integrity |
+| `crates/engine/src/log/config.rs`  | `LogConfig`: `rotate_threshold`, `fanout` (KV_COMPACTION_FANOUT), `value_sep_threshold`                                                                                                                                                                                                                                                                                                                                                                           |
+| `crates/engine/src/log/file.rs`    | `LogFile`: monoio io_uring file wrapper; append, read_at; `FooterEntry` (+ `value_hash`) encode/decode + footer magic v3                                                                                                                                                                                                                                                                                                                                          |
+| `crates/engine/src/log/record.rs`  | Record encoding/decoding; CRC-64/NVME via `crc-fast`; flag bits                                                                                                                                                                                                                                                                                                                                                                                                   |
+| `crates/engine/src/log/index.rs`   | `NsIndex`: `BTreeMap` + TTL sidecar + value-sep hash sidecar + range-cursor SCAN                                                                                                                                                                                                                                                                                                                                                                                  |
+| `crates/engine/src/log/recover.rs` | Startup: parse sealed-file footers (incl. `value_hash`); clean-shutdown active file has a footer (fast path), crash falls back to CRC-truncating replay; repopulates the value-sep sidecar                                                                                                                                                                                                                                                                        |
+| `crates/engine/src/log/reclaim.rs` | `reclaim_namespace`: merge a set of sealed files into one new sealed file, unlink inputs (called once per level by size-tiered reclaim); also exposed as `BGREWRITEAOF`                                                                                                                                                                                                                                                                                           |
+| `crates/server/src/main.rs`        | Thread spawning; per-thread Monoio runtime + ShardStore initialization                                                                                                                                                                                                                                                                                                                                                                                            |
+| `crates/server/src/config.rs`      | CLI arg + env var parsing into `Config`                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `crates/server/src/dispatch.rs`    | Maps `Command` → `ShardStore` calls → RESP response; `ConnState`; cross-shard fan-out for MGET/MSET/DEL/EXISTS                                                                                                                                                                                                                                                                                                                                                    |
+| `crates/server/src/cross_shard.rs` | `CrossShardRequest` enum (MGet, MSet, Del, Set, Incr, DelRev, SetNx, SetXx, SetRev, GetDel, …) + per-shard receiver loop; `futures_channel::mpsc` transport                                                                                                                                                                                                                                                                                                       |
+| `crates/engine/src/watch.rs`       | `WatchEvent`, `KeyFilter`, `WatchRegistry` — per-shard subscription registry; dead-sender lazy pruning                                                                                                                                                                                                                                                                                                                                                            |
+| `crates/server/src/resp.rs`        | TCP accept loop; RESP framing; connection state machine; `WATCH`/`PWATCH` streaming (RESP3 only)                                                                                                                                                                                                                                                                                                                                                                  |
+| `crates/server/src/http.rs`        | HTTP route handlers; header/query param extraction; JSON error responses; SSE watch endpoint; batch endpoint                                                                                                                                                                                                                                                                                                                                                      |
+| `crates/server/src/routing.rs`     | `peek_resp_key` / `peek_http_key` — peek first bytes of a new connection to extract routing key; `shard_for_key` (FxHash); percent-decode for HTTP paths                                                                                                                                                                                                                                                                                                          |
+| `crates/server/src/metrics.rs`     | Prometheus metric definitions (`MetricsInner` / `Metrics`); `encode()` flushes atomic cache counters into registered `CounterVec` before gathering                                                                                                                                                                                                                                                                                                                |
diff --git a/Cargo.lock b/Cargo.lock
index 4081369..a7aca4f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -82,6 +82,18 @@ dependencies = [
  "rustversion",
 ]
 
+[[package]]
+name = "arrayref"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
+
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
 [[package]]
 name = "asn1-rs"
 version = "0.7.1"
@@ -282,11 +294,13 @@ dependencies = [
 name = "beyond-kv-engine"
 version = "0.1.0"
 dependencies = [
+ "blake3",
  "bytes",
  "crc-fast",
  "divan",
  "futures-channel",
  "futures-util",
+ "libc",
  "memchr",
  "monoio",
  "rustc-hash",
@@ -342,6 +356,20 @@ version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
 
+[[package]]
+name = "blake3"
+version = "1.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce"
+dependencies = [
+ "arrayref",
+ "arrayvec",
+ "cc",
+ "cfg-if",
+ "constant_time_eq",
+ "cpufeatures",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.20.2"
@@ -475,6 +503,12 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf0a07a401f374238ab8e2f11a104d2851bf9ce711ec69804834de8af45c7af"
 
+[[package]]
+name = "constant_time_eq"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
+
 [[package]]
 name = "core-foundation"
 version = "0.10.1"
@@ -491,6 +525,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "cpufeatures"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "crc-fast"
 version = "1.10.0"
diff --git a/Cargo.toml b/Cargo.toml
index 446403f..c6d5a1a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ futures-util = { version = "0.3", features = ["sink"] }
 http = "1.0"
 memchr = "2"
 mimalloc = { version = "0.1", features = ["v2"] }
-monoio = { version = "0.2", features = ["io-uring", "unlinkat", "renameat", "sync"] }
+monoio = { version = "0.2", features = ["io-uring", "unlinkat", "renameat", "mkdirat", "sync"] }
 monoio-codec = "0.3"
 monoio-http = "0.3"
 monoio-rustls = "0.4"
diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml
index e68eac4..e076ade 100644
--- a/crates/engine/Cargo.toml
+++ b/crates/engine/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true
 rust-version.workspace = true
 
 [dependencies]
+blake3 = "1"
 bytes.workspace = true
 crc-fast.workspace = true
 futures-channel.workspace = true
@@ -20,6 +21,7 @@ tracing.workspace = true
 
 [dev-dependencies]
 divan = "0.1"
+libc = "0.2"
 tempfile = "3"
 
 [[bench]]
diff --git a/crates/engine/src/lib.rs b/crates/engine/src/lib.rs
index 487a2f8..bbdf270 100644
--- a/crates/engine/src/lib.rs
+++ b/crates/engine/src/lib.rs
@@ -3,4 +3,5 @@ pub mod error;
 pub mod log;
 pub mod store;
 pub mod types;
+pub mod value_store;
 pub mod watch;
diff --git a/crates/engine/src/log/ARCHITECTURE.md b/crates/engine/src/log/ARCHITECTURE.md
index a953993..bda1f45 100644
--- a/crates/engine/src/log/ARCHITECTURE.md
+++ b/crates/engine/src/log/ARCHITECTURE.md
@@ -56,24 +56,33 @@ futures::join_all([read_exact(), read_exact(), ...])  — concurrent io_uring op
 [Option<Bytes>, ...]
 ```
 
-### Reclaim (compaction)
+### Reclaim (size-tiered compaction)
+
+Reclaim is size-tiered, not full-merge: each merge rewrites only one level's
+runs, so write amplification is ~O(log N) instead of O(reclaims × live-set), and
+on GlideFS a reclaim re-uploads one level rather than the whole namespace.
 
 ```
-reclaim_namespace()
-  │
-  ├─ 1. Seal active file — write footer (per-key metadata + CRC64 + magic)
-  │
-  ├─ 2. Read all live index entries → read records from sealed files
-  │
-  ├─ 3. Write live records to data-{next_id}.log.tmp
+NamespaceLog::reclaim()
   │
-  ├─ 4. rename() .tmp → .log   (atomic)
+  ├─ 1. Seal active file — write footer — and insert it as a fresh level-0 run
   │
-  ├─ 5. Drop old sealed files  (unlink; logs failures but does not error)
+  ├─ 2. Cascade: while some level L holds >= `fanout` runs:
+  │       │
+  │       ├─ collect that level's live records (index entries with those file_ids)
+  │       ├─ reclaim_namespace(): read them concurrently, write one merged file
+  │       │     to data-{next_id}.log.tmp, footer + fsync, rename .tmp → .log,
+  │       │     fsync dir, unlink the input files (leak-logged, never errors)
+  │       ├─ open_ro the merged file FIRST (only fallible step), THEN swap index
+  │       │     + sealed map atomically — a failed open leaves state consistent
+  │       └─ tag the merged run at level L+1
   │
-  └─ 6. Open fresh active LogFile → return ReclaimReport
+  └─ 3. Open a fresh active LogFile → return ReclaimReport
 ```
 
+`fanout` (default 8) is the per-level run count that triggers a merge. Levels
+are in-memory only (`level: file_id → u8`); recovered runs start at level 0.
+
 ### Recovery (startup)
 
 ```
@@ -91,27 +100,33 @@ open_namespace(dir, config)
   │     footer absent  (crash)          → replay records from offset 0,
   │                                       truncate at first bad CRC
   │
-  └─ apply in order:
-       full record  → NsIndex::insert()
-       tombstone    → NsIndex::remove()
-       ttl_update   → NsIndex::update_ttl() (only if key still present)
+  ├─ apply in order:
+  │    full record  → NsIndex::insert() (+ value-sep sidecar if VALUE_SEP)
+  │    tombstone    → NsIndex::remove()
+  │    ttl_update   → NsIndex::set_ttl() (only if key still present)
+  │
+  └─ NamespaceLog::open() post-steps:
+       rebuild blob refcounts (one incr_ref per live value-separated key)
+       sweep_orphans() — unlink any blob no live key references (crash leftover)
+       seed the revision clock from the highest recovered tstamp_ms
 ```
 
 ## Concepts & Terminology
 
-| Term           | What It Controls                                                                                          | NOT                                                                              |
-| -------------- | --------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
-| `NamespaceLog` | All reads/writes for one key-space; owns the index and file set                                           | Not a shard — multiple namespaces can live in one shard                          |
-| `LogFile`      | One `data-{id}.log` file; tracks write offset, exposes positioned I/O                                     | Not a WAL segment; the log IS the store                                          |
-| `active` file  | The only writable file at any time; receives all new appends                                              | Not memory-mapped; accessed via io_uring                                         |
-| `sealed` files | Immutable; readable only; eligible for reclaim                                                            | Not deleted until reclaim completes the rename                                   |
-| Footer         | Per-key metadata block at the end of a file; enables fast recovery                                        | Written to the active file on clean shutdown; absence means crash or in-progress |
-| Tombstone      | A record with the `TOMBSTONE` flag; marks a key as deleted in the log                                     | Not a physical delete — the old record remains until reclaim                     |
-| TTL-update     | A tiny record with the `TTL_UPDATE` flag; updates expiry with no value copy                               | Not authoritative until replayed against the index                               |
-| `NsIndex`      | In-memory `FxHashMap` from key → `IndexEntry`; the read path                                              | Not persisted — rebuilt from log on every open                                   |
-| `IndexEntry`   | 16-byte struct: file_id + record_offset + record_size + flags                                             | Does not hold the value or the key                                               |
-| Reclaim        | GC: rewrites live keys into one new file; auto-triggered by sealed-file count threshold or `BGREWRITEAOF` | Caller must serialize with writes; cannot run concurrently with appends          |
-| `flush()`      | Unlinks and recreates all files (CoW snapshot invalidation)                                               | Not fsync — this destroys all data in the namespace                              |
+| Term           | What It Controls                                                                                          | NOT                                                                                 |
+| -------------- | --------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- |
+| `NamespaceLog` | All reads/writes for one key-space; owns the index and file set                                           | Not a shard — multiple namespaces can live in one shard                             |
+| `LogFile`      | One `data-{id}.log` file; tracks write offset, exposes positioned I/O                                     | Not a WAL segment; the log IS the store                                             |
+| `active` file  | The only writable file at any time; receives all new appends                                              | Not memory-mapped; accessed via io_uring                                            |
+| `sealed` files | Immutable; readable only; eligible for reclaim                                                            | Not deleted until reclaim completes the rename                                      |
+| Footer         | Per-key metadata block at the end of a file; enables fast recovery                                        | Written to the active file on clean shutdown; absence means crash or in-progress    |
+| Tombstone      | A record with the `TOMBSTONE` flag; marks a key as deleted in the log                                     | Not a physical delete — the old record remains until reclaim                        |
+| TTL-update     | A tiny record with the `TTL_UPDATE` flag; updates expiry with no value copy                               | Not authoritative until replayed against the index                                  |
+| `NsIndex`      | In-memory key → `IndexEntry` map (`BTreeMap`, ordered for SCAN) + TTL sidecar + value-sep sidecar         | Not persisted — rebuilt from log on every open                                      |
+| `IndexEntry`   | 24-byte struct: record_offset (u64) + record_size (u32) + file_id (u32) + tstamp_ms (u64)                 | Does not hold the value, the key, or flags; `tstamp_ms` doubles as the CAS revision |
+| `ValueStore`   | Content-addressed blob store for large (value-separated) values; refcounted, deduped, deferred-GC         | Not in the log — compaction moves only the 16-byte pointer, never the blob          |
+| Reclaim        | GC: rewrites live keys into one new file; auto-triggered by sealed-file count threshold or `BGREWRITEAOF` | Caller must serialize with writes; cannot run concurrently with appends             |
+| `flush()`      | Unlinks and recreates all files (CoW snapshot invalidation)                                               | Not fsync — this destroys all data in the namespace                                 |
 
 ## Core Mechanisms
 
@@ -127,7 +142,7 @@ Every record on disk is self-describing:
 Byte range   Field           Notes
 0..8         crc64-nvme      covers bytes 8..end of record
 8..16        tstamp_ms       monotonic; used for tie-breaking on recovery
-16           flags           TOMBSTONE=0x01 | NO_EXPIRY=0x02 | TTL_UPDATE=0x04
+16           flags           TOMBSTONE=0x01 | NO_EXPIRY=0x02 | TTL_UPDATE=0x04 | VALUE_SEP=0x08
 17..25       expires_at_ms   0 when NO_EXPIRY flag set
 25..29       key_size        u32
 29..33       val_size        u32
@@ -135,17 +150,34 @@ Byte range   Field           Notes
 37..         key || val || meta
 ```
 
-The CRC covers the entire record body. Any byte-level corruption causes the record to be skipped on recovery (active file is truncated to the last clean record).
+`HEADER_LEN = 37`. When `VALUE_SEP` is set the `val` field is not the value but
+a 16-byte BLAKE3-128 content hash pointing into the blob store (see Value
+Separation below); `val_size == 16` in that case.
+
+The CRC covers the entire record body. Any byte-level corruption causes the
+record to be skipped: on recovery the active file is truncated to the last clean
+record, and on the watch catch-up path (`scan_since` / `scan_file_records`) the
+scan of that file stops at the first bad CRC rather than streaming a corrupt
+event.
 
 ### Sealed file footer (`file.rs`)
 
-When a file is sealed (by reclaim or a future rotation), a footer is appended:
+When a file is sealed (by reclaim, rotation, or clean-shutdown seal), a footer is appended:
 
 ```
-[ IndexEntry × N ][ entry_count: u64 ][ crc64: u64 ][ magic: u64 = 0x4259_4F4E_445F_4B56 ]
+[ FooterEntry × N ][ footer_body_len: u64 ][ crc64: u64 ][ magic: u64 = 0x4259_4F4E_445F_4B58 ]
 ```
 
-The magic value (`BYOND_KV` in ASCII) lets recovery distinguish a cleanly sealed file from a crashed active file. If the footer is present and CRC-valid, recovery uses it to populate the index without scanning the full file body.
+Each `FooterEntry` carries `key`, `record_offset`, `record_size`,
+`expires_at_ms` (optional), `tstamp_ms`, and the optional 16-byte value-sep hash
+— enough to rebuild the index, the TTL sidecar, and the blob refcounts without
+reading record bodies. The 24-byte trailer is `footer_body_len`, the body CRC,
+and the magic.
+
+The magic value (`BYOND_KX` in ASCII — the `X` marks the v3 format that added
+the per-entry tstamp and value-sep hash) lets recovery distinguish a cleanly
+sealed file from a crashed active file. If the footer is present and CRC-valid,
+recovery uses it to populate the index without scanning the full file body.
 
 ### In-memory index and TTL sidecar (`index.rs`)
 
@@ -157,6 +189,30 @@ The magic value (`BYOND_KV` in ASCII) lets recovery distinguish a cleanly sealed
 
 The compaction rename (`data-{id}.log.tmp` → `data-{id}.log`) is the only atomic step. If the process crashes before the rename, the `.tmp` file is abandoned and recovery ignores it. If the crash happens after the rename but before old files are unlinked, the old sealed files remain; the next reclaim will skip them because the index no longer references their entries. Dead files produce a log warning, not an error.
 
+### Value separation (`value_store.rs`)
+
+Values `>= config.value_sep_threshold` (default 128 KiB = one GlideFS block) are
+written WiscKey-style to a content-addressed blob store at `{dir}/values/`
+instead of inline in the log. The log record then carries only a 16-byte
+BLAKE3-128 content hash (the `VALUE_SEP` flag marks this). Because the pointer is
+tiny and immutable, compaction relocates pointers, never large values —
+collapsing large-value write amplification.
+
+Blobs are:
+
+- **Deduped** — identical content across keys/forks/tenants maps to one blob.
+- **Refcounted** — refcounts are in-memory, rebuilt from the live index on open.
+- **Write-once + crash-durable** — the blob's data AND its directory entry are
+  fsynced before the pointer record can become durable, so a crash can at worst
+  leave an orphan blob (reclaimed by `sweep_orphans`), never a dangling pointer.
+- **Deferred-GC** — when the last reference drops, the blob is queued and only
+  physically unlinked after the next log fsync (`collect_garbage`), so a
+  power-loss revert always finds its blob still present. A same-content `put`
+  racing the unlink is serialized by a per-hash file stripe.
+
+On read, the blob is fetched by hash and re-hashed to verify integrity — parity
+with the CRC the inline path pays on every read.
+
 ## State Machine
 
 ```
@@ -201,20 +257,28 @@ The engine runs on a single-threaded `monoio` runtime per shard. There is no cro
 
 ## Failure Modes
 
-| Failure                         | What Actually Happens                      | Recovery                                                                                          |
-| ------------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------- |
-| Crash mid-append (active file)  | Partial record at tail of active file      | Recovery replays records; stops and truncates at first bad CRC                                    |
-| Crash mid-reclaim before rename | `.tmp` file left on disk                   | Ignored on next open (no `.log` suffix); old sealed files intact                                  |
-| Crash mid-reclaim after rename  | Old sealed files not unlinked              | Next reclaim drops them; logged as warnings                                                       |
-| Sealed file footer corrupt      | Footer CRC check fails                     | Falls back to full sequential record scan                                                         |
-| Read from expired key           | Returns `None`; tombstone appended lazily  | Tombstone write is best-effort; a crash before it completes means the key re-expires on next read |
-| `flush()` called accidentally   | All namespace files unlinked and recreated | Data is gone; no recovery — `flush()` is a destructive reset                                      |
-| Clean shutdown (SIGTERM/SIGINT) | Footer written to active file before exit  | Next startup treats it as sealed; no record replay needed                                         |
+| Failure                                       | What Actually Happens                       | Recovery                                                                                                        |
+| --------------------------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------- |
+| Crash mid-append (active file)                | Partial record at tail of active file       | Recovery replays records; stops and truncates at first bad CRC                                                  |
+| Crash mid-reclaim before rename               | `.tmp` file left on disk                    | Ignored on next open (no `.log` suffix); old sealed files intact                                                |
+| Crash mid-reclaim after rename                | Old sealed files not unlinked               | Next reclaim drops them; logged as warnings                                                                     |
+| Sealed file footer corrupt                    | Footer CRC check fails                      | Falls back to full sequential record scan                                                                       |
+| Read from expired key                         | Returns `None`; tombstone appended lazily   | Tombstone write is best-effort; a crash before it completes means the key re-expires on next read               |
+| `flush()` called accidentally                 | All namespace files unlinked and recreated  | Data is gone; no recovery — `flush()` is a destructive reset                                                    |
+| Clean shutdown (SIGTERM/SIGINT)               | Footer written to active file before exit   | Next startup treats it as sealed; no record replay needed                                                       |
+| Crash after blob write, before pointer record | Orphan blob on disk, no referencing key     | `sweep_orphans` unlinks it on next open (refcounts rebuilt from the live index first)                           |
+| Corrupt record on watch replay                | CRC mismatch in `scan_file_records`         | Scan of that file stops at the bad record; no bogus event is streamed                                           |
+| `open_ro` of merged file fails mid-reclaim    | Merged file on disk, in-memory swap aborted | Index/sealed left untouched; old (unlinked-but-open) fds keep serving reads until restart finds the merged file |
 
 ## Configuration
 
 `LogConfig` (`config.rs`):
 
-| Field           | Default      | What It Controls                                                                        |
-| --------------- | ------------ | --------------------------------------------------------------------------------------- |
-| `max_file_size` | (caller-set) | Byte threshold at which the active file is rotated to sealed and a new active is opened |
+| Field                 | Default | What It Controls                                                                              |
+| --------------------- | ------- | --------------------------------------------------------------------------------------------- |
+| `rotate_threshold`    | 1 GiB   | Byte threshold at which the active file is sealed and a fresh active is opened                |
+| `fanout`              | 8       | Size-tiered compaction fanout: a level merges into the next once it holds this many runs      |
+| `value_sep_threshold` | 128 KiB | Values `>=` this go to the content-addressed blob store instead of inline (one GlideFS block) |
+
+`KV_COMPACTION_FANOUT` and `KV_VALUE_SEP_THRESHOLD` env vars override `fanout`
+and `value_sep_threshold` at `ShardStore::open` (fanout is clamped to `>= 2`).
diff --git a/crates/engine/src/log/config.rs b/crates/engine/src/log/config.rs
index c88bb57..c478f29 100644
--- a/crates/engine/src/log/config.rs
+++ b/crates/engine/src/log/config.rs
@@ -5,12 +5,23 @@ pub struct LogConfig {
     /// threshold, call `NamespaceLog::rotate_active()` to seal the active file and
     /// open a fresh one. Rotation is operator-controlled and NOT automatic.
     pub rotate_threshold: u64,
+    /// Size-tiered compaction fanout: a level is merged into the next once it
+    /// holds this many runs. Higher = less write-amp, more space-amp. Default 8
+    /// (the measured knee).
+    pub fanout: usize,
+    /// Value-separation threshold in bytes. Values >= this are stored in the
+    /// content-addressed blob store instead of inline in the log, so compaction
+    /// never re-uploads them. Default 128 KiB = one GlideFS block: below a block,
+    /// a blob-per-value wastes space; at/above it, separation collapses write-amp.
+    pub value_sep_threshold: usize,
 }
 
 impl Default for LogConfig {
     fn default() -> Self {
         Self {
             rotate_threshold: 1 << 30, // 1 GiB
+            fanout: 8,
+            value_sep_threshold: 128 * 1024, // 128 KiB = one GlideFS block
         }
     }
 }
diff --git a/crates/engine/src/log/file.rs b/crates/engine/src/log/file.rs
index dc9b0fd..cde2fcb 100644
--- a/crates/engine/src/log/file.rs
+++ b/crates/engine/src/log/file.rs
@@ -97,6 +97,10 @@ impl Deref for BufGuard {
 
 impl BufGuard {
     pub(crate) fn into_inner(mut self) -> Vec<u8> {
+        // SAFETY: `take` moves the inner Vec out of the ManuallyDrop exactly
+        // once. The immediately-following `mem::forget(self)` prevents `Drop`
+        // from running and taking it a second time, so the single-take
+        // invariant holds across both code paths (this and `drop`).
         let buf = unsafe { ManuallyDrop::take(&mut self.0) };
         std::mem::forget(self);
         buf
@@ -105,6 +109,9 @@ impl BufGuard {
 
 impl Drop for BufGuard {
     fn drop(&mut self) {
+        // SAFETY: `Drop::drop` runs at most once per value, and `into_inner`
+        // is the only other consumer — it `mem::forget`s the guard so this
+        // `drop` cannot run after it. Thus the inner Vec is taken exactly once.
         let buf = unsafe { ManuallyDrop::take(&mut self.0) };
         pool_release(buf);
     }
@@ -113,7 +120,7 @@ impl Drop for BufGuard {
 /// Magic at the very end of every sealed file. Lets recovery distinguish
 /// "sealed cleanly" from "active or crashed mid-seal" without scanning.
 /// v2: includes tstamp_ms per entry for O(1) CAS revision checks.
-pub const FOOTER_MAGIC: u64 = 0x4259_4F4E_445F_4B57; // "BYOND_KW" (v2)
+pub const FOOTER_MAGIC: u64 = 0x4259_4F4E_445F_4B58; // "BYOND_KX" (v3: + value-sep hash)
 /// Footer trailer size: footer_body_len (8) + footer_crc (8) + magic (8).
 pub const FOOTER_TRAILER_LEN: u64 = 24;
 
@@ -122,6 +129,7 @@ pub const FOOTER_TRAILER_LEN: u64 = 24;
 /// Wire layout (little-endian):
 ///   [key_size: u32][record_offset: u64][record_size: u32]
 ///   [expires_at_ms: u64 (0 if absent)][has_expiry: u8][tstamp_ms: u64]
+///   [has_valsep: u8][value_hash: 16 bytes (only if has_valsep)]
 ///   [key bytes]
 #[derive(Debug, Clone)]
 pub struct FooterEntry {
@@ -130,11 +138,15 @@ pub struct FooterEntry {
     pub record_size: u32,
     pub expires_at_ms: Option<u64>,
     pub tstamp_ms: u64,
+    /// Content hash if this key's value is value-separated (lives in the blob
+    /// store). Carried in the footer so recovery rebuilds the value-sep sidecar
+    /// and blob refcounts without reading record bodies.
+    pub value_hash: Option<[u8; 16]>,
 }
 
 impl FooterEntry {
     fn encoded_size(&self) -> usize {
-        4 + 8 + 4 + 8 + 1 + 8 + self.key.len()
+        4 + 8 + 4 + 8 + 1 + 8 + 1 + if self.value_hash.is_some() { 16 } else { 0 } + self.key.len()
     }
 
     fn encode_into(&self, buf: &mut Vec<u8>) {
@@ -148,11 +160,20 @@ impl FooterEntry {
         buf.extend_from_slice(&ms.to_le_bytes());
         buf.push(has_expiry);
         buf.extend_from_slice(&self.tstamp_ms.to_le_bytes());
+        match self.value_hash {
+            Some(h) => {
+                buf.push(1u8);
+                buf.extend_from_slice(&h);
+            }
+            None => buf.push(0u8),
+        }
         buf.extend_from_slice(&self.key);
     }
 
     fn parse(buf: &[u8]) -> Option<(Self, usize)> {
-        if buf.len() < 33 {
+        // Fixed prefix: key_size(4)+offset(8)+size(4)+expires(8)+has_expiry(1)
+        //               +tstamp(8)+has_valsep(1) = 34 bytes.
+        if buf.len() < 34 {
             return None;
         }
         let key_size = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]) as usize;
@@ -167,11 +188,25 @@ impl FooterEntry {
         let tstamp_ms = u64::from_le_bytes([
             buf[25], buf[26], buf[27], buf[28], buf[29], buf[30], buf[31], buf[32],
         ]);
-        let total = 33 + key_size;
+        let has_valsep = buf[33];
+        let mut cursor = 34usize;
+        let value_hash = if has_valsep != 0 {
+            let end = cursor + 16;
+            if buf.len() < end {
+                return None;
+            }
+            let mut h = [0u8; 16];
+            h.copy_from_slice(&buf[cursor..end]);
+            cursor = end;
+            Some(h)
+        } else {
+            None
+        };
+        let total = cursor + key_size;
         if buf.len() < total {
             return None;
         }
-        let key = bytes::Bytes::copy_from_slice(&buf[33..total]);
+        let key = bytes::Bytes::copy_from_slice(&buf[cursor..total]);
         Some((
             Self {
                 key,
@@ -183,17 +218,35 @@ impl FooterEntry {
                     None
                 },
                 tstamp_ms,
+                value_hash,
             },
             total,
         ))
     }
 }
 
-pub fn data_filename(file_id: u16) -> String {
+pub fn data_filename(file_id: u32) -> String {
     format!("data-{:010}.log", file_id)
 }
 
-pub fn reclaim_tmp_filename(file_id: u16) -> String {
+/// fsync a directory so that newly-created (or renamed) entries inside it are
+/// durable. A file's own `fsync` flushes its data + inode, but POSIX does not
+/// guarantee the *directory entry* (the name → inode link) is durable until the
+/// directory itself is fsynced. Without this, a power loss could leave a freshly
+/// created data file's bytes on disk while its name is lost — making records that
+/// were already fsynced unreachable, violating the `appendfsync everysec`
+/// contract. Called at every new-file creation / rename site (rare paths:
+/// rotate, reclaim, flush, startup), never on the per-write hot path.
+/// Best-effort: opening a directory read-only and fsyncing it is the portable
+/// way; on filesystems that reject it the link is still durable via journaling.
+pub(crate) async fn sync_dir(dir: &Path) {
+    if let Ok(d) = OpenOptions::new().read(true).open(dir).await {
+        let _ = d.sync_all().await;
+        let _ = d.close().await;
+    }
+}
+
+pub fn reclaim_tmp_filename(file_id: u32) -> String {
     format!("data-{:010}.log.tmp", file_id)
 }
 
@@ -206,15 +259,20 @@ pub fn reclaim_tmp_filename(file_id: u16) -> String {
 /// safe under single-thread (`!Sync`) access; sufficient since each shard runs
 /// on its own monoio runtime.
 pub struct LogFile {
-    pub file_id: u16,
+    pub file_id: u32,
     pub path: PathBuf,
     file: File,
     write_offset: Cell<u64>,
     poisoned: Cell<bool>,
+    /// Test-only: when set, the next `append` reserves its offset (as a real
+    /// append does) then fails with ENOSPC instead of touching the disk —
+    /// faithfully modeling a disk-full write without privileges or a real fill.
+    #[cfg(test)]
+    fail_next_write: Cell<bool>,
 }
 
 impl LogFile {
-    pub async fn open_rw(path: PathBuf, file_id: u16) -> Result<Self> {
+    pub async fn open_rw(path: PathBuf, file_id: u32) -> Result<Self> {
         let file = OpenOptions::new()
             .read(true)
             .write(true)
@@ -230,10 +288,12 @@ impl LogFile {
             file,
             write_offset: Cell::new(len),
             poisoned: Cell::new(false),
+            #[cfg(test)]
+            fail_next_write: Cell::new(false),
         })
     }
 
-    pub async fn open_ro(path: PathBuf, file_id: u16) -> Result<Self> {
+    pub async fn open_ro(path: PathBuf, file_id: u32) -> Result<Self> {
         let file = OpenOptions::new().read(true).open(&path).await?;
         let metadata = file.metadata().await?;
         let len = metadata.len();
@@ -243,9 +303,18 @@ impl LogFile {
             file,
             write_offset: Cell::new(len),
             poisoned: Cell::new(false),
+            #[cfg(test)]
+            fail_next_write: Cell::new(false),
         })
     }
 
+    /// Test-only: arm the next `append` to fail with ENOSPC after reserving its
+    /// offset, exactly as a real disk-full write would (which then poisons the file).
+    #[cfg(test)]
+    pub(crate) fn force_next_write_failure(&self) {
+        self.fail_next_write.set(true);
+    }
+
     pub async fn size(&self) -> Result<u64> {
         let metadata = self.file.metadata().await?;
         Ok(metadata.len())
@@ -289,6 +358,15 @@ impl LogFile {
         let len = buf.len() as u64;
         let offset = self.write_offset.get();
         self.write_offset.set(offset + len);
+        #[cfg(test)]
+        if self.fail_next_write.replace(false) {
+            // Model a disk-full write: offset already reserved, nothing hits disk,
+            // file poisoned so no later write can shadow this torn slot.
+            self.poisoned.set(true);
+            return Err(EngineError::Io {
+                source: std::io::Error::from_raw_os_error(28), // ENOSPC
+            });
+        }
         let (res, buf) = self.file.write_all_at(buf, offset).await;
         if let Err(e) = res {
             self.poisoned.set(true);
@@ -425,6 +503,7 @@ pub(crate) fn footer_entry_from_index(
     key: bytes::Bytes,
     entry: &IndexEntry,
     expires_at_ms: Option<u64>,
+    value_hash: Option<[u8; 16]>,
 ) -> FooterEntry {
     FooterEntry {
         key,
@@ -432,12 +511,13 @@ pub(crate) fn footer_entry_from_index(
         record_size: entry.record_size,
         expires_at_ms,
         tstamp_ms: entry.tstamp_ms,
+        value_hash,
     }
 }
 
 /// List all `data-*.log` files in `dir`, sorted ascending by file_id.
-pub fn list_data_files(dir: &Path) -> Result<Vec<(u16, PathBuf)>> {
-    let mut out: Vec<(u16, PathBuf)> = Vec::new();
+pub fn list_data_files(dir: &Path) -> Result<Vec<(u32, PathBuf)>> {
+    let mut out: Vec<(u32, PathBuf)> = Vec::new();
     let read_dir = match std::fs::read_dir(dir) {
         Ok(rd) => rd,
         Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
@@ -456,14 +536,66 @@ pub fn list_data_files(dir: &Path) -> Result<Vec<(u16, PathBuf)>> {
         let Some(num) = rest.strip_suffix(".log") else {
             continue;
         };
-        let Ok(file_id_u32) = num.parse::<u32>() else {
+        let Ok(file_id) = num.parse::<u32>() else {
             continue;
         };
-        if file_id_u32 > u16::MAX as u32 {
-            continue;
-        }
-        out.push((file_id_u32 as u16, path));
+        out.push((file_id, path));
     }
     out.sort_by_key(|(id, _)| *id);
     Ok(out)
 }
+
+#[cfg(test)]
+mod enospc_tests {
+    use super::*;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    /// A failed append (disk-full) poisons the file: the offset was reserved, but
+    /// every subsequent append fails immediately. This is what prevents a later
+    /// write from landing PAST the torn slot — which would survive recovery while
+    /// the records between it and the truncation point are silently lost. Remove
+    /// the `poisoned` set/check in `append` and the third write below succeeds at
+    /// the advanced offset, shadowing the gap on the next recovery: teeth.
+    #[test]
+    fn failed_append_poisons_file_and_blocks_later_writes() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let f = LogFile::open_rw(dir.path().join("data-0000000000.log"), 0)
+                .await
+                .unwrap();
+
+            let (off_a, _) = f.append(b"AAAA".to_vec()).await.unwrap();
+            assert_eq!(off_a, 0);
+            assert_eq!(f.size().await.unwrap(), 4, "first record on disk");
+
+            // Disk fills: this append reserves offset 4 then fails with ENOSPC.
+            f.force_next_write_failure();
+            assert!(
+                f.append(b"BBBB".to_vec()).await.is_err(),
+                "disk-full write must error"
+            );
+
+            // The file is now poisoned. A later write must NOT succeed at the
+            // advanced offset (which would leave a gap at [4,8) shadowing it).
+            let after = f.append(b"CCCC".to_vec()).await;
+            assert!(
+                after.is_err(),
+                "poisoned file must reject writes — otherwise a later write shadows the torn slot on recovery"
+            );
+            // Nothing after the first record ever reached disk.
+            assert_eq!(
+                f.size().await.unwrap(),
+                4,
+                "no bytes written past the good prefix"
+            );
+        });
+    }
+}
diff --git a/crates/engine/src/log/index.rs b/crates/engine/src/log/index.rs
index be3e454..055e36f 100644
--- a/crates/engine/src/log/index.rs
+++ b/crates/engine/src/log/index.rs
@@ -9,7 +9,7 @@ use rustc_hash::FxHashMap;
 /// (header + key + value + metadata). The header carries key/value/meta sizes so
 /// we can slice the value out in-memory.
 ///
-/// Layout: u64 + u32 + u16 + (2 pad) + u64 = 24 bytes.
+/// Layout: u64 + u32 + u32 + (2 pad) + u64 = 24 bytes.
 ///
 /// 4 GiB single-record limit (well above Redis's 512 MiB string ceiling).
 /// 65k files per namespace × `rotate_threshold` = comfortable disk ceiling.
@@ -21,12 +21,12 @@ use rustc_hash::FxHashMap;
 pub struct IndexEntry {
     pub record_offset: u64,
     pub record_size: u32,
-    pub file_id: u16,
+    pub file_id: u32,
     pub tstamp_ms: u64,
 }
 
 impl IndexEntry {
-    pub fn new(file_id: u16, record_offset: u64, record_size: u32, tstamp_ms: u64) -> Self {
+    pub fn new(file_id: u32, record_offset: u64, record_size: u32, tstamp_ms: u64) -> Self {
         Self {
             record_offset,
             record_size,
@@ -41,6 +41,10 @@ pub struct NsIndex {
     map: BTreeMap<Bytes, IndexEntry>,
     /// TTL sidecar — only TTL'd keys pay extra memory. FxHashMap for O(1) point lookups.
     ttl: FxHashMap<Bytes, u64>,
+    /// Value-separation sidecar: `key -> content hash` for keys whose value lives
+    /// in the blob store. Only large-value keys pay this. Used to unref the old
+    /// blob on overwrite/delete and to rebuild blob refcounts on recovery.
+    valsep: FxHashMap<Bytes, crate::value_store::ContentHash>,
     /// Best-effort live key count: incremented on insert, decremented on remove.
     /// Lazy-expired keys are included until tombstoned, matching Redis DBSIZE semantics.
     live_count: usize,
@@ -57,10 +61,35 @@ impl NsIndex {
         Self {
             map: BTreeMap::new(),
             ttl: FxHashMap::default(),
+            valsep: FxHashMap::default(),
             live_count: 0,
         }
     }
 
+    /// Content hash for a value-separated key, if any.
+    pub fn valsep(&self, key: &[u8]) -> Option<crate::value_store::ContentHash> {
+        self.valsep.get(key).copied()
+    }
+
+    /// Record (or clear) the blob hash for a key. `Some` marks it value-separated;
+    /// `None` clears (e.g. overwrite from a large value to a small inline one).
+    pub fn set_valsep(&mut self, key: &Bytes, hash: Option<crate::value_store::ContentHash>) {
+        match hash {
+            Some(h) => {
+                self.valsep.insert(key.clone(), h);
+            }
+            None => {
+                self.valsep.remove(key);
+            }
+        }
+    }
+
+    /// Iterate `(key, content hash)` for all value-separated keys. Used at open
+    /// to rebuild blob refcounts.
+    pub fn valsep_iter(&self) -> impl Iterator<Item = (&Bytes, &crate::value_store::ContentHash)> {
+        self.valsep.iter()
+    }
+
     pub fn len(&self) -> usize {
         self.map.len()
     }
@@ -104,6 +133,7 @@ impl NsIndex {
 
     pub fn remove(&mut self, key: &[u8]) -> Option<IndexEntry> {
         self.ttl.remove(key);
+        self.valsep.remove(key);
         let removed = self.map.remove(key);
         if removed.is_some() {
             self.live_count = self.live_count.saturating_sub(1);
@@ -114,6 +144,7 @@ impl NsIndex {
     pub fn clear(&mut self) {
         self.map.clear();
         self.ttl.clear();
+        self.valsep.clear();
         self.live_count = 0;
     }
 
diff --git a/crates/engine/src/log/mod.rs b/crates/engine/src/log/mod.rs
index f527b10..4a73c8d 100644
--- a/crates/engine/src/log/mod.rs
+++ b/crates/engine/src/log/mod.rs
@@ -35,14 +35,21 @@ use tracing::warn;
 use crate::error::{EngineError, Result};
 use crate::log::config::LogConfig;
 use crate::log::file::{
-    BufGuard, FooterEntry, LogFile, data_filename, pool_acquire_write, pool_release_write,
+    BufGuard, FooterEntry, LogFile, data_filename, pool_acquire_write, pool_release_write, sync_dir,
 };
 use crate::log::index::{IndexEntry, NsIndex};
 use crate::log::record::{HEADER_LEN, flags as rflags, parse_header, verify_crc};
+use crate::value_store::{ContentHash, ValueStore};
 
 pub fn now_ms() -> u64 {
     match SystemTime::now().duration_since(UNIX_EPOCH) {
-        Ok(d) => d.as_millis() as u64,
+        Ok(d) => u64::try_from(d.as_millis()).unwrap_or_else(|_| {
+            // ~584 million years past the epoch — not reachable in practice,
+            // but saturate explicitly rather than silently truncating, matching
+            // the checked conversion in `ShardStore::validate_ttl`.
+            warn!("millisecond timestamp exceeds u64::MAX; saturating");
+            u64::MAX
+        }),
         Err(_) => {
             warn!("system clock is before UNIX epoch; timestamps will be 0");
             0
@@ -72,13 +79,25 @@ impl WriteCondition {
 
 pub struct NamespaceLog {
     pub dir: PathBuf,
+    /// Content-addressed blob store for value-separated (large) values. Lives at
+    /// `{dir}/values/`. Values >= `config.value_sep_threshold` are stored here
+    /// (write-once, deduped, GC'd when the last referencing key drops) and the
+    /// log holds only a 16-byte hash pointer — so compaction never moves them.
+    pub values: ValueStore,
     pub index: RefCell<NsIndex>,
     /// Sealed files in file_id ascending order. `Rc<LogFile>` so readers can
     /// clone a handle and drop the `RefCell` borrow before awaiting I/O.
-    pub sealed: RefCell<FxHashMap<u16, Rc<LogFile>>>,
+    pub sealed: RefCell<FxHashMap<u32, Rc<LogFile>>>,
+    /// Size-tier level per sealed `file_id` (tiered compaction only). 0 = freshly
+    /// sealed; merging `fanout` runs at level L produces one run at level L+1.
+    level: RefCell<FxHashMap<u32, u8>>,
     /// Active (writable) file.
     pub active: RefCell<Rc<LogFile>>,
     pub config: LogConfig,
+    /// Cumulative bytes rewritten by compaction (reclaim). Instrumentation for
+    /// measuring write amplification: full-merge grows ~O(reclaims × live-set),
+    /// tiered ~O(log N).
+    pub compaction_bytes: Cell<u64>,
     unsynced_bytes: Cell<u64>,
     /// Monotonically increasing tstamp_ms — wall clock with a +1 nudge if the
     /// clock didn't advance, so duplicate-key replays always pick the latest.
@@ -100,44 +119,92 @@ pub struct NamespaceLog {
     /// Count of write methods currently between their entry check and exit.
     /// `freeze_and_drain` polls this to 0 before allowing the seal to proceed.
     in_flight_writes: Cell<u32>,
-    /// Serializes INCR/DECR within this namespace.
-    ///
-    /// Why: under contention, optimistic CAS on the same key has every
-    /// concurrent writer submit a (futile) disk append, then race to win the
-    /// post-write index update — only one wins per round, the rest become
-    /// orphans. Worse, io_uring completion order is roughly submission order,
-    /// so a late-submitting task can keep losing every round as new contenders
-    /// refill the in-flight pool, exhausting any finite retry budget. A single
-    /// async mutex collapses the herd: one INCR's read-modify-write completes
-    /// at a time, eliminating wasted disk writes and starvation.
-    pub(crate) incr_lock: futures_util::lock::Mutex<()>,
+    /// Per-key write serialization, striped. Every mutating method locks
+    /// `wlock(key)` for its check→append→commit, so two writes to the SAME key
+    /// never interleave — while writes to DIFFERENT keys hash to different
+    /// stripes and stay fully concurrent (lock-free reads are untouched). This
+    /// is what makes conditional writes (CAS/NX/XX) atomic: holding the stripe,
+    /// they check BEFORE appending, so a failed condition writes no record at
+    /// all — eliminating the optimistic-orphan that a crash could resurrect.
+    /// Collisions (distinct keys, same stripe) only cause rare, harmless extra
+    /// serialization. INCR no longer needs a dedicated lock: its optimistic
+    /// retry now appends nothing on a lost race.
+    write_stripes: Vec<futures_util::lock::Mutex<()>>,
 }
 
+/// Number of write-lock stripes per namespace. Powers of two keep `& (N-1)`
+/// cheap. 64 keeps per-key false-collisions rare without much memory.
+const WRITE_STRIPES: usize = 64;
+
 impl NamespaceLog {
     pub async fn open(dir: PathBuf, config: LogConfig) -> Result<Self> {
         let opened = recover::open_namespace(dir.clone()).await?;
-        let sealed: FxHashMap<u16, Rc<LogFile>> = opened
+        let sealed: FxHashMap<u32, Rc<LogFile>> = opened
             .sealed
             .into_iter()
             .map(|f| (f.file_id, Rc::new(f)))
             .collect();
         let active = Rc::new(opened.active);
+        // Recovered sealed files start at level 0 (tiered compaction will merge
+        // them upward as new runs accumulate). Levels are in-memory only.
+        let level: FxHashMap<u32, u8> = sealed.keys().map(|&id| (id, 0u8)).collect();
+        // Rebuild blob refcounts: one per live value-separated key (the sidecar
+        // was repopulated from sealed footers + active-file replay during open).
+        let values = ValueStore::new(dir.join("values"));
+        for (_, h) in opened.index.valsep_iter() {
+            values.incr_ref(h);
+        }
+        // Reclaim any blob a crash left without a referencing record (now that
+        // refcounts reflect the live index, anything else on disk is an orphan).
+        values.sweep_orphans().await?;
+        // Seed the revision clock from the highest tstamp recovered, so revisions
+        // never regress across a restart even if the wall clock stepped back
+        // (next_tstamp already nudges within a run). This keeps CAS revisions and
+        // watch `scan_since` resumption monotonic. (A tombstone whose tstamp
+        // exceeds every live key's is not reflected here — a narrow, transient
+        // case: reclaim drops dead tombstones, and recovery resolves last-writer
+        // by physical order, not tstamp.)
+        let max_tstamp = opened
+            .index
+            .iter()
+            .map(|(_, e)| e.tstamp_ms)
+            .max()
+            .unwrap_or(0);
         Ok(Self {
             dir,
+            values,
             index: RefCell::new(opened.index),
             sealed: RefCell::new(sealed),
+            level: RefCell::new(level),
             active: RefCell::new(active),
             config,
+            compaction_bytes: Cell::new(0),
             unsynced_bytes: Cell::new(0),
-            last_tstamp: Cell::new(0),
+            last_tstamp: Cell::new(max_tstamp),
             reclaim_in_progress: Cell::new(false),
             rotate_in_progress: Cell::new(false),
             frozen: Cell::new(false),
             in_flight_writes: Cell::new(0),
-            incr_lock: futures_util::lock::Mutex::new(()),
+            write_stripes: (0..WRITE_STRIPES)
+                .map(|_| futures_util::lock::Mutex::new(()))
+                .collect(),
         })
     }
 
+    /// Stripe index for `key` (FxHash & (N-1)).
+    fn stripe_idx(key: &[u8]) -> usize {
+        use std::hash::{Hash, Hasher};
+        let mut h = rustc_hash::FxHasher::default();
+        key.hash(&mut h);
+        (h.finish() as usize) & (WRITE_STRIPES - 1)
+    }
+
+    /// The write-serialization stripe for `key`. Same key → same stripe →
+    /// serialized; different keys → (usually) different stripes → concurrent.
+    fn wlock(&self, key: &[u8]) -> &futures_util::lock::Mutex<()> {
+        &self.write_stripes[Self::stripe_idx(key)]
+    }
+
     /// Block all subsequent writes (they return [`EngineError::Frozen`]) and
     /// wait for any already-in-flight writes to complete. Used by the seal
     /// path so the footer it writes is a consistent snapshot of on-disk state.
@@ -210,10 +277,9 @@ impl NamespaceLog {
         metadata: &[u8],
         expires_at_ms: Option<u64>,
     ) -> Result<u64> {
+        self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs
         let _wg = self.begin_write()?;
-        if self.reclaim_in_progress.get() {
-            return Err(EngineError::ReclamationBusy);
-        }
+        let _w = self.wlock(&key).lock().await; // serialize writes to this key
         let tstamp = self.next_tstamp();
         let mut flags = 0u8;
         let exp = match expires_at_ms {
@@ -223,32 +289,77 @@ impl NamespaceLog {
                 0
             }
         };
-        let mut buf = pool_acquire_write(HEADER_LEN + key.len() + value.len() + metadata.len());
-        record::encode_into(&mut buf, tstamp, flags, exp, &key, value, metadata)?;
+        // Value separation: a value >= the threshold is written to the blob store
+        // (write-once, deduped) and the record carries only its 16-byte hash, so
+        // compaction never re-uploads the value.
+        let sep_hash = self.maybe_separate(value, &mut flags).await?;
+        let stored: &[u8] = sep_hash.as_ref().map_or(value, |h| &h[..]);
+        let mut buf = pool_acquire_write(HEADER_LEN + key.len() + stored.len() + metadata.len());
+        record::encode_into(&mut buf, tstamp, flags, exp, &key, stored, metadata)?;
         let record_size = buf.len() as u32;
         let active = self.active();
-        let (offset, buf) = active.append(buf).await?;
+        let (offset, buf) = match active.append(buf).await {
+            Ok(r) => r,
+            Err(e) => {
+                // Append failed: roll back the blob ref so we don't leave a phantom
+                // blob (written + ref'd, but no record references it).
+                if let Some(h) = sep_hash {
+                    self.values.unref(&h);
+                }
+                return Err(e);
+            }
+        };
         pool_release_write(buf);
         self.unsynced_bytes
             .set(self.unsynced_bytes.get() + record_size as u64);
         let entry = IndexEntry::new(active.file_id, offset, record_size, tstamp);
-        self.index.borrow_mut().insert(key, entry, expires_at_ms);
+        let old_hash = self.apply_valsep_insert(key.clone(), entry, expires_at_ms, sep_hash);
+        if let Some(oh) = old_hash {
+            self.values.unref(&oh);
+        }
         if active.write_offset() >= self.config.rotate_threshold {
             self.rotate_active().await?;
         }
         Ok(tstamp)
     }
 
+    /// If `value` is large enough to separate, write it to the blob store (the
+    /// store dedups + refcounts) and set the `VALUE_SEP` flag; return its hash.
+    /// Otherwise return `None` (value stays inline). The blob is written before
+    /// the log record so the record's hash always points at durable bytes.
+    async fn maybe_separate(&self, value: &[u8], flags: &mut u8) -> Result<Option<ContentHash>> {
+        if value.len() >= self.config.value_sep_threshold {
+            *flags |= rflags::VALUE_SEP;
+            Ok(Some(self.values.put(value).await?))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Insert the index entry and update the value-sep sidecar. Returns the key's
+    /// PREVIOUS blob hash (if it was value-separated) so the caller can unref it
+    /// after the new write commits — covering overwrite, large→small, and
+    /// same-content cases uniformly (the new blob was already ref'd by `put`).
+    fn apply_valsep_insert(
+        &self,
+        key: Bytes,
+        entry: IndexEntry,
+        expires_at_ms: Option<u64>,
+        sep_hash: Option<ContentHash>,
+    ) -> Option<ContentHash> {
+        let mut index = self.index.borrow_mut();
+        let old = index.valsep(&key);
+        index.insert(key.clone(), entry, expires_at_ms);
+        index.set_valsep(&key, sep_hash);
+        old
+    }
+
     /// Conditional write: write only if the current live state of `key` satisfies `cond`.
-    ///
-    /// Returns `Ok(Some(tstamp))` if written and indexed, `Ok(None)` if the condition was
-    /// not met. The returned tstamp is THIS write's revision — callers must use it
-    /// instead of [`last_revision`](Self::last_revision) when updating caches or
-    /// returning a revision to clients, because concurrent writes that pass pre-check
-    /// but later fail post-check still bump `last_tstamp`. A concurrent write that
-    /// lands during the disk-I/O await is detected by a post-write re-check before
-    /// the index is updated; if the race is lost the on-disk record becomes an
-    /// unreferenced orphan reclaimed during next compaction.
+    /// Atomic — the key's write stripe is held across check + append + commit, so no
+    /// concurrent write to the same key can interleave. A failed condition writes
+    /// nothing. Returns `Ok(Some(tstamp))` if written, `Ok(None)` if the condition
+    /// was not met. The returned tstamp is THIS write's revision; callers use it
+    /// instead of [`last_revision`](Self::last_revision) for caches/responses.
     pub async fn put_full_cond(
         &self,
         key: Bytes,
@@ -258,11 +369,13 @@ impl NamespaceLog {
         cond: WriteCondition,
         now: u64,
     ) -> Result<Option<u64>> {
+        self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs
         let _wg = self.begin_write()?;
-        if self.reclaim_in_progress.get() {
-            return Err(EngineError::ReclamationBusy);
-        }
-        // Pre-check: verify condition before incurring disk I/O.
+        let _w = self.wlock(&key).lock().await; // serialize writes to this key
+        // Holding the key's write stripe, no concurrent write to this key can run.
+        // So the condition check is authoritative: we check BEFORE appending, and a
+        // failed condition writes NOTHING (no record, no blob) — there is no
+        // optimistic orphan that a crash could resurrect, and no post-check.
         if !cond.check(Self::live_rev(&self.index.borrow(), &key, now)) {
             return Ok(None);
         }
@@ -275,22 +388,31 @@ impl NamespaceLog {
                 0
             }
         };
-        let mut buf = pool_acquire_write(HEADER_LEN + key.len() + value.len() + metadata.len());
-        record::encode_into(&mut buf, tstamp, flags, exp, &key, value, metadata)?;
+        let sep_hash = self.maybe_separate(value, &mut flags).await?;
+        let stored: &[u8] = sep_hash.as_ref().map_or(value, |h| &h[..]);
+        let mut buf = pool_acquire_write(HEADER_LEN + key.len() + stored.len() + metadata.len());
+        record::encode_into(&mut buf, tstamp, flags, exp, &key, stored, metadata)?;
         let record_size = buf.len() as u32;
         let active = self.active();
-        let (offset, buf) = active.append(buf).await?;
+        let (offset, buf) = match active.append(buf).await {
+            Ok(r) => r,
+            Err(e) => {
+                // Append failed: roll back the blob ref so we don't leave a phantom
+                // blob (written + ref'd, but no record references it).
+                if let Some(h) = sep_hash {
+                    self.values.unref(&h);
+                }
+                return Err(e);
+            }
+        };
         pool_release_write(buf);
         self.unsynced_bytes
             .set(self.unsynced_bytes.get() + record_size as u64);
-        // Post-check: re-verify before committing to the index. Another task that
-        // modified the same key during the disk-I/O await will have already updated
-        // the index; if that breaks our condition, abort without touching the index.
-        if !cond.check(Self::live_rev(&self.index.borrow(), &key, now)) {
-            return Ok(None);
-        }
         let entry = IndexEntry::new(active.file_id, offset, record_size, tstamp);
-        self.index.borrow_mut().insert(key, entry, expires_at_ms);
+        let old_hash = self.apply_valsep_insert(key.clone(), entry, expires_at_ms, sep_hash);
+        if let Some(oh) = old_hash {
+            self.values.unref(&oh);
+        }
         if active.write_offset() >= self.config.rotate_threshold {
             self.rotate_active().await?;
         }
@@ -310,43 +432,77 @@ impl NamespaceLog {
     /// of [`last_revision`](Self::last_revision) — concurrent writes can bump
     /// `last_tstamp` higher than any tstamp this batch produced.
     pub async fn put_many(&self, pairs: &[(Bytes, Bytes)]) -> Result<Vec<u64>> {
+        self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs
         let _wg = self.begin_write()?;
-        if self.reclaim_in_progress.get() {
-            return Err(EngineError::ReclamationBusy);
-        }
         if pairs.is_empty() {
             return Ok(Vec::new());
         }
+        // Serialize against same-key single-key writes by holding every stripe this
+        // batch touches. Acquired in sorted-distinct order so two batches (or a
+        // batch and a single write) can never deadlock on a lock-ordering cycle.
+        let mut idxs: Vec<usize> = pairs.iter().map(|(k, _)| Self::stripe_idx(k)).collect();
+        idxs.sort_unstable();
+        idxs.dedup();
+        let mut _stripe_guards = Vec::with_capacity(idxs.len());
+        for i in idxs {
+            _stripe_guards.push(self.write_stripes[i].lock().await);
+        }
         let estimated: usize = pairs
             .iter()
             .map(|(k, v)| HEADER_LEN + k.len() + v.len())
             .sum();
         let mut buf = pool_acquire_write(estimated);
         let mut layout: Vec<(usize, u32, u64)> = Vec::with_capacity(pairs.len());
+        // Per-pair blob hash for value-separated values (None = inline).
+        let mut sep_hashes: Vec<Option<ContentHash>> = Vec::with_capacity(pairs.len());
         for (k, v) in pairs {
             let tstamp = self.next_tstamp();
+            let mut flags = rflags::NO_EXPIRY;
+            let sh = self.maybe_separate(v, &mut flags).await?;
+            let stored: &[u8] = sh.as_ref().map_or(&v[..], |h| &h[..]);
             let start = buf.len();
-            record::encode_into(&mut buf, tstamp, rflags::NO_EXPIRY, 0, k, v, &[])?;
+            record::encode_into(&mut buf, tstamp, flags, 0, k, stored, &[])?;
             let record_size = (buf.len() - start) as u32;
             layout.push((start, record_size, tstamp));
+            sep_hashes.push(sh);
         }
         let active = self.active();
         let buf_len = buf.len() as u64;
-        let (base_offset, buf) = active.append(buf).await?;
+        let (base_offset, buf) = match active.append(buf).await {
+            Ok(r) => r,
+            Err(e) => {
+                // Append failed: roll back every blob ref this batch took so none
+                // are left as phantom blobs (written + ref'd, no record).
+                for h in sep_hashes.into_iter().flatten() {
+                    self.values.unref(&h);
+                }
+                return Err(e);
+            }
+        };
         pool_release_write(buf);
         self.unsynced_bytes.set(self.unsynced_bytes.get() + buf_len);
+        let mut old_hashes: Vec<ContentHash> = Vec::new();
         {
             let mut index = self.index.borrow_mut();
-            for ((k, _v), (rel_start, size, tstamp)) in pairs.iter().zip(layout.iter()) {
+            for (((k, _v), (rel_start, size, tstamp)), sh) in
+                pairs.iter().zip(layout.iter()).zip(sep_hashes.iter())
+            {
                 let entry = IndexEntry::new(
                     active.file_id,
                     base_offset + *rel_start as u64,
                     *size,
                     *tstamp,
                 );
+                if let Some(oh) = index.valsep(k) {
+                    old_hashes.push(oh);
+                }
                 index.insert(k.clone(), entry, None);
+                index.set_valsep(k, *sh);
             }
         }
+        for oh in old_hashes {
+            self.values.unref(&oh);
+        }
         if active.write_offset() >= self.config.rotate_threshold {
             self.rotate_active().await?;
         }
@@ -359,14 +515,17 @@ impl NamespaceLog {
     /// for watch events and any client-visible revision — concurrent writes
     /// can bump `last_tstamp` beyond this specific tombstone's tstamp.
     pub async fn tombstone(&self, key: &[u8]) -> Result<Option<u64>> {
+        self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs
         let _wg = self.begin_write()?;
-        if self.reclaim_in_progress.get() {
-            return Err(EngineError::ReclamationBusy);
-        }
-        let was_present = self.index.borrow_mut().remove(key).is_some();
-        if !was_present {
-            return Ok(None);
-        }
+        let _w = self.wlock(key).lock().await; // serialize writes to this key
+        let old_hash = {
+            let mut index = self.index.borrow_mut();
+            let h = index.valsep(key);
+            if index.remove(key).is_none() {
+                return Ok(None);
+            }
+            h
+        };
         let tstamp = self.next_tstamp();
         let mut buf = pool_acquire_write(HEADER_LEN + key.len());
         record::encode_into(&mut buf, tstamp, rflags::TOMBSTONE, 0, key, &[], &[])?;
@@ -375,6 +534,9 @@ impl NamespaceLog {
         let (_, buf) = active.append(buf).await?;
         pool_release_write(buf);
         self.unsynced_bytes.set(self.unsynced_bytes.get() + buf_len);
+        if let Some(h) = old_hash {
+            self.values.unref(&h);
+        }
         Ok(Some(tstamp))
     }
 
@@ -389,16 +551,20 @@ impl NamespaceLog {
         expected_rev: u64,
         now: u64,
     ) -> Result<Option<u64>> {
+        self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs
         let _wg = self.begin_write()?;
-        if self.reclaim_in_progress.get() {
-            return Err(EngineError::ReclamationBusy);
-        }
+        let _w = self.wlock(key).lock().await; // serialize writes to this key
         // Both check and removal happen without yielding — no interleaving possible.
         let current_rev = Self::live_rev(&self.index.borrow(), key, now);
         if current_rev != Some(expected_rev) {
             return Ok(None);
         }
-        self.index.borrow_mut().remove(key);
+        let old_hash = {
+            let mut index = self.index.borrow_mut();
+            let h = index.valsep(key);
+            index.remove(key);
+            h
+        };
         // Disk write (yields, but index already updated)
         let tstamp = self.next_tstamp();
         let mut buf = pool_acquire_write(HEADER_LEN + key.len());
@@ -408,6 +574,9 @@ impl NamespaceLog {
         let (_, buf) = active.append(buf).await?;
         pool_release_write(buf);
         self.unsynced_bytes.set(self.unsynced_bytes.get() + buf_len);
+        if let Some(h) = old_hash {
+            self.values.unref(&h);
+        }
         Ok(Some(tstamp))
     }
 
@@ -415,10 +584,9 @@ impl NamespaceLog {
     /// tstamp assigned to this update — callers must use it (not
     /// [`last_revision`](Self::last_revision)) for watch events.
     pub async fn ttl_update(&self, key: &[u8], expires_at_ms: Option<u64>) -> Result<u64> {
+        self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs
         let _wg = self.begin_write()?;
-        if self.reclaim_in_progress.get() {
-            return Err(EngineError::ReclamationBusy);
-        }
+        let _w = self.wlock(key).lock().await; // serialize writes to this key
         let tstamp = self.next_tstamp();
         let mut flags = rflags::TTL_UPDATE;
         let exp = match expires_at_ms {
@@ -440,7 +608,7 @@ impl NamespaceLog {
         Ok(tstamp)
     }
 
-    fn locate_file(&self, file_id: u16) -> Option<Rc<LogFile>> {
+    fn locate_file(&self, file_id: u32) -> Option<Rc<LogFile>> {
         let active = self.active.borrow().clone();
         if active.file_id == file_id {
             return Some(active);
@@ -451,11 +619,15 @@ impl NamespaceLog {
     /// Fsync the active file if any writes are pending. Called by the per-shard
     /// 1-second timer task to provide `appendfsync everysec` semantics.
     pub async fn sync(&self) -> Result<()> {
-        if self.unsynced_bytes.get() == 0 {
-            return Ok(());
+        if self.unsynced_bytes.get() > 0 {
+            self.active().sync().await?;
+            self.unsynced_bytes.set(0);
         }
-        self.active().sync().await?;
-        self.unsynced_bytes.set(0);
+        // Every log record is durable now (just fsynced, or already was), so the
+        // blobs orphaned by overwrites/deletes can finally be physically removed.
+        // Deferring to here is what makes a power-loss revert safe: until the
+        // superseding record is durable, the old blob stays on disk.
+        self.values.collect_garbage().await;
         Ok(())
     }
 
@@ -470,7 +642,9 @@ impl NamespaceLog {
             .await
     }
 
-    fn extract_value_meta(bytes: &[u8]) -> Result<(Bytes, Bytes)> {
+    /// Returns `(value_field, metadata, flags)`. For value-separated records the
+    /// `value_field` is the 16-byte content hash, not the value — call `deref`.
+    fn extract_value_meta(bytes: &[u8]) -> Result<(Bytes, Bytes, u8)> {
         let hdr = parse_header(&bytes[..HEADER_LEN.min(bytes.len())], 0)?;
         let key_end = HEADER_LEN + hdr.key_size as usize;
         let val_end = key_end + hdr.val_size as usize;
@@ -484,13 +658,43 @@ impl NamespaceLog {
         verify_crc(&hdr, &bytes[..HEADER_LEN], &bytes[HEADER_LEN..meta_end], 0)?;
         let value = Bytes::copy_from_slice(&bytes[key_end..val_end]);
         let metadata = Bytes::copy_from_slice(&bytes[val_end..meta_end]);
-        Ok((value, metadata))
+        Ok((value, metadata, hdr.flags))
+    }
+
+    /// Resolve a record's value field to the real value: if value-separated,
+    /// fetch the blob by its hash; otherwise the field IS the value.
+    async fn deref(&self, value: Bytes, flags: u8) -> Result<Bytes> {
+        if flags & rflags::VALUE_SEP == 0 {
+            return Ok(value);
+        }
+        if value.len() != std::mem::size_of::<ContentHash>() {
+            return Err(EngineError::BadRecord {
+                offset: 0,
+                reason: "value-separated record's value field is not a 16-byte hash",
+            });
+        }
+        let mut h: ContentHash = [0u8; 16];
+        h.copy_from_slice(&value);
+        let bytes = self.values.get(&h).await?;
+        // Integrity: re-hash the blob and confirm it matches the content hash the
+        // record points at — parity with the CRC check inline values get on every
+        // read. Catches silent blob corruption AND a blob/hash mismatch, instead
+        // of returning wrong bytes. BLAKE3 is SIMD-fast; this mirrors the per-read
+        // CRC the inline path already pays over the value.
+        if crate::value_store::content_hash(&bytes) != h {
+            return Err(EngineError::BadRecord {
+                offset: 0,
+                reason: "value-separated blob content hash mismatch (corruption)",
+            });
+        }
+        Ok(Bytes::from(bytes))
     }
 
     /// Single-record read: one `read_at`, parse header in-memory.
     pub async fn read_value(&self, entry: IndexEntry) -> Result<(Bytes, Bytes)> {
         let bytes = self.read_record(entry).await?;
-        Self::extract_value_meta(&bytes)
+        let (value, metadata, flags) = Self::extract_value_meta(&bytes)?;
+        Ok((self.deref(value, flags).await?, metadata))
     }
 
     /// Bulk-read: submits all `read_at` futures concurrently via `join_all` so
@@ -508,11 +712,26 @@ impl NamespaceLog {
         }
         let futures: Vec<_> = misses.iter().map(|(_, e)| self.read_record(*e)).collect();
         let results: Vec<Result<BufGuard>> = join_all(futures).await;
-        let mut out: Vec<(usize, Bytes, Bytes)> = Vec::with_capacity(misses.len());
+        // Extract synchronously, then deref all value-separated blobs concurrently
+        // — the same io_uring batching the record reads above already get. Without
+        // this, a bulk_read over N large (value-separated) values fans out the
+        // record reads in parallel but then fetches the N blobs one at a time.
+        let mut extracted: Vec<(usize, Bytes, Bytes, u8)> = Vec::with_capacity(misses.len());
         for ((slot, _entry), bytes_res) in misses.into_iter().zip(results.into_iter()) {
             let bytes = bytes_res?;
-            let (value, metadata) = Self::extract_value_meta(&bytes)?;
-            out.push((slot, value, metadata));
+            let (value, metadata, flags) = Self::extract_value_meta(&bytes)?;
+            extracted.push((slot, value, metadata, flags));
+        }
+        let deref_results = join_all(
+            extracted
+                .iter()
+                .map(|(_, value, _, flags)| self.deref(value.clone(), *flags)),
+        )
+        .await;
+        let mut out: Vec<(usize, Bytes, Bytes)> = Vec::with_capacity(extracted.len());
+        for ((slot, _value, metadata, _flags), derefed) in extracted.into_iter().zip(deref_results)
+        {
+            out.push((slot, derefed?, metadata));
         }
         Ok(out)
     }
@@ -548,7 +767,7 @@ impl NamespaceLog {
 
         let mut events = Vec::with_capacity(live.len());
         for (slot, value, meta_bytes) in read_results {
-            let (key, _, expires_at_ms) = &live[slot];
+            let (key, entry, expires_at_ms) = &live[slot];
             let metadata = if meta_bytes.is_empty() {
                 None
             } else {
@@ -565,7 +784,11 @@ impl NamespaceLog {
                 value,
                 metadata,
                 expires_at_ms: *expires_at_ms,
-                revision: 0,
+                // The key's real revision (its record tstamp), NOT 0. The
+                // subscribe-then-scan window can surface a write in both `initial`
+                // and the live channel; callers dedup by revision, so an `initial`
+                // event must carry the same revision the live channel will.
+                revision: entry.tstamp_ms,
             });
         }
         Ok(events)
@@ -579,7 +802,7 @@ impl NamespaceLog {
         filter: &crate::watch::KeyFilter<'_>,
         since_revision: u64,
     ) -> Result<Vec<crate::watch::WatchEvent>> {
-        let mut files: Vec<(u16, Rc<LogFile>)> = self
+        let mut files: Vec<(u32, Rc<LogFile>)> = self
             .sealed
             .borrow()
             .iter()
@@ -591,7 +814,7 @@ impl NamespaceLog {
         let mut events = Vec::new();
         for (_, file) in &files {
             let end = file.data_end_offset().await;
-            scan_file_records(file, end, filter, since_revision, &mut events).await?;
+            scan_file_records(file, end, filter, since_revision, &self.values, &mut events).await?;
         }
         // Sort by revision so callers see a clean chronological stream.
         events.sort_by_key(|e| match e {
@@ -636,6 +859,7 @@ impl NamespaceLog {
                     record_size: e.record_size,
                     expires_at_ms: index.ttl(k),
                     tstamp_ms: e.tstamp_ms,
+                    value_hash: index.valsep(k),
                 })
                 .collect()
         };
@@ -669,6 +893,7 @@ impl NamespaceLog {
             .insert(old_active.file_id, old_active);
         let new_path = self.dir.join(data_filename(next_id));
         let new_active = Rc::new(LogFile::open_rw(new_path, next_id).await?);
+        sync_dir(&self.dir).await; // make the new file's directory entry durable
         *self.active.borrow_mut() = new_active;
         self.unsynced_bytes.set(0);
         Ok(())
@@ -703,6 +928,7 @@ impl NamespaceLog {
                     record_size: e.record_size,
                     expires_at_ms: index.ttl(k),
                     tstamp_ms: e.tstamp_ms,
+                    value_hash: index.valsep(k),
                 })
                 .collect()
         };
@@ -726,6 +952,7 @@ impl NamespaceLog {
             .insert(old_active.file_id, old_active);
         let new_path = self.dir.join(data_filename(next_id));
         let new_active = Rc::new(LogFile::open_rw(new_path, next_id).await?);
+        sync_dir(&self.dir).await; // make the new file's directory entry durable
         *self.active.borrow_mut() = new_active;
         self.unsynced_bytes.set(0);
         Ok(())
@@ -742,10 +969,18 @@ impl NamespaceLog {
     ///
     /// NOT safe under concurrent reads/writes — caller must serialize.
     pub async fn flush(&self) -> Result<u64> {
-        if self.reclaim_in_progress.get() {
+        // Wait out any reclaim, then take the exclusive flag (shared with reclaim)
+        // so neither a reclaim nor another flush can run concurrently; writes wait
+        // on the same flag. `replace` is the atomic gate against a racing op.
+        self.await_reclaim().await;
+        if self.reclaim_in_progress.replace(true) {
             return Err(EngineError::ReclamationBusy);
         }
-        self.reclaim_in_progress.set(true);
+        // Drain in-flight writes before unlinking/recreating the data files, so a
+        // write mid-append can't race the file replacement.
+        while self.in_flight_writes.get() > 0 {
+            monoio::time::sleep(std::time::Duration::from_micros(50)).await;
+        }
         // Burn a tstamp for the flush event itself. Doing this BEFORE the
         // flush guarantees the revision exceeds anything previously committed
         // (or even speculatively assigned by concurrent failed writes).
@@ -788,29 +1023,57 @@ impl NamespaceLog {
 
         let path = self.dir.join(data_filename(0));
         let new_active = Rc::new(LogFile::open_rw(path, 0).await?);
+        sync_dir(&self.dir).await; // make the recreated file's directory entry durable
         *self.active.borrow_mut() = new_active;
         self.unsynced_bytes.set(0);
         Ok(())
     }
 
-    /// Operator-triggered reclaim. Seals the current active file with a
-    /// footer, then merges all live records (across the just-sealed file plus
-    /// previously-sealed files) into a single new sealed file. Old files are
-    /// unlinked. A fresh active file is opened.
+    /// Operator-triggered reclaim (size-tiered compaction). Seals the active
+    /// file as a fresh level-0 run, then repeatedly merges the lowest level
+    /// that has reached `fanout` runs into one run at the next level. Each
+    /// merge rewrites only that level's live records (O(log N) total write
+    /// amplification) — never the whole live set, so on GlideFS a reclaim
+    /// re-uploads one level, not the entire namespace.
     ///
     /// NOT concurrent-safe with other ops on this namespace.
     pub async fn reclaim(&self) -> Result<reclaim::ReclaimReport> {
-        if self.reclaim_in_progress.get() {
+        // Atomic check-and-set: a second concurrent reclaim on this namespace is a
+        // no-op error (only one reclaim at a time). Writes do NOT error — they wait
+        // on `await_reclaim` and proceed once this finishes.
+        if self.reclaim_in_progress.replace(true) {
             return Err(EngineError::ReclamationBusy);
         }
-        self.reclaim_in_progress.set(true);
+        // Drain writes that already passed the gate before we set the flag, so the
+        // seal's footer is a consistent snapshot — no in-flight write (appended but
+        // not yet indexed) is missed and silently lost on the next footer recovery.
+        // New writes now wait in `await_reclaim` BEFORE `begin_write`, so they don't
+        // hold `in_flight_writes` and this drain always terminates (no deadlock).
+        while self.in_flight_writes.get() > 0 {
+            monoio::time::sleep(std::time::Duration::from_micros(50)).await;
+        }
         let result = self.reclaim_inner().await;
         self.reclaim_in_progress.set(false);
         result
     }
 
+    /// Block until no reclaim is in progress on this namespace. Called at the very
+    /// start of every write (before `begin_write`), so writes stall during a
+    /// reclaim instead of erroring, and waiters never hold the in-flight count.
+    async fn await_reclaim(&self) {
+        // 500µs, not 50µs: a reclaim of a large namespace can take seconds, and
+        // every concurrent writer parks here for its whole duration. The coarser
+        // interval cuts timer-wheel churn ~10× across all waiting writers while
+        // adding at most ~half a millisecond to post-reclaim write latency.
+        while self.reclaim_in_progress.get() {
+            monoio::time::sleep(std::time::Duration::from_micros(500)).await;
+        }
+    }
+
     async fn reclaim_inner(&self) -> Result<reclaim::ReclaimReport> {
-        // Seal the current active.
+        use std::collections::{BTreeMap, HashSet};
+
+        // 1. Seal the active file as a fresh level-0 run.
         let old_active = self.active.borrow().clone();
         let footer: Vec<FooterEntry> = {
             let index = self.index.borrow();
@@ -823,68 +1086,136 @@ impl NamespaceLog {
                     record_size: e.record_size,
                     expires_at_ms: index.ttl(k),
                     tstamp_ms: e.tstamp_ms,
+                    value_hash: index.valsep(k),
                 })
                 .collect()
         };
         old_active.write_footer(&footer).await?;
         self.sealed
             .borrow_mut()
-            .insert(old_active.file_id, old_active.clone());
+            .insert(old_active.file_id, old_active); // level 0 (absent from map)
 
-        // Pick the next file_id as max(existing) + 1.
-        let next_id = {
+        let mut total = reclaim::ReclaimReport {
+            live_keys: 0,
+            live_bytes: 0,
+            dead_files_dropped: 0,
+            dead_files_leaked: 0,
+            new_file_id: 0,
+        };
+
+        // 2. Cascade: while some level holds >= fanout runs, merge that level
+        //    into one run at the next level.
+        loop {
+            let by_level: BTreeMap<u8, Vec<u32>> = {
+                let levels = self.level.borrow();
+                let mut m: BTreeMap<u8, Vec<u32>> = BTreeMap::new();
+                for &id in self.sealed.borrow().keys() {
+                    m.entry(levels.get(&id).copied().unwrap_or(0))
+                        .or_default()
+                        .push(id);
+                }
+                m
+            };
+            let (lvl, ids) = match by_level
+                .iter()
+                .find(|(_, ids)| ids.len() >= self.config.fanout)
+            {
+                Some((&l, ids)) => (l, ids.clone()),
+                None => break,
+            };
+
+            let files: Vec<Rc<LogFile>> = {
+                let sealed = self.sealed.borrow();
+                ids.iter()
+                    .filter_map(|id| sealed.get(id).cloned())
+                    .collect()
+            };
+            let id_set: HashSet<u32> = ids.iter().copied().collect();
+            let live: Vec<(Bytes, IndexEntry, Option<u64>)> = {
+                let index = self.index.borrow();
+                index
+                    .iter()
+                    .filter(|(_, e)| id_set.contains(&e.file_id))
+                    .map(|(k, e)| (k.clone(), *e, index.ttl(k)))
+                    .collect()
+            };
+            let next_id = {
+                let sealed = self.sealed.borrow();
+                sealed
+                    .keys()
+                    .copied()
+                    .max()
+                    .unwrap_or(0)
+                    .max(self.active.borrow().file_id)
+                    .checked_add(1)
+                    .ok_or(EngineError::CapacityExceeded {
+                        reason: "file_id overflow: namespace has too many log files",
+                    })?
+            };
+
+            // reclaim_namespace writes one merged file (next_id) and unlinks the
+            // input `files`; index borrow is not held across the await.
+            let (report, new_entries) =
+                reclaim::reclaim_namespace(self.dir.clone(), &files, next_id, &live).await?;
+            total.live_keys = report.live_keys;
+            total.live_bytes = report.live_bytes;
+            self.compaction_bytes
+                .set(self.compaction_bytes.get() + report.live_bytes);
+            total.dead_files_dropped += report.dead_files_dropped;
+            total.dead_files_leaked += report.dead_files_leaked;
+
+            // Open the merged file FIRST — it is the only fallible step left. If
+            // it fails (EMFILE, hardware error), we return with the index and
+            // sealed map untouched: they still reference the old file_ids, whose
+            // `Rc<LogFile>` handles remain open. On Linux those fds keep serving
+            // reads even though `reclaim_namespace` already unlinked the paths, so
+            // no key goes dark before the next restart (which finds the merged
+            // file on disk). Mutating in-memory state before this open could leave
+            // the index pointing at a `next_id` absent from `sealed` — reads of
+            // those keys would fail with "file_id not found" until restart.
+            let new_file =
+                Rc::new(LogFile::open_ro(self.dir.join(data_filename(next_id)), next_id).await?);
+            // From here on every step is infallible: commit the swap atomically.
+            {
+                let mut index = self.index.borrow_mut();
+                for (key, entry, ttl) in new_entries {
+                    index.insert(key, entry, ttl);
+                }
+            }
+            {
+                let mut sealed = self.sealed.borrow_mut();
+                let mut levels = self.level.borrow_mut();
+                for id in &ids {
+                    sealed.remove(id);
+                    levels.remove(id);
+                }
+                sealed.insert(next_id, new_file);
+                levels.insert(next_id, lvl.saturating_add(1));
+            }
+        }
+
+        // 3. Open a fresh active file.
+        let new_active_id = {
             let sealed = self.sealed.borrow();
             sealed
                 .keys()
                 .copied()
                 .max()
                 .unwrap_or(0)
+                .max(self.active.borrow().file_id)
                 .checked_add(1)
                 .ok_or(EngineError::CapacityExceeded {
                     reason: "file_id overflow: namespace has too many log files",
                 })?
         };
-        let new_active_id = next_id
-            .checked_add(1)
-            .ok_or(EngineError::CapacityExceeded {
-                reason: "file_id overflow: namespace has too many log files",
-            })?;
-
-        let sealed_snapshot: Vec<Rc<LogFile>> = self.sealed.borrow().values().cloned().collect();
-
-        // Snapshot live entries outside the await so the reclaim doesn't hold an index borrow.
-        let live: Vec<(Bytes, IndexEntry, Option<u64>)> = {
-            let index = self.index.borrow();
-            index
-                .iter()
-                .map(|(k, e)| (k.clone(), *e, index.ttl(k)))
-                .collect()
-        };
-
-        let (report, new_entries) =
-            reclaim::reclaim_namespace(self.dir.clone(), &sealed_snapshot, next_id, &live).await?;
-
-        // Apply new index entries.
-        {
-            let mut index = self.index.borrow_mut();
-            for (key, entry, ttl) in new_entries {
-                index.insert(key, entry, ttl);
-            }
-        }
-
-        // Drop old sealed handles & swap in the single new sealed file.
-        self.sealed.borrow_mut().clear();
-        let new_sealed_path = self.dir.join(data_filename(next_id));
-        let new_sealed = Rc::new(LogFile::open_ro(new_sealed_path, next_id).await?);
-        self.sealed.borrow_mut().insert(next_id, new_sealed);
-
-        // Open a fresh active file.
-        let new_active_path = self.dir.join(data_filename(new_active_id));
-        let new_active = Rc::new(LogFile::open_rw(new_active_path, new_active_id).await?);
+        let new_active = Rc::new(
+            LogFile::open_rw(self.dir.join(data_filename(new_active_id)), new_active_id).await?,
+        );
+        sync_dir(&self.dir).await; // make the new active file's directory entry durable
         *self.active.borrow_mut() = new_active;
         self.unsynced_bytes.set(0);
-
-        Ok(report)
+        total.new_file_id = new_active_id;
+        Ok(total)
     }
 }
 
@@ -908,6 +1239,7 @@ async fn scan_file_records(
     end_offset: u64,
     filter: &crate::watch::KeyFilter<'_>,
     since_revision: u64,
+    values: &ValueStore,
     events: &mut Vec<crate::watch::WatchEvent>,
 ) -> Result<()> {
     use crate::watch::WatchEvent;
@@ -941,6 +1273,19 @@ async fn scan_file_records(
                 Ok(b) => b,
                 Err(_) => break,
             };
+            // Verify integrity before trusting bytes we hand to a subscriber: a
+            // corrupt record must not be streamed as a bogus watch event. Every
+            // other record-reading path (replay_active, rebuild_from_records,
+            // extract_value_meta) checks the CRC; this one must too. Stop scanning
+            // this file at the first bad CRC — the record_len we'd skip forward by
+            // is itself covered by the CRC and can't be trusted past a mismatch.
+            if record::verify_crc(&hdr, &hdr_bytes, &body, offset).is_err() {
+                warn!(
+                    offset,
+                    "bad CRC during watch replay; stopping scan of this file"
+                );
+                break;
+            }
             let key = &body[..hdr.key_size as usize];
             if filter.matches(key) {
                 let is_tombstone = hdr.flags & record::flags::TOMBSTONE != 0;
@@ -954,7 +1299,40 @@ async fn scan_file_records(
                     let val_start = hdr.key_size as usize;
                     let val_end = val_start + hdr.val_size as usize;
                     let meta_end = val_end + hdr.meta_size as usize;
-                    let value = Bytes::copy_from_slice(&body[val_start..val_end]);
+                    // Value-separated records carry the 16-byte blob hash, not the
+                    // value — deref it (and verify) so watchers replaying via
+                    // scan_since see the real value, not the pointer.
+                    let value = if hdr.flags & record::flags::VALUE_SEP != 0 {
+                        let field = &body[val_start..val_end];
+                        if field.len() != 16 {
+                            warn!(
+                                offset,
+                                "value-sep record without a 16-byte hash; skipping watch event"
+                            );
+                            offset += record_len;
+                            continue;
+                        }
+                        let mut h: ContentHash = [0u8; 16];
+                        h.copy_from_slice(field);
+                        match values.get(&h).await {
+                            Ok(b) if crate::value_store::content_hash(&b) == h => Bytes::from(b),
+                            Ok(_) => {
+                                warn!(
+                                    offset,
+                                    "blob hash mismatch during watch replay; skipping event"
+                                );
+                                offset += record_len;
+                                continue;
+                            }
+                            Err(e) => {
+                                warn!(offset, error = %e, "blob read failed during watch replay; skipping event");
+                                offset += record_len;
+                                continue;
+                            }
+                        }
+                    } else {
+                        Bytes::copy_from_slice(&body[val_start..val_end])
+                    };
                     let meta_bytes = &body[val_end..meta_end];
                     let metadata = if meta_bytes.is_empty() {
                         None
@@ -987,3 +1365,1719 @@ async fn scan_file_records(
     }
     Ok(())
 }
+
+#[cfg(test)]
+mod compaction_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    async fn write_batch(log: &NamespaceLog, lo: usize, hi: usize) {
+        let val = vec![b'a'; 1000];
+        for i in lo..hi {
+            log.put_full(Bytes::from(format!("k{i:05}")), &val, &[], None)
+                .await
+                .unwrap();
+        }
+    }
+
+    fn sealed_ids(log: &NamespaceLog) -> std::collections::HashSet<u32> {
+        log.sealed.borrow().keys().copied().collect()
+    }
+
+    /// The flood fix: reclaim must NOT rewrite the inherited base on every
+    /// reclaim. After a first reclaim produces a level-1 base run, a second
+    /// batch + reclaim should merge only the NEW level-0 runs into a second
+    /// level-1 run — leaving the original base untouched (still on disk, not
+    /// re-uploaded to S3).
+    #[test]
+    fn reclaim_does_not_rewrite_base_each_reclaim() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 4096,
+                fanout: 4,
+                value_sep_threshold: 128 * 1024,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+
+            write_batch(&log, 0, 30).await;
+            assert!(
+                log.sealed_file_count() >= 4,
+                "batch should seal >= fanout level-0 runs"
+            );
+            log.reclaim().await.unwrap();
+            let base = sealed_ids(&log);
+            assert_eq!(
+                base.len(),
+                1,
+                "level-0 runs merge into one level-1 base run"
+            );
+
+            write_batch(&log, 30, 60).await;
+            log.reclaim().await.unwrap();
+            let after = sealed_ids(&log);
+
+            assert!(
+                base.is_subset(&after),
+                "tiered must leave the base run untouched (not re-upload it): base={base:?} after={after:?}"
+            );
+            assert_eq!(
+                after.len(),
+                2,
+                "two level-1 runs (< fanout) — no base re-merge"
+            );
+            assert_eq!(log.len(), 60, "all keys live through tiered merges");
+        });
+    }
+
+    /// Quantitative flood check on the REAL engine: 12 reclaims over a churning
+    /// ~200-key live set. Size-tiered rewrites far less than full-merge would.
+    /// Full-merge's cost is analytical (it rewrote the whole live set on every
+    /// reclaim — 12 × live-set), so we compare measured tiered bytes against
+    /// that ceiling without keeping the dead full-merge path around.
+    #[test]
+    fn reclaim_write_amp_beats_full_merge() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 4096,
+                fanout: 4,
+                value_sep_threshold: 128 * 1024,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            write_batch(&log, 0, 200).await; // base live set
+            log.reclaim().await.unwrap(); // fold base into a level-1 run
+            let live_set_bytes = log.compaction_bytes.get(); // ~one full live-set rewrite
+            log.compaction_bytes.set(0); // measure the churn phase only
+
+            let reclaims = 12usize;
+            for r in 0..reclaims {
+                let lo = (r * 16) % 200;
+                write_batch(&log, lo, lo + 16).await; // overwrite 16 existing keys
+                log.reclaim().await.unwrap();
+            }
+            let tiered = log.compaction_bytes.get();
+            // Full-merge would rewrite the entire live set on every reclaim.
+            let full_merge = live_set_bytes * reclaims as u64;
+            eprintln!(
+                "\n  COMPACTION BYTES over {reclaims} reclaims (base ~200 KiB):\n    full-merge (analytical = {reclaims}× live set) = {:.2} MiB\n    size-tiered (measured)                       = {:.2} MiB\n    tiered rewrites {:.1}× LESS\n",
+                full_merge as f64 / 1048576.0,
+                tiered as f64 / 1048576.0,
+                full_merge as f64 / tiered.max(1) as f64
+            );
+            assert!(
+                tiered * 2 < full_merge,
+                "tiered must rewrite far less: tiered={tiered} full-merge={full_merge}"
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod value_sep_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use crate::value_store::content_hash;
+    use bytes::Bytes;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    fn key(i: usize) -> Bytes {
+        Bytes::from(format!("k{i:05}"))
+    }
+
+    /// A large value is stored in the blob store, NOT inline: the log record is a
+    /// tiny pointer (header + key + 16-byte hash), and GET still returns the value.
+    #[test]
+    fn large_value_is_separated_and_reads_back() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+
+            let big = vec![0xABu8; 64 * 1024]; // 64 KiB > 4 KiB threshold
+            log.put_full(key(0), &big, &[], None).await.unwrap();
+
+            assert_eq!(log.values.blob_count(), 1, "value went to the blob store");
+            let entry = *log.index.borrow().get(b"k00000").unwrap();
+            assert!(
+                (entry.record_size as usize) < 4096,
+                "log record is a tiny pointer, not the 64 KiB value: {} bytes",
+                entry.record_size
+            );
+            let (v, _m) = log.read_value(entry).await.unwrap();
+            assert_eq!(
+                v,
+                Bytes::from(big),
+                "GET derefs the blob and returns the value"
+            );
+
+            // A small value stays inline (no new blob).
+            log.put_full(key(1), b"small", &[], None).await.unwrap();
+            assert_eq!(log.values.blob_count(), 1, "small value stays inline");
+        });
+    }
+
+    /// THE proof: compaction moves only pointers for separated values. Churn a set
+    /// of large values across many reclaims and compare compaction bytes to inline.
+    #[test]
+    fn compaction_moves_only_pointers_not_values() {
+        run(async {
+            async fn churn(threshold: usize) -> (u64, usize) {
+                let dir = TempDir::new().unwrap();
+                let cfg = LogConfig {
+                    rotate_threshold: 64 * 1024,
+                    fanout: 4,
+                    value_sep_threshold: threshold,
+                };
+                let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap();
+                let n = 60usize;
+                let v0 = vec![0xCDu8; 32 * 1024];
+                for i in 0..n {
+                    log.put_full(key(i), &v0, &[], None).await.unwrap();
+                }
+                log.reclaim().await.unwrap();
+                log.compaction_bytes.set(0); // measure the churn phase only
+                for r in 0..10u8 {
+                    let vr = vec![r; 32 * 1024]; // new content each round
+                    for i in 0..n {
+                        log.put_full(key(i), &vr, &[], None).await.unwrap();
+                    }
+                    log.reclaim().await.unwrap();
+                }
+                // all n keys still readable through the blob deref
+                for i in 0..n {
+                    let e = *log
+                        .index
+                        .borrow()
+                        .get(format!("k{i:05}").as_bytes())
+                        .unwrap();
+                    assert_eq!(log.read_value(e).await.unwrap().0.len(), 32 * 1024);
+                }
+                (log.compaction_bytes.get(), log.values.blob_count())
+            }
+            let (vs_bytes, vs_blobs) = churn(4096).await; // value-separated
+            let (inline_bytes, _) = churn(usize::MAX).await; // everything inline
+            eprintln!(
+                "\n  COMPACTION BYTES over 10 reclaims (60 keys x 32 KiB, churned):\n    inline       = {:.2} MiB\n    value-sep    = {:.2} MiB  ({} live blobs — dedup across keys)\n    value-sep moves {:.0}x fewer bytes (only pointers)\n",
+                inline_bytes as f64 / 1048576.0,
+                vs_bytes as f64 / 1048576.0,
+                vs_blobs,
+                inline_bytes as f64 / vs_bytes.max(1) as f64
+            );
+            assert!(
+                vs_bytes * 5 < inline_bytes,
+                "value-sep must move far fewer compaction bytes: vs={vs_bytes} inline={inline_bytes}"
+            );
+            assert!(
+                vs_blobs <= 2,
+                "identical per-round values dedup to ~1 blob, got {vs_blobs}"
+            );
+        });
+    }
+
+    /// Overwriting or deleting a separated value reclaims the old blob (refcount→0).
+    #[test]
+    fn overwrite_and_delete_gc_the_blob() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+
+            log.put_full(key(0), &vec![1u8; 8192], &[], None)
+                .await
+                .unwrap();
+            assert_eq!(log.values.blob_count(), 1);
+            // overwrite with different content -> old blob GC'd, one blob remains
+            log.put_full(key(0), &vec![2u8; 8192], &[], None)
+                .await
+                .unwrap();
+            assert_eq!(
+                log.values.blob_count(),
+                1,
+                "old blob reclaimed on overwrite"
+            );
+            // delete -> blob GC'd
+            log.tombstone(b"k00000").await.unwrap();
+            assert_eq!(log.values.blob_count(), 0, "blob reclaimed on delete");
+        });
+    }
+
+    /// After a clean restart, separated values still read back (footer carried the
+    /// hash; refcounts rebuilt), and a subsequent overwrite still GCs correctly.
+    #[test]
+    fn separated_values_survive_reopen() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let path = dir.path().to_path_buf();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let big = vec![0x5Au8; 100 * 1024];
+            {
+                let log = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+                log.put_full(key(0), &big, &[], None).await.unwrap();
+                log.reclaim().await.unwrap(); // seal -> footer carries the value hash
+            }
+            // Reopen from disk.
+            let log = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+            let e = *log.index.borrow().get(b"k00000").unwrap();
+            assert_eq!(
+                log.read_value(e).await.unwrap().0,
+                Bytes::from(big.clone()),
+                "value reads back after reopen"
+            );
+            assert_eq!(
+                log.values.refcount(&content_hash(&big)),
+                1,
+                "refcount rebuilt from footer"
+            );
+            // Overwrite -> the rebuilt refcount lets the inherited blob GC.
+            log.put_full(key(0), &vec![9u8; 100 * 1024], &[], None)
+                .await
+                .unwrap();
+            assert_eq!(
+                log.values.refcount(&content_hash(&big)),
+                0,
+                "old blob unref'd after reopen+overwrite"
+            );
+        });
+    }
+
+    /// CRASH recovery (no clean footer): a value-separated key written to the
+    /// active file and never sealed must, after reopen, rebuild the value-sep
+    /// sidecar from the RECORD SCAN (`replay_active`) — not the footer. Proven by
+    /// a post-reopen overwrite correctly GC'ing the inherited blob.
+    #[test]
+    fn separated_values_survive_crash_recovery() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let path = dir.path().to_path_buf();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let big = vec![0x33u8; 100 * 1024];
+            {
+                let log = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+                log.put_full(key(0), &big, &[], None).await.unwrap();
+                // Drop WITHOUT sealing -> active file has no footer (a crash).
+            }
+            let log = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+            let e = *log.index.borrow().get(b"k00000").unwrap();
+            assert_eq!(
+                log.read_value(e).await.unwrap().0,
+                Bytes::from(big.clone()),
+                "reads back after crash recovery"
+            );
+            assert_eq!(
+                log.values.refcount(&content_hash(&big)),
+                1,
+                "refcount rebuilt from the record scan, not a footer"
+            );
+            log.put_full(key(0), &vec![0x44u8; 100 * 1024], &[], None)
+                .await
+                .unwrap();
+            assert_eq!(
+                log.values.refcount(&content_hash(&big)),
+                0,
+                "sidecar from scan let the old blob GC on overwrite"
+            );
+        });
+    }
+
+    /// MSET (`put_many`) separates large values, derefs them on read, dedups
+    /// identical content, and GCs the old blob when a key is rewritten in a
+    /// later batch.
+    #[test]
+    fn mset_separates_large_values() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            let big1 = Bytes::from(vec![1u8; 8192]);
+            let big2 = Bytes::from(vec![2u8; 8192]);
+            let small = Bytes::from_static(b"inline");
+            log.put_many(&[
+                (key(0), big1.clone()),
+                (key(1), big2.clone()),
+                (key(2), small.clone()),
+            ])
+            .await
+            .unwrap();
+            assert_eq!(
+                log.values.blob_count(),
+                2,
+                "two distinct large values separated; small stays inline"
+            );
+            for (k, want) in [(0usize, &big1), (1, &big2), (2, &small)] {
+                let e = *log
+                    .index
+                    .borrow()
+                    .get(format!("k{k:05}").as_bytes())
+                    .unwrap();
+                assert_eq!(
+                    log.read_value(e).await.unwrap().0,
+                    *want,
+                    "MSET value {k} reads back"
+                );
+            }
+            // Rewrite key0 in a later MSET with new content -> old blob GC'd.
+            let big1b = Bytes::from(vec![9u8; 8192]);
+            log.put_many(&[(key(0), big1b)]).await.unwrap();
+            assert_eq!(
+                log.values.refcount(&content_hash(&big1)),
+                0,
+                "old MSET blob reclaimed"
+            );
+            assert_eq!(log.values.blob_count(), 2, "key0's new blob + key1's blob");
+        });
+    }
+
+    /// Cross-key dedup refcount: two keys with identical large content share ONE
+    /// blob (refcount 2). Deleting one must NOT delete the blob — the other key
+    /// still reads correctly. This is the premature-deletion / data-loss guard.
+    #[test]
+    fn shared_blob_not_deleted_while_referenced() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            let v = vec![7u8; 8192];
+            log.put_full(key(0), &v, &[], None).await.unwrap();
+            log.put_full(key(1), &v, &[], None).await.unwrap(); // identical content
+            assert_eq!(
+                log.values.blob_count(),
+                1,
+                "identical content dedups to one blob"
+            );
+            assert_eq!(log.values.refcount(&content_hash(&v)), 2);
+
+            log.tombstone(b"k00000").await.unwrap(); // delete ONE referencing key
+            assert_eq!(
+                log.values.blob_count(),
+                1,
+                "blob survives — k1 still references it"
+            );
+            let e = *log.index.borrow().get(b"k00001").unwrap();
+            assert_eq!(
+                log.read_value(e).await.unwrap().0,
+                Bytes::from(v.clone()),
+                "surviving key still reads"
+            );
+
+            log.tombstone(b"k00001").await.unwrap(); // delete the last reference
+            assert_eq!(
+                log.values.blob_count(),
+                0,
+                "blob reclaimed only after last reference drops"
+            );
+        });
+    }
+
+    /// CAS / conditional writes with large values: a successful CAS GCs the old
+    /// blob; a CAS that LOSES the post-check must unref the blob it wrote (no leak).
+    #[test]
+    fn cas_large_value_gc_and_abort_unref() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            let now = super::now_ms();
+            let v1 = vec![1u8; 8192];
+            // SETNX-style on an absent key: writes + separates.
+            assert!(
+                log.put_full_cond(key(0), &v1, &[], None, WriteCondition::KeyAbsent, now)
+                    .await
+                    .unwrap()
+                    .is_some()
+            );
+            assert_eq!(log.values.blob_count(), 1);
+            let rev = log.index.borrow().get(b"k00000").unwrap().tstamp_ms;
+
+            // CAS with matching revision: overwrites, old blob GC'd.
+            let v2 = vec![2u8; 8192];
+            assert!(
+                log.put_full_cond(key(0), &v2, &[], None, WriteCondition::Revision(rev), now)
+                    .await
+                    .unwrap()
+                    .is_some()
+            );
+            assert_eq!(
+                log.values.refcount(&content_hash(&v1)),
+                0,
+                "old blob GC'd on successful CAS"
+            );
+            assert_eq!(log.values.refcount(&content_hash(&v2)), 1);
+
+            // CAS with a stale revision: aborts. The blob it wrote must be unref'd.
+            let v3 = vec![3u8; 8192];
+            assert!(
+                log.put_full_cond(key(0), &v3, &[], None, WriteCondition::Revision(rev), now)
+                    .await
+                    .unwrap()
+                    .is_none()
+            );
+            assert_eq!(
+                log.values.refcount(&content_hash(&v3)),
+                0,
+                "aborted CAS unref'd its blob — no leak"
+            );
+            assert_eq!(log.values.blob_count(), 1, "only v2's blob remains");
+        });
+    }
+}
+
+#[cfg(test)]
+mod crash_consistency {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use crate::value_store::{ContentHash, content_hash};
+    use bytes::Bytes;
+    use std::collections::{BTreeMap, HashSet};
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    #[derive(Clone)]
+    enum Op {
+        Set { k: u8, val: Vec<u8>, large: bool },
+        Del { k: u8 },
+    }
+    fn kb(k: u8) -> Bytes {
+        Bytes::from(format!("k{k}"))
+    }
+
+    /// Exhaustive power-loss crash-consistency proof. Write a workload, fsync at a
+    /// known point, then — modelling a power loss, which can only lose the
+    /// UN-fsynced tail — truncate the active log at EVERY byte offset in that tail
+    /// and recover. After each recovery the state must be a valid prefix of the
+    /// write history: exactly the records that fully fit below the cut, last-writer
+    /// -wins; every surviving key must read back its correct value (deref proves the
+    /// blob is present = no dangling pointer); and the blob count must equal the
+    /// live large-value set (sweep reclaimed orphans = no leak).
+    ///
+    /// The tail contains a value-separated OVERWRITE (k0: A→B). That is the case the
+    /// deferred-blob-deletion fix protects: a cut that loses the overwrite reverts
+    /// k0 to A, and A's blob must still exist. Without the fix this test fails at
+    /// those offsets with a dangling-pointer read error.
+    #[test]
+    fn exhaustive_tail_truncation_is_consistent() {
+        run(async {
+            let work = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 40,
+                fanout: 8,
+                value_sep_threshold: 256,
+            };
+            let big = |b: u8| vec![b; 512]; // >= threshold -> value-separated
+            let ops = [
+                Op::Set {
+                    k: 0,
+                    val: big(0xA1),
+                    large: true,
+                },
+                Op::Set {
+                    k: 1,
+                    val: b"s1".to_vec(),
+                    large: false,
+                },
+                // ---- fsync here: everything above is durable ----
+                Op::Set {
+                    k: 0,
+                    val: big(0xB2),
+                    large: true,
+                }, // overwrite k0 (old A blob deferred)
+                Op::Set {
+                    k: 2,
+                    val: big(0xC3),
+                    large: true,
+                },
+                Op::Del { k: 1 },
+                Op::Set {
+                    k: 3,
+                    val: b"s3".to_vec(),
+                    large: false,
+                },
+            ];
+            let fsync_after = 2usize;
+
+            let mut ends: Vec<u64> = Vec::with_capacity(ops.len());
+            let mut fsync_offset = 0u64;
+            {
+                let log = NamespaceLog::open(work.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap();
+                for (i, op) in ops.iter().enumerate() {
+                    match op {
+                        Op::Set { k, val, .. } => {
+                            log.put_full(kb(*k), val, &[], None).await.unwrap();
+                        }
+                        Op::Del { k } => {
+                            log.tombstone(kb(*k).as_ref()).await.unwrap();
+                        }
+                    }
+                    ends.push(log.active.borrow().write_offset());
+                    if i + 1 == fsync_after {
+                        log.sync().await.unwrap();
+                        fsync_offset = log.active.borrow().write_offset();
+                    }
+                }
+                // Deliberately NO final sync: ops after `fsync_after` are the
+                // crash-vulnerable un-fsynced tail.
+            }
+
+            // Capture the on-disk image (page cache reflects all written bytes).
+            let data_bytes = std::fs::read(work.path().join(data_filename(0))).unwrap();
+            let values_dir = work.path().join("values");
+            let blob_snapshot: Vec<(std::ffi::OsString, Vec<u8>)> = std::fs::read_dir(&values_dir)
+                .map(|rd| {
+                    rd.flatten()
+                        .map(|e| (e.file_name(), std::fs::read(e.path()).unwrap()))
+                        .collect()
+                })
+                .unwrap_or_default();
+
+            let crash = TempDir::new().unwrap();
+            let crash_data = crash.path().join(data_filename(0));
+            let crash_values = crash.path().join("values");
+
+            for t in (fsync_offset as usize)..=data_bytes.len() {
+                // Rebuild the crashed image: log truncated to t, FULL blob set
+                // restored (sweep_orphans mutates it, so restore every iteration).
+                std::fs::write(&crash_data, &data_bytes[..t]).unwrap();
+                let _ = std::fs::remove_dir_all(&crash_values);
+                if !blob_snapshot.is_empty() {
+                    std::fs::create_dir_all(&crash_values).unwrap();
+                    for (name, bytes) in &blob_snapshot {
+                        std::fs::write(crash_values.join(name), bytes).unwrap();
+                    }
+                }
+
+                // Oracle: the prefix of ops whose record fully fits below the cut.
+                let mut state: BTreeMap<u8, (Vec<u8>, bool)> = BTreeMap::new();
+                for (op, end) in ops.iter().zip(ends.iter()) {
+                    if *end > t as u64 {
+                        break;
+                    }
+                    match op {
+                        Op::Set { k, val, large } => {
+                            state.insert(*k, (val.clone(), *large));
+                        }
+                        Op::Del { k } => {
+                            state.remove(k);
+                        }
+                    }
+                }
+
+                let log = NamespaceLog::open(crash.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap();
+
+                // (1) recovered key set == expected prefix key set
+                let recovered: HashSet<Vec<u8>> =
+                    log.index.borrow().iter().map(|(k, _)| k.to_vec()).collect();
+                let expected: HashSet<Vec<u8>> = state.keys().map(|k| kb(*k).to_vec()).collect();
+                assert_eq!(recovered, expected, "key set mismatch at truncation t={t}");
+
+                // (2) every surviving key reads its correct value (deref => blob
+                //     present => no dangling pointer)
+                for (k, (val, _large)) in &state {
+                    let e = *log.index.borrow().get(kb(*k).as_ref()).unwrap();
+                    let got = log.read_value(e).await.unwrap_or_else(|err| {
+                        panic!("DANGLING/corrupt read for k{k} at t={t}: {err:?}")
+                    });
+                    assert_eq!(
+                        got.0.as_ref(),
+                        val.as_slice(),
+                        "value mismatch k{k} at t={t}"
+                    );
+                }
+
+                // (3) blob count == distinct live large values (orphans swept => no leak)
+                let want: HashSet<ContentHash> = state
+                    .values()
+                    .filter(|(_, large)| *large)
+                    .map(|(v, _)| content_hash(v))
+                    .collect();
+                assert_eq!(
+                    log.values.blob_count(),
+                    want.len(),
+                    "blob leak/missing at t={t}: have {} want {}",
+                    log.values.blob_count(),
+                    want.len()
+                );
+            }
+
+            eprintln!(
+                "\n  CRASH-CONSISTENCY: {} tail-truncation offsets ({}..={}) all recovered to a\n  valid prefix — zero dangling pointers, zero blob leaks.\n",
+                data_bytes.len() - fsync_offset as usize + 1,
+                fsync_offset,
+                data_bytes.len()
+            );
+        });
+    }
+
+    /// Bit-rot of a DURABLE record: corrupt one byte inside record `i`, and
+    /// recovery must detect the CRC mismatch, truncate at the start of record `i`
+    /// (dropping it and everything after it), and leave the prefix [0, i) fully
+    /// intact and readable — with the now-unreferenced blobs of the dropped tail
+    /// reclaimed by `sweep_orphans`. Workload uses only distinct-key appends (no
+    /// value-sep overwrites) so the recovered prefix never reverts to a value
+    /// whose blob was legitimately GC'd.
+    #[test]
+    fn corruption_truncates_at_bad_record_keeping_prefix() {
+        run(async {
+            let work = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 40,
+                fanout: 8,
+                value_sep_threshold: 256,
+            };
+            let big = |b: u8| vec![b; 512];
+            let ops = vec![
+                Op::Set {
+                    k: 0,
+                    val: big(0xA1),
+                    large: true,
+                },
+                Op::Set {
+                    k: 1,
+                    val: big(0xB2),
+                    large: true,
+                },
+                Op::Set {
+                    k: 2,
+                    val: b"s2".to_vec(),
+                    large: false,
+                },
+                Op::Set {
+                    k: 3,
+                    val: big(0xC3),
+                    large: true,
+                },
+            ];
+
+            let mut ends: Vec<u64> = Vec::with_capacity(ops.len());
+            {
+                let log = NamespaceLog::open(work.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap();
+                for op in &ops {
+                    if let Op::Set { k, val, .. } = op {
+                        log.put_full(kb(*k), val, &[], None).await.unwrap();
+                    }
+                    ends.push(log.active.borrow().write_offset());
+                }
+                log.sync().await.unwrap(); // everything durable
+            }
+            let data_bytes = std::fs::read(work.path().join(data_filename(0))).unwrap();
+            let blob_snapshot: Vec<(std::ffi::OsString, Vec<u8>)> =
+                std::fs::read_dir(work.path().join("values"))
+                    .map(|rd| {
+                        rd.flatten()
+                            .map(|e| (e.file_name(), std::fs::read(e.path()).unwrap()))
+                            .collect()
+                    })
+                    .unwrap_or_default();
+
+            let crash = TempDir::new().unwrap();
+            let crash_data = crash.path().join(data_filename(0));
+            let crash_values = crash.path().join("values");
+
+            for i in 0..ops.len() {
+                let start = if i == 0 { 0 } else { ends[i - 1] as usize };
+                let pos = (start + ends[i] as usize) / 2; // a byte inside record i
+                let mut corrupt = data_bytes.clone();
+                corrupt[pos] ^= 0xFF;
+                std::fs::write(&crash_data, &corrupt).unwrap();
+                let _ = std::fs::remove_dir_all(&crash_values);
+                std::fs::create_dir_all(&crash_values).unwrap();
+                for (name, bytes) in &blob_snapshot {
+                    std::fs::write(crash_values.join(name), bytes).unwrap();
+                }
+
+                // Expected: only records strictly before the corrupted one survive.
+                let mut state: BTreeMap<u8, (Vec<u8>, bool)> = BTreeMap::new();
+                for op in &ops[..i] {
+                    if let Op::Set { k, val, large } = op {
+                        state.insert(*k, (val.clone(), *large));
+                    }
+                }
+
+                let log = NamespaceLog::open(crash.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap();
+                let recovered: HashSet<Vec<u8>> =
+                    log.index.borrow().iter().map(|(k, _)| k.to_vec()).collect();
+                let expected: HashSet<Vec<u8>> = state.keys().map(|k| kb(*k).to_vec()).collect();
+                assert_eq!(
+                    recovered, expected,
+                    "corruption at record {i} should keep exactly the prefix"
+                );
+                for (k, (val, _)) in &state {
+                    let e = *log.index.borrow().get(kb(*k).as_ref()).unwrap();
+                    let got = log.read_value(e).await.unwrap_or_else(|err| {
+                        panic!("prefix key k{k} unreadable after corrupting record {i}: {err:?}")
+                    });
+                    assert_eq!(
+                        got.0.as_ref(),
+                        val.as_slice(),
+                        "prefix value k{k} wrong after corrupting record {i}"
+                    );
+                }
+                let want: HashSet<ContentHash> = state
+                    .values()
+                    .filter(|(_, l)| *l)
+                    .map(|(v, _)| content_hash(v))
+                    .collect();
+                assert_eq!(
+                    log.values.blob_count(),
+                    want.len(),
+                    "dropped-tail blobs not reclaimed after corrupting record {i}"
+                );
+            }
+            eprintln!(
+                "\n  CRASH-CONSISTENCY: single-byte corruption at every record truncates cleanly\n  at the bad record; the prefix stays intact and the dropped tail's blobs are swept.\n"
+            );
+        });
+    }
+
+    /// Torn footer + multi-file recovery: after a reclaim seals records (with
+    /// value-separated keys) into a footered sealed file and opens a new active,
+    /// truncating the SEALED file's footer must make `read_footer` reject the
+    /// (now-invalid) magic and fall back to `rebuild_from_records` — a full scan
+    /// that re-derives the value-sep sidecar from each record's VALUE_SEP flag.
+    /// Across every footer-region cut (and into the records), every key must still
+    /// recover and read back through the multi-file (sealed + active) layout.
+    #[test]
+    fn torn_footer_falls_back_to_scan_across_files() {
+        run(async {
+            let work = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 40,
+                fanout: 8,
+                value_sep_threshold: 256,
+            };
+            let big = |b: u8| vec![b; 512];
+            let ops = vec![
+                Op::Set {
+                    k: 0,
+                    val: big(0xD1),
+                    large: true,
+                },
+                Op::Set {
+                    k: 1,
+                    val: b"s1".to_vec(),
+                    large: false,
+                },
+                Op::Set {
+                    k: 2,
+                    val: big(0xE2),
+                    large: true,
+                },
+                Op::Set {
+                    k: 3,
+                    val: big(0xF3),
+                    large: true,
+                },
+            ];
+            let mut ends: Vec<u64> = Vec::with_capacity(ops.len());
+            let records_end;
+            {
+                let log = NamespaceLog::open(work.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap();
+                for op in &ops {
+                    if let Op::Set { k, val, .. } = op {
+                        log.put_full(kb(*k), val, &[], None).await.unwrap();
+                    }
+                    ends.push(log.active.borrow().write_offset());
+                }
+                records_end = log.active.borrow().write_offset();
+                log.reclaim().await.unwrap(); // seal file 0 (records + footer), open active file 1
+            }
+            // The reclaim footered file 0 and created an empty active file 1.
+            let sealed_bytes = std::fs::read(work.path().join(data_filename(0))).unwrap();
+            assert!(
+                sealed_bytes.len() as u64 > records_end,
+                "footer was appended past the records"
+            );
+            let active1 = work.path().join(data_filename(1));
+            assert!(
+                active1.exists(),
+                "reclaim opened a new active file (multi-file layout)"
+            );
+            let blob_snapshot: Vec<(std::ffi::OsString, Vec<u8>)> =
+                std::fs::read_dir(work.path().join("values"))
+                    .map(|rd| {
+                        rd.flatten()
+                            .map(|e| (e.file_name(), std::fs::read(e.path()).unwrap()))
+                            .collect()
+                    })
+                    .unwrap_or_default();
+
+            let crash = TempDir::new().unwrap();
+            let f0 = crash.path().join(data_filename(0));
+            let f1 = crash.path().join(data_filename(1));
+            let cvals = crash.path().join("values");
+
+            // Cut from late in the last record through the entire footer region.
+            let lo = (records_end as usize).saturating_sub(40);
+            for t in lo..=sealed_bytes.len() {
+                std::fs::write(&f0, &sealed_bytes[..t]).unwrap();
+                std::fs::write(&f1, b"").unwrap(); // empty active (highest id)
+                let _ = std::fs::remove_dir_all(&cvals);
+                std::fs::create_dir_all(&cvals).unwrap();
+                for (name, bytes) in &blob_snapshot {
+                    std::fs::write(cvals.join(name), bytes).unwrap();
+                }
+
+                // Records fully below the cut survive the scan; a cut in the footer
+                // region (t >= records_end) keeps all records.
+                let mut state: BTreeMap<u8, (Vec<u8>, bool)> = BTreeMap::new();
+                for (op, end) in ops.iter().zip(ends.iter()) {
+                    if *end > t as u64 {
+                        break;
+                    }
+                    if let Op::Set { k, val, large } = op {
+                        state.insert(*k, (val.clone(), *large));
+                    }
+                }
+
+                let log = NamespaceLog::open(crash.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap();
+                let recovered: HashSet<Vec<u8>> =
+                    log.index.borrow().iter().map(|(k, _)| k.to_vec()).collect();
+                let expected: HashSet<Vec<u8>> = state.keys().map(|k| kb(*k).to_vec()).collect();
+                assert_eq!(
+                    recovered, expected,
+                    "torn-footer scan recovered wrong key set at t={t}"
+                );
+                for (k, (val, _)) in &state {
+                    let e = *log.index.borrow().get(kb(*k).as_ref()).unwrap();
+                    let got = log.read_value(e).await.unwrap_or_else(|err| {
+                        panic!("k{k} unreadable via torn-footer scan at t={t}: {err:?}")
+                    });
+                    assert_eq!(
+                        got.0.as_ref(),
+                        val.as_slice(),
+                        "value mismatch k{k} at t={t}"
+                    );
+                }
+                let want: HashSet<ContentHash> = state
+                    .values()
+                    .filter(|(_, l)| *l)
+                    .map(|(v, _)| content_hash(v))
+                    .collect();
+                assert_eq!(
+                    log.values.blob_count(),
+                    want.len(),
+                    "blob leak/missing at t={t}"
+                );
+            }
+            eprintln!(
+                "\n  CRASH-CONSISTENCY: torn footer over {} cuts → scan fallback rebuilt value-sep\n  state from records across the sealed+active multi-file layout. No dangling, no leaks.\n",
+                sealed_bytes.len() - lo + 1
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod concurrency_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use std::collections::BTreeMap;
+    use std::rc::Rc;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    async fn live_state(log: &NamespaceLog) -> BTreeMap<Vec<u8>, Vec<u8>> {
+        let entries: Vec<(Vec<u8>, IndexEntry)> = log
+            .index
+            .borrow()
+            .iter()
+            .map(|(k, e)| (k.to_vec(), *e))
+            .collect();
+        let mut out = BTreeMap::new();
+        for (k, e) in entries {
+            let (v, _m) = log.read_value(e).await.unwrap();
+            out.insert(k, v.to_vec());
+        }
+        out
+    }
+
+    /// Stress the new per-key write striping under real concurrency: many spawned
+    /// tasks hammer a small shared keyspace with interleaved SET / CAS / DEL (so
+    /// same-key writes actually contend on stripes). This must (a) never deadlock
+    /// — single-key writes each hold exactly one stripe — and (b) leave on-disk
+    /// state that, after a full fsync, recovery reproduces EXACTLY. A conditional
+    /// write that loses a race writes nothing (no orphan), so the durable log can
+    /// never replay to anything other than the live runtime state.
+    #[test]
+    fn concurrent_mixed_writes_recover_to_runtime_state() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 40,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = Rc::new(
+                NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap(),
+            );
+
+            let mut handles = Vec::new();
+            for t in 0..6u64 {
+                let log = log.clone();
+                handles.push(monoio::spawn(async move {
+                    for i in 0..120u64 {
+                        let k = Bytes::from(format!("k{}", (t + i) % 5)); // 5 hot keys, heavy same-key contention
+                        let big = i % 7 == 0; // mix in value-separated (>4 KiB) writes
+                        let val = if big {
+                            vec![(t as u8).wrapping_add(i as u8); 8192]
+                        } else {
+                            vec![t as u8; 24]
+                        };
+                        match (t + i) % 3 {
+                            0 => {
+                                log.put_full(k, &val, &[], None).await.unwrap();
+                            }
+                            1 => {
+                                // CAS against whatever revision we just observed.
+                                let now = now_ms();
+                                let cond = match log.index.borrow().get(k.as_ref()) {
+                                    Some(e) => WriteCondition::Revision(e.tstamp_ms),
+                                    None => WriteCondition::KeyAbsent,
+                                };
+                                let _ = log
+                                    .put_full_cond(k, &val, &[], None, cond, now)
+                                    .await
+                                    .unwrap();
+                            }
+                            _ => {
+                                let _ = log.tombstone(k.as_ref()).await.unwrap();
+                            }
+                        }
+                    }
+                }));
+            }
+            for h in handles {
+                h.await;
+            }
+
+            // Full durability, then snapshot the live runtime state.
+            log.sync().await.unwrap();
+            let runtime = live_state(&log).await;
+            drop(log);
+
+            // Recover from disk; it must reproduce the exact runtime state — no
+            // resurrected "failed" CAS, no lost update, no dangling value-sep blob.
+            let log2 = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            let recovered = live_state(&log2).await;
+            assert_eq!(
+                recovered, runtime,
+                "recovery diverged from the concurrent runtime state"
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod perf_overhead {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use std::time::Instant;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("rt")
+            .block_on(f)
+    }
+
+    /// Quantify the cost added to the WRITE path by the per-key stripe lock, and
+    /// confirm the READ path is untouched. Reported, not asserted. Ignored by
+    /// default (a perf probe, not a regression test): `cargo test -- --ignored`.
+    #[test]
+    #[ignore = "perf probe; run with --ignored --nocapture"]
+    fn write_path_overhead() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 40,
+                fanout: 8,
+                value_sep_threshold: 1 << 20,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            let val = vec![0u8; 64]; // small inline value (the common case)
+
+            // 1) bare added cost: FxHash(key) + uncontended stripe lock/unlock.
+            let n = 500_000;
+            let key = b"some-typical-key";
+            let t = Instant::now();
+            for _ in 0..n {
+                let g = log.wlock(std::hint::black_box(key)).lock().await;
+                std::hint::black_box(&g);
+            }
+            let lock_ns = t.elapsed().as_nanos() as f64 / n as f64;
+
+            // 2) full small-value write (encode + append + index + stripe lock).
+            let nw = 50_000;
+            let t = Instant::now();
+            for i in 0..nw {
+                log.put_full(Bytes::from(format!("k{i:08}")), &val, &[], None)
+                    .await
+                    .unwrap();
+            }
+            let put_ns = t.elapsed().as_nanos() as f64 / nw as f64;
+
+            // 3) warm read (lock-free path; the stripe lock is never taken).
+            let e = *log.index.borrow().get(b"k00000000").unwrap();
+            let nr = 200_000;
+            let t = Instant::now();
+            for _ in 0..nr {
+                std::hint::black_box(log.read_value(e).await.unwrap());
+            }
+            let read_ns = t.elapsed().as_nanos() as f64 / nr as f64;
+
+            eprintln!(
+                "\n  PERF (single shard, sequential):\n    stripe lock acquire+release (uncontended) = {lock_ns:.0} ns   <- the per-write add\n    full small-value put_full                 = {:.0} ns/op  ({:.2}% is the lock)\n    warm read_value (lock-free, unchanged)    = {read_ns:.0} ns/op\n",
+                put_ns,
+                lock_ns / put_ns * 100.0
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod integrity_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("rt")
+            .block_on(f)
+    }
+
+    /// #2: a value-separated blob corrupted on disk is DETECTED on read (content
+    /// hash mismatch), not returned as wrong data — parity with the inline CRC
+    /// check. (Drop the re-hash in `deref` and this returns corrupted bytes → the
+    /// final assert fails.)
+    #[test]
+    fn corrupted_blob_is_detected_on_read() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            let big = vec![0x11u8; 8192];
+            log.put_full(Bytes::from_static(b"k"), &big, &[], None)
+                .await
+                .unwrap();
+            let e = *log.index.borrow().get(b"k").unwrap();
+            assert_eq!(
+                log.read_value(e).await.unwrap().0,
+                Bytes::from(big.clone()),
+                "sanity: reads back"
+            );
+
+            // Flip a byte in the blob file on disk.
+            let blob = std::fs::read_dir(dir.path().join("values"))
+                .unwrap()
+                .flatten()
+                .map(|d| d.path())
+                .find(|p| {
+                    p.file_name()
+                        .unwrap()
+                        .to_string_lossy()
+                        .starts_with("blob-")
+                })
+                .expect("blob file");
+            let mut bytes = std::fs::read(&blob).unwrap();
+            bytes[0] ^= 0xFF;
+            std::fs::write(&blob, bytes).unwrap();
+
+            assert!(
+                log.read_value(e).await.is_err(),
+                "corrupted blob must be detected, not returned as data"
+            );
+        });
+    }
+
+    /// #3: revisions stay monotonic across a restart even when recovered data
+    /// carries a tstamp ahead of the wall clock (clock skew / future-dated write).
+    /// The revision clock seeds from the max recovered tstamp. (Seed from 0 — the
+    /// old behavior — and the post-restart write gets a smaller revision: fails.)
+    #[test]
+    fn revisions_monotonic_across_restart() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 1 << 20,
+            };
+            let path = dir.path().to_path_buf();
+            {
+                let log = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+                log.put_full(Bytes::from_static(b"k0"), b"v0", &[], None)
+                    .await
+                    .unwrap();
+            }
+            // Append a record with a far-future tstamp directly to the active file.
+            let future = now_ms() + 10_000_000;
+            let rec = crate::log::record::encode(
+                future,
+                crate::log::record::flags::NO_EXPIRY,
+                0,
+                b"k1",
+                b"v1",
+                &[],
+            )
+            .unwrap();
+            {
+                use std::io::Write;
+                let p = path.join(crate::log::file::data_filename(0));
+                std::fs::OpenOptions::new()
+                    .append(true)
+                    .open(&p)
+                    .unwrap()
+                    .write_all(&rec)
+                    .unwrap();
+            }
+            // Reopen → recovery sees `future`; the revision clock must seed from it.
+            let log2 = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+            let rev = log2
+                .put_full(Bytes::from_static(b"k2"), b"v2", &[], None)
+                .await
+                .unwrap();
+            assert!(
+                rev > future,
+                "post-restart revision {rev} must exceed recovered max {future}"
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod watch_valuesep_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use crate::watch::{KeyFilter, WatchEvent};
+    use bytes::Bytes;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("rt")
+            .block_on(f)
+    }
+
+    /// Regression: watch resumption (`scan_since`) must deref a value-separated
+    /// record to its real value, not emit the 16-byte content-hash pointer. (Skip
+    /// the deref and the event value is 16 bytes, not the 8 KiB value → fails.)
+    #[test]
+    fn scan_since_emits_real_value_for_separated_record() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 4096,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+            let big = vec![0x7Cu8; 8192]; // > threshold → separated
+            log.put_full(Bytes::from_static(b"wk"), &big, &[], None)
+                .await
+                .unwrap();
+
+            let events = log.scan_since(&KeyFilter::Exact(b"wk"), 0).await.unwrap();
+            assert_eq!(events.len(), 1, "exactly one Set event");
+            match &events[0] {
+                WatchEvent::Set { value, .. } => {
+                    assert_eq!(
+                        value,
+                        &Bytes::from(big.clone()),
+                        "watch replay must emit the real value, not the hash"
+                    );
+                }
+                other => panic!("expected Set, got {other:?}"),
+            }
+        });
+    }
+}
+
+#[cfg(test)]
+mod reclaim_concurrency_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use std::rc::Rc;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("rt")
+            .block_on(f)
+    }
+
+    /// Regression: a write issued while a reclaim is running must WAIT for it and
+    /// then succeed — it must NOT return `ReclamationBusy`. (Before: writes errored
+    /// during reclaim.) A small rotate threshold makes the reclaim do real merge
+    /// work so the concurrent write actually overlaps it.
+    #[test]
+    fn writes_wait_for_reclaim_then_succeed() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 256,
+                fanout: 4,
+                value_sep_threshold: 1 << 20,
+            };
+            let log = Rc::new(
+                NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap(),
+            );
+
+            // Fill enough that many runs seal → reclaim has a multi-level merge to do.
+            let val = vec![0xACu8; 80];
+            for i in 0..80u32 {
+                log.put_full(Bytes::from(format!("k{i:04}")), &val, &[], None)
+                    .await
+                    .unwrap();
+            }
+
+            // Run reclaim and a write concurrently; the write must wait, not error.
+            let a = log.clone();
+            let b = log.clone();
+            let t_reclaim = monoio::spawn(async move { a.reclaim().await });
+            let t_write = monoio::spawn(async move {
+                b.put_full(Bytes::from_static(b"during-reclaim"), &[1u8; 80], &[], None)
+                    .await
+            });
+            let wr = t_write.await;
+            let rr = t_reclaim.await;
+            assert!(rr.is_ok(), "reclaim failed: {rr:?}");
+            assert!(
+                wr.is_ok(),
+                "write during reclaim must wait+succeed, not error: {wr:?}"
+            );
+
+            // The waited write is durable and reads back.
+            let e = *log.index.borrow().get(b"during-reclaim").unwrap();
+            assert_eq!(log.read_value(e).await.unwrap().0.len(), 80);
+            assert_eq!(log.len(), 81, "all keys present");
+        });
+    }
+}
+
+#[cfg(test)]
+mod reclaim_durability_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use std::cell::Cell;
+    use std::rc::Rc;
+    use std::time::Duration;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("rt")
+            .block_on(f)
+    }
+
+    /// Teeth-verified regression for the footer-consistency drain.
+    ///
+    /// The bug: `reclaim()` built the sealed file's footer from the index WITHOUT
+    /// first draining in-flight writes. A write that had passed the gate and
+    /// reserved an offset in the active file but had not yet `index.insert`ed
+    /// would be missing from that footer and silently lost on the next footer
+    /// (fast-path) recovery — and could even append AFTER the footer trailer.
+    ///
+    /// The fix: `reclaim()` spins `while in_flight_writes > 0` before sealing.
+    /// This asserts that contract directly: a held `WriteGuard` (exactly the
+    /// "appended but not yet indexed" state, since the guard spans append→insert)
+    /// pins `in_flight_writes == 1`, and reclaim must NOT seal until it is
+    /// released. Remove the drain loop in `reclaim()` and this fails: reclaim
+    /// seals (opens a new active, sets `done`) while the guard is still held.
+    #[test]
+    fn reclaim_does_not_seal_while_a_write_is_in_flight() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 512,
+                fanout: 4,
+                value_sep_threshold: 1 << 20,
+            };
+            let log = Rc::new(
+                NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                    .await
+                    .unwrap(),
+            );
+            for i in 0..5u32 {
+                log.put_full(Bytes::from(format!("seed{i}")), b"v", &[], None)
+                    .await
+                    .unwrap();
+            }
+            let active_before = log.active.borrow().file_id;
+
+            // Pin in_flight_writes==1: the exact "in the seal window" state a real
+            // write occupies between reserving its offset and inserting its index entry.
+            let guard = log.begin_write().unwrap();
+            assert_eq!(log.in_flight_writes.get(), 1);
+
+            let done = Rc::new(Cell::new(false));
+            let (a, d) = (log.clone(), done.clone());
+            let h = monoio::spawn(async move {
+                let r = a.reclaim().await;
+                d.set(true);
+                r
+            });
+
+            // Give reclaim ample scheduling to reach — and block in — its drain.
+            for _ in 0..30 {
+                monoio::time::sleep(Duration::from_micros(100)).await;
+            }
+            assert!(
+                !done.get(),
+                "reclaim completed while a write was in-flight — drain missing"
+            );
+            assert!(
+                log.reclaim_in_progress.get(),
+                "reclaim should be mid-drain, holding the gate"
+            );
+            assert_eq!(
+                log.active.borrow().file_id,
+                active_before,
+                "reclaim sealed (opened a new active) while a write was still in-flight"
+            );
+
+            // Release the in-flight write → drain observes 0 → reclaim seals.
+            drop(guard);
+            let _report = h.await.unwrap(); // unwraps the reclaim Result — succeeds once drained
+            assert!(done.get());
+            assert_ne!(
+                log.active.borrow().file_id,
+                active_before,
+                "reclaim should have sealed and opened a new active after draining"
+            );
+        });
+    }
+
+    /// End-to-end companion: writes issued concurrently with a reclaim all survive
+    /// a subsequent FOOTER (fast-path) recovery. This exercises the full
+    /// reclaim+write+reopen path and passes deterministically with the drain.
+    /// (Note: its *teeth* are timing-dependent — in a quiet test the small-write
+    /// io_uring appends rarely suspend long enough to interleave reclaim's seal,
+    /// so the deterministic contract test above is what actually guards the fix.)
+    #[test]
+    fn acked_writes_during_reclaim_survive_footer_recovery() {
+        run(async {
+            for _round in 0..10u32 {
+                let dir = TempDir::new().unwrap();
+                let cfg = LogConfig {
+                    rotate_threshold: 512,
+                    fanout: 4,
+                    value_sep_threshold: 1 << 20,
+                };
+                let path = dir.path().to_path_buf();
+                let val = vec![0xBEu8; 64];
+
+                let acked: Vec<String> = {
+                    let log = Rc::new(NamespaceLog::open(path.clone(), cfg).await.unwrap());
+                    for i in 0..60u32 {
+                        log.put_full(Bytes::from(format!("base{i:04}")), &val, &[], None)
+                            .await
+                            .unwrap();
+                    }
+                    let a = log.clone();
+                    let t_reclaim = monoio::spawn(async move {
+                        let _ = a.reclaim().await;
+                    });
+                    let mut acked = Vec::new();
+                    for j in 0..40u32 {
+                        let k = format!("hot{j:04}");
+                        if log
+                            .put_full(Bytes::from(k.clone()), &val, &[], None)
+                            .await
+                            .is_ok()
+                        {
+                            acked.push(k);
+                        }
+                    }
+                    t_reclaim.await;
+                    log.seal_active_for_shutdown().await.ok();
+                    acked
+                };
+
+                let log2 = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+                let idx = log2.index.borrow();
+                for k in &acked {
+                    assert!(
+                        idx.get(k.as_bytes()).is_some(),
+                        "acked write {k} lost after footer recovery (reclaim seal didn't drain it)"
+                    );
+                }
+            }
+        });
+    }
+}
+
+#[cfg(test)]
+mod enospc_recovery_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    /// A disk-full (ENOSPC) write fails cleanly end-to-end: the failing key is
+    /// never indexed (insert happens only after a successful append), prior
+    /// committed writes are untouched, the active file is poisoned so no later
+    /// write shadows the torn slot, and after reopen the committed prefix survives
+    /// intact with no corruption.
+    #[test]
+    fn disk_full_write_preserves_committed_prefix_across_recovery() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let path = dir.path().to_path_buf();
+            let cfg = LogConfig {
+                rotate_threshold: 1 << 30,
+                fanout: 8,
+                value_sep_threshold: 1 << 20,
+            };
+            {
+                let log = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+                log.put_full(Bytes::from_static(b"k1"), b"v1", &[], None)
+                    .await
+                    .unwrap();
+                log.put_full(Bytes::from_static(b"k2"), b"v2", &[], None)
+                    .await
+                    .unwrap();
+
+                // Disk fills on the next record's append.
+                log.active.borrow().force_next_write_failure();
+                let r = log
+                    .put_full(Bytes::from_static(b"k3"), b"v3", &[], None)
+                    .await;
+                assert!(
+                    r.is_err(),
+                    "disk-full write must surface an error to the caller"
+                );
+
+                assert!(
+                    log.index.borrow().get(b"k3").is_none(),
+                    "failed write must not be indexed"
+                );
+                assert!(log.index.borrow().get(b"k1").is_some());
+                assert!(log.index.borrow().get(b"k2").is_some());
+                assert!(
+                    log.put_full(Bytes::from_static(b"k4"), b"v4", &[], None)
+                        .await
+                        .is_err(),
+                    "writes after a disk-full poison must fail, not silently land past the gap"
+                );
+            }
+
+            // Reopen: committed prefix survives, failed/blocked writes absent, clean replay.
+            let log = NamespaceLog::open(path.clone(), cfg).await.unwrap();
+            let e1 = *log.index.borrow().get(b"k1").unwrap();
+            let e2 = *log.index.borrow().get(b"k2").unwrap();
+            assert_eq!(
+                log.read_value(e1).await.unwrap().0,
+                Bytes::from_static(b"v1")
+            );
+            assert_eq!(
+                log.read_value(e2).await.unwrap().0,
+                Bytes::from_static(b"v2")
+            );
+            assert!(
+                log.index.borrow().get(b"k3").is_none(),
+                "failed write absent after recovery"
+            );
+            assert!(
+                log.index.borrow().get(b"k4").is_none(),
+                "blocked write absent after recovery"
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod fd_footprint_tests {
+    use super::*;
+    use crate::log::config::LogConfig;
+    use bytes::Bytes;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    fn open_fds() -> usize {
+        std::fs::read_dir("/proc/self/fd")
+            .map(|d| d.count())
+            .unwrap_or(0)
+    }
+
+    /// A `NamespaceLog`'s open-fd footprint is one fd for the active file PLUS one
+    /// per sealed file it holds open for reads. So `MAX_NAMESPACES` bounds the
+    /// namespace *count* but NOT the descriptor count — fds scale with sealed
+    /// files per namespace, which is what actually binds before the namespace cap
+    /// in a many-namespaces deployment. This pins that relationship so a future
+    /// change that, say, stops holding sealed fds (or starts leaking them) is
+    /// visible. `fanout` is set huge so runs accumulate without compaction merges.
+    #[test]
+    fn open_fds_scale_with_sealed_file_count() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let cfg = LogConfig {
+                rotate_threshold: 256,
+                fanout: 1 << 20,
+                value_sep_threshold: 1 << 20,
+            };
+            let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+                .await
+                .unwrap();
+
+            let fds_before = open_fds();
+            let sealed_before = log.sealed_file_count();
+
+            // Each ~300-byte record exceeds rotate_threshold (256) → seals the
+            // active and opens a fresh one, accumulating sealed files.
+            for i in 0..20u32 {
+                log.put_full(Bytes::from(format!("k{i:04}")), &[0xAB; 300], &[], None)
+                    .await
+                    .unwrap();
+            }
+
+            let sealed_after = log.sealed_file_count();
+            let fds_after = open_fds();
+            let new_sealed = sealed_after - sealed_before;
+
+            assert!(
+                new_sealed >= 10,
+                "expected sealed files to accumulate, got {new_sealed}"
+            );
+            // Each retained sealed file holds an fd: descriptor growth tracks it.
+            assert!(
+                fds_after >= fds_before + new_sealed,
+                "open fds ({fds_after}) did not grow with sealed files ({fds_before} + {new_sealed}) \
+                 — fd footprint is per-sealed-file and not bounded by the namespace cap"
+            );
+        });
+    }
+}
diff --git a/crates/engine/src/log/reclaim.rs b/crates/engine/src/log/reclaim.rs
index 992651a..e80c254 100644
--- a/crates/engine/src/log/reclaim.rs
+++ b/crates/engine/src/log/reclaim.rs
@@ -11,6 +11,23 @@ use crate::log::file::{
     BufGuard, FooterEntry, LogFile, data_filename, footer_entry_from_index, reclaim_tmp_filename,
 };
 use crate::log::index::IndexEntry;
+use crate::log::record::{HEADER_LEN, flags as rflags, parse_header};
+
+/// If `record_bytes` is a value-separated record, return its 16-byte blob hash
+/// so the new sealed file's footer carries it forward (recovery + GC depend on
+/// the footer hash). The record bytes themselves are copied verbatim by reclaim.
+fn value_hash_of(record_bytes: &[u8]) -> Option<[u8; 16]> {
+    let hdr = parse_header(record_bytes, 0).ok()?;
+    if hdr.flags & rflags::VALUE_SEP == 0 || hdr.val_size as usize != 16 {
+        return None;
+    }
+    let start = HEADER_LEN + hdr.key_size as usize;
+    let end = start + 16;
+    let slice = record_bytes.get(start..end)?;
+    let mut h = [0u8; 16];
+    h.copy_from_slice(slice);
+    Some(h)
+}
 
 #[derive(Debug, Clone, Copy)]
 pub struct ReclaimReport {
@@ -20,7 +37,7 @@ pub struct ReclaimReport {
     /// Files whose unlink failed after compaction; disk space is not freed until
     /// a subsequent reclaim or manual cleanup.
     pub dead_files_leaked: u32,
-    pub new_file_id: u16,
+    pub new_file_id: u32,
 }
 
 /// Read every live entry from `sealed_files` and write them into a single new
@@ -32,7 +49,7 @@ pub struct ReclaimReport {
 pub async fn reclaim_namespace(
     dir: PathBuf,
     sealed_files: &[Rc<LogFile>],
-    next_file_id: u16,
+    next_file_id: u32,
     live: &[(Bytes, IndexEntry, Option<u64>)],
 ) -> Result<(ReclaimReport, Vec<(Bytes, IndexEntry, Option<u64>)>)> {
     let tmp_path = dir.join(reclaim_tmp_filename(next_file_id));
@@ -52,7 +69,7 @@ pub async fn reclaim_namespace(
     new_file.truncate_to(0).await?;
 
     // Build an owned-Rc map so read futures can capture file handles without borrowing.
-    let file_map: FxHashMap<u16, Rc<LogFile>> = sealed_files
+    let file_map: FxHashMap<u32, Rc<LogFile>> = sealed_files
         .iter()
         .map(|f| (f.file_id, Rc::clone(f)))
         .collect();
@@ -84,6 +101,7 @@ pub async fn reclaim_namespace(
 
     for ((key, old_entry, ttl), bytes_res) in live.iter().zip(read_results) {
         let bytes = bytes_res?.into_inner();
+        let value_hash = value_hash_of(&bytes);
         let (new_offset, _) = new_file.append(bytes).await?;
         live_bytes += old_entry.record_size as u64;
         let new_entry = IndexEntry::new(
@@ -93,7 +111,12 @@ pub async fn reclaim_namespace(
             old_entry.tstamp_ms,
         );
         new_entries.push((key.clone(), new_entry, *ttl));
-        footer.push(footer_entry_from_index(key.clone(), &new_entry, *ttl));
+        footer.push(footer_entry_from_index(
+            key.clone(),
+            &new_entry,
+            *ttl,
+            value_hash,
+        ));
     }
 
     new_file.write_footer(&footer).await?;
@@ -101,6 +124,10 @@ pub async fn reclaim_namespace(
 
     let final_path = dir.join(data_filename(next_file_id));
     monoio::fs::rename(&tmp_path, &final_path).await?;
+    // Make the rename durable: without fsyncing the directory, a power loss could
+    // leave the merged file under its tmp name (or nameless) while old inputs are
+    // already unlinked below — losing the compacted data.
+    crate::log::file::sync_dir(&dir).await;
 
     let live_keys = new_entries.len() as u64;
 
diff --git a/crates/engine/src/log/record.rs b/crates/engine/src/log/record.rs
index 0cd268c..17f34a4 100644
--- a/crates/engine/src/log/record.rs
+++ b/crates/engine/src/log/record.rs
@@ -28,6 +28,10 @@ pub mod flags {
     pub const TOMBSTONE: u8 = 0b0000_0001;
     pub const NO_EXPIRY: u8 = 0b0000_0010;
     pub const TTL_UPDATE: u8 = 0b0000_0100;
+    /// Value-separated: the record's value field is a 16-byte content hash, not
+    /// the value itself. The value lives in the content-addressed blob store
+    /// (`value_store`). Set for values >= `LogConfig::value_sep_threshold`.
+    pub const VALUE_SEP: u8 = 0b0000_1000;
 }
 
 /// Fixed header bytes preceding every record.
diff --git a/crates/engine/src/log/recover.rs b/crates/engine/src/log/recover.rs
index 19eb0d6..d19797c 100644
--- a/crates/engine/src/log/recover.rs
+++ b/crates/engine/src/log/recover.rs
@@ -39,6 +39,7 @@ pub async fn open_namespace(dir: PathBuf) -> Result<OpenedFiles> {
         // Fresh namespace — create active file id 0.
         let path = dir.join(crate::log::file::data_filename(0));
         let active = LogFile::open_rw(path, 0).await?;
+        crate::log::file::sync_dir(&dir).await; // make the new file's dir entry durable
         return Ok(OpenedFiles {
             sealed,
             active,
@@ -80,11 +81,11 @@ pub async fn open_namespace(dir: PathBuf) -> Result<OpenedFiles> {
                 offset: 0,
                 reason: "file_id overflow on clean-shutdown recovery",
             })?;
-            if next_id >= u16::MAX - 100 {
+            if next_id >= u32::MAX - 100 {
                 warn!(
                     file_id = next_id,
-                    remaining = u16::MAX - next_id,
-                    "file_id nearing u16::MAX; compact sealed files to reclaim IDs"
+                    remaining = u32::MAX - next_id,
+                    "file_id nearing u32::MAX; compact sealed files to reclaim IDs"
                 );
             }
             let new_path = active_path
@@ -94,7 +95,9 @@ pub async fn open_namespace(dir: PathBuf) -> Result<OpenedFiles> {
                     reason: "namespace data_dir has no parent; cannot compute next-file path",
                 })?
                 .join(crate::log::file::data_filename(next_id));
-            LogFile::open_rw(new_path, next_id).await?
+            let active = LogFile::open_rw(new_path, next_id).await?;
+            crate::log::file::sync_dir(&dir).await; // new active after clean-shutdown recovery
+            active
         }
         None => {
             drop(highest);
@@ -111,16 +114,19 @@ pub async fn open_namespace(dir: PathBuf) -> Result<OpenedFiles> {
     })
 }
 
-fn apply_footer_entries(index: &mut NsIndex, file_id: u16, entries: &[FooterEntry]) {
+fn apply_footer_entries(index: &mut NsIndex, file_id: u32, entries: &[FooterEntry]) {
     for e in entries {
         let entry = IndexEntry::new(file_id, e.record_offset, e.record_size, e.tstamp_ms);
         index.insert(e.key.clone(), entry, e.expires_at_ms);
+        // Value-separated keys carry their blob hash in the footer — repopulate
+        // the sidecar so overwrite/delete can unref and blob refcounts rebuild.
+        index.set_valsep(&e.key, e.value_hash);
     }
 }
 
 /// Scan a file's records from the start, populating the index. Used as a
 /// fallback when a sealed file's footer is missing/corrupt.
-async fn rebuild_from_records(file: &LogFile, file_id: u16, index: &mut NsIndex) -> Result<()> {
+async fn rebuild_from_records(file: &LogFile, file_id: u32, index: &mut NsIndex) -> Result<()> {
     let total = file.size().await?;
     let mut offset = 0u64;
     while offset < total {
@@ -156,7 +162,7 @@ async fn rebuild_from_records(file: &LogFile, file_id: u16, index: &mut NsIndex)
 
 /// Replay the active file from offset 0 to EOF. On bad CRC, truncate at the
 /// last good boundary.
-async fn replay_active(file: &LogFile, file_id: u16, index: &mut NsIndex) -> Result<()> {
+async fn replay_active(file: &LogFile, file_id: u32, index: &mut NsIndex) -> Result<()> {
     let total = file.size().await?;
     let mut offset = 0u64;
     let mut last_good = 0u64;
@@ -205,7 +211,7 @@ async fn replay_active(file: &LogFile, file_id: u16, index: &mut NsIndex) -> Res
 
 fn apply_record(
     index: &mut NsIndex,
-    file_id: u16,
+    file_id: u32,
     offset: u64,
     hdr: &crate::log::record::RecordHeader,
     body: &[u8],
@@ -251,5 +257,22 @@ fn apply_record(
     } else {
         Some(hdr.expires_at_ms)
     };
-    index.insert(Bytes::copy_from_slice(key), entry, ttl);
+    let key_bytes = Bytes::copy_from_slice(key);
+    index.insert(key_bytes.clone(), entry, ttl);
+    // Value-separated record: the value field is the 16-byte blob hash. Repopulate
+    // the sidecar so the blob can be unref'd on a later overwrite/delete.
+    if hdr.flags & rflags::VALUE_SEP != 0 {
+        let vstart = hdr.key_size as usize;
+        let vend = vstart + hdr.val_size as usize;
+        if hdr.val_size as usize == 16 && body.len() >= vend {
+            let mut h = [0u8; 16];
+            h.copy_from_slice(&body[vstart..vend]);
+            index.set_valsep(&key_bytes, Some(h));
+        } else {
+            warn!(
+                offset,
+                "value-separated record without a 16-byte hash; ignoring sidecar entry"
+            );
+        }
+    }
 }
diff --git a/crates/engine/src/store.rs b/crates/engine/src/store.rs
index b0241bd..1d39a72 100644
--- a/crates/engine/src/store.rs
+++ b/crates/engine/src/store.rs
@@ -82,7 +82,27 @@ impl ShardStore {
     /// from a hot async path after the runtime is handling requests.
     pub async fn open(data_dir: &Path, memory_bytes: usize) -> Result<Self> {
         std::fs::create_dir_all(data_dir)?;
-        let config = LogConfig::default();
+        // Compaction is size-tiered (GlideFS-friendly). `KV_COMPACTION_FANOUT`
+        // tunes the per-level fanout; `KV_VALUE_SEP_THRESHOLD` the
+        // value-separation cutoff.
+        let config = {
+            let mut c = LogConfig::default();
+            if let Ok(n) = std::env::var("KV_COMPACTION_FANOUT")
+                .unwrap_or_default()
+                .parse::<usize>()
+            {
+                if n >= 2 {
+                    c.fanout = n;
+                }
+            }
+            if let Ok(n) = std::env::var("KV_VALUE_SEP_THRESHOLD")
+                .unwrap_or_default()
+                .parse::<usize>()
+            {
+                c.value_sep_threshold = n;
+            }
+            c
+        };
         let mut namespaces: FxHashMap<String, Rc<NamespaceLog>> = FxHashMap::default();
 
         // Collect valid namespace subdirectories, then open them concurrently.
@@ -160,13 +180,23 @@ impl ShardStore {
         }
         let dir = self.data_dir.join(ns);
         let nslog = Rc::new(NamespaceLog::open(dir, self.config).await?);
-        // Re-check after the await — another spawned task may have beaten us.
-        Ok(self
-            .namespaces
-            .borrow_mut()
-            .entry(ns.to_string())
-            .or_insert(nslog)
-            .clone())
+        // Re-check after the await: a concurrent task may have inserted this same
+        // namespace (dedup — return theirs, drop ours), OR filled the map to the
+        // cap while we were opening. Without the cap re-check, N concurrent opens
+        // of distinct new namespaces could all pass the pre-await gate at
+        // `len == MAX-1` and each insert, overshooting the cap.
+        let mut ns_map = self.namespaces.borrow_mut();
+        if let Some(existing) = ns_map.get(ns) {
+            return Ok(existing.clone());
+        }
+        if ns_map.len() >= MAX_NAMESPACES {
+            // Our freshly-created (empty) namespace dir is left behind; it is
+            // harmless and reused idempotently if this namespace is opened later.
+            return Err(EngineError::CapacityExceeded {
+                reason: "namespace limit reached",
+            });
+        }
+        Ok(ns_map.entry(ns.to_string()).or_insert(nslog).clone())
     }
 
     /// Test-only accessor that bypasses `ensure_ns` validation. Do not use in production code.
@@ -353,7 +383,15 @@ impl ShardStore {
         } else {
             WriteCondition::KeyAbsent
         };
-        let expires_at_ms = opts.ttl.map(|d| Self::validate_ttl(d, now)).transpose()?;
+        // Honor KEEPTTL the same way `set` does: preserve the key's existing
+        // expiry instead of silently clearing it. Relevant to SETXX (the key
+        // exists, so it may carry a TTL); on SETNX the key is absent, so the
+        // index lookup returns None and this is a no-op.
+        let expires_at_ms = if opts.keep_ttl {
+            nslog.index.borrow().ttl(key)
+        } else {
+            opts.ttl.map(|d| Self::validate_ttl(d, now)).transpose()?
+        };
         let meta_bytes: Vec<u8> = opts
             .metadata
             .as_ref()
@@ -944,16 +982,15 @@ impl ShardStore {
     pub async fn incr(&self, ns: &str, key: &[u8], delta: i64) -> Result<i64> {
         let nslog = self.ensure_ns(ns).await?;
 
-        // Serialize INCRs within this namespace. Without the lock, every concurrent
-        // CAS attempt submits a (futile) disk append and only one wins per round —
-        // io_uring completion order is roughly submission order, so a late submitter
-        // can lose every round as new contenders refill the in-flight pool. Holding
-        // the lock makes each INCR's read-modify-write run to completion; the small
-        // retry budget below only needs to cover cross-op contention (a SET/DEL
-        // squeezing in while we're between read and write).
-        let _incr_guard = nslog.incr_lock.lock().await;
-
-        for _ in 0..8u8 {
+        // Optimistic read-modify-write. `put_full_cond` now holds the key's write
+        // stripe and checks BEFORE appending, so a lost race writes nothing (no
+        // futile append) and simply returns `None` — the retry below re-reads and
+        // tries again. No dedicated INCR lock is needed any more.
+        // 64 attempts, not 8: a hot counter incremented hundreds of times per
+        // tick can legitimately lose 8 CAS races in a row and still be making
+        // progress. The cap only exists to bound a pathological livelock, so set
+        // it well above realistic same-key contention before surfacing Conflict.
+        for _ in 0..64u16 {
             let now = now_ms();
 
             // Read current value + TTL + revision for the CAS condition.
@@ -1053,11 +1090,11 @@ impl ShardStore {
             KeyFilter::Exact(k) => self
                 .watchers
                 .borrow_mut()
-                .subscribe_key(ns_b, Bytes::copy_from_slice(k)),
+                .subscribe_key(ns_b, Bytes::copy_from_slice(k))?,
             KeyFilter::Prefix(p) => self
                 .watchers
                 .borrow_mut()
-                .subscribe_prefix(ns_b, Bytes::copy_from_slice(p)),
+                .subscribe_prefix(ns_b, Bytes::copy_from_slice(p))?,
         };
 
         let nslog = self.ensure_ns(ns).await?;
@@ -1283,6 +1320,8 @@ impl ShardStore {
         // prior write — see its doc for why this beats reading last_revision()
         // here under concurrent writers.
         let revision = nslog.flush().await?;
+        // Drop all value-separated blobs for this namespace (FLUSHDB clears everything).
+        nslog.values.clear();
 
         let mut w = self.watchers.borrow_mut();
         for key in live_keys {
@@ -2419,4 +2458,262 @@ mod tests {
             assert_eq!(final_val, CONTENDERS as i64 * PER_CONTENDER);
         });
     }
+
+    /// Regression: `set_conditional` (SETXX/SETNX) used to ignore `keep_ttl` and
+    /// silently clear the key's TTL. SETXX with KEEPTTL must preserve the
+    /// existing expiry, matching `set`'s behavior.
+    #[test]
+    fn setxx_keep_ttl_preserves_existing_expiry() {
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().to_path_buf();
+        run(async move {
+            let s = open_store(&path).await;
+            set_ttl(&s, b"kt", b"v1", Duration::from_secs(3600)).await;
+            let ok = s
+                .setxx(
+                    "default",
+                    b"kt",
+                    Bytes::from_static(b"v2"),
+                    SetOptions {
+                        ttl: None,
+                        metadata: None,
+                        keep_ttl: true,
+                    },
+                )
+                .await
+                .unwrap();
+            assert!(ok, "setxx on a live key must succeed");
+            assert_eq!(get_value(&s, b"kt").await.unwrap().as_ref(), b"v2");
+            match s.ttl("default", b"kt").await.unwrap() {
+                TtlResult::Remaining(secs) => {
+                    assert!(secs > 0, "KEEPTTL must preserve the existing TTL")
+                }
+                other => panic!("expected Remaining, got {other:?}"),
+            }
+        });
+    }
+
+    /// Regression: `current_entries` emitted `revision: 0` for every initial
+    /// watch event, breaking the documented "dedup by revision" contract across
+    /// the subscribe→scan window. Initial events must carry the key's real
+    /// revision — the same value a subsequent GET reports.
+    #[test]
+    fn watch_initial_event_carries_real_revision() {
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().to_path_buf();
+        run(async move {
+            let s = open_store(&path).await;
+            set(&s, b"wr", b"v").await;
+            let expected = s.get("default", b"wr").await.unwrap().unwrap().revision;
+            let (initial, _rx) = s
+                .watch_subscribe("default", KeyFilter::Exact(b"wr"), 0)
+                .await
+                .unwrap();
+            assert_eq!(initial.len(), 1);
+            match &initial[0] {
+                WatchEvent::Set { revision, .. } => {
+                    assert!(
+                        *revision > 0,
+                        "initial event must carry a real revision, not 0"
+                    );
+                    assert_eq!(
+                        *revision, expected,
+                        "initial revision must match GET revision so callers can dedup"
+                    );
+                }
+                other => panic!("expected Set, got {other:?}"),
+            }
+        });
+    }
+
+    /// Regression: `scan_file_records` (the watch catch-up / `scan_since` path)
+    /// skipped the CRC check every other record-reading path performs, so a
+    /// bit-flipped record would be streamed to subscribers as a bogus event.
+    /// With the CRC check, the corrupt record is skipped and no event is emitted.
+    #[test]
+    fn watch_replay_skips_crc_corrupted_record() {
+        use std::io::{Seek, SeekFrom, Write as _};
+        let tmp = TempDir::new().unwrap();
+        let path = tmp.path().to_path_buf();
+        run(async move {
+            let s = open_store(&path).await;
+            set(&s, b"ck", b"hello").await;
+            // Flush so the record bytes are on disk where scan_since reads them.
+            s.sync_logs().await.unwrap();
+            let active_path = s.get_ns("default").unwrap().active.borrow().path.clone();
+
+            // Corrupt the first value byte. Record at offset 0: HEADER_LEN(37) +
+            // key "ck"(2) = byte 39 is the first value byte. Flipping it breaks
+            // the record CRC without disturbing the parseable header.
+            let mut f = std::fs::OpenOptions::new()
+                .write(true)
+                .open(&active_path)
+                .unwrap();
+            f.seek(SeekFrom::Start(39)).unwrap();
+            f.write_all(&[0xFF]).unwrap();
+            f.flush().unwrap();
+
+            // since=1 (< the record's ms-epoch tstamp) routes through scan_since,
+            // which reads from disk and now hits the corrupted bytes.
+            let (initial, _rx) = s
+                .watch_subscribe("default", KeyFilter::Exact(b"ck"), 1)
+                .await
+                .unwrap();
+            assert!(
+                initial.is_empty(),
+                "a CRC-corrupt record must not be replayed as a watch event, got {initial:?}"
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod namespace_cap_tests {
+    use super::*;
+    use crate::error::EngineError;
+    use crate::types::SetOptions;
+    use bytes::Bytes;
+    use std::future::Future;
+    use std::rc::Rc;
+    use tempfile::TempDir;
+
+    fn run<F: Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    /// Concurrent opens of distinct new namespaces cannot overshoot the cap.
+    /// With one slot free, two tasks both pass the synchronous pre-await gate
+    /// (`len == MAX-1`), both open concurrently, then resume: the first inserts
+    /// (filling the slot) and the second's post-await re-check must reject it.
+    /// Without the cap re-check after the await, both insert → `MAX+1`: teeth.
+    #[test]
+    fn concurrent_boundary_opens_do_not_overshoot_cap() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let s = Rc::new(ShardStore::open(dir.path(), 4 << 20).await.unwrap());
+
+            // Fill to exactly one slot below the cap.
+            let mut i = 0usize;
+            while s.namespace_count() < MAX_NAMESPACES - 1 {
+                s.set(
+                    &format!("db{i}"),
+                    b"k",
+                    Bytes::from_static(b"v"),
+                    SetOptions::default(),
+                )
+                .await
+                .unwrap();
+                i += 1;
+            }
+            assert_eq!(s.namespace_count(), MAX_NAMESPACES - 1);
+
+            // Two concurrent opens of distinct new namespaces racing for the last slot.
+            let (s1, s2) = (s.clone(), s.clone());
+            let t1 = monoio::spawn(async move {
+                s1.set(
+                    "race_a",
+                    b"k",
+                    Bytes::from_static(b"v"),
+                    SetOptions::default(),
+                )
+                .await
+            });
+            let t2 = monoio::spawn(async move {
+                s2.set(
+                    "race_b",
+                    b"k",
+                    Bytes::from_static(b"v"),
+                    SetOptions::default(),
+                )
+                .await
+            });
+            let (r1, r2) = (t1.await, t2.await);
+
+            assert_eq!(
+                s.namespace_count(),
+                MAX_NAMESPACES,
+                "cap must hold exactly under concurrent boundary opens (no overshoot)"
+            );
+            let wins = [r1.is_ok(), r2.is_ok()].into_iter().filter(|b| *b).count();
+            assert_eq!(wins, 1, "exactly one concurrent open wins the last slot");
+            let rejected = [&r1, &r2]
+                .into_iter()
+                .filter(|r| matches!(r, Err(EngineError::CapacityExceeded { .. })))
+                .count();
+            assert_eq!(rejected, 1, "the loser must get a clean CapacityExceeded");
+        });
+    }
+
+    /// The per-shard namespace cap degrades gracefully: opening distinct
+    /// namespaces succeeds up to `MAX_NAMESPACES`, the next one is rejected with a
+    /// clean `CapacityExceeded` (no panic, the map does not grow), and existing
+    /// namespaces — including `default` — keep serving reads and writes at the cap.
+    /// This is the bound that keeps NamespaceLog + file-descriptor growth finite.
+    #[test]
+    fn namespace_cap_is_enforced_and_existing_namespaces_keep_working() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let s = ShardStore::open(dir.path(), 4 << 20).await.unwrap();
+
+            // "default" pre-exists; open distinct namespaces until the map is full.
+            let mut i = 0usize;
+            while s.namespace_count() < MAX_NAMESPACES {
+                let ns = format!("db{i}");
+                s.set(&ns, b"k", Bytes::from_static(b"v"), SetOptions::default())
+                    .await
+                    .unwrap();
+                i += 1;
+            }
+            assert_eq!(s.namespace_count(), MAX_NAMESPACES);
+
+            // One namespace over the cap: clean rejection, map unchanged.
+            let over = s
+                .set(
+                    &format!("db{i}"),
+                    b"k",
+                    Bytes::from_static(b"v"),
+                    SetOptions::default(),
+                )
+                .await;
+            assert!(
+                matches!(over, Err(EngineError::CapacityExceeded { .. })),
+                "expected CapacityExceeded at the cap, got {over:?}"
+            );
+            assert_eq!(
+                s.namespace_count(),
+                MAX_NAMESPACES,
+                "rejected open must not grow the map"
+            );
+
+            // Existing namespaces are unaffected by the cap.
+            s.set(
+                "db0",
+                b"k2",
+                Bytes::from_static(b"v2"),
+                SetOptions::default(),
+            )
+            .await
+            .unwrap();
+            assert!(
+                s.get("db0", b"k2").await.unwrap().is_some(),
+                "existing ns still writable at cap"
+            );
+            s.set(
+                "default",
+                b"dk",
+                Bytes::from_static(b"dv"),
+                SetOptions::default(),
+            )
+            .await
+            .unwrap();
+            assert!(
+                s.get("default", b"dk").await.unwrap().is_some(),
+                "default ns still works at cap"
+            );
+        });
+    }
 }
diff --git a/crates/engine/src/value_store.rs b/crates/engine/src/value_store.rs
new file mode 100644
index 0000000..27cb42a
--- /dev/null
+++ b/crates/engine/src/value_store.rs
@@ -0,0 +1,431 @@
+//! Content-addressed value store (value separation, WiscKey-style) for large
+//! values — the GlideFS-friendly large-value path.
+//!
+//! A value is keyed by its BLAKE3-128 content hash and stored once; identical
+//! values across keys/forks/tenants dedup to a single blob. The main log holds
+//! only the small `(key -> hash)` pointer record, so compaction moves pointers,
+//! never large values — collapsing large-value write amplification. Blobs are
+//! immutable and refcounted; a blob is unlinked when its last reference drops.
+//!
+//! Blob I/O is async via `monoio::fs` (io_uring) — it runs on the same reactor
+//! as the log engine and never blocks the shard's event loop. Refcounts are
+//! in-memory, rebuilt from the live index on open; `sweep_orphans` reclaims any
+//! blob a crash left without a referencing record.
+
+use std::cell::RefCell;
+use std::path::PathBuf;
+
+use rustc_hash::FxHashMap;
+
+use crate::error::Result;
+
+/// BLAKE3-128 content hash (matches GlideFS's block addressing width).
+pub type ContentHash = [u8; 16];
+
+pub fn content_hash(value: &[u8]) -> ContentHash {
+    let mut out = [0u8; 16];
+    out.copy_from_slice(&blake3::hash(value).as_bytes()[..16]);
+    out
+}
+
+fn hex16(h: &ContentHash) -> String {
+    let mut s = String::with_capacity(32);
+    for b in h {
+        s.push_str(&format!("{b:02x}"));
+    }
+    s
+}
+
+/// Parse a `blob-<32 hex>` filename back into its content hash.
+fn parse_blob_name(name: &str) -> Option<ContentHash> {
+    let hex = name.strip_prefix("blob-")?;
+    if hex.len() != 32 {
+        return None;
+    }
+    let mut h = [0u8; 16];
+    for (i, b) in h.iter_mut().enumerate() {
+        *b = u8::from_str_radix(hex.get(i * 2..i * 2 + 2)?, 16).ok()?;
+    }
+    Some(h)
+}
+
+/// Content-addressed, refcounted blob store. Refcounts are in-memory (rebuilt
+/// from the index on open in the integrated engine).
+pub struct ValueStore {
+    dir: PathBuf,
+    refs: RefCell<FxHashMap<ContentHash, u32>>,
+    /// Blobs whose refcount has hit zero but whose deletion is deferred until the
+    /// next fsync (see `collect_garbage`). Deleting a superseded blob before the
+    /// superseding log record is durable would, on a power loss that loses that
+    /// record, leave the reverted-to key pointing at a deleted blob (a dangling
+    /// pointer). Deferring past the fsync makes the revert safe.
+    pending_delete: RefCell<Vec<ContentHash>>,
+    /// Striped locks serializing the file create/delete of a given blob. A blob's
+    /// `put` (write) and `collect_garbage` (unlink) for the SAME content hash hold
+    /// the same stripe, so they can never race — without it, an unlink in flight
+    /// could delete a file a concurrent same-content `put` just recreated (a
+    /// dangling pointer). Different content → different stripe → still concurrent.
+    file_locks: Vec<futures_util::lock::Mutex<()>>,
+}
+
+/// Number of blob file-op stripes. Same-hash ops serialize; different hashes
+/// stay concurrent. Blob writes are the rare large-value path, so this is small.
+const FILE_LOCK_STRIPES: usize = 16;
+
+impl ValueStore {
+    pub fn new(dir: PathBuf) -> Self {
+        // Dir is created lazily on the first blob write — an all-small-value
+        // namespace never materializes a `values/` directory at all.
+        Self {
+            dir,
+            refs: RefCell::new(FxHashMap::default()),
+            pending_delete: RefCell::new(Vec::new()),
+            file_locks: (0..FILE_LOCK_STRIPES)
+                .map(|_| futures_util::lock::Mutex::new(()))
+                .collect(),
+        }
+    }
+
+    fn path(&self, h: &ContentHash) -> PathBuf {
+        self.dir.join(format!("blob-{}", hex16(h)))
+    }
+
+    /// The file-op stripe for a content hash (first bytes of the hash → stripe).
+    fn flock(&self, h: &ContentHash) -> &futures_util::lock::Mutex<()> {
+        &self.file_locks[(h[0] as usize) & (FILE_LOCK_STRIPES - 1)]
+    }
+
+    /// Store `value`, deduplicated by content. Returns its content hash. Writes
+    /// the blob only on first reference (immutable, write-once); subsequent puts
+    /// of identical content just bump the refcount — no rewrite, no extra bytes.
+    pub async fn put(&self, value: &[u8]) -> Result<ContentHash> {
+        let h = content_hash(value);
+        // Serialize this content's file create against a concurrent delete of the
+        // same content in `collect_garbage` (held across the refcount bump + write
+        // so the decision and the file op are atomic for this hash).
+        let _fl = self.flock(&h).lock().await;
+        let first = {
+            let mut refs = self.refs.borrow_mut();
+            let c = refs.entry(h).or_insert(0);
+            *c += 1;
+            *c == 1
+        };
+        if first {
+            // Write the blob durably BEFORE the caller writes the pointer record
+            // that references it. The log uses appendfsync-everysec, but the
+            // pointer and the value live in different files — so we must fsync the
+            // blob's data AND its directory entry here, or a power loss could
+            // leave a durable pointer aimed at a non-durable blob (a dangling
+            // pointer = corruption, worse than the everysec "lose the last 1s"
+            // contract). With this ordering, the worst a crash can do is leave an
+            // orphan blob (durable blob, lost pointer) — reclaimed by
+            // `sweep_orphans` on the next open. All I/O is io_uring (no blocking).
+            if let Err(e) = self.write_blob_durable(&h, value).await {
+                self.dec(&h); // roll back the ref; no phantom reference to a missing blob
+                return Err(e);
+            }
+        }
+        Ok(h)
+    }
+
+    /// Write `value` to its blob path and make it crash-durable: fsync the file's
+    /// data, then fsync the parent directory so the new directory entry survives a
+    /// power loss. Returns only once the blob is durable on stable storage.
+    async fn write_blob_durable(&self, h: &ContentHash, value: &[u8]) -> Result<()> {
+        // Propagate a create failure rather than swallow it: if the directory
+        // can't be made, the `open` below fails with a generic ENOENT that hides
+        // the real cause (e.g. EACCES on the parent). idempotent: Ok if it exists.
+        monoio::fs::create_dir_all(&self.dir).await?;
+        let file = monoio::fs::OpenOptions::new()
+            .create(true)
+            .write(true)
+            .truncate(true)
+            .open(self.path(h))
+            .await?;
+        let (res, _buf) = file.write_all_at(value.to_vec(), 0).await;
+        res?;
+        file.sync_all().await?; // blob bytes durable
+        let _ = file.close().await;
+        // fsync the directory so the blob's name is durable before any pointer
+        // record referencing it can become durable. A failure here weakens the
+        // crash-durability contract (the blob's directory entry may not survive a
+        // power loss, leaving a durable pointer aimed at a nameless blob), so
+        // surface it as an error rather than swallow it. The caller rolls back the
+        // refcount on Err, so a failed durability step never leaves a phantom ref.
+        let dir = monoio::fs::OpenOptions::new()
+            .read(true)
+            .open(&self.dir)
+            .await?;
+        let sync_res = dir.sync_all().await;
+        let _ = dir.close().await;
+        sync_res?;
+        Ok(())
+    }
+
+    pub async fn get(&self, h: &ContentHash) -> Result<Vec<u8>> {
+        Ok(monoio::fs::read(self.path(h)).await?)
+    }
+
+    /// Recovery: rebuild the in-memory refcount for a hash referenced by a live
+    /// index entry, WITHOUT writing the blob (it already exists on disk from
+    /// before the restart). Called once per live value-separated key at open.
+    pub fn incr_ref(&self, h: &ContentHash) {
+        *self.refs.borrow_mut().entry(*h).or_insert(0) += 1;
+    }
+
+    /// Decrement the in-memory refcount; return true if it hit zero (blob dead).
+    fn dec(&self, h: &ContentHash) -> bool {
+        let mut refs = self.refs.borrow_mut();
+        match refs.get_mut(h) {
+            Some(c) => {
+                *c = c.saturating_sub(1);
+                if *c == 0 {
+                    refs.remove(h);
+                    true
+                } else {
+                    false
+                }
+            }
+            None => false,
+        }
+    }
+
+    /// Drop one reference. When the last reference goes away the blob is NOT
+    /// deleted immediately — it is queued for `collect_garbage`, which runs after
+    /// the next fsync. This preserves the crash-consistency invariant: a blob is
+    /// only physically removed once the log record that superseded it is durable,
+    /// so a power loss that reverts the key always finds its blob still present.
+    pub fn unref(&self, h: &ContentHash) {
+        if self.dec(h) {
+            self.pending_delete.borrow_mut().push(*h);
+        }
+    }
+
+    /// Delete the blobs queued by `unref`, but only those still at refcount 0
+    /// (a queued blob may have been re-referenced by an identical-content write
+    /// in the meantime). MUST be called only after the log has been fsynced past
+    /// the records that orphaned these blobs — i.e. right after `LogFile::sync`.
+    /// Blobs whose deletion is skipped here for any reason are still reachable as
+    /// orphans and reclaimed by `sweep_orphans` on the next open.
+    pub async fn collect_garbage(&self) {
+        let pending: Vec<ContentHash> = std::mem::take(&mut *self.pending_delete.borrow_mut());
+        for h in pending {
+            // Hold this content's file stripe so the refcount==0 check and the
+            // unlink are atomic w.r.t. a concurrent same-content `put`: either the
+            // put re-references it first (refcount>0 → we skip) or we delete first
+            // (and the put then recreates it). The file can't be left missing while
+            // a live key references it.
+            let _fl = self.flock(&h).lock().await;
+            if self.refcount(&h) == 0 {
+                let _ = monoio::fs::remove_file(self.path(&h)).await;
+            }
+        }
+    }
+
+    /// Drop all blobs and refcounts (FLUSHDB). Nukes the whole `values/` tree —
+    /// including any deferred-delete or orphan blobs — and resets all state.
+    pub fn clear(&self) {
+        self.refs.borrow_mut().clear();
+        self.pending_delete.borrow_mut().clear();
+        let _ = std::fs::remove_dir_all(&self.dir);
+    }
+
+    /// Reclaim orphan blobs: files on disk that no live key references. A crash
+    /// between writing a blob and appending its log record (or between writing a
+    /// new blob and unref'ing the old one) leaves such a file. Call once at open,
+    /// AFTER refcounts have been rebuilt from the live index — then any blob not
+    /// in `refs` is unreachable and safe to delete. Returns the count removed.
+    ///
+    /// Directory listing uses `std::fs` because this runs at open, before the
+    /// shard serves traffic (same place `recover` already lists data files);
+    /// the deletions go through io_uring.
+    pub async fn sweep_orphans(&self) -> Result<usize> {
+        let entries = match std::fs::read_dir(&self.dir) {
+            Ok(e) => e,
+            Err(_) => return Ok(0), // no values dir => nothing to sweep
+        };
+        let mut orphans: Vec<PathBuf> = Vec::new();
+        for ent in entries.flatten() {
+            let name = ent.file_name();
+            let name = name.to_string_lossy();
+            match parse_blob_name(&name) {
+                Some(h) if !self.refs.borrow().contains_key(&h) => orphans.push(ent.path()),
+                Some(_) => {} // referenced — keep
+                None => {}    // not a blob file — ignore
+            }
+        }
+        let removed = orphans.len();
+        for p in orphans {
+            let _ = monoio::fs::remove_file(p).await;
+        }
+        Ok(removed)
+    }
+
+    pub fn blob_count(&self) -> usize {
+        self.refs.borrow().len()
+    }
+
+    pub fn refcount(&self, h: &ContentHash) -> u32 {
+        self.refs.borrow().get(h).copied().unwrap_or(0)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    /// Identical large values DEDUP to one blob (write-once), and refcounted GC
+    /// reclaims it — on real files, through the async io_uring path.
+    #[test]
+    fn dedup_write_once_and_gc() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let vs = ValueStore::new(dir.path().join("values"));
+
+            let big = vec![7u8; 1_000_000]; // 1 MiB
+            let h1 = vs.put(&big).await.unwrap();
+            let h2 = vs.put(&big).await.unwrap(); // identical content
+            assert_eq!(h1, h2, "same content → same hash");
+            assert_eq!(vs.blob_count(), 1, "identical values dedup to ONE blob");
+            assert_eq!(vs.refcount(&h1), 2);
+            assert_eq!(vs.get(&h1).await.unwrap(), big, "roundtrip");
+
+            let other = vec![9u8; 1_000_000];
+            vs.put(&other).await.unwrap();
+            assert_eq!(vs.blob_count(), 2, "distinct content → distinct blob");
+
+            // Drop both refs to the first blob → refcount 0, queued for deletion.
+            vs.unref(&h1);
+            vs.unref(&h1);
+            assert_eq!(vs.refcount(&h1), 0);
+            assert!(
+                vs.get(&h1).await.is_ok(),
+                "blob still on disk before collect (deferred delete)"
+            );
+            // collect_garbage runs after an fsync → now the blob is physically gone.
+            vs.collect_garbage().await;
+            assert!(
+                vs.get(&h1).await.is_err(),
+                "blob GC'd after collect_garbage"
+            );
+            assert_eq!(vs.blob_count(), 1, "only the live blob remains");
+        });
+    }
+
+    /// A crash can leave a blob on disk with no referencing key. After refcounts
+    /// are rebuilt from the live index, `sweep_orphans` reclaims exactly those.
+    #[test]
+    fn sweep_reclaims_orphans_only() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let vs = ValueStore::new(dir.path().join("values"));
+            let live = vs.put(&vec![1u8; 4096]).await.unwrap();
+            let orphan = vs.put(&vec![2u8; 4096]).await.unwrap();
+            // Simulate a crash that wrote the orphan blob but never recorded its
+            // reference: forget it from the in-memory refs (as a fresh open would,
+            // since no live key points at it).
+            vs.refs.borrow_mut().remove(&orphan);
+
+            let removed = vs.sweep_orphans().await.unwrap();
+            assert_eq!(removed, 1, "exactly the unreferenced blob is reclaimed");
+            assert!(vs.get(&orphan).await.is_err(), "orphan blob deleted");
+            assert_eq!(
+                vs.get(&live).await.unwrap().len(),
+                4096,
+                "live blob untouched"
+            );
+        });
+    }
+}
+
+#[cfg(test)]
+mod gc_race_tests {
+    use super::*;
+    use std::rc::Rc;
+    use tempfile::TempDir;
+
+    fn run<F: std::future::Future>(f: F) -> F::Output {
+        monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+            .enable_timer()
+            .build()
+            .expect("monoio runtime")
+            .block_on(f)
+    }
+
+    /// Deterministic regression (real teeth): a blob queued for deletion that is
+    /// re-referenced BEFORE GC runs must survive — `collect_garbage` re-checks the
+    /// live refcount instead of deleting everything it queued. (Remove the
+    /// `refcount == 0` guard in `collect_garbage` and this fails: the blob is gone.)
+    #[test]
+    fn collect_garbage_skips_a_requeued_then_rereferenced_blob() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let vs = ValueStore::new(dir.path().join("values"));
+            let v = vec![0x5Au8; 4096];
+            let h = vs.put(&v).await.unwrap(); // refcount 1, file written
+            vs.unref(&h); // refcount 0, queued for delete, file still present
+            assert_eq!(vs.refcount(&h), 0);
+            vs.put(&v).await.unwrap(); // re-reference BEFORE gc → refcount 1
+            assert_eq!(vs.refcount(&h), 1);
+            vs.collect_garbage().await; // must SKIP h (live again)
+            assert_eq!(
+                vs.get(&h).await.unwrap(),
+                v,
+                "re-referenced blob must survive GC"
+            );
+        });
+    }
+
+    /// Stress: `collect_garbage` racing a same-content `put`. The per-content file
+    /// lock makes create/delete of one hash mutually exclusive, so a re-referenced
+    /// blob is never left deleted — a by-construction guarantee against io_uring
+    /// completion reordering. (The bad reorder is hard to force on a given kernel,
+    /// so this passes with or without the lock; the deterministic test above has
+    /// the teeth, the lock provides correctness under any ordering.)
+    #[test]
+    fn gc_does_not_delete_a_concurrently_recreated_blob() {
+        run(async {
+            let dir = TempDir::new().unwrap();
+            let vs = Rc::new(ValueStore::new(dir.path().join("values")));
+            let v = vec![0xABu8; 8192];
+            let h = content_hash(&v);
+
+            for _ in 0..300 {
+                // Make h a queued (refcount 0) deletion with its file still on disk.
+                vs.put(&v).await.unwrap();
+                vs.unref(&h);
+                assert_eq!(vs.refcount(&h), 0);
+
+                // Race GC (wants to delete h) against a put re-referencing the same content.
+                let a = vs.clone();
+                let b = vs.clone();
+                let vb = v.clone();
+                let t_gc = monoio::spawn(async move { a.collect_garbage().await });
+                let t_put = monoio::spawn(async move { b.put(&vb).await.unwrap() });
+                t_gc.await;
+                t_put.await;
+
+                // The put re-referenced it → refcount 1 → the blob MUST still exist.
+                assert_eq!(vs.refcount(&h), 1, "put should have re-referenced the blob");
+                assert_eq!(
+                    vs.get(&h).await.expect("live blob deleted by GC/put race"),
+                    v,
+                    "blob content intact after concurrent GC + recreate"
+                );
+
+                // Reset for the next round.
+                vs.unref(&h);
+                vs.collect_garbage().await;
+            }
+        });
+    }
+}
diff --git a/crates/engine/src/watch.rs b/crates/engine/src/watch.rs
index de18fe7..9bd62d7 100644
--- a/crates/engine/src/watch.rs
+++ b/crates/engine/src/watch.rs
@@ -4,11 +4,20 @@ use bytes::Bytes;
 use futures_channel::mpsc::{Receiver, Sender, channel};
 use rustc_hash::FxHashMap;
 
+use crate::error::{EngineError, Result};
+
 /// Capacity for per-subscriber watch channels. A slow subscriber that fills
 /// its buffer is pruned (same as a disconnected subscriber) rather than
 /// allowed to grow without bound.
 const WATCH_CHANNEL_CAPACITY: usize = 512;
 
+/// Hard cap on the number of live subscriptions (exact keys + prefixes) a
+/// single registry will hold. Dead senders are pruned lazily on `notify` and
+/// on each `subscribe_*`; this cap bounds the worst case where a client
+/// registers many distinct keys faster than pruning reclaims them, preventing
+/// unbounded growth of the `keys`/`prefixes` collections on the shard thread.
+const MAX_TOTAL_SUBSCRIPTIONS: usize = 65_536;
+
 #[derive(Debug, Clone)]
 pub enum WatchEvent {
     Set {
@@ -57,22 +66,50 @@ impl WatchRegistry {
         }
     }
 
-    pub fn subscribe_key(&mut self, ns: Bytes, key: Bytes) -> Receiver<WatchEvent> {
+    pub fn subscribe_key(&mut self, ns: Bytes, key: Bytes) -> Result<Receiver<WatchEvent>> {
         // Prune dead senders for this key before inserting the new one.
         if let Some(senders) = self.keys.get_mut(&(ns.clone(), key.clone())) {
             senders.retain(|tx| !tx.is_closed());
         }
+        self.ensure_capacity()?;
         let (tx, rx) = channel(WATCH_CHANNEL_CAPACITY);
         self.keys.entry((ns, key)).or_default().push(tx);
-        rx
+        Ok(rx)
     }
 
-    pub fn subscribe_prefix(&mut self, ns: Bytes, prefix: Bytes) -> Receiver<WatchEvent> {
+    pub fn subscribe_prefix(&mut self, ns: Bytes, prefix: Bytes) -> Result<Receiver<WatchEvent>> {
         // Prune dead prefix senders before inserting the new one.
         self.prefixes.retain(|(_, tx)| !tx.is_closed());
+        self.ensure_capacity()?;
         let (tx, rx) = channel(WATCH_CHANNEL_CAPACITY);
         self.prefixes.push(((ns, prefix), tx));
-        rx
+        Ok(rx)
+    }
+
+    /// Total live subscriptions across exact keys and prefixes.
+    fn total_subscriptions(&self) -> usize {
+        self.keys.values().map(Vec::len).sum::<usize>() + self.prefixes.len()
+    }
+
+    /// Reject a new subscription only if the registry is genuinely full. The
+    /// cheap count runs first; only when it trips do we pay for a full prune of
+    /// dead senders and re-count, so ordinary subscriber churn (disconnects that
+    /// haven't been pruned yet) never produces a false capacity error.
+    fn ensure_capacity(&mut self) -> Result<()> {
+        if self.total_subscriptions() < MAX_TOTAL_SUBSCRIPTIONS {
+            return Ok(());
+        }
+        self.keys.retain(|_, senders| {
+            senders.retain(|tx| !tx.is_closed());
+            !senders.is_empty()
+        });
+        self.prefixes.retain(|(_, tx)| !tx.is_closed());
+        if self.total_subscriptions() >= MAX_TOTAL_SUBSCRIPTIONS {
+            return Err(EngineError::CapacityExceeded {
+                reason: "watch subscription limit reached",
+            });
+        }
+        Ok(())
     }
 
     pub fn notify(&mut self, ns: &str, key: &[u8], event: WatchEvent) {
@@ -123,7 +160,9 @@ mod tests {
     #[test]
     fn exact_key_receives_event() {
         let mut reg = WatchRegistry::new();
-        let mut rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"));
+        let mut rx = reg
+            .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"))
+            .unwrap();
         reg.notify("ns", b"k", set_event(b"k"));
         assert!(matches!(rx.try_recv().unwrap(), WatchEvent::Set { .. }));
     }
@@ -131,7 +170,9 @@ mod tests {
     #[test]
     fn exact_key_ignores_other_keys() {
         let mut reg = WatchRegistry::new();
-        let mut rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"));
+        let mut rx = reg
+            .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"))
+            .unwrap();
         reg.notify("ns", b"other", set_event(b"other"));
         assert!(rx.try_recv().is_err(), "channel should be empty");
     }
@@ -139,7 +180,9 @@ mod tests {
     #[test]
     fn exact_key_ignores_other_namespaces() {
         let mut reg = WatchRegistry::new();
-        let mut rx = reg.subscribe_key(Bytes::from_static(b"ns1"), Bytes::from_static(b"k"));
+        let mut rx = reg
+            .subscribe_key(Bytes::from_static(b"ns1"), Bytes::from_static(b"k"))
+            .unwrap();
         reg.notify("ns2", b"k", set_event(b"k"));
         assert!(rx.try_recv().is_err(), "channel should be empty");
     }
@@ -147,7 +190,9 @@ mod tests {
     #[test]
     fn prefix_receives_matching_keys() {
         let mut reg = WatchRegistry::new();
-        let mut rx = reg.subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/"));
+        let mut rx = reg
+            .subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/"))
+            .unwrap();
         reg.notify("ns", b"cfg/a", set_event(b"cfg/a"));
         reg.notify("ns", b"cfg/b", del_event(b"cfg/b"));
         reg.notify("ns", b"other", set_event(b"other")); // no match
@@ -160,7 +205,9 @@ mod tests {
     #[test]
     fn dead_exact_sender_pruned() {
         let mut reg = WatchRegistry::new();
-        let rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"));
+        let rx = reg
+            .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"))
+            .unwrap();
         drop(rx);
         // First notify prunes the dead sender.
         reg.notify("ns", b"k", set_event(b"k"));
@@ -174,7 +221,9 @@ mod tests {
     #[test]
     fn dead_prefix_sender_pruned() {
         let mut reg = WatchRegistry::new();
-        let rx = reg.subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/"));
+        let rx = reg
+            .subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/"))
+            .unwrap();
         drop(rx);
         reg.notify("ns", b"cfg/x", set_event(b"cfg/x"));
         assert!(reg.prefixes.is_empty());
@@ -183,10 +232,42 @@ mod tests {
     #[test]
     fn multiple_subscribers_same_key() {
         let mut reg = WatchRegistry::new();
-        let mut rx1 = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"));
-        let mut rx2 = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"));
+        let mut rx1 = reg
+            .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"))
+            .unwrap();
+        let mut rx2 = reg
+            .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k"))
+            .unwrap();
         reg.notify("ns", b"k", set_event(b"k"));
         assert!(rx1.try_recv().is_ok());
         assert!(rx2.try_recv().is_ok());
     }
+
+    #[test]
+    fn subscription_cap_rejects_when_full_but_reclaims_dead_first() {
+        let mut reg = WatchRegistry::new();
+        // Fill the registry to the cap with distinct live keys.
+        let mut live = Vec::with_capacity(MAX_TOTAL_SUBSCRIPTIONS);
+        for i in 0..MAX_TOTAL_SUBSCRIPTIONS {
+            let key = Bytes::from(format!("k{i}"));
+            live.push(
+                reg.subscribe_key(Bytes::from_static(b"ns"), key)
+                    .expect("under cap"),
+            );
+        }
+        assert_eq!(reg.total_subscriptions(), MAX_TOTAL_SUBSCRIPTIONS);
+
+        // At the cap, one more is rejected.
+        assert!(matches!(
+            reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"overflow")),
+            Err(EngineError::CapacityExceeded { .. })
+        ));
+
+        // Drop one receiver: its sender is now dead. The next subscribe must
+        // reclaim it instead of falsely rejecting.
+        live.pop();
+        let rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"after-drop"));
+        assert!(rx.is_ok(), "dead sender should have been reclaimed");
+        assert_eq!(reg.total_subscriptions(), MAX_TOTAL_SUBSCRIPTIONS);
+    }
 }
diff --git a/crates/engine/tests/emfile.rs b/crates/engine/tests/emfile.rs
new file mode 100644
index 0000000..cb382cf
--- /dev/null
+++ b/crates/engine/tests/emfile.rs
@@ -0,0 +1,126 @@
+//! Forced file-descriptor exhaustion (EMFILE).
+//!
+//! This lives in its OWN integration-test binary on purpose: it lowers the
+//! process-global `RLIMIT_NOFILE`, which would poison any sibling test sharing
+//! the process. As the sole test here, the clamp affects only this process.
+//!
+//! It proves the descriptor-exhaustion gap we characterized degrades gracefully:
+//! opening a new namespace under EMFILE fails with a clean `Io` error (no panic,
+//! no corruption), an already-open namespace keeps serving reads, and the store
+//! recovers as soon as descriptors are freed.
+
+use beyond_kv_engine::error::EngineError;
+use beyond_kv_engine::store::ShardStore;
+use beyond_kv_engine::types::SetOptions;
+use bytes::Bytes;
+use tempfile::TempDir;
+
+fn open_fd_count() -> usize {
+    std::fs::read_dir("/proc/self/fd")
+        .map(|d| d.count())
+        .unwrap_or(0)
+}
+
+/// Set the `RLIMIT_NOFILE` soft limit (clamped to the hard limit); return the
+/// previous soft limit so the caller can restore it.
+fn set_nofile_soft(soft: u64) -> u64 {
+    unsafe {
+        let mut rl = libc::rlimit {
+            rlim_cur: 0,
+            rlim_max: 0,
+        };
+        assert_eq!(
+            libc::getrlimit(libc::RLIMIT_NOFILE, &mut rl),
+            0,
+            "getrlimit failed"
+        );
+        let old = rl.rlim_cur;
+        rl.rlim_cur = soft.min(rl.rlim_max);
+        assert_eq!(
+            libc::setrlimit(libc::RLIMIT_NOFILE, &rl),
+            0,
+            "setrlimit failed"
+        );
+        old
+    }
+}
+
+#[test]
+fn emfile_on_namespace_open_degrades_gracefully_and_recovers() {
+    let mut rt = monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+        .enable_timer()
+        .build()
+        .expect("monoio runtime");
+    rt.block_on(async {
+        let dir = TempDir::new().unwrap();
+        let s = ShardStore::open(dir.path(), 4 << 20).await.unwrap();
+
+        // A namespace we rely on after the clamp; seed a value to read back.
+        s.set(
+            "keeper",
+            b"k",
+            Bytes::from_static(b"v0"),
+            SetOptions::default(),
+        )
+        .await
+        .unwrap();
+
+        // Clamp the soft fd limit to just above current usage. All runtime + store
+        // infra descriptors are already allocated, so only NEW file opens (new
+        // namespaces / log files) can now hit EMFILE.
+        let cur = open_fd_count();
+        let old = set_nofile_soft((cur + 8) as u64);
+
+        // Open fresh namespaces until one fails for lack of descriptors.
+        let mut hit: Option<EngineError> = None;
+        for i in 0..256 {
+            match s
+                .set(
+                    &format!("new{i}"),
+                    b"k",
+                    Bytes::from_static(b"v"),
+                    SetOptions::default(),
+                )
+                .await
+            {
+                Ok(()) => {}
+                Err(e) => {
+                    hit = Some(e);
+                    break;
+                }
+            }
+        }
+        let err = hit.expect("expected an fd-exhaustion error while opening new namespaces");
+        assert!(
+            matches!(err, EngineError::Io { .. }),
+            "EMFILE must surface as a clean Io error, got {err:?}"
+        );
+
+        // Graceful degradation: an already-open namespace still serves reads
+        // (the read path needs no new descriptor). No panic, no corruption.
+        let got = s
+            .get("keeper", b"k")
+            .await
+            .expect("get must not error under EMFILE");
+        assert_eq!(
+            got.map(|e| e.value),
+            Some(Bytes::from_static(b"v0")),
+            "existing namespace remains readable while descriptors are exhausted"
+        );
+
+        // Recovery: once descriptors are available again, opening namespaces works.
+        set_nofile_soft(old);
+        s.set(
+            "after_recovery",
+            b"k",
+            Bytes::from_static(b"v"),
+            SetOptions::default(),
+        )
+        .await
+        .expect("opening a namespace must succeed once fds are freed");
+        assert!(
+            s.get("after_recovery", b"k").await.unwrap().is_some(),
+            "store recovers cleanly after fd exhaustion clears"
+        );
+    });
+}
diff --git a/crates/engine/tests/writeamp.rs b/crates/engine/tests/writeamp.rs
new file mode 100644
index 0000000..3e9c5f4
--- /dev/null
+++ b/crates/engine/tests/writeamp.rs
@@ -0,0 +1,77 @@
+//! Write-amplification measurement: value separation vs the inline baseline
+//! (pre-value-separation behavior), using the engine's real `compaction_bytes`
+//! counter (bytes relocated by compaction = the GlideFS S3 re-upload cost).
+//!
+//! Run: cargo test -p beyond-kv-engine --test writeamp -- --nocapture
+
+use beyond_kv_engine::log::NamespaceLog;
+use beyond_kv_engine::log::config::LogConfig;
+use bytes::Bytes;
+use tempfile::TempDir;
+
+fn key(i: usize) -> Bytes {
+    Bytes::from(format!("k{i:05}"))
+}
+
+/// Churn `n` 32 KiB values across `rounds` reclaims; return cumulative
+/// compaction bytes after each round. `threshold = usize::MAX` ⇒ values stay
+/// inline (the pre-value-separation baseline); a small threshold ⇒ separated.
+async fn sweep(threshold: usize, rounds: usize) -> Vec<u64> {
+    let dir = TempDir::new().unwrap();
+    let cfg = LogConfig {
+        rotate_threshold: 64 * 1024,
+        fanout: 4,
+        value_sep_threshold: threshold,
+    };
+    let log = NamespaceLog::open(dir.path().to_path_buf(), cfg)
+        .await
+        .unwrap();
+    let n = 60usize;
+    let v0 = vec![0xCDu8; 32 * 1024];
+    for i in 0..n {
+        log.put_full(key(i), &v0, &[], None).await.unwrap();
+    }
+    log.reclaim().await.unwrap();
+    log.compaction_bytes.set(0); // measure only the churn phase
+    let mut series = Vec::with_capacity(rounds);
+    for r in 0..rounds {
+        let vr = vec![r as u8; 32 * 1024]; // new content each round
+        for i in 0..n {
+            log.put_full(key(i), &vr, &[], None).await.unwrap();
+        }
+        log.reclaim().await.unwrap();
+        series.push(log.compaction_bytes.get());
+    }
+    series
+}
+
+#[test]
+fn writeamp_sweep_csv() {
+    let mut rt = monoio::RuntimeBuilder::<monoio::FusionDriver>::new()
+        .enable_timer()
+        .build()
+        .unwrap();
+    rt.block_on(async {
+        let rounds = 15usize;
+        let inline = sweep(usize::MAX, rounds).await; // baseline: values inline
+        let vsep = sweep(4096, rounds).await; // value separation on
+        println!("WRITEAMP_CSV_START");
+        println!("round,inline_mib,valuesep_mib");
+        for r in 0..rounds {
+            println!(
+                "{},{:.4},{:.4}",
+                r + 1,
+                inline[r] as f64 / 1048576.0,
+                vsep[r] as f64 / 1048576.0
+            );
+        }
+        println!("WRITEAMP_CSV_END");
+        let (ti, tv) = (inline[rounds - 1], vsep[rounds - 1]);
+        println!(
+            "TOTAL inline={:.2} MiB  valuesep={:.4} MiB  ratio={:.0}x",
+            ti as f64 / 1048576.0,
+            tv as f64 / 1048576.0,
+            ti as f64 / tv.max(1) as f64
+        );
+    });
+}