diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index bb3bff3..42a2b86 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -13,16 +13,23 @@ TCP Client RespCodec (beyond_resp) ← RESP2/RESP3 framing │ RESP Array → Bytes ▼ -Command::parse() ← command.rs — stack-allocated parsing, arity check +Command::parse() ← command.rs — stack-allocated parsing, arity check │ Command::Set { key, value, args } + │ bad arity / unknown option ─────────────────────────────► ERR (connection stays open) ▼ dispatch() ← dispatch.rs — NX/XX condition, TTL conversion - │ SetOptions { ttl: Duration, metadata } + │ SetOptions { ttl, metadata } + │ value > KV_MAX_VALUE_BYTES ─────────────────────────────► ERR 413 / RESP ERR ▼ ShardStore::set() ← store.rs (async) - ├─ record::encode(tstamp, flags, expires_at_ms, key, value, metadata) - ├─ NamespaceLog::put_full → active_file.append(buf) → fsync ← L2 write (io_uring) - └─ MemCache::insert(key, value, ...) ← L1 write + │ frozen (handoff seal in progress) ──────────────────────► ERR Frozen + ├─ NamespaceLog::put_full + │ ├─ value ≥ 128 KiB → ValueStore::put(value) ← blob write (io_uring, write-once) + │ │ io error ──────────────────────────────────────────► ERR propagated to client + │ ├─ record::encode(tstamp, flags, expires_at_ms, key, value-or-hash, metadata) + │ └─ active_file.append(buf) → fsync ← L2 write (io_uring) + │ io error → file poisoned ──────────────────────────► ERR; subsequent writes fail until restart + └─ MemCache::insert(key, value, ...) ← L1 write (stores full value) │ ▼ r::ok() ← response.rs @@ -41,17 +48,22 @@ Command::Get { key } │ ▼ ShardStore::get() (async) - ├─ MemCache::get(key, now_ms) ── hit? ──► check expiry ──► return Entry (L1 fast path) + ├─ MemCache::get(key, now_ms) ── hit? ──► check expiry ──► return Entry (L1 fast path; full value) │ │ expired │ ▼ │ remove from L1, append tombstone, return None │ └─ miss? ──► NsIndex::get(key) - ├─ None ──────────────────────────────────────────► return None - ├─ expired (TTL sidecar) ──► append tombstone ────► None - └─ live ──► file.read_at(record_offset, record_size) (single io_uring SQE) - ├─ parse header → slice value/metadata - └─ MemCache::insert ──► return Entry + ├─ None ──────────────────────────────────────────────────► return None + ├─ expired (TTL sidecar) ──► append tombstone ────────────► None + └─ live ──► file.read_at(record_offset, record_size) (one io_uring SQE) + │ parse header → slice value field + ├─ VALUE_SEP flag clear: value field IS the value + │ └─ MemCache::insert(full value) ──────► return Entry + └─ VALUE_SEP flag set: value field is 16-byte hash + └─ ValueStore::get(hash) (one io_uring SQE) + ├─ blob missing ─────────────────► ERR BadRecord + └─ ok ──► MemCache::insert ──────► return Entry │ ▼ r::bulk(entry.value) or r::nil() @@ -91,6 +103,39 @@ http.rs router HTTP Client ``` +### Startup / Recovery (per shard, per namespace) + +``` +ShardStore::open() + └─ for each namespace dir found on disk: + NamespaceLog::open() + ├─ recover::open_namespace() + │ ├─ for each sealed data-*.log (ascending file_id): + │ │ ├─ read_footer() ── magic matches? ──► apply_footer_entries() (O(1), no body scan) + │ │ │ └─ rebuilds index + TTL + valsep sidecars + │ │ └─ magic mismatch / CRC fail ──► rebuild_from_records() (full body scan, fallback) + │ └─ highest file: + │ ├─ footer present (clean shutdown) ──► treat as sealed, open new active + │ └─ no footer (crash) ──► replay_active() + │ └─ scan records; bad CRC → truncate at last good boundary + ├─ for each live value-separated key: ValueStore::incr_ref(hash) + └─ ValueStore::sweep_orphans() ──► delete values/blob-* with no live key reference +``` + +### Background Durability (per shard, every 1 second) + +``` +ShardStore::sync_all() + └─ for each open namespace: + NamespaceLog::sync() + └─ unsynced_bytes > 0? ──► active_file.fsync() (io_uring) + io error ──► kv_log_sync_failures_total++ ──► /readyz 503 after threshold +``` + +This IS the durability mechanism — `appendfsync everysec`. Individual writes call `write_all_at` (goes to the OS page cache; not yet on stable storage) and increment `unsynced_bytes`. The 1-second timer is the only thing that calls `fsync`. A crash before the next timer fires can lose up to ~1 second of writes. The meaningful secondary effect is on `/readyz`: fsync failures increment `readyz_sync_failure_count`; once it exceeds `KV_READYZ_SYNC_FAILURE_THRESHOLD` the shard reports degraded and `/readyz` returns 503. + +**New-file directory durability.** Whenever a new `data-*.log` is created or renamed into place (fresh namespace, rotate, reclaim, FLUSHDB, clean-shutdown recovery), the engine fsyncs the _namespace directory_ (`file.rs:sync_dir`) so the file's directory entry is durable — not just its bytes. Without this a power loss could leave a created file's fsynced records unreachable (data present, name lost), violating the everysec contract for any file past the first. This runs only on those rare paths, never on the per-write hot path. (Residual assumption: that `fsync` is honest down through the filesystem/GlideFS/hardware stack — not verifiable in software.) + ### TTL Expiry ``` @@ -142,18 +187,22 @@ SCAN iterates shards sequentially: when a shard's inner cursor returns `"0"` (ex ## Concepts & Terminology -| Term | What It Controls | NOT | -| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- | -| Namespace (`ns`) | Which `NamespaceLog` (and therefore which on-disk directory) receives reads/writes; set by `SELECT ` (RESP, any non-negative integer) or `/namespaces/{ns}/` (HTTP); max 1024 open per shard | Not an auth or tenant boundary | -| Shard / ShardStore | One independent storage unit per OS thread — lazily-opened `NamespaceLog` per namespace + L1 cache | A partition of the keyspace: a key lives on exactly one shard, picked by `FxHash(key) % n_shards` | -| L1 / MemCache | In-process S3-FIFO cache that short-circuits disk reads | Not write-through durable storage | -| L2 / NamespaceLog | Persistent on-disk store; in-RAM hash index over an append-only log file; authoritative source of truth | Not the hot path for reads after first access | -| Active file | The currently-writable log file. Records are appended, fsynced, then made visible via the index | Not modified in place; only appended | -| Sealed file | A previously-active file that has been merged through reclaim. Read-only, has a footer of live entries | Not deleted until reclaim runs again | -| Ghost Set | MemCache tracking of recently evicted keys; a ghost hit promotes the next insert directly to the Main queue | Not a tombstone or deletion marker | -| Cursor `"0"` | SCAN sentinel meaning "start from beginning" or "scan complete" — the same value signals both states | Not a literal zero integer | -| `\x01`-prefixed cursor | Single-shard continuation cursor: `b"\x01"` + last_key from the previous page | Not a user-visible value; internal to scan | -| `\x02`-prefixed cursor | Multi-shard continuation cursor: `b"\x02"` + `[shard_idx: u8]` + per-shard inner cursor; only emitted when `n_shards > 1` | Never produced by single-shard deployments; not a user-visible value | +| Term | What It Controls | NOT | +| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------- | +| Namespace (`ns`) | Which `NamespaceLog` (and therefore which on-disk directory) receives reads/writes; set by `SELECT ` (RESP, any non-negative integer) or `/namespaces/{ns}/` (HTTP); max 1024 open per shard | Not an auth or tenant boundary | +| Shard / ShardStore | One independent storage unit per OS thread — lazily-opened `NamespaceLog` per namespace + L1 cache | A partition of the keyspace: a key lives on exactly one shard, picked by `FxHash(key) % n_shards` | +| L1 / MemCache | In-process S3-FIFO cache that short-circuits disk reads | Not write-through durable storage | +| L2 / NamespaceLog | Persistent on-disk store; ordered in-RAM index (`BTreeMap`) over an append-only log file + a blob store; authoritative source of truth | Not the hot path for reads after first access | +| Active file | The currently-writable log file. Records are appended, fsynced, then made visible via the index | Not modified in place; only appended | +| Sealed file | A previously-active file that has been merged through reclaim. Read-only, has a footer of live entries | Not deleted until reclaim runs again | +| Run / Level | A run is one sealed file; its level is its size-tier. Reclaim merges `fanout` runs at level L into one run at L+1 — bounds write amplification to O(log N) | Not persisted: a restart resets every run to level 0 | +| Write stripe (`wlock`) | One of 64 per-namespace async mutexes; a write locks `stripe[FxHash(key) % 64]` for check→append→commit. Serializes same-key writes; makes CAS/INCR atomic | Not cross-thread (shard is single-threaded); not taken by reads; not per-key (stripes are shared, collisions just over-serialize) | +| Value separation | A value ≥ `value_sep_threshold` (128 KiB) is stored in the content-addressed blob store; the log record carries only its 16-byte hash, so compaction never re-uploads the value | Not applied to small values (they stay inline); not a per-key dedup of small data | +| Blob | An immutable, content-addressed value file at `values/blob-{hash}`; refcounted, write-once, deduped across keys; at refcount 0 it is deleted by `collect_garbage` after the next fsync (deferred for crash-safety) | Not mutated in place; not moved by compaction; not deleted eagerly on unref | +| Ghost Set | MemCache tracking of recently evicted keys; a ghost hit promotes the next insert directly to the Main queue | Not a tombstone or deletion marker | +| Cursor `"0"` | SCAN sentinel meaning "start from beginning" or "scan complete" — the same value signals both states | Not a literal zero integer | +| `\x01`-prefixed cursor | Single-shard continuation cursor: `b"\x01"` + last_key from the previous page | Not a user-visible value; internal to scan | +| `\x02`-prefixed cursor | Multi-shard continuation cursor: `b"\x02"` + `[shard_idx: u8]` + per-shard inner cursor; only emitted when `n_shards > 1` | Never produced by single-shard deployments; not a user-visible value | ## Core Mechanism @@ -182,7 +231,7 @@ The accept loop in `main.rs` peeks the first command's key on each new connectio ### Two-Level Storage -Every read checks L1 first. L1 hits avoid all disk I/O. On L1 miss the engine looks up the key in the in-RAM hash index, then issues a single io_uring `read_at(record_offset, record_size)` against the file holding that record. The header carries `key_size`/`val_size`/`meta_size`, so the value and metadata are sliced out in-memory after the read completes. +Every read checks L1 first. L1 hits avoid all disk I/O. On L1 miss the engine looks up the key in the in-RAM index (`BTreeMap`), then issues a single io_uring `read_at(record_offset, record_size)` against the file holding that record. The header carries `key_size`/`val_size`/`meta_size`, so the value and metadata are sliced out in-memory after the read completes. If the record's `VALUE_SEP` flag is set, the sliced "value" is a 16-byte hash and one additional blob read fetches the value — still O(1), since the hash came straight from the record. The blob is then re-hashed and checked against that content hash before being returned (parity with the CRC the inline path verifies on every read) — silent blob corruption or a blob/hash mismatch surfaces as an error instead of wrong data. Writes go to both levels in order: append + fsync to disk first (durable), then L1 (hot set). @@ -208,26 +257,68 @@ Each namespace gets its own directory `{data_dir}/shard-{n}/{ns}/`. Files in tha | key bytes | value bytes | metadata bytes | ``` -CRC-64/NVME via `crc-fast` covers everything after the CRC field. `flags` carries `TOMBSTONE` (0x01), `NO_EXPIRY` (0x02), `TTL_UPDATE` (0x04). Tombstone and TTL-update records have `val_size = meta_size = 0`. +CRC-64/NVME via `crc-fast` covers everything after the CRC field. `flags` carries `TOMBSTONE` (0x01), `NO_EXPIRY` (0x02), `TTL_UPDATE` (0x04), `VALUE_SEP` (0x08). Tombstone and TTL-update records have `val_size = meta_size = 0`. A `VALUE_SEP` record's "value bytes" are a 16-byte content hash, not the value — the value lives in the blob store (see [Value Separation](#value-separation)). -**In-RAM index** (per namespace): `FxHashMap`. `IndexEntry` is 24 bytes: +**In-RAM index** (per namespace): `BTreeMap` (ordered, so SCAN is a range walk). `IndexEntry` is 24 bytes: ```rust struct IndexEntry { record_offset: u64, record_size: u32, - file_id: u16, + file_id: u32, // u32 (not u16): file IDs are never reused, so a hot namespace + // must not exhaust them — u32 ≈ unbounded; still packs to 24 B tstamp_ms: u64, // revision — enables O(1) CAS checks without a disk read } ``` -Plus a TTL sidecar `FxHashMap` so only TTL'd keys pay the extra 16-byte slot. +Two FxHashMap sidecars, each paid only by the keys that need it: a TTL sidecar `FxHashMap` (TTL'd keys) and a value-separation sidecar `FxHashMap` mapping a large-value key to its blob hash (used to unref the old blob on overwrite/delete and to rebuild blob refcounts on recovery). + +**Sealed-file footer** (written when a file is sealed by reclaim): one entry per live key — `(key, record_offset, record_size, expires_at_ms, tstamp_ms, value_hash?)` — followed by a 24-byte trailer (body length + CRC + magic `0x4259_4F4E_445F_4B58`, "BYOND_KX" v3). The `value_hash` (present only for value-separated keys) is carried in the footer so recovery rebuilds both the index and the value-sep sidecar in O(1) without reading record bodies. On startup, recovery reads each sealed file's footer; if the magic doesn't match (older format or crash mid-seal), it falls back to a full record scan — which still repopulates the value-sep sidecar from each record's `VALUE_SEP` flag. The active file's tail is replayed record-by-record; first bad CRC truncates the active file at the last good boundary. After the index is rebuilt, blob refcounts are reconstructed by walking the value-sep sidecar (one `incr_ref` per live large-value key). + +**Reclaim (compaction)** is **size-tiered** — one strategy, no flag (`reclaim_inner`). Triggered two ways: `BGREWRITEAOF` (current namespace, synchronous from the client's perspective) or the auto-reclaim background task (every `KV_RECLAIM_INTERVAL_SECS`, default 300s) which reclaims any namespace whose sealed file count exceeds `KV_RECLAIM_SEALED_THRESHOLD` (default 4, 0 = disabled). + +A reclaim seals the active file as a fresh level-0 run, then repeatedly finds the lowest level holding ≥ `fanout` (`KV_COMPACTION_FANOUT`, default 8) runs and merges just those into one run at the next level, cascading upward (`reclaim_namespace` copies each live record's bytes verbatim into the merged file and unlinks its inputs). Each reclaim rewrites **one level, not the whole live set** — O(log N) amortized write amplification. On GlideFS this matters directly: a reclaim re-uploads one level's worth of bytes to S3, not the entire namespace. + +**Reclaim does not error writes.** A write that arrives during a reclaim _waits_ (`await_reclaim`, before it takes the in-flight count) and proceeds when the reclaim finishes — it never returns `ReclamationBusy` to the client (only a second _concurrent reclaim_ gets that). Before sealing, reclaim **drains in-flight writes** (waits for `in_flight_writes == 0`) so the footer it writes is a consistent snapshot — a write that appended to the active file but hadn't yet updated the index can't be missed from the footer and silently lost on a later footer recovery. `FLUSHDB` uses the same gate + drain (so a write can't race the file replacement). Trade-off: writes _stall_ for the reclaim's duration (standard LSM write-stall, bounded by level size / tunable via `rotate_threshold`·`fanout`), but they always succeed. + +`NamespaceLog::compaction_bytes` counts the bytes each reclaim rewrites, so write-amp is directly measurable. Level assignments live in an in-memory `RefCell>`; **a restart resets all runs to level 0** (levels are not yet persisted). + +> **Why not full-merge** (rewrite the entire live set into one file per reclaim, the classic compacting-log design)? On GlideFS, full-merge re-uploads the whole namespace to S3 on _every_ reclaim — O(live-set) each time. Measured on the real engine over 12 reclaims of a churning ~200-key set, size-tiered rewrote **4.6× fewer bytes** than full-merge would. Point reads don't pay for the extra runs: the in-RAM index resolves each key straight to `(file_id, offset)`, so a GET is one read regardless of run count. Full-merge was removed, not flag-gated. + +**Forks need no special handling.** A GlideFS fork is a copy-on-write volume: the child shares the parent's packs and only pays for what it writes. Because reclaim writes merged runs to _new_ offsets (never rewriting a parent's packs) and large values live in immutable blobs the child shares for free, a fork's amplification is bounded by its own divergence with zero fork-awareness in the engine — no "freeze the inherited base" step, no fork-vs-restart detection. (An earlier `freeze_inherited` design was removed: it required a fork hook that doesn't exist and pinned dead inherited data forever, defeating GC.) + +**FLUSHDB** unlinks-and-recreates the namespace's data files (does NOT truncate in place) so CoW sharing with the parent fork's blocks is preserved, and drops the namespace's blob store (`values/`). + +### Value Separation + +A value whose length ≥ `LogConfig::value_sep_threshold` (default **128 KiB = one GlideFS block**) is written to a content-addressed blob store at `{ns}/values/blob-{hash}` instead of inline in the log; the log record carries only the 16-byte BLAKE3-128 hash with the `VALUE_SEP` flag set. Small values stay inline. `value_store.rs` is the store; the wiring lives in `log/mod.rs` (`put_full`/`put_full_cond`/`put_many` separate on write, `read_value`/`bulk_read` deref on read). + +``` +SET big (256 KiB) ─► blob store: values/blob- (262,144 bytes, write-once) + └► log record: header + key + 16-byte hash (≈100 bytes) +GET big ─► index → record (the hash) → blob store get(hash) → value (still ONE disk read) +``` + +Behavior, observed on the running binary (256 KiB value): + +| event | what actually happens | +| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| SET large value | 262,144 bytes written to `values/`; **log grows ~100 bytes** (the pointer) | +| GET | index resolves the record → 16-byte hash → blob read; one read, value returned | +| identical content (any keys) | deduped to **one blob** (content-addressed, write-once) | +| BGREWRITEAOF / reclaim | copies the ~100-byte pointer record; the blob is **never touched** — log stays tiny | +| overwrite / delete / expire | the old blob is `unref`'d; at refcount 0 it is **queued**, then physically unlinked by `collect_garbage` after the next fsync (deferred — see durability ordering) | + +**Why:** on GlideFS, the cost that matters is _bytes moved by compaction_ — relocating a record to a new offset re-uploads it to S3 (dedup is offset-keyed). Inline, every compaction re-moves the value; a value surviving N reclaims is uploaded N+1 times. Separated, compaction only ever moves the 16-byte pointer; the value is uploaded once and reclaimed by deletion (unlink → whole blocks freed → GlideFS dead-pack GC), never by rewrite. **Measured on the real engine** (60 keys × 32 KiB, 10 churning reclaims): inline moved **22.5 MiB** of compaction bytes, value-separated moved **0.01 MiB** — 3337× less. The threshold is one block because below it a blob-per-value wastes the rest of the block (space-amp explodes); at/above it write-amp collapses to ~1×. + +Blob I/O is async on the shard's io_uring reactor — `monoio::fs` `read`/`write`/`remove_file`/`create_dir`/`sync_all` (the `mkdirat`/`unlinkat` features), never a blocking syscall on the hot path. Reads re-hash the blob and verify it against the content hash (integrity, see above). Blob refcounts are in-memory, rebuilt on open from the value-sep sidecar (which the footer/scan recovery repopulates); immediately after, `ValueStore::sweep_orphans` deletes any blob on disk that no live key references. The create and delete of a given content hash are serialized by a per-content **file-op lock** (16 stripes), so `collect_garbage` can never unlink a blob a same-content `put` is concurrently recreating (a by-construction guard against io_uring completion reordering). -**Sealed-file footer** (written when the active file is rotated by reclaim): array of `(key, record_offset, record_size, expires_at_ms, tstamp_ms)` entries followed by a 24-byte trailer (body length + CRC + magic `0x4259_4F4E_445F_4B57`). On startup, recovery reads the footer of each sealed file in O(1) and rebuilds the index without scanning the file body. Sealed files with the older magic (`0x4259_4F4E_445F_4B56`, written before the `tstamp_ms` footer field was added) fall back to a full sequential scan of the file body, reading `tstamp_ms` from each record header — no explicit migration needed. The active file's tail (between its last hint checkpoint and EOF) is replayed record-by-record; first bad CRC truncates the active file at the last good boundary. +**Durability ordering** — the pointer (in the log) and the value (in a blob file) live in _different_ files, so both edges of a blob's lifetime are ordered against the log's fsync: -**Reclaim**: seal the current active file, walk live index entries, copy live records to a new sealed file, write its footer + fsync, atomic-rename, unlink old sealed files. A fresh active file is opened. Triggered two ways: `BGREWRITEAOF` (current namespace, synchronous from the client's perspective) or the auto-reclaim background task (every `KV_RECLAIM_INTERVAL_SECS`, default 300s) which reclaims any namespace whose sealed file count exceeds `KV_RECLAIM_SEALED_THRESHOLD` (default 4, 0 = disabled). +- **Create before reference.** `put` makes the blob crash-durable _before_ it returns, before the caller writes the pointer record: `write_all_at` → `sync_all` (blob bytes) → fsync the `values/` directory (blob's name). Write-ahead ordering: a durable pointer can never reference a non-durable blob. +- **Delete after the superseding record is durable.** `unref` only drops the refcount and _queues_ the blob; `collect_garbage` (run after each `sync`) physically deletes it. So the old blob of an overwrite/delete survives until the record that superseded it is durable. Were it deleted eagerly, a power loss that lost the superseding record would revert the key to its old value — whose blob would be gone (a dangling pointer). Deferring makes the revert safe. -**FLUSHDB** unlinks-and-recreates the namespace's data files (does NOT truncate in place) so CoW sharing with the parent fork's blocks is preserved. +The log itself is `appendfsync everysec` (≤1 s of writes lost on power loss), so the worst a crash does to a value-separated write is leave an **orphan blob** (durable blob, pointer lost, or queued-but-uncollected) — which `sweep_orphans` reclaims on the next open. **There is no dangling-pointer (durable pointer, missing blob) window.** This is verified exhaustively by the `crash_consistency` test module: `exhaustive_tail_truncation_is_consistent` truncates the un-fsynced tail at **every byte offset** (and includes a value-sep overwrite in the crash zone — the case the deferred-delete fix protects); `corruption_truncates_at_bad_record_keeping_prefix` does the same for single-byte bit-rot of durable records; `torn_footer_falls_back_to_scan_across_files` reclaims to a sealed+active multi-file layout, then cuts the sealed file's footer at every offset to exercise the `read_footer`→record-scan fallback (which rebuilds value-sep state from the `VALUE_SEP` flag). Each asserts a valid recovered prefix with zero dangling pointers and zero blob leaks. The harness has teeth: reintroducing the synchronous-delete bug makes it fail at the exact offset where the overwrite is lost. ### Command Parsing (`command.rs`) @@ -262,13 +353,13 @@ A connection is pinned to one shard, but multi-key commands (MGET, MSET, DEL, EX ### SCAN Glob Matching -Pattern matching uses a stack-based backtracking algorithm that handles `*` (any sequence) and `?` (single character). No heap allocation; runs inline during RocksDB iteration. See `store.rs:glob_match()`. +Pattern matching uses a stack-based backtracking algorithm that handles `*` (any sequence), `?` (single character), and `[abc]` / `[a-z]` / `[^abc]` character classes. No heap allocation; runs inline during the `BTreeMap` range walk — each key is tested as the cursor advances. See `store.rs:glob_match()`. ### Watch / Subscribe Clients can subscribe to mutations on a key or a key prefix and receive a live stream of events. The mechanism is the same for both transports; only the framing differs. -**Revision** — every log record's `tstamp_ms` field doubles as a revision ID. No separate counter. Revisions are monotonically increasing per-shard and are included in every `WatchEvent`, enabling resumable subscriptions. +**Revision** — every log record's `tstamp_ms` field doubles as a revision ID. No separate counter. Revisions are monotonically increasing per-shard (a hybrid logical clock: `max(wall_clock_ms, last_revision + 1)`), so they advance even if two writes land in the same millisecond or the wall clock steps backward mid-run. On open, the clock is **seeded from the highest tstamp recovered**, so revisions stay monotonic across a restart too (a post-restart write can never be assigned a revision ≤ existing data, which would otherwise corrupt `scan_since` watch resumption). Revisions are included in every `WatchEvent`, enabling resumable subscriptions. **WatchRegistry** (`engine/src/watch.rs`) — one per `ShardStore`, owned behind `RefCell` (no locking needed; single-threaded per shard). Holds two tables: @@ -280,7 +371,7 @@ After each successful `set`, `mset`, or `del`, the store calls `WatchRegistry::n **Initial state delivery** (`watch_subscribe`): - `since == 0` → call `NamespaceLog::current_entries` — reads the live index + fetches values from disk for matching keys. Delivers the current state snapshot immediately. -- `since > 0` → call `NamespaceLog::scan_since` — scans all log files in `file_id` order to replay mutations with `tstamp_ms > since`. Used by clients that reconnect after a brief disconnection to catch up without missing writes. +- `since > 0` → call `NamespaceLog::scan_since` — scans all log files in `file_id` order to replay mutations with `tstamp_ms > since`. Used by clients that reconnect after a brief disconnection to catch up without missing writes. Value-separated records are deref'd (and integrity-checked) during the scan, so replayed events carry the real value, not the blob-hash pointer. **RESP3 transport** — `WATCH key [key ...] [SINCE ]` and `PWATCH prefix [SINCE ]` intercept in `handle_conn` before `dispatch()`. They require RESP3 (`HELLO 3` first). Initial events are sent as Push frames, followed by a `>2 watch ready` frame, then a live select loop: @@ -318,7 +409,7 @@ GET /v1/watch?prefix=

&since= → resumable prefix stream ### Compare-And-Swap (CAS) -CAS enables optimistic concurrency control: a write succeeds only if the current revision of the key matches the caller's expected value. Because each shard is single-threaded, the check-then-write is atomic with no race window. +CAS enables optimistic concurrency control: a write succeeds only if the current revision of the key matches the caller's expected value. The check and the write are atomic — `put_full_cond` holds the key's write stripe across check→append→commit, so no concurrent same-key write can interleave (even at the disk-I/O `.await`). A failed condition writes nothing: it checks before appending, so there is no record on disk for a CAS that returned "no" (this is what makes CAS crash-safe — a failed CAS can never resurrect after a crash). **RESP** — `SET key value REV `: @@ -331,13 +422,14 @@ CAS enables optimistic concurrency control: a write succeeds only if the current - Mismatch → `409 Conflict` + `{"error":"conflict","message":"revision mismatch"}`. - `GET` always returns `X-KV-Revision: ` so the caller can capture the revision before a CAS write. -**Implementation** — `ShardStore::setrev()`: +**Implementation** — `ShardStore::setrev()` → `NamespaceLog::put_full_cond(key, …, WriteCondition::Revision(n))`: -1. `ensure_ns()` borrows the in-memory index. -2. Reads `IndexEntry.tstamp_ms` for the key (O(1), no disk read). -3. Expired keys are treated as absent (revision mismatch). -4. On match: write via `put_full()`, notify watchers, return `Ok(Some(new_rev))`. -5. On mismatch: return `Ok(None)` — no write, no disk I/O. +1. Acquire the key's write stripe (`wlock`) — held across the whole operation. +2. Check `IndexEntry.tstamp_ms == n` (O(1), no disk read; expired keys count as absent → mismatch). +3. On mismatch: return `Ok(None)` immediately — **no append, no disk I/O, no record**. +4. On match: encode + append + commit + notify watchers, return `Ok(Some(new_rev))`. + +Because the stripe is held from the check through the commit, no concurrent same-key write can land in between — the check is authoritative and the failed path leaves nothing on disk. `REV` is mutually exclusive with `NX`/`XX` at the protocol layer. @@ -368,7 +460,7 @@ CAS enables optimistic concurrency control: a write succeeds only if the current ``` absent ──SET──► live - live ──GET──► live (freq bumped in L1; revision returned in X-KV-Revision / Entry.revision) + live ──GET──► live (freq bumped in L1; revision in X-KV-Revision / Entry.revision) live ──DEL──► absent live ──expired────► absent (lazy, on next access or L1 sweep) live ──PERSIST──► live (TTL cleared) @@ -378,11 +470,30 @@ absent ──SET──► live absent ──CAS──────────────────► absent (mismatch; 409 / nil returned) ``` +| From | Event | To | Guard | What Actually Happens | +| ------ | ------------------ | ------ | ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| absent | SET | live | — | Record appended + fsynced; index entry inserted; L1 populated. Large value written to blob store first; record carries 16-byte hash. | +| live | SET (overwrite) | live | — | New record appended; index entry replaced; L1 updated. Old blob `unref`'d if value-separated; new blob written (dedup: no write if identical content). | +| live | SET NX | live | key present | No write, no disk I/O. Returns nil / 0. | +| absent | SET NX | live | key absent | Same as SET. | +| live | SET XX | live | key present | Same as SET (overwrite). | +| absent | SET XX | absent | key absent | No write. Returns nil / 0. | +| live | DEL | absent | — | Tombstone appended; index entry removed; L1 evicted. Blob `unref`'d if value-separated → unlinked at refcount 0. | +| live | EXPIRE | live | — | TTL_UPDATE record appended; TTL sidecar updated. No value rewrite. | +| live | PERSIST | live | — | TTL_UPDATE record (NO_EXPIRY flag) appended; TTL sidecar entry removed. | +| live | GET (TTL elapsed) | absent | `now_ms ≥ expires_at_ms` | Tombstone appended; index + TTL sidecar cleared; L1 evicted. Blob `unref`'d. Caller receives nil. | +| live | CAS (rev matches) | live | `tstamp_ms == expected` | Same as SET overwrite. New revision returned. | +| live | CAS (rev mismatch) | live | `tstamp_ms != expected` | No write, no disk I/O. 409 / nil returned. | +| absent | CAS | absent | key absent = revision mismatch | No write. 409 / nil returned. | +| live | FLUSHDB | absent | — | All data files unlinked and recreated; blob store directory removed; index and sidecars cleared. CoW sharing with parent fork preserved (unlink, not truncate). | + ## Why It Behaves This Way ### Why each thread has its own engine instance -Sharing storage across threads would require locking on the index and the active-file write offset. Per-thread instances eliminate that coordination entirely and keep the hot path lock-free. The tradeoff is that the routing layer must pin each client connection to a thread — a key read on thread 0 won't see a write made on thread 1. +Sharing storage across threads would require cross-thread locking on the index and the active-file write offset. Per-thread instances eliminate that coordination entirely: **reads are lock-free**, and there is no cross-thread synchronization anywhere. The tradeoff is that the routing layer must pin each client connection to a thread — a key read on thread 0 won't see a write made on thread 1. + +Within a shard, writes take one **per-key stripe lock** (64 stripes per namespace, `wlock(key)`) for their check→append→commit. This is _not_ cross-thread (the shard is single-threaded; it's an async mutex serializing the cooperative tasks that interleave at `.await` points). Writes to different keys hash to different stripes and proceed fully concurrently; only same-key writes serialize. It exists so conditional writes (CAS/NX/XX) and read-modify-write (INCR) are atomic on disk — the holder checks before appending, so a failed condition or lost race writes nothing (no orphan record). Reads never take it. Connection routing is built into the server: `peek_resp_key` peeks the first bytes of a new TCP connection (without consuming them), extracts the key from the first command, and runs `FxHash(key) % n_shards` to pick a worker thread. The connection is then pinned to that thread for its lifetime. Multi-key commands (MGET, MSET, DEL, EXISTS) whose keys span shards are transparently fanned out via per-shard request channels (see "Cross-Shard Fan-Out") so the client sees a single response in original key order — no `CROSSSLOT` error. @@ -408,26 +519,28 @@ Redis protocol defines SCAN to return "0" when iteration is complete. Reusing "0 ### Why MSET is atomic (within one shard) -Redis MSET is documented as atomic. Within a single shard this implementation builds one buffer containing every record, calls `write_at(buf, base_offset)` and `fsync()` once, then bulk-updates the index — all keys land or none do. The L1 cache is populated after the disk fsync; in the narrow window between the two, a cache miss correctly falls back to disk and sees all keys. +Redis MSET is documented as atomic. Within a single shard this implementation builds one buffer containing every record and calls `write_all_at(buf, base_offset)` — a single OS write — then bulk-updates the index atomically. All keys are visible together or not at all: a crash before the next 1-second fsync loses all of them; a crash after leaves all of them. The L1 cache is populated after the write; a cache miss in the narrow window correctly falls back to disk and sees all keys. Across shards (when MSET keys span shard boundaries), atomicity is **not** preserved: each shard's subset commits independently, matching Redis Cluster's semantics. ## Configuration -| CLI Flag / Env Var | Default | What It Controls at Runtime | -| ---------------------------------------------------------------------- | -------------------- | ------------------------------------------------------------------------------------------------- | -| `--data-dir` / `KV_DATA_DIR` | `/var/lib/beyond/kv` | Root path for all shard directories (`{data_dir}/shard-{n}`) | -| `--resp-port` / `KV_RESP_PORT` | `6379` | TCP port each thread's RESP listener binds to | -| `--http-address` / `KV_ADDRESS` | `0.0.0.0:4869` | Socket address each thread's HTTP listener binds to (full `ip:port`) | -| `--threads` / `KV_THREADS` | `num_cpus::get()` | Number of OS threads (= number of shards) | -| `--memory-bytes` / `KV_MEMORY_BYTES` | `268435456` (256 MB) | Total L1 cache budget; divided evenly across threads | -| `--max-conns-per-shard` / `KV_MAX_CONNS_PER_SHARD` | `10000` | Per-shard connection cap; connections beyond this are dropped immediately with a busy response | -| `--idle-timeout-secs` / `KV_IDLE_TIMEOUT_SECS` | `60` | Seconds of inactivity before a connection is closed | -| `--max-value-bytes` / `KV_MAX_VALUE_BYTES` | `67108864` (64 MB) | Maximum accepted value size; larger bodies are rejected with HTTP 413 or RESP `ERR` | -| `--reclaim-sealed-threshold` / `KV_RECLAIM_SEALED_THRESHOLD` | `4` | Auto-reclaim a namespace when its sealed file count exceeds this value; `0` disables auto-reclaim | -| `--reclaim-interval-secs` / `KV_RECLAIM_INTERVAL_SECS` | `300` | Seconds between auto-reclaim scans (ignored when threshold is 0) | -| `--readyz-sync-failure-threshold` / `KV_READYZ_SYNC_FAILURE_THRESHOLD` | `3` | Consecutive log-sync failures on any shard before `/readyz` returns 503 | -| `--log-level` / `LOG_LEVEL` | `info` | `tracing` filter level; set `ENVIRONMENT=development` for pretty-printed logs | +| CLI Flag / Env Var | Default | What It Controls at Runtime | +| ---------------------------------------------------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--data-dir` / `KV_DATA_DIR` | `/var/lib/beyond/kv` | Root path for all shard directories (`{data_dir}/shard-{n}`) | +| `--resp-port` / `KV_RESP_PORT` | `6379` | TCP port each thread's RESP listener binds to | +| `--http-address` / `KV_ADDRESS` | `0.0.0.0:4869` | Socket address each thread's HTTP listener binds to (full `ip:port`) | +| `--threads` / `KV_THREADS` | `num_cpus::get()` | Number of OS threads (= number of shards) | +| `--memory-bytes` / `KV_MEMORY_BYTES` | `268435456` (256 MB) | Total L1 cache budget; divided evenly across threads | +| `--max-conns-per-shard` / `KV_MAX_CONNS_PER_SHARD` | `10000` | Per-shard connection cap; connections beyond this are dropped immediately with a busy response | +| `--idle-timeout-secs` / `KV_IDLE_TIMEOUT_SECS` | `60` | Seconds of inactivity before a connection is closed | +| `--max-value-bytes` / `KV_MAX_VALUE_BYTES` | `67108864` (64 MB) | Maximum accepted value size; larger bodies are rejected with HTTP 413 or RESP `ERR` | +| `--reclaim-sealed-threshold` / `KV_RECLAIM_SEALED_THRESHOLD` | `4` | Auto-reclaim a namespace when its sealed file count exceeds this value; `0` disables auto-reclaim | +| `--reclaim-interval-secs` / `KV_RECLAIM_INTERVAL_SECS` | `300` | Seconds between auto-reclaim scans (ignored when threshold is 0) | +| `KV_COMPACTION_FANOUT` | `8` | Size-tiered compaction: a level merges into the next once it holds this many runs (higher = less write-amp, more space-amp); values < 2 ignored | +| `KV_VALUE_SEP_THRESHOLD` | `131072` (128 KiB) | Values ≥ this go to the content-addressed blob store instead of inline; one GlideFS block — below it a blob-per-value wastes space, at/above it write-amp collapses to ~1× | +| `--readyz-sync-failure-threshold` / `KV_READYZ_SYNC_FAILURE_THRESHOLD` | `3` | Consecutive log-sync failures on any shard before `/readyz` returns 503 | +| `--log-level` / `LOG_LEVEL` | `info` | `tracing` filter level; set `ENVIRONMENT=development` for pretty-printed logs | ## Observability @@ -474,43 +587,50 @@ The server is designed to run inside a trusted network perimeter (the same Glide ## Failure Modes -| Failure | What Actually Happens | Recovery | -| -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------- | -| Thread panic | `panic = "abort"` — process terminates immediately; no unwinding | External process supervisor restarts the process | -| Disk write error | `EngineError::Io` propagated; RESP client receives `ERR` response; connection stays open | Client retries; underlying disk issue must be resolved externally | -| CRC mismatch on replay | `EngineError::CrcMismatch` during recovery — active file truncates at the last good boundary, sealed-file footer falls back to scanning records | Automatic; the offending tail bytes are dropped | -| Bad record header | `EngineError::BadRecord`; treated as the truncation point during replay | Affected tail records are lost; older records survive | -| RESP parse error | Connection closed; no response sent | Client reconnects | -| HTTP malformed request | JSON error body `{"error": "...", "message": "..."}` with 4xx status | Client fixes request | -| Expired key read | Tombstone appended, evicted from L1; `None` returned to caller | Transparent; client sees cache miss | -| Crash during MSET (single shard) | Single fsynced write — either all records land or the partial tail is truncated by recovery's CRC check | No partial state; client can safely retry | -| Crash during cross-shard MSET | Each shard's subset is independent; some shards may have committed before the crash | Client retries; idempotent overwrites converge to the desired state | -| Crash mid-reclaim | Old sealed files are still authoritative; tmp file from the partial reclaim is removed on next reclaim | Automatic; no data loss (no rename happened) | -| L1 cache over capacity | Eviction runs inline during insert; oldest Small-queue entries dropped first | Automatic; no data loss (L2 is authoritative) | +| Failure | What Actually Happens | Recovery | +| --------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Thread panic | `panic = "abort"` — process terminates immediately; no unwinding | External process supervisor restarts the process | +| Process crash between writes and fsync | Writes in the last ≤1 second (since the previous timer-fsync) are lost — they went to the OS page cache but not stable storage | Up to ~1 second of writes lost; recovery truncates the active file at the last fsynced CRC boundary | +| Disk write error | `EngineError::Io` propagated; RESP client receives `ERR` response; connection stays open | Client retries; underlying disk issue must be resolved externally | +| CRC mismatch on replay | `EngineError::CrcMismatch` during recovery — active file truncates at the last good boundary, sealed-file footer falls back to scanning records | Automatic; the offending tail bytes are dropped | +| Bad record header | `EngineError::BadRecord`; treated as the truncation point during replay | Affected tail records are lost; older records survive | +| Value-separated blob corrupted (bit-rot, mismatch) | Read re-hashes the blob; on content-hash mismatch returns `EngineError::BadRecord` instead of the wrong bytes (parity with inline CRC) | Detected, not silent; the key reads as an error until the blob is restored/overwritten | +| RESP parse error | Connection closed; no response sent | Client reconnects | +| HTTP malformed request | JSON error body `{"error": "...", "message": "..."}` with 4xx status | Client fixes request | +| Expired key read | Tombstone appended, evicted from L1; `None` returned to caller | Transparent; client sees cache miss | +| Crash during MSET (single shard) | All records are built into one buffer and written with a single `write_all_at` — they're atomically visible or not from the OS perspective, but are only on stable storage after the next 1s fsync. A crash before that fsync loses the whole MSET. Recovery truncates the active file at the last fsynced CRC boundary | The MSET either fully lands or is fully absent after recovery — no partial MSET state | +| Crash during cross-shard MSET | Each shard's subset is independent; some shards may have committed before the crash | Client retries; idempotent overwrites converge to the desired state | +| Crash mid-reclaim | Old sealed files are still authoritative; tmp file from the partial reclaim is removed on next reclaim | Automatic; no data loss (no rename happened) | +| Crash between blob write and log append | The blob is written but no record references it — an **orphan blob** (wasted disk only, never data loss). Recovery doesn't index it (no footer/record points at it) | `ValueStore::sweep_orphans` at the next open deletes every `values/blob-*` not referenced by a live key. Proven on the binary across a SIGKILL restart | +| Power loss after a value-sep overwrite/delete, before its record is durable | The key reverts to its previous value (everysec: the un-fsynced overwrite is lost). The old blob is **still present** — its deletion was deferred until the superseding record's fsync, which didn't happen | Reads return the old value correctly (no dangling pointer). If the superseding record _was_ durable, the old blob is instead a true orphan → reclaimed by `sweep_orphans`. Exhaustively verified by the crash-consistency tests | +| Concurrent same-key write races a conditional write (CAS/NX/XX), then crash | **Closed.** Conditional writes hold the key's write stripe and check the condition _before_ appending, so a failed condition writes **no record at all** — there is no optimistic orphan to resurrect. (Previously: an aborted optimistic CAS left a valid orphan record a crash could resurrect.) | N/A — the orphan-producing code path was removed, not guarded. Verified by `concurrency_tests::concurrent_mixed_writes_recover_to_runtime_state` (recovery reproduces runtime exactly under heavy same-key CAS/SET/DEL contention) | +| L1 cache over capacity | Eviction runs inline during insert; oldest Small-queue entries dropped first | Automatic; no data loss (L2 is authoritative) | ## File Map -| File | What It Does | -| ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `crates/proto/src/command.rs` | Parses RESP arrays into `Command` enum; validates arity and option syntax | -| `crates/proto/src/response.rs` | Builds RESP values (ok, nil, bulk, error, array, hello reply, scan reply) | -| `crates/proto/src/error.rs` | Protocol-level error variants returned to clients | -| `crates/engine/src/store.rs` | `ShardStore`: all storage operations; coordinates L1 + L2; expiry logic; SCAN; bulk MGET | -| `crates/engine/src/cache.rs` | `MemCache`: S3-FIFO in-memory cache; eviction; ghost set; memory accounting | -| `crates/engine/src/types.rs` | `Entry`, `SetOptions`, `TtlResult`, `ScanPage` | -| `crates/engine/src/error.rs` | Storage-level errors (I/O, CRC mismatch, bad record, invalid namespace, metadata JSON) | -| `crates/engine/src/log/mod.rs` | `NamespaceLog`: index + active + sealed files; put_full / put_many / tombstone / ttl_update / bulk_read / flush / reclaim | -| `crates/engine/src/log/file.rs` | `LogFile`: monoio io_uring file wrapper; append, read_at, write_footer, read_footer | -| `crates/engine/src/log/record.rs` | Record encoding/decoding; CRC-64/NVME via `crc-fast`; flag bits | -| `crates/engine/src/log/index.rs` | `NsIndex`: hashmap + TTL sidecar + bucket-cursor SCAN | -| `crates/engine/src/log/recover.rs` | Startup: parse sealed-file footers; clean-shutdown active file has a footer (fast path), crash falls back to CRC-truncating replay | -| `crates/engine/src/log/reclaim.rs` | Threshold-triggered merge of sealed files into a new sealed file; also exposed as `BGREWRITEAOF` | -| `crates/server/src/main.rs` | Thread spawning; per-thread Monoio runtime + ShardStore initialization | -| `crates/server/src/config.rs` | CLI arg + env var parsing into `Config` | -| `crates/server/src/dispatch.rs` | Maps `Command` → `ShardStore` calls → RESP response; `ConnState`; cross-shard fan-out for MGET/MSET/DEL/EXISTS | -| `crates/server/src/cross_shard.rs` | `CrossShardRequest` enum (MGet, MSet, Del, Set, Incr, DelRev, SetNx, SetXx, SetRev, GetDel, …) + per-shard receiver loop; `futures_channel::mpsc` transport | -| `crates/engine/src/watch.rs` | `WatchEvent`, `KeyFilter`, `WatchRegistry` — per-shard subscription registry; dead-sender lazy pruning | -| `crates/server/src/resp.rs` | TCP accept loop; RESP framing; connection state machine; `WATCH`/`PWATCH` streaming (RESP3 only) | -| `crates/server/src/http.rs` | HTTP route handlers; header/query param extraction; JSON error responses; SSE watch endpoint; batch endpoint | -| `crates/server/src/routing.rs` | `peek_resp_key` / `peek_http_key` — peek first bytes of a new connection to extract routing key; `shard_for_key` (FxHash); percent-decode for HTTP paths | -| `crates/server/src/metrics.rs` | Prometheus metric definitions (`MetricsInner` / `Metrics`); `encode()` flushes atomic cache counters into registered `CounterVec` before gathering | +| File | What It Does | +| ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `crates/proto/src/command.rs` | Parses RESP arrays into `Command` enum; validates arity and option syntax | +| `crates/proto/src/response.rs` | Builds RESP values (ok, nil, bulk, error, array, hello reply, scan reply) | +| `crates/proto/src/error.rs` | Protocol-level error variants returned to clients | +| `crates/engine/src/store.rs` | `ShardStore`: all storage operations; coordinates L1 + L2; expiry logic; SCAN; bulk MGET | +| `crates/engine/src/cache.rs` | `MemCache`: S3-FIFO in-memory cache; eviction; ghost set; memory accounting | +| `crates/engine/src/types.rs` | `Entry`, `SetOptions`, `TtlResult`, `ScanPage` | +| `crates/engine/src/error.rs` | Storage-level errors (I/O, CRC mismatch, bad record, invalid namespace, metadata JSON) | +| `crates/engine/src/log/mod.rs` | `NamespaceLog`: index + active + sealed files + blob store; put_full / put_many / tombstone / ttl_update / bulk_read / flush; `reclaim` → `reclaim_inner` (size-tiered); value separation on write (`maybe_separate`/`apply_valsep_insert`) and deref on read; `compaction_bytes` | +| `crates/engine/src/value_store.rs` | `ValueStore`: content-addressed blob store (`values/blob-{hash}`), all I/O async via `monoio::fs` (io_uring); `put` (write-once + dedup, fsync data+dir before returning), `get`, `unref` (refcount-- + queue), `collect_garbage` (delete queued blobs after fsync), `incr_ref` (recovery), `sweep_orphans` (reclaim crash-orphaned blobs at open), `clear` (FLUSHDB); per-content `flock` stripes serialize create/delete; callers re-hash on read for integrity | +| `crates/engine/src/log/config.rs` | `LogConfig`: `rotate_threshold`, `fanout` (KV_COMPACTION_FANOUT), `value_sep_threshold` | +| `crates/engine/src/log/file.rs` | `LogFile`: monoio io_uring file wrapper; append, read_at; `FooterEntry` (+ `value_hash`) encode/decode + footer magic v3 | +| `crates/engine/src/log/record.rs` | Record encoding/decoding; CRC-64/NVME via `crc-fast`; flag bits | +| `crates/engine/src/log/index.rs` | `NsIndex`: `BTreeMap` + TTL sidecar + value-sep hash sidecar + range-cursor SCAN | +| `crates/engine/src/log/recover.rs` | Startup: parse sealed-file footers (incl. `value_hash`); clean-shutdown active file has a footer (fast path), crash falls back to CRC-truncating replay; repopulates the value-sep sidecar | +| `crates/engine/src/log/reclaim.rs` | `reclaim_namespace`: merge a set of sealed files into one new sealed file, unlink inputs (called once per level by size-tiered reclaim); also exposed as `BGREWRITEAOF` | +| `crates/server/src/main.rs` | Thread spawning; per-thread Monoio runtime + ShardStore initialization | +| `crates/server/src/config.rs` | CLI arg + env var parsing into `Config` | +| `crates/server/src/dispatch.rs` | Maps `Command` → `ShardStore` calls → RESP response; `ConnState`; cross-shard fan-out for MGET/MSET/DEL/EXISTS | +| `crates/server/src/cross_shard.rs` | `CrossShardRequest` enum (MGet, MSet, Del, Set, Incr, DelRev, SetNx, SetXx, SetRev, GetDel, …) + per-shard receiver loop; `futures_channel::mpsc` transport | +| `crates/engine/src/watch.rs` | `WatchEvent`, `KeyFilter`, `WatchRegistry` — per-shard subscription registry; dead-sender lazy pruning | +| `crates/server/src/resp.rs` | TCP accept loop; RESP framing; connection state machine; `WATCH`/`PWATCH` streaming (RESP3 only) | +| `crates/server/src/http.rs` | HTTP route handlers; header/query param extraction; JSON error responses; SSE watch endpoint; batch endpoint | +| `crates/server/src/routing.rs` | `peek_resp_key` / `peek_http_key` — peek first bytes of a new connection to extract routing key; `shard_for_key` (FxHash); percent-decode for HTTP paths | +| `crates/server/src/metrics.rs` | Prometheus metric definitions (`MetricsInner` / `Metrics`); `encode()` flushes atomic cache counters into registered `CounterVec` before gathering | diff --git a/Cargo.lock b/Cargo.lock index 4081369..a7aca4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,6 +82,18 @@ dependencies = [ "rustversion", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "asn1-rs" version = "0.7.1" @@ -282,11 +294,13 @@ dependencies = [ name = "beyond-kv-engine" version = "0.1.0" dependencies = [ + "blake3", "bytes", "crc-fast", "divan", "futures-channel", "futures-util", + "libc", "memchr", "monoio", "rustc-hash", @@ -342,6 +356,20 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -475,6 +503,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf0a07a401f374238ab8e2f11a104d2851bf9ce711ec69804834de8af45c7af" +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "core-foundation" version = "0.10.1" @@ -491,6 +525,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc-fast" version = "1.10.0" diff --git a/Cargo.toml b/Cargo.toml index 446403f..c6d5a1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ futures-util = { version = "0.3", features = ["sink"] } http = "1.0" memchr = "2" mimalloc = { version = "0.1", features = ["v2"] } -monoio = { version = "0.2", features = ["io-uring", "unlinkat", "renameat", "sync"] } +monoio = { version = "0.2", features = ["io-uring", "unlinkat", "renameat", "mkdirat", "sync"] } monoio-codec = "0.3" monoio-http = "0.3" monoio-rustls = "0.4" diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml index e68eac4..e076ade 100644 --- a/crates/engine/Cargo.toml +++ b/crates/engine/Cargo.toml @@ -6,6 +6,7 @@ license.workspace = true rust-version.workspace = true [dependencies] +blake3 = "1" bytes.workspace = true crc-fast.workspace = true futures-channel.workspace = true @@ -20,6 +21,7 @@ tracing.workspace = true [dev-dependencies] divan = "0.1" +libc = "0.2" tempfile = "3" [[bench]] diff --git a/crates/engine/src/lib.rs b/crates/engine/src/lib.rs index 487a2f8..bbdf270 100644 --- a/crates/engine/src/lib.rs +++ b/crates/engine/src/lib.rs @@ -3,4 +3,5 @@ pub mod error; pub mod log; pub mod store; pub mod types; +pub mod value_store; pub mod watch; diff --git a/crates/engine/src/log/ARCHITECTURE.md b/crates/engine/src/log/ARCHITECTURE.md index a953993..bda1f45 100644 --- a/crates/engine/src/log/ARCHITECTURE.md +++ b/crates/engine/src/log/ARCHITECTURE.md @@ -56,24 +56,33 @@ futures::join_all([read_exact(), read_exact(), ...]) — concurrent io_uring op [Option, ...] ``` -### Reclaim (compaction) +### Reclaim (size-tiered compaction) + +Reclaim is size-tiered, not full-merge: each merge rewrites only one level's +runs, so write amplification is ~O(log N) instead of O(reclaims × live-set), and +on GlideFS a reclaim re-uploads one level rather than the whole namespace. ``` -reclaim_namespace() - │ - ├─ 1. Seal active file — write footer (per-key metadata + CRC64 + magic) - │ - ├─ 2. Read all live index entries → read records from sealed files - │ - ├─ 3. Write live records to data-{next_id}.log.tmp +NamespaceLog::reclaim() │ - ├─ 4. rename() .tmp → .log (atomic) + ├─ 1. Seal active file — write footer — and insert it as a fresh level-0 run │ - ├─ 5. Drop old sealed files (unlink; logs failures but does not error) + ├─ 2. Cascade: while some level L holds >= `fanout` runs: + │ │ + │ ├─ collect that level's live records (index entries with those file_ids) + │ ├─ reclaim_namespace(): read them concurrently, write one merged file + │ │ to data-{next_id}.log.tmp, footer + fsync, rename .tmp → .log, + │ │ fsync dir, unlink the input files (leak-logged, never errors) + │ ├─ open_ro the merged file FIRST (only fallible step), THEN swap index + │ │ + sealed map atomically — a failed open leaves state consistent + │ └─ tag the merged run at level L+1 │ - └─ 6. Open fresh active LogFile → return ReclaimReport + └─ 3. Open a fresh active LogFile → return ReclaimReport ``` +`fanout` (default 8) is the per-level run count that triggers a merge. Levels +are in-memory only (`level: file_id → u8`); recovered runs start at level 0. + ### Recovery (startup) ``` @@ -91,27 +100,33 @@ open_namespace(dir, config) │ footer absent (crash) → replay records from offset 0, │ truncate at first bad CRC │ - └─ apply in order: - full record → NsIndex::insert() - tombstone → NsIndex::remove() - ttl_update → NsIndex::update_ttl() (only if key still present) + ├─ apply in order: + │ full record → NsIndex::insert() (+ value-sep sidecar if VALUE_SEP) + │ tombstone → NsIndex::remove() + │ ttl_update → NsIndex::set_ttl() (only if key still present) + │ + └─ NamespaceLog::open() post-steps: + rebuild blob refcounts (one incr_ref per live value-separated key) + sweep_orphans() — unlink any blob no live key references (crash leftover) + seed the revision clock from the highest recovered tstamp_ms ``` ## Concepts & Terminology -| Term | What It Controls | NOT | -| -------------- | --------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- | -| `NamespaceLog` | All reads/writes for one key-space; owns the index and file set | Not a shard — multiple namespaces can live in one shard | -| `LogFile` | One `data-{id}.log` file; tracks write offset, exposes positioned I/O | Not a WAL segment; the log IS the store | -| `active` file | The only writable file at any time; receives all new appends | Not memory-mapped; accessed via io_uring | -| `sealed` files | Immutable; readable only; eligible for reclaim | Not deleted until reclaim completes the rename | -| Footer | Per-key metadata block at the end of a file; enables fast recovery | Written to the active file on clean shutdown; absence means crash or in-progress | -| Tombstone | A record with the `TOMBSTONE` flag; marks a key as deleted in the log | Not a physical delete — the old record remains until reclaim | -| TTL-update | A tiny record with the `TTL_UPDATE` flag; updates expiry with no value copy | Not authoritative until replayed against the index | -| `NsIndex` | In-memory `FxHashMap` from key → `IndexEntry`; the read path | Not persisted — rebuilt from log on every open | -| `IndexEntry` | 16-byte struct: file_id + record_offset + record_size + flags | Does not hold the value or the key | -| Reclaim | GC: rewrites live keys into one new file; auto-triggered by sealed-file count threshold or `BGREWRITEAOF` | Caller must serialize with writes; cannot run concurrently with appends | -| `flush()` | Unlinks and recreates all files (CoW snapshot invalidation) | Not fsync — this destroys all data in the namespace | +| Term | What It Controls | NOT | +| -------------- | --------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | +| `NamespaceLog` | All reads/writes for one key-space; owns the index and file set | Not a shard — multiple namespaces can live in one shard | +| `LogFile` | One `data-{id}.log` file; tracks write offset, exposes positioned I/O | Not a WAL segment; the log IS the store | +| `active` file | The only writable file at any time; receives all new appends | Not memory-mapped; accessed via io_uring | +| `sealed` files | Immutable; readable only; eligible for reclaim | Not deleted until reclaim completes the rename | +| Footer | Per-key metadata block at the end of a file; enables fast recovery | Written to the active file on clean shutdown; absence means crash or in-progress | +| Tombstone | A record with the `TOMBSTONE` flag; marks a key as deleted in the log | Not a physical delete — the old record remains until reclaim | +| TTL-update | A tiny record with the `TTL_UPDATE` flag; updates expiry with no value copy | Not authoritative until replayed against the index | +| `NsIndex` | In-memory key → `IndexEntry` map (`BTreeMap`, ordered for SCAN) + TTL sidecar + value-sep sidecar | Not persisted — rebuilt from log on every open | +| `IndexEntry` | 24-byte struct: record_offset (u64) + record_size (u32) + file_id (u32) + tstamp_ms (u64) | Does not hold the value, the key, or flags; `tstamp_ms` doubles as the CAS revision | +| `ValueStore` | Content-addressed blob store for large (value-separated) values; refcounted, deduped, deferred-GC | Not in the log — compaction moves only the 16-byte pointer, never the blob | +| Reclaim | GC: rewrites live keys into one new file; auto-triggered by sealed-file count threshold or `BGREWRITEAOF` | Caller must serialize with writes; cannot run concurrently with appends | +| `flush()` | Unlinks and recreates all files (CoW snapshot invalidation) | Not fsync — this destroys all data in the namespace | ## Core Mechanisms @@ -127,7 +142,7 @@ Every record on disk is self-describing: Byte range Field Notes 0..8 crc64-nvme covers bytes 8..end of record 8..16 tstamp_ms monotonic; used for tie-breaking on recovery -16 flags TOMBSTONE=0x01 | NO_EXPIRY=0x02 | TTL_UPDATE=0x04 +16 flags TOMBSTONE=0x01 | NO_EXPIRY=0x02 | TTL_UPDATE=0x04 | VALUE_SEP=0x08 17..25 expires_at_ms 0 when NO_EXPIRY flag set 25..29 key_size u32 29..33 val_size u32 @@ -135,17 +150,34 @@ Byte range Field Notes 37.. key || val || meta ``` -The CRC covers the entire record body. Any byte-level corruption causes the record to be skipped on recovery (active file is truncated to the last clean record). +`HEADER_LEN = 37`. When `VALUE_SEP` is set the `val` field is not the value but +a 16-byte BLAKE3-128 content hash pointing into the blob store (see Value +Separation below); `val_size == 16` in that case. + +The CRC covers the entire record body. Any byte-level corruption causes the +record to be skipped: on recovery the active file is truncated to the last clean +record, and on the watch catch-up path (`scan_since` / `scan_file_records`) the +scan of that file stops at the first bad CRC rather than streaming a corrupt +event. ### Sealed file footer (`file.rs`) -When a file is sealed (by reclaim or a future rotation), a footer is appended: +When a file is sealed (by reclaim, rotation, or clean-shutdown seal), a footer is appended: ``` -[ IndexEntry × N ][ entry_count: u64 ][ crc64: u64 ][ magic: u64 = 0x4259_4F4E_445F_4B56 ] +[ FooterEntry × N ][ footer_body_len: u64 ][ crc64: u64 ][ magic: u64 = 0x4259_4F4E_445F_4B58 ] ``` -The magic value (`BYOND_KV` in ASCII) lets recovery distinguish a cleanly sealed file from a crashed active file. If the footer is present and CRC-valid, recovery uses it to populate the index without scanning the full file body. +Each `FooterEntry` carries `key`, `record_offset`, `record_size`, +`expires_at_ms` (optional), `tstamp_ms`, and the optional 16-byte value-sep hash +— enough to rebuild the index, the TTL sidecar, and the blob refcounts without +reading record bodies. The 24-byte trailer is `footer_body_len`, the body CRC, +and the magic. + +The magic value (`BYOND_KX` in ASCII — the `X` marks the v3 format that added +the per-entry tstamp and value-sep hash) lets recovery distinguish a cleanly +sealed file from a crashed active file. If the footer is present and CRC-valid, +recovery uses it to populate the index without scanning the full file body. ### In-memory index and TTL sidecar (`index.rs`) @@ -157,6 +189,30 @@ The magic value (`BYOND_KV` in ASCII) lets recovery distinguish a cleanly sealed The compaction rename (`data-{id}.log.tmp` → `data-{id}.log`) is the only atomic step. If the process crashes before the rename, the `.tmp` file is abandoned and recovery ignores it. If the crash happens after the rename but before old files are unlinked, the old sealed files remain; the next reclaim will skip them because the index no longer references their entries. Dead files produce a log warning, not an error. +### Value separation (`value_store.rs`) + +Values `>= config.value_sep_threshold` (default 128 KiB = one GlideFS block) are +written WiscKey-style to a content-addressed blob store at `{dir}/values/` +instead of inline in the log. The log record then carries only a 16-byte +BLAKE3-128 content hash (the `VALUE_SEP` flag marks this). Because the pointer is +tiny and immutable, compaction relocates pointers, never large values — +collapsing large-value write amplification. + +Blobs are: + +- **Deduped** — identical content across keys/forks/tenants maps to one blob. +- **Refcounted** — refcounts are in-memory, rebuilt from the live index on open. +- **Write-once + crash-durable** — the blob's data AND its directory entry are + fsynced before the pointer record can become durable, so a crash can at worst + leave an orphan blob (reclaimed by `sweep_orphans`), never a dangling pointer. +- **Deferred-GC** — when the last reference drops, the blob is queued and only + physically unlinked after the next log fsync (`collect_garbage`), so a + power-loss revert always finds its blob still present. A same-content `put` + racing the unlink is serialized by a per-hash file stripe. + +On read, the blob is fetched by hash and re-hashed to verify integrity — parity +with the CRC the inline path pays on every read. + ## State Machine ``` @@ -201,20 +257,28 @@ The engine runs on a single-threaded `monoio` runtime per shard. There is no cro ## Failure Modes -| Failure | What Actually Happens | Recovery | -| ------------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------- | -| Crash mid-append (active file) | Partial record at tail of active file | Recovery replays records; stops and truncates at first bad CRC | -| Crash mid-reclaim before rename | `.tmp` file left on disk | Ignored on next open (no `.log` suffix); old sealed files intact | -| Crash mid-reclaim after rename | Old sealed files not unlinked | Next reclaim drops them; logged as warnings | -| Sealed file footer corrupt | Footer CRC check fails | Falls back to full sequential record scan | -| Read from expired key | Returns `None`; tombstone appended lazily | Tombstone write is best-effort; a crash before it completes means the key re-expires on next read | -| `flush()` called accidentally | All namespace files unlinked and recreated | Data is gone; no recovery — `flush()` is a destructive reset | -| Clean shutdown (SIGTERM/SIGINT) | Footer written to active file before exit | Next startup treats it as sealed; no record replay needed | +| Failure | What Actually Happens | Recovery | +| --------------------------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | +| Crash mid-append (active file) | Partial record at tail of active file | Recovery replays records; stops and truncates at first bad CRC | +| Crash mid-reclaim before rename | `.tmp` file left on disk | Ignored on next open (no `.log` suffix); old sealed files intact | +| Crash mid-reclaim after rename | Old sealed files not unlinked | Next reclaim drops them; logged as warnings | +| Sealed file footer corrupt | Footer CRC check fails | Falls back to full sequential record scan | +| Read from expired key | Returns `None`; tombstone appended lazily | Tombstone write is best-effort; a crash before it completes means the key re-expires on next read | +| `flush()` called accidentally | All namespace files unlinked and recreated | Data is gone; no recovery — `flush()` is a destructive reset | +| Clean shutdown (SIGTERM/SIGINT) | Footer written to active file before exit | Next startup treats it as sealed; no record replay needed | +| Crash after blob write, before pointer record | Orphan blob on disk, no referencing key | `sweep_orphans` unlinks it on next open (refcounts rebuilt from the live index first) | +| Corrupt record on watch replay | CRC mismatch in `scan_file_records` | Scan of that file stops at the bad record; no bogus event is streamed | +| `open_ro` of merged file fails mid-reclaim | Merged file on disk, in-memory swap aborted | Index/sealed left untouched; old (unlinked-but-open) fds keep serving reads until restart finds the merged file | ## Configuration `LogConfig` (`config.rs`): -| Field | Default | What It Controls | -| --------------- | ------------ | --------------------------------------------------------------------------------------- | -| `max_file_size` | (caller-set) | Byte threshold at which the active file is rotated to sealed and a new active is opened | +| Field | Default | What It Controls | +| --------------------- | ------- | --------------------------------------------------------------------------------------------- | +| `rotate_threshold` | 1 GiB | Byte threshold at which the active file is sealed and a fresh active is opened | +| `fanout` | 8 | Size-tiered compaction fanout: a level merges into the next once it holds this many runs | +| `value_sep_threshold` | 128 KiB | Values `>=` this go to the content-addressed blob store instead of inline (one GlideFS block) | + +`KV_COMPACTION_FANOUT` and `KV_VALUE_SEP_THRESHOLD` env vars override `fanout` +and `value_sep_threshold` at `ShardStore::open` (fanout is clamped to `>= 2`). diff --git a/crates/engine/src/log/config.rs b/crates/engine/src/log/config.rs index c88bb57..c478f29 100644 --- a/crates/engine/src/log/config.rs +++ b/crates/engine/src/log/config.rs @@ -5,12 +5,23 @@ pub struct LogConfig { /// threshold, call `NamespaceLog::rotate_active()` to seal the active file and /// open a fresh one. Rotation is operator-controlled and NOT automatic. pub rotate_threshold: u64, + /// Size-tiered compaction fanout: a level is merged into the next once it + /// holds this many runs. Higher = less write-amp, more space-amp. Default 8 + /// (the measured knee). + pub fanout: usize, + /// Value-separation threshold in bytes. Values >= this are stored in the + /// content-addressed blob store instead of inline in the log, so compaction + /// never re-uploads them. Default 128 KiB = one GlideFS block: below a block, + /// a blob-per-value wastes space; at/above it, separation collapses write-amp. + pub value_sep_threshold: usize, } impl Default for LogConfig { fn default() -> Self { Self { rotate_threshold: 1 << 30, // 1 GiB + fanout: 8, + value_sep_threshold: 128 * 1024, // 128 KiB = one GlideFS block } } } diff --git a/crates/engine/src/log/file.rs b/crates/engine/src/log/file.rs index dc9b0fd..cde2fcb 100644 --- a/crates/engine/src/log/file.rs +++ b/crates/engine/src/log/file.rs @@ -97,6 +97,10 @@ impl Deref for BufGuard { impl BufGuard { pub(crate) fn into_inner(mut self) -> Vec { + // SAFETY: `take` moves the inner Vec out of the ManuallyDrop exactly + // once. The immediately-following `mem::forget(self)` prevents `Drop` + // from running and taking it a second time, so the single-take + // invariant holds across both code paths (this and `drop`). let buf = unsafe { ManuallyDrop::take(&mut self.0) }; std::mem::forget(self); buf @@ -105,6 +109,9 @@ impl BufGuard { impl Drop for BufGuard { fn drop(&mut self) { + // SAFETY: `Drop::drop` runs at most once per value, and `into_inner` + // is the only other consumer — it `mem::forget`s the guard so this + // `drop` cannot run after it. Thus the inner Vec is taken exactly once. let buf = unsafe { ManuallyDrop::take(&mut self.0) }; pool_release(buf); } @@ -113,7 +120,7 @@ impl Drop for BufGuard { /// Magic at the very end of every sealed file. Lets recovery distinguish /// "sealed cleanly" from "active or crashed mid-seal" without scanning. /// v2: includes tstamp_ms per entry for O(1) CAS revision checks. -pub const FOOTER_MAGIC: u64 = 0x4259_4F4E_445F_4B57; // "BYOND_KW" (v2) +pub const FOOTER_MAGIC: u64 = 0x4259_4F4E_445F_4B58; // "BYOND_KX" (v3: + value-sep hash) /// Footer trailer size: footer_body_len (8) + footer_crc (8) + magic (8). pub const FOOTER_TRAILER_LEN: u64 = 24; @@ -122,6 +129,7 @@ pub const FOOTER_TRAILER_LEN: u64 = 24; /// Wire layout (little-endian): /// [key_size: u32][record_offset: u64][record_size: u32] /// [expires_at_ms: u64 (0 if absent)][has_expiry: u8][tstamp_ms: u64] +/// [has_valsep: u8][value_hash: 16 bytes (only if has_valsep)] /// [key bytes] #[derive(Debug, Clone)] pub struct FooterEntry { @@ -130,11 +138,15 @@ pub struct FooterEntry { pub record_size: u32, pub expires_at_ms: Option, pub tstamp_ms: u64, + /// Content hash if this key's value is value-separated (lives in the blob + /// store). Carried in the footer so recovery rebuilds the value-sep sidecar + /// and blob refcounts without reading record bodies. + pub value_hash: Option<[u8; 16]>, } impl FooterEntry { fn encoded_size(&self) -> usize { - 4 + 8 + 4 + 8 + 1 + 8 + self.key.len() + 4 + 8 + 4 + 8 + 1 + 8 + 1 + if self.value_hash.is_some() { 16 } else { 0 } + self.key.len() } fn encode_into(&self, buf: &mut Vec) { @@ -148,11 +160,20 @@ impl FooterEntry { buf.extend_from_slice(&ms.to_le_bytes()); buf.push(has_expiry); buf.extend_from_slice(&self.tstamp_ms.to_le_bytes()); + match self.value_hash { + Some(h) => { + buf.push(1u8); + buf.extend_from_slice(&h); + } + None => buf.push(0u8), + } buf.extend_from_slice(&self.key); } fn parse(buf: &[u8]) -> Option<(Self, usize)> { - if buf.len() < 33 { + // Fixed prefix: key_size(4)+offset(8)+size(4)+expires(8)+has_expiry(1) + // +tstamp(8)+has_valsep(1) = 34 bytes. + if buf.len() < 34 { return None; } let key_size = u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]) as usize; @@ -167,11 +188,25 @@ impl FooterEntry { let tstamp_ms = u64::from_le_bytes([ buf[25], buf[26], buf[27], buf[28], buf[29], buf[30], buf[31], buf[32], ]); - let total = 33 + key_size; + let has_valsep = buf[33]; + let mut cursor = 34usize; + let value_hash = if has_valsep != 0 { + let end = cursor + 16; + if buf.len() < end { + return None; + } + let mut h = [0u8; 16]; + h.copy_from_slice(&buf[cursor..end]); + cursor = end; + Some(h) + } else { + None + }; + let total = cursor + key_size; if buf.len() < total { return None; } - let key = bytes::Bytes::copy_from_slice(&buf[33..total]); + let key = bytes::Bytes::copy_from_slice(&buf[cursor..total]); Some(( Self { key, @@ -183,17 +218,35 @@ impl FooterEntry { None }, tstamp_ms, + value_hash, }, total, )) } } -pub fn data_filename(file_id: u16) -> String { +pub fn data_filename(file_id: u32) -> String { format!("data-{:010}.log", file_id) } -pub fn reclaim_tmp_filename(file_id: u16) -> String { +/// fsync a directory so that newly-created (or renamed) entries inside it are +/// durable. A file's own `fsync` flushes its data + inode, but POSIX does not +/// guarantee the *directory entry* (the name → inode link) is durable until the +/// directory itself is fsynced. Without this, a power loss could leave a freshly +/// created data file's bytes on disk while its name is lost — making records that +/// were already fsynced unreachable, violating the `appendfsync everysec` +/// contract. Called at every new-file creation / rename site (rare paths: +/// rotate, reclaim, flush, startup), never on the per-write hot path. +/// Best-effort: opening a directory read-only and fsyncing it is the portable +/// way; on filesystems that reject it the link is still durable via journaling. +pub(crate) async fn sync_dir(dir: &Path) { + if let Ok(d) = OpenOptions::new().read(true).open(dir).await { + let _ = d.sync_all().await; + let _ = d.close().await; + } +} + +pub fn reclaim_tmp_filename(file_id: u32) -> String { format!("data-{:010}.log.tmp", file_id) } @@ -206,15 +259,20 @@ pub fn reclaim_tmp_filename(file_id: u16) -> String { /// safe under single-thread (`!Sync`) access; sufficient since each shard runs /// on its own monoio runtime. pub struct LogFile { - pub file_id: u16, + pub file_id: u32, pub path: PathBuf, file: File, write_offset: Cell, poisoned: Cell, + /// Test-only: when set, the next `append` reserves its offset (as a real + /// append does) then fails with ENOSPC instead of touching the disk — + /// faithfully modeling a disk-full write without privileges or a real fill. + #[cfg(test)] + fail_next_write: Cell, } impl LogFile { - pub async fn open_rw(path: PathBuf, file_id: u16) -> Result { + pub async fn open_rw(path: PathBuf, file_id: u32) -> Result { let file = OpenOptions::new() .read(true) .write(true) @@ -230,10 +288,12 @@ impl LogFile { file, write_offset: Cell::new(len), poisoned: Cell::new(false), + #[cfg(test)] + fail_next_write: Cell::new(false), }) } - pub async fn open_ro(path: PathBuf, file_id: u16) -> Result { + pub async fn open_ro(path: PathBuf, file_id: u32) -> Result { let file = OpenOptions::new().read(true).open(&path).await?; let metadata = file.metadata().await?; let len = metadata.len(); @@ -243,9 +303,18 @@ impl LogFile { file, write_offset: Cell::new(len), poisoned: Cell::new(false), + #[cfg(test)] + fail_next_write: Cell::new(false), }) } + /// Test-only: arm the next `append` to fail with ENOSPC after reserving its + /// offset, exactly as a real disk-full write would (which then poisons the file). + #[cfg(test)] + pub(crate) fn force_next_write_failure(&self) { + self.fail_next_write.set(true); + } + pub async fn size(&self) -> Result { let metadata = self.file.metadata().await?; Ok(metadata.len()) @@ -289,6 +358,15 @@ impl LogFile { let len = buf.len() as u64; let offset = self.write_offset.get(); self.write_offset.set(offset + len); + #[cfg(test)] + if self.fail_next_write.replace(false) { + // Model a disk-full write: offset already reserved, nothing hits disk, + // file poisoned so no later write can shadow this torn slot. + self.poisoned.set(true); + return Err(EngineError::Io { + source: std::io::Error::from_raw_os_error(28), // ENOSPC + }); + } let (res, buf) = self.file.write_all_at(buf, offset).await; if let Err(e) = res { self.poisoned.set(true); @@ -425,6 +503,7 @@ pub(crate) fn footer_entry_from_index( key: bytes::Bytes, entry: &IndexEntry, expires_at_ms: Option, + value_hash: Option<[u8; 16]>, ) -> FooterEntry { FooterEntry { key, @@ -432,12 +511,13 @@ pub(crate) fn footer_entry_from_index( record_size: entry.record_size, expires_at_ms, tstamp_ms: entry.tstamp_ms, + value_hash, } } /// List all `data-*.log` files in `dir`, sorted ascending by file_id. -pub fn list_data_files(dir: &Path) -> Result> { - let mut out: Vec<(u16, PathBuf)> = Vec::new(); +pub fn list_data_files(dir: &Path) -> Result> { + let mut out: Vec<(u32, PathBuf)> = Vec::new(); let read_dir = match std::fs::read_dir(dir) { Ok(rd) => rd, Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()), @@ -456,14 +536,66 @@ pub fn list_data_files(dir: &Path) -> Result> { let Some(num) = rest.strip_suffix(".log") else { continue; }; - let Ok(file_id_u32) = num.parse::() else { + let Ok(file_id) = num.parse::() else { continue; }; - if file_id_u32 > u16::MAX as u32 { - continue; - } - out.push((file_id_u32 as u16, path)); + out.push((file_id, path)); } out.sort_by_key(|(id, _)| *id); Ok(out) } + +#[cfg(test)] +mod enospc_tests { + use super::*; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + /// A failed append (disk-full) poisons the file: the offset was reserved, but + /// every subsequent append fails immediately. This is what prevents a later + /// write from landing PAST the torn slot — which would survive recovery while + /// the records between it and the truncation point are silently lost. Remove + /// the `poisoned` set/check in `append` and the third write below succeeds at + /// the advanced offset, shadowing the gap on the next recovery: teeth. + #[test] + fn failed_append_poisons_file_and_blocks_later_writes() { + run(async { + let dir = TempDir::new().unwrap(); + let f = LogFile::open_rw(dir.path().join("data-0000000000.log"), 0) + .await + .unwrap(); + + let (off_a, _) = f.append(b"AAAA".to_vec()).await.unwrap(); + assert_eq!(off_a, 0); + assert_eq!(f.size().await.unwrap(), 4, "first record on disk"); + + // Disk fills: this append reserves offset 4 then fails with ENOSPC. + f.force_next_write_failure(); + assert!( + f.append(b"BBBB".to_vec()).await.is_err(), + "disk-full write must error" + ); + + // The file is now poisoned. A later write must NOT succeed at the + // advanced offset (which would leave a gap at [4,8) shadowing it). + let after = f.append(b"CCCC".to_vec()).await; + assert!( + after.is_err(), + "poisoned file must reject writes — otherwise a later write shadows the torn slot on recovery" + ); + // Nothing after the first record ever reached disk. + assert_eq!( + f.size().await.unwrap(), + 4, + "no bytes written past the good prefix" + ); + }); + } +} diff --git a/crates/engine/src/log/index.rs b/crates/engine/src/log/index.rs index be3e454..055e36f 100644 --- a/crates/engine/src/log/index.rs +++ b/crates/engine/src/log/index.rs @@ -9,7 +9,7 @@ use rustc_hash::FxHashMap; /// (header + key + value + metadata). The header carries key/value/meta sizes so /// we can slice the value out in-memory. /// -/// Layout: u64 + u32 + u16 + (2 pad) + u64 = 24 bytes. +/// Layout: u64 + u32 + u32 + (2 pad) + u64 = 24 bytes. /// /// 4 GiB single-record limit (well above Redis's 512 MiB string ceiling). /// 65k files per namespace × `rotate_threshold` = comfortable disk ceiling. @@ -21,12 +21,12 @@ use rustc_hash::FxHashMap; pub struct IndexEntry { pub record_offset: u64, pub record_size: u32, - pub file_id: u16, + pub file_id: u32, pub tstamp_ms: u64, } impl IndexEntry { - pub fn new(file_id: u16, record_offset: u64, record_size: u32, tstamp_ms: u64) -> Self { + pub fn new(file_id: u32, record_offset: u64, record_size: u32, tstamp_ms: u64) -> Self { Self { record_offset, record_size, @@ -41,6 +41,10 @@ pub struct NsIndex { map: BTreeMap, /// TTL sidecar — only TTL'd keys pay extra memory. FxHashMap for O(1) point lookups. ttl: FxHashMap, + /// Value-separation sidecar: `key -> content hash` for keys whose value lives + /// in the blob store. Only large-value keys pay this. Used to unref the old + /// blob on overwrite/delete and to rebuild blob refcounts on recovery. + valsep: FxHashMap, /// Best-effort live key count: incremented on insert, decremented on remove. /// Lazy-expired keys are included until tombstoned, matching Redis DBSIZE semantics. live_count: usize, @@ -57,10 +61,35 @@ impl NsIndex { Self { map: BTreeMap::new(), ttl: FxHashMap::default(), + valsep: FxHashMap::default(), live_count: 0, } } + /// Content hash for a value-separated key, if any. + pub fn valsep(&self, key: &[u8]) -> Option { + self.valsep.get(key).copied() + } + + /// Record (or clear) the blob hash for a key. `Some` marks it value-separated; + /// `None` clears (e.g. overwrite from a large value to a small inline one). + pub fn set_valsep(&mut self, key: &Bytes, hash: Option) { + match hash { + Some(h) => { + self.valsep.insert(key.clone(), h); + } + None => { + self.valsep.remove(key); + } + } + } + + /// Iterate `(key, content hash)` for all value-separated keys. Used at open + /// to rebuild blob refcounts. + pub fn valsep_iter(&self) -> impl Iterator { + self.valsep.iter() + } + pub fn len(&self) -> usize { self.map.len() } @@ -104,6 +133,7 @@ impl NsIndex { pub fn remove(&mut self, key: &[u8]) -> Option { self.ttl.remove(key); + self.valsep.remove(key); let removed = self.map.remove(key); if removed.is_some() { self.live_count = self.live_count.saturating_sub(1); @@ -114,6 +144,7 @@ impl NsIndex { pub fn clear(&mut self) { self.map.clear(); self.ttl.clear(); + self.valsep.clear(); self.live_count = 0; } diff --git a/crates/engine/src/log/mod.rs b/crates/engine/src/log/mod.rs index f527b10..4a73c8d 100644 --- a/crates/engine/src/log/mod.rs +++ b/crates/engine/src/log/mod.rs @@ -35,14 +35,21 @@ use tracing::warn; use crate::error::{EngineError, Result}; use crate::log::config::LogConfig; use crate::log::file::{ - BufGuard, FooterEntry, LogFile, data_filename, pool_acquire_write, pool_release_write, + BufGuard, FooterEntry, LogFile, data_filename, pool_acquire_write, pool_release_write, sync_dir, }; use crate::log::index::{IndexEntry, NsIndex}; use crate::log::record::{HEADER_LEN, flags as rflags, parse_header, verify_crc}; +use crate::value_store::{ContentHash, ValueStore}; pub fn now_ms() -> u64 { match SystemTime::now().duration_since(UNIX_EPOCH) { - Ok(d) => d.as_millis() as u64, + Ok(d) => u64::try_from(d.as_millis()).unwrap_or_else(|_| { + // ~584 million years past the epoch — not reachable in practice, + // but saturate explicitly rather than silently truncating, matching + // the checked conversion in `ShardStore::validate_ttl`. + warn!("millisecond timestamp exceeds u64::MAX; saturating"); + u64::MAX + }), Err(_) => { warn!("system clock is before UNIX epoch; timestamps will be 0"); 0 @@ -72,13 +79,25 @@ impl WriteCondition { pub struct NamespaceLog { pub dir: PathBuf, + /// Content-addressed blob store for value-separated (large) values. Lives at + /// `{dir}/values/`. Values >= `config.value_sep_threshold` are stored here + /// (write-once, deduped, GC'd when the last referencing key drops) and the + /// log holds only a 16-byte hash pointer — so compaction never moves them. + pub values: ValueStore, pub index: RefCell, /// Sealed files in file_id ascending order. `Rc` so readers can /// clone a handle and drop the `RefCell` borrow before awaiting I/O. - pub sealed: RefCell>>, + pub sealed: RefCell>>, + /// Size-tier level per sealed `file_id` (tiered compaction only). 0 = freshly + /// sealed; merging `fanout` runs at level L produces one run at level L+1. + level: RefCell>, /// Active (writable) file. pub active: RefCell>, pub config: LogConfig, + /// Cumulative bytes rewritten by compaction (reclaim). Instrumentation for + /// measuring write amplification: full-merge grows ~O(reclaims × live-set), + /// tiered ~O(log N). + pub compaction_bytes: Cell, unsynced_bytes: Cell, /// Monotonically increasing tstamp_ms — wall clock with a +1 nudge if the /// clock didn't advance, so duplicate-key replays always pick the latest. @@ -100,44 +119,92 @@ pub struct NamespaceLog { /// Count of write methods currently between their entry check and exit. /// `freeze_and_drain` polls this to 0 before allowing the seal to proceed. in_flight_writes: Cell, - /// Serializes INCR/DECR within this namespace. - /// - /// Why: under contention, optimistic CAS on the same key has every - /// concurrent writer submit a (futile) disk append, then race to win the - /// post-write index update — only one wins per round, the rest become - /// orphans. Worse, io_uring completion order is roughly submission order, - /// so a late-submitting task can keep losing every round as new contenders - /// refill the in-flight pool, exhausting any finite retry budget. A single - /// async mutex collapses the herd: one INCR's read-modify-write completes - /// at a time, eliminating wasted disk writes and starvation. - pub(crate) incr_lock: futures_util::lock::Mutex<()>, + /// Per-key write serialization, striped. Every mutating method locks + /// `wlock(key)` for its check→append→commit, so two writes to the SAME key + /// never interleave — while writes to DIFFERENT keys hash to different + /// stripes and stay fully concurrent (lock-free reads are untouched). This + /// is what makes conditional writes (CAS/NX/XX) atomic: holding the stripe, + /// they check BEFORE appending, so a failed condition writes no record at + /// all — eliminating the optimistic-orphan that a crash could resurrect. + /// Collisions (distinct keys, same stripe) only cause rare, harmless extra + /// serialization. INCR no longer needs a dedicated lock: its optimistic + /// retry now appends nothing on a lost race. + write_stripes: Vec>, } +/// Number of write-lock stripes per namespace. Powers of two keep `& (N-1)` +/// cheap. 64 keeps per-key false-collisions rare without much memory. +const WRITE_STRIPES: usize = 64; + impl NamespaceLog { pub async fn open(dir: PathBuf, config: LogConfig) -> Result { let opened = recover::open_namespace(dir.clone()).await?; - let sealed: FxHashMap> = opened + let sealed: FxHashMap> = opened .sealed .into_iter() .map(|f| (f.file_id, Rc::new(f))) .collect(); let active = Rc::new(opened.active); + // Recovered sealed files start at level 0 (tiered compaction will merge + // them upward as new runs accumulate). Levels are in-memory only. + let level: FxHashMap = sealed.keys().map(|&id| (id, 0u8)).collect(); + // Rebuild blob refcounts: one per live value-separated key (the sidecar + // was repopulated from sealed footers + active-file replay during open). + let values = ValueStore::new(dir.join("values")); + for (_, h) in opened.index.valsep_iter() { + values.incr_ref(h); + } + // Reclaim any blob a crash left without a referencing record (now that + // refcounts reflect the live index, anything else on disk is an orphan). + values.sweep_orphans().await?; + // Seed the revision clock from the highest tstamp recovered, so revisions + // never regress across a restart even if the wall clock stepped back + // (next_tstamp already nudges within a run). This keeps CAS revisions and + // watch `scan_since` resumption monotonic. (A tombstone whose tstamp + // exceeds every live key's is not reflected here — a narrow, transient + // case: reclaim drops dead tombstones, and recovery resolves last-writer + // by physical order, not tstamp.) + let max_tstamp = opened + .index + .iter() + .map(|(_, e)| e.tstamp_ms) + .max() + .unwrap_or(0); Ok(Self { dir, + values, index: RefCell::new(opened.index), sealed: RefCell::new(sealed), + level: RefCell::new(level), active: RefCell::new(active), config, + compaction_bytes: Cell::new(0), unsynced_bytes: Cell::new(0), - last_tstamp: Cell::new(0), + last_tstamp: Cell::new(max_tstamp), reclaim_in_progress: Cell::new(false), rotate_in_progress: Cell::new(false), frozen: Cell::new(false), in_flight_writes: Cell::new(0), - incr_lock: futures_util::lock::Mutex::new(()), + write_stripes: (0..WRITE_STRIPES) + .map(|_| futures_util::lock::Mutex::new(())) + .collect(), }) } + /// Stripe index for `key` (FxHash & (N-1)). + fn stripe_idx(key: &[u8]) -> usize { + use std::hash::{Hash, Hasher}; + let mut h = rustc_hash::FxHasher::default(); + key.hash(&mut h); + (h.finish() as usize) & (WRITE_STRIPES - 1) + } + + /// The write-serialization stripe for `key`. Same key → same stripe → + /// serialized; different keys → (usually) different stripes → concurrent. + fn wlock(&self, key: &[u8]) -> &futures_util::lock::Mutex<()> { + &self.write_stripes[Self::stripe_idx(key)] + } + /// Block all subsequent writes (they return [`EngineError::Frozen`]) and /// wait for any already-in-flight writes to complete. Used by the seal /// path so the footer it writes is a consistent snapshot of on-disk state. @@ -210,10 +277,9 @@ impl NamespaceLog { metadata: &[u8], expires_at_ms: Option, ) -> Result { + self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs let _wg = self.begin_write()?; - if self.reclaim_in_progress.get() { - return Err(EngineError::ReclamationBusy); - } + let _w = self.wlock(&key).lock().await; // serialize writes to this key let tstamp = self.next_tstamp(); let mut flags = 0u8; let exp = match expires_at_ms { @@ -223,32 +289,77 @@ impl NamespaceLog { 0 } }; - let mut buf = pool_acquire_write(HEADER_LEN + key.len() + value.len() + metadata.len()); - record::encode_into(&mut buf, tstamp, flags, exp, &key, value, metadata)?; + // Value separation: a value >= the threshold is written to the blob store + // (write-once, deduped) and the record carries only its 16-byte hash, so + // compaction never re-uploads the value. + let sep_hash = self.maybe_separate(value, &mut flags).await?; + let stored: &[u8] = sep_hash.as_ref().map_or(value, |h| &h[..]); + let mut buf = pool_acquire_write(HEADER_LEN + key.len() + stored.len() + metadata.len()); + record::encode_into(&mut buf, tstamp, flags, exp, &key, stored, metadata)?; let record_size = buf.len() as u32; let active = self.active(); - let (offset, buf) = active.append(buf).await?; + let (offset, buf) = match active.append(buf).await { + Ok(r) => r, + Err(e) => { + // Append failed: roll back the blob ref so we don't leave a phantom + // blob (written + ref'd, but no record references it). + if let Some(h) = sep_hash { + self.values.unref(&h); + } + return Err(e); + } + }; pool_release_write(buf); self.unsynced_bytes .set(self.unsynced_bytes.get() + record_size as u64); let entry = IndexEntry::new(active.file_id, offset, record_size, tstamp); - self.index.borrow_mut().insert(key, entry, expires_at_ms); + let old_hash = self.apply_valsep_insert(key.clone(), entry, expires_at_ms, sep_hash); + if let Some(oh) = old_hash { + self.values.unref(&oh); + } if active.write_offset() >= self.config.rotate_threshold { self.rotate_active().await?; } Ok(tstamp) } + /// If `value` is large enough to separate, write it to the blob store (the + /// store dedups + refcounts) and set the `VALUE_SEP` flag; return its hash. + /// Otherwise return `None` (value stays inline). The blob is written before + /// the log record so the record's hash always points at durable bytes. + async fn maybe_separate(&self, value: &[u8], flags: &mut u8) -> Result> { + if value.len() >= self.config.value_sep_threshold { + *flags |= rflags::VALUE_SEP; + Ok(Some(self.values.put(value).await?)) + } else { + Ok(None) + } + } + + /// Insert the index entry and update the value-sep sidecar. Returns the key's + /// PREVIOUS blob hash (if it was value-separated) so the caller can unref it + /// after the new write commits — covering overwrite, large→small, and + /// same-content cases uniformly (the new blob was already ref'd by `put`). + fn apply_valsep_insert( + &self, + key: Bytes, + entry: IndexEntry, + expires_at_ms: Option, + sep_hash: Option, + ) -> Option { + let mut index = self.index.borrow_mut(); + let old = index.valsep(&key); + index.insert(key.clone(), entry, expires_at_ms); + index.set_valsep(&key, sep_hash); + old + } + /// Conditional write: write only if the current live state of `key` satisfies `cond`. - /// - /// Returns `Ok(Some(tstamp))` if written and indexed, `Ok(None)` if the condition was - /// not met. The returned tstamp is THIS write's revision — callers must use it - /// instead of [`last_revision`](Self::last_revision) when updating caches or - /// returning a revision to clients, because concurrent writes that pass pre-check - /// but later fail post-check still bump `last_tstamp`. A concurrent write that - /// lands during the disk-I/O await is detected by a post-write re-check before - /// the index is updated; if the race is lost the on-disk record becomes an - /// unreferenced orphan reclaimed during next compaction. + /// Atomic — the key's write stripe is held across check + append + commit, so no + /// concurrent write to the same key can interleave. A failed condition writes + /// nothing. Returns `Ok(Some(tstamp))` if written, `Ok(None)` if the condition + /// was not met. The returned tstamp is THIS write's revision; callers use it + /// instead of [`last_revision`](Self::last_revision) for caches/responses. pub async fn put_full_cond( &self, key: Bytes, @@ -258,11 +369,13 @@ impl NamespaceLog { cond: WriteCondition, now: u64, ) -> Result> { + self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs let _wg = self.begin_write()?; - if self.reclaim_in_progress.get() { - return Err(EngineError::ReclamationBusy); - } - // Pre-check: verify condition before incurring disk I/O. + let _w = self.wlock(&key).lock().await; // serialize writes to this key + // Holding the key's write stripe, no concurrent write to this key can run. + // So the condition check is authoritative: we check BEFORE appending, and a + // failed condition writes NOTHING (no record, no blob) — there is no + // optimistic orphan that a crash could resurrect, and no post-check. if !cond.check(Self::live_rev(&self.index.borrow(), &key, now)) { return Ok(None); } @@ -275,22 +388,31 @@ impl NamespaceLog { 0 } }; - let mut buf = pool_acquire_write(HEADER_LEN + key.len() + value.len() + metadata.len()); - record::encode_into(&mut buf, tstamp, flags, exp, &key, value, metadata)?; + let sep_hash = self.maybe_separate(value, &mut flags).await?; + let stored: &[u8] = sep_hash.as_ref().map_or(value, |h| &h[..]); + let mut buf = pool_acquire_write(HEADER_LEN + key.len() + stored.len() + metadata.len()); + record::encode_into(&mut buf, tstamp, flags, exp, &key, stored, metadata)?; let record_size = buf.len() as u32; let active = self.active(); - let (offset, buf) = active.append(buf).await?; + let (offset, buf) = match active.append(buf).await { + Ok(r) => r, + Err(e) => { + // Append failed: roll back the blob ref so we don't leave a phantom + // blob (written + ref'd, but no record references it). + if let Some(h) = sep_hash { + self.values.unref(&h); + } + return Err(e); + } + }; pool_release_write(buf); self.unsynced_bytes .set(self.unsynced_bytes.get() + record_size as u64); - // Post-check: re-verify before committing to the index. Another task that - // modified the same key during the disk-I/O await will have already updated - // the index; if that breaks our condition, abort without touching the index. - if !cond.check(Self::live_rev(&self.index.borrow(), &key, now)) { - return Ok(None); - } let entry = IndexEntry::new(active.file_id, offset, record_size, tstamp); - self.index.borrow_mut().insert(key, entry, expires_at_ms); + let old_hash = self.apply_valsep_insert(key.clone(), entry, expires_at_ms, sep_hash); + if let Some(oh) = old_hash { + self.values.unref(&oh); + } if active.write_offset() >= self.config.rotate_threshold { self.rotate_active().await?; } @@ -310,43 +432,77 @@ impl NamespaceLog { /// of [`last_revision`](Self::last_revision) — concurrent writes can bump /// `last_tstamp` higher than any tstamp this batch produced. pub async fn put_many(&self, pairs: &[(Bytes, Bytes)]) -> Result> { + self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs let _wg = self.begin_write()?; - if self.reclaim_in_progress.get() { - return Err(EngineError::ReclamationBusy); - } if pairs.is_empty() { return Ok(Vec::new()); } + // Serialize against same-key single-key writes by holding every stripe this + // batch touches. Acquired in sorted-distinct order so two batches (or a + // batch and a single write) can never deadlock on a lock-ordering cycle. + let mut idxs: Vec = pairs.iter().map(|(k, _)| Self::stripe_idx(k)).collect(); + idxs.sort_unstable(); + idxs.dedup(); + let mut _stripe_guards = Vec::with_capacity(idxs.len()); + for i in idxs { + _stripe_guards.push(self.write_stripes[i].lock().await); + } let estimated: usize = pairs .iter() .map(|(k, v)| HEADER_LEN + k.len() + v.len()) .sum(); let mut buf = pool_acquire_write(estimated); let mut layout: Vec<(usize, u32, u64)> = Vec::with_capacity(pairs.len()); + // Per-pair blob hash for value-separated values (None = inline). + let mut sep_hashes: Vec> = Vec::with_capacity(pairs.len()); for (k, v) in pairs { let tstamp = self.next_tstamp(); + let mut flags = rflags::NO_EXPIRY; + let sh = self.maybe_separate(v, &mut flags).await?; + let stored: &[u8] = sh.as_ref().map_or(&v[..], |h| &h[..]); let start = buf.len(); - record::encode_into(&mut buf, tstamp, rflags::NO_EXPIRY, 0, k, v, &[])?; + record::encode_into(&mut buf, tstamp, flags, 0, k, stored, &[])?; let record_size = (buf.len() - start) as u32; layout.push((start, record_size, tstamp)); + sep_hashes.push(sh); } let active = self.active(); let buf_len = buf.len() as u64; - let (base_offset, buf) = active.append(buf).await?; + let (base_offset, buf) = match active.append(buf).await { + Ok(r) => r, + Err(e) => { + // Append failed: roll back every blob ref this batch took so none + // are left as phantom blobs (written + ref'd, no record). + for h in sep_hashes.into_iter().flatten() { + self.values.unref(&h); + } + return Err(e); + } + }; pool_release_write(buf); self.unsynced_bytes.set(self.unsynced_bytes.get() + buf_len); + let mut old_hashes: Vec = Vec::new(); { let mut index = self.index.borrow_mut(); - for ((k, _v), (rel_start, size, tstamp)) in pairs.iter().zip(layout.iter()) { + for (((k, _v), (rel_start, size, tstamp)), sh) in + pairs.iter().zip(layout.iter()).zip(sep_hashes.iter()) + { let entry = IndexEntry::new( active.file_id, base_offset + *rel_start as u64, *size, *tstamp, ); + if let Some(oh) = index.valsep(k) { + old_hashes.push(oh); + } index.insert(k.clone(), entry, None); + index.set_valsep(k, *sh); } } + for oh in old_hashes { + self.values.unref(&oh); + } if active.write_offset() >= self.config.rotate_threshold { self.rotate_active().await?; } @@ -359,14 +515,17 @@ impl NamespaceLog { /// for watch events and any client-visible revision — concurrent writes /// can bump `last_tstamp` beyond this specific tombstone's tstamp. pub async fn tombstone(&self, key: &[u8]) -> Result> { + self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs let _wg = self.begin_write()?; - if self.reclaim_in_progress.get() { - return Err(EngineError::ReclamationBusy); - } - let was_present = self.index.borrow_mut().remove(key).is_some(); - if !was_present { - return Ok(None); - } + let _w = self.wlock(key).lock().await; // serialize writes to this key + let old_hash = { + let mut index = self.index.borrow_mut(); + let h = index.valsep(key); + if index.remove(key).is_none() { + return Ok(None); + } + h + }; let tstamp = self.next_tstamp(); let mut buf = pool_acquire_write(HEADER_LEN + key.len()); record::encode_into(&mut buf, tstamp, rflags::TOMBSTONE, 0, key, &[], &[])?; @@ -375,6 +534,9 @@ impl NamespaceLog { let (_, buf) = active.append(buf).await?; pool_release_write(buf); self.unsynced_bytes.set(self.unsynced_bytes.get() + buf_len); + if let Some(h) = old_hash { + self.values.unref(&h); + } Ok(Some(tstamp)) } @@ -389,16 +551,20 @@ impl NamespaceLog { expected_rev: u64, now: u64, ) -> Result> { + self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs let _wg = self.begin_write()?; - if self.reclaim_in_progress.get() { - return Err(EngineError::ReclamationBusy); - } + let _w = self.wlock(key).lock().await; // serialize writes to this key // Both check and removal happen without yielding — no interleaving possible. let current_rev = Self::live_rev(&self.index.borrow(), key, now); if current_rev != Some(expected_rev) { return Ok(None); } - self.index.borrow_mut().remove(key); + let old_hash = { + let mut index = self.index.borrow_mut(); + let h = index.valsep(key); + index.remove(key); + h + }; // Disk write (yields, but index already updated) let tstamp = self.next_tstamp(); let mut buf = pool_acquire_write(HEADER_LEN + key.len()); @@ -408,6 +574,9 @@ impl NamespaceLog { let (_, buf) = active.append(buf).await?; pool_release_write(buf); self.unsynced_bytes.set(self.unsynced_bytes.get() + buf_len); + if let Some(h) = old_hash { + self.values.unref(&h); + } Ok(Some(tstamp)) } @@ -415,10 +584,9 @@ impl NamespaceLog { /// tstamp assigned to this update — callers must use it (not /// [`last_revision`](Self::last_revision)) for watch events. pub async fn ttl_update(&self, key: &[u8], expires_at_ms: Option) -> Result { + self.await_reclaim().await; // stall (don't error) while a reclaim/flush runs let _wg = self.begin_write()?; - if self.reclaim_in_progress.get() { - return Err(EngineError::ReclamationBusy); - } + let _w = self.wlock(key).lock().await; // serialize writes to this key let tstamp = self.next_tstamp(); let mut flags = rflags::TTL_UPDATE; let exp = match expires_at_ms { @@ -440,7 +608,7 @@ impl NamespaceLog { Ok(tstamp) } - fn locate_file(&self, file_id: u16) -> Option> { + fn locate_file(&self, file_id: u32) -> Option> { let active = self.active.borrow().clone(); if active.file_id == file_id { return Some(active); @@ -451,11 +619,15 @@ impl NamespaceLog { /// Fsync the active file if any writes are pending. Called by the per-shard /// 1-second timer task to provide `appendfsync everysec` semantics. pub async fn sync(&self) -> Result<()> { - if self.unsynced_bytes.get() == 0 { - return Ok(()); + if self.unsynced_bytes.get() > 0 { + self.active().sync().await?; + self.unsynced_bytes.set(0); } - self.active().sync().await?; - self.unsynced_bytes.set(0); + // Every log record is durable now (just fsynced, or already was), so the + // blobs orphaned by overwrites/deletes can finally be physically removed. + // Deferring to here is what makes a power-loss revert safe: until the + // superseding record is durable, the old blob stays on disk. + self.values.collect_garbage().await; Ok(()) } @@ -470,7 +642,9 @@ impl NamespaceLog { .await } - fn extract_value_meta(bytes: &[u8]) -> Result<(Bytes, Bytes)> { + /// Returns `(value_field, metadata, flags)`. For value-separated records the + /// `value_field` is the 16-byte content hash, not the value — call `deref`. + fn extract_value_meta(bytes: &[u8]) -> Result<(Bytes, Bytes, u8)> { let hdr = parse_header(&bytes[..HEADER_LEN.min(bytes.len())], 0)?; let key_end = HEADER_LEN + hdr.key_size as usize; let val_end = key_end + hdr.val_size as usize; @@ -484,13 +658,43 @@ impl NamespaceLog { verify_crc(&hdr, &bytes[..HEADER_LEN], &bytes[HEADER_LEN..meta_end], 0)?; let value = Bytes::copy_from_slice(&bytes[key_end..val_end]); let metadata = Bytes::copy_from_slice(&bytes[val_end..meta_end]); - Ok((value, metadata)) + Ok((value, metadata, hdr.flags)) + } + + /// Resolve a record's value field to the real value: if value-separated, + /// fetch the blob by its hash; otherwise the field IS the value. + async fn deref(&self, value: Bytes, flags: u8) -> Result { + if flags & rflags::VALUE_SEP == 0 { + return Ok(value); + } + if value.len() != std::mem::size_of::() { + return Err(EngineError::BadRecord { + offset: 0, + reason: "value-separated record's value field is not a 16-byte hash", + }); + } + let mut h: ContentHash = [0u8; 16]; + h.copy_from_slice(&value); + let bytes = self.values.get(&h).await?; + // Integrity: re-hash the blob and confirm it matches the content hash the + // record points at — parity with the CRC check inline values get on every + // read. Catches silent blob corruption AND a blob/hash mismatch, instead + // of returning wrong bytes. BLAKE3 is SIMD-fast; this mirrors the per-read + // CRC the inline path already pays over the value. + if crate::value_store::content_hash(&bytes) != h { + return Err(EngineError::BadRecord { + offset: 0, + reason: "value-separated blob content hash mismatch (corruption)", + }); + } + Ok(Bytes::from(bytes)) } /// Single-record read: one `read_at`, parse header in-memory. pub async fn read_value(&self, entry: IndexEntry) -> Result<(Bytes, Bytes)> { let bytes = self.read_record(entry).await?; - Self::extract_value_meta(&bytes) + let (value, metadata, flags) = Self::extract_value_meta(&bytes)?; + Ok((self.deref(value, flags).await?, metadata)) } /// Bulk-read: submits all `read_at` futures concurrently via `join_all` so @@ -508,11 +712,26 @@ impl NamespaceLog { } let futures: Vec<_> = misses.iter().map(|(_, e)| self.read_record(*e)).collect(); let results: Vec> = join_all(futures).await; - let mut out: Vec<(usize, Bytes, Bytes)> = Vec::with_capacity(misses.len()); + // Extract synchronously, then deref all value-separated blobs concurrently + // — the same io_uring batching the record reads above already get. Without + // this, a bulk_read over N large (value-separated) values fans out the + // record reads in parallel but then fetches the N blobs one at a time. + let mut extracted: Vec<(usize, Bytes, Bytes, u8)> = Vec::with_capacity(misses.len()); for ((slot, _entry), bytes_res) in misses.into_iter().zip(results.into_iter()) { let bytes = bytes_res?; - let (value, metadata) = Self::extract_value_meta(&bytes)?; - out.push((slot, value, metadata)); + let (value, metadata, flags) = Self::extract_value_meta(&bytes)?; + extracted.push((slot, value, metadata, flags)); + } + let deref_results = join_all( + extracted + .iter() + .map(|(_, value, _, flags)| self.deref(value.clone(), *flags)), + ) + .await; + let mut out: Vec<(usize, Bytes, Bytes)> = Vec::with_capacity(extracted.len()); + for ((slot, _value, metadata, _flags), derefed) in extracted.into_iter().zip(deref_results) + { + out.push((slot, derefed?, metadata)); } Ok(out) } @@ -548,7 +767,7 @@ impl NamespaceLog { let mut events = Vec::with_capacity(live.len()); for (slot, value, meta_bytes) in read_results { - let (key, _, expires_at_ms) = &live[slot]; + let (key, entry, expires_at_ms) = &live[slot]; let metadata = if meta_bytes.is_empty() { None } else { @@ -565,7 +784,11 @@ impl NamespaceLog { value, metadata, expires_at_ms: *expires_at_ms, - revision: 0, + // The key's real revision (its record tstamp), NOT 0. The + // subscribe-then-scan window can surface a write in both `initial` + // and the live channel; callers dedup by revision, so an `initial` + // event must carry the same revision the live channel will. + revision: entry.tstamp_ms, }); } Ok(events) @@ -579,7 +802,7 @@ impl NamespaceLog { filter: &crate::watch::KeyFilter<'_>, since_revision: u64, ) -> Result> { - let mut files: Vec<(u16, Rc)> = self + let mut files: Vec<(u32, Rc)> = self .sealed .borrow() .iter() @@ -591,7 +814,7 @@ impl NamespaceLog { let mut events = Vec::new(); for (_, file) in &files { let end = file.data_end_offset().await; - scan_file_records(file, end, filter, since_revision, &mut events).await?; + scan_file_records(file, end, filter, since_revision, &self.values, &mut events).await?; } // Sort by revision so callers see a clean chronological stream. events.sort_by_key(|e| match e { @@ -636,6 +859,7 @@ impl NamespaceLog { record_size: e.record_size, expires_at_ms: index.ttl(k), tstamp_ms: e.tstamp_ms, + value_hash: index.valsep(k), }) .collect() }; @@ -669,6 +893,7 @@ impl NamespaceLog { .insert(old_active.file_id, old_active); let new_path = self.dir.join(data_filename(next_id)); let new_active = Rc::new(LogFile::open_rw(new_path, next_id).await?); + sync_dir(&self.dir).await; // make the new file's directory entry durable *self.active.borrow_mut() = new_active; self.unsynced_bytes.set(0); Ok(()) @@ -703,6 +928,7 @@ impl NamespaceLog { record_size: e.record_size, expires_at_ms: index.ttl(k), tstamp_ms: e.tstamp_ms, + value_hash: index.valsep(k), }) .collect() }; @@ -726,6 +952,7 @@ impl NamespaceLog { .insert(old_active.file_id, old_active); let new_path = self.dir.join(data_filename(next_id)); let new_active = Rc::new(LogFile::open_rw(new_path, next_id).await?); + sync_dir(&self.dir).await; // make the new file's directory entry durable *self.active.borrow_mut() = new_active; self.unsynced_bytes.set(0); Ok(()) @@ -742,10 +969,18 @@ impl NamespaceLog { /// /// NOT safe under concurrent reads/writes — caller must serialize. pub async fn flush(&self) -> Result { - if self.reclaim_in_progress.get() { + // Wait out any reclaim, then take the exclusive flag (shared with reclaim) + // so neither a reclaim nor another flush can run concurrently; writes wait + // on the same flag. `replace` is the atomic gate against a racing op. + self.await_reclaim().await; + if self.reclaim_in_progress.replace(true) { return Err(EngineError::ReclamationBusy); } - self.reclaim_in_progress.set(true); + // Drain in-flight writes before unlinking/recreating the data files, so a + // write mid-append can't race the file replacement. + while self.in_flight_writes.get() > 0 { + monoio::time::sleep(std::time::Duration::from_micros(50)).await; + } // Burn a tstamp for the flush event itself. Doing this BEFORE the // flush guarantees the revision exceeds anything previously committed // (or even speculatively assigned by concurrent failed writes). @@ -788,29 +1023,57 @@ impl NamespaceLog { let path = self.dir.join(data_filename(0)); let new_active = Rc::new(LogFile::open_rw(path, 0).await?); + sync_dir(&self.dir).await; // make the recreated file's directory entry durable *self.active.borrow_mut() = new_active; self.unsynced_bytes.set(0); Ok(()) } - /// Operator-triggered reclaim. Seals the current active file with a - /// footer, then merges all live records (across the just-sealed file plus - /// previously-sealed files) into a single new sealed file. Old files are - /// unlinked. A fresh active file is opened. + /// Operator-triggered reclaim (size-tiered compaction). Seals the active + /// file as a fresh level-0 run, then repeatedly merges the lowest level + /// that has reached `fanout` runs into one run at the next level. Each + /// merge rewrites only that level's live records (O(log N) total write + /// amplification) — never the whole live set, so on GlideFS a reclaim + /// re-uploads one level, not the entire namespace. /// /// NOT concurrent-safe with other ops on this namespace. pub async fn reclaim(&self) -> Result { - if self.reclaim_in_progress.get() { + // Atomic check-and-set: a second concurrent reclaim on this namespace is a + // no-op error (only one reclaim at a time). Writes do NOT error — they wait + // on `await_reclaim` and proceed once this finishes. + if self.reclaim_in_progress.replace(true) { return Err(EngineError::ReclamationBusy); } - self.reclaim_in_progress.set(true); + // Drain writes that already passed the gate before we set the flag, so the + // seal's footer is a consistent snapshot — no in-flight write (appended but + // not yet indexed) is missed and silently lost on the next footer recovery. + // New writes now wait in `await_reclaim` BEFORE `begin_write`, so they don't + // hold `in_flight_writes` and this drain always terminates (no deadlock). + while self.in_flight_writes.get() > 0 { + monoio::time::sleep(std::time::Duration::from_micros(50)).await; + } let result = self.reclaim_inner().await; self.reclaim_in_progress.set(false); result } + /// Block until no reclaim is in progress on this namespace. Called at the very + /// start of every write (before `begin_write`), so writes stall during a + /// reclaim instead of erroring, and waiters never hold the in-flight count. + async fn await_reclaim(&self) { + // 500µs, not 50µs: a reclaim of a large namespace can take seconds, and + // every concurrent writer parks here for its whole duration. The coarser + // interval cuts timer-wheel churn ~10× across all waiting writers while + // adding at most ~half a millisecond to post-reclaim write latency. + while self.reclaim_in_progress.get() { + monoio::time::sleep(std::time::Duration::from_micros(500)).await; + } + } + async fn reclaim_inner(&self) -> Result { - // Seal the current active. + use std::collections::{BTreeMap, HashSet}; + + // 1. Seal the active file as a fresh level-0 run. let old_active = self.active.borrow().clone(); let footer: Vec = { let index = self.index.borrow(); @@ -823,68 +1086,136 @@ impl NamespaceLog { record_size: e.record_size, expires_at_ms: index.ttl(k), tstamp_ms: e.tstamp_ms, + value_hash: index.valsep(k), }) .collect() }; old_active.write_footer(&footer).await?; self.sealed .borrow_mut() - .insert(old_active.file_id, old_active.clone()); + .insert(old_active.file_id, old_active); // level 0 (absent from map) - // Pick the next file_id as max(existing) + 1. - let next_id = { + let mut total = reclaim::ReclaimReport { + live_keys: 0, + live_bytes: 0, + dead_files_dropped: 0, + dead_files_leaked: 0, + new_file_id: 0, + }; + + // 2. Cascade: while some level holds >= fanout runs, merge that level + // into one run at the next level. + loop { + let by_level: BTreeMap> = { + let levels = self.level.borrow(); + let mut m: BTreeMap> = BTreeMap::new(); + for &id in self.sealed.borrow().keys() { + m.entry(levels.get(&id).copied().unwrap_or(0)) + .or_default() + .push(id); + } + m + }; + let (lvl, ids) = match by_level + .iter() + .find(|(_, ids)| ids.len() >= self.config.fanout) + { + Some((&l, ids)) => (l, ids.clone()), + None => break, + }; + + let files: Vec> = { + let sealed = self.sealed.borrow(); + ids.iter() + .filter_map(|id| sealed.get(id).cloned()) + .collect() + }; + let id_set: HashSet = ids.iter().copied().collect(); + let live: Vec<(Bytes, IndexEntry, Option)> = { + let index = self.index.borrow(); + index + .iter() + .filter(|(_, e)| id_set.contains(&e.file_id)) + .map(|(k, e)| (k.clone(), *e, index.ttl(k))) + .collect() + }; + let next_id = { + let sealed = self.sealed.borrow(); + sealed + .keys() + .copied() + .max() + .unwrap_or(0) + .max(self.active.borrow().file_id) + .checked_add(1) + .ok_or(EngineError::CapacityExceeded { + reason: "file_id overflow: namespace has too many log files", + })? + }; + + // reclaim_namespace writes one merged file (next_id) and unlinks the + // input `files`; index borrow is not held across the await. + let (report, new_entries) = + reclaim::reclaim_namespace(self.dir.clone(), &files, next_id, &live).await?; + total.live_keys = report.live_keys; + total.live_bytes = report.live_bytes; + self.compaction_bytes + .set(self.compaction_bytes.get() + report.live_bytes); + total.dead_files_dropped += report.dead_files_dropped; + total.dead_files_leaked += report.dead_files_leaked; + + // Open the merged file FIRST — it is the only fallible step left. If + // it fails (EMFILE, hardware error), we return with the index and + // sealed map untouched: they still reference the old file_ids, whose + // `Rc` handles remain open. On Linux those fds keep serving + // reads even though `reclaim_namespace` already unlinked the paths, so + // no key goes dark before the next restart (which finds the merged + // file on disk). Mutating in-memory state before this open could leave + // the index pointing at a `next_id` absent from `sealed` — reads of + // those keys would fail with "file_id not found" until restart. + let new_file = + Rc::new(LogFile::open_ro(self.dir.join(data_filename(next_id)), next_id).await?); + // From here on every step is infallible: commit the swap atomically. + { + let mut index = self.index.borrow_mut(); + for (key, entry, ttl) in new_entries { + index.insert(key, entry, ttl); + } + } + { + let mut sealed = self.sealed.borrow_mut(); + let mut levels = self.level.borrow_mut(); + for id in &ids { + sealed.remove(id); + levels.remove(id); + } + sealed.insert(next_id, new_file); + levels.insert(next_id, lvl.saturating_add(1)); + } + } + + // 3. Open a fresh active file. + let new_active_id = { let sealed = self.sealed.borrow(); sealed .keys() .copied() .max() .unwrap_or(0) + .max(self.active.borrow().file_id) .checked_add(1) .ok_or(EngineError::CapacityExceeded { reason: "file_id overflow: namespace has too many log files", })? }; - let new_active_id = next_id - .checked_add(1) - .ok_or(EngineError::CapacityExceeded { - reason: "file_id overflow: namespace has too many log files", - })?; - - let sealed_snapshot: Vec> = self.sealed.borrow().values().cloned().collect(); - - // Snapshot live entries outside the await so the reclaim doesn't hold an index borrow. - let live: Vec<(Bytes, IndexEntry, Option)> = { - let index = self.index.borrow(); - index - .iter() - .map(|(k, e)| (k.clone(), *e, index.ttl(k))) - .collect() - }; - - let (report, new_entries) = - reclaim::reclaim_namespace(self.dir.clone(), &sealed_snapshot, next_id, &live).await?; - - // Apply new index entries. - { - let mut index = self.index.borrow_mut(); - for (key, entry, ttl) in new_entries { - index.insert(key, entry, ttl); - } - } - - // Drop old sealed handles & swap in the single new sealed file. - self.sealed.borrow_mut().clear(); - let new_sealed_path = self.dir.join(data_filename(next_id)); - let new_sealed = Rc::new(LogFile::open_ro(new_sealed_path, next_id).await?); - self.sealed.borrow_mut().insert(next_id, new_sealed); - - // Open a fresh active file. - let new_active_path = self.dir.join(data_filename(new_active_id)); - let new_active = Rc::new(LogFile::open_rw(new_active_path, new_active_id).await?); + let new_active = Rc::new( + LogFile::open_rw(self.dir.join(data_filename(new_active_id)), new_active_id).await?, + ); + sync_dir(&self.dir).await; // make the new active file's directory entry durable *self.active.borrow_mut() = new_active; self.unsynced_bytes.set(0); - - Ok(report) + total.new_file_id = new_active_id; + Ok(total) } } @@ -908,6 +1239,7 @@ async fn scan_file_records( end_offset: u64, filter: &crate::watch::KeyFilter<'_>, since_revision: u64, + values: &ValueStore, events: &mut Vec, ) -> Result<()> { use crate::watch::WatchEvent; @@ -941,6 +1273,19 @@ async fn scan_file_records( Ok(b) => b, Err(_) => break, }; + // Verify integrity before trusting bytes we hand to a subscriber: a + // corrupt record must not be streamed as a bogus watch event. Every + // other record-reading path (replay_active, rebuild_from_records, + // extract_value_meta) checks the CRC; this one must too. Stop scanning + // this file at the first bad CRC — the record_len we'd skip forward by + // is itself covered by the CRC and can't be trusted past a mismatch. + if record::verify_crc(&hdr, &hdr_bytes, &body, offset).is_err() { + warn!( + offset, + "bad CRC during watch replay; stopping scan of this file" + ); + break; + } let key = &body[..hdr.key_size as usize]; if filter.matches(key) { let is_tombstone = hdr.flags & record::flags::TOMBSTONE != 0; @@ -954,7 +1299,40 @@ async fn scan_file_records( let val_start = hdr.key_size as usize; let val_end = val_start + hdr.val_size as usize; let meta_end = val_end + hdr.meta_size as usize; - let value = Bytes::copy_from_slice(&body[val_start..val_end]); + // Value-separated records carry the 16-byte blob hash, not the + // value — deref it (and verify) so watchers replaying via + // scan_since see the real value, not the pointer. + let value = if hdr.flags & record::flags::VALUE_SEP != 0 { + let field = &body[val_start..val_end]; + if field.len() != 16 { + warn!( + offset, + "value-sep record without a 16-byte hash; skipping watch event" + ); + offset += record_len; + continue; + } + let mut h: ContentHash = [0u8; 16]; + h.copy_from_slice(field); + match values.get(&h).await { + Ok(b) if crate::value_store::content_hash(&b) == h => Bytes::from(b), + Ok(_) => { + warn!( + offset, + "blob hash mismatch during watch replay; skipping event" + ); + offset += record_len; + continue; + } + Err(e) => { + warn!(offset, error = %e, "blob read failed during watch replay; skipping event"); + offset += record_len; + continue; + } + } + } else { + Bytes::copy_from_slice(&body[val_start..val_end]) + }; let meta_bytes = &body[val_end..meta_end]; let metadata = if meta_bytes.is_empty() { None @@ -987,3 +1365,1719 @@ async fn scan_file_records( } Ok(()) } + +#[cfg(test)] +mod compaction_tests { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + async fn write_batch(log: &NamespaceLog, lo: usize, hi: usize) { + let val = vec![b'a'; 1000]; + for i in lo..hi { + log.put_full(Bytes::from(format!("k{i:05}")), &val, &[], None) + .await + .unwrap(); + } + } + + fn sealed_ids(log: &NamespaceLog) -> std::collections::HashSet { + log.sealed.borrow().keys().copied().collect() + } + + /// The flood fix: reclaim must NOT rewrite the inherited base on every + /// reclaim. After a first reclaim produces a level-1 base run, a second + /// batch + reclaim should merge only the NEW level-0 runs into a second + /// level-1 run — leaving the original base untouched (still on disk, not + /// re-uploaded to S3). + #[test] + fn reclaim_does_not_rewrite_base_each_reclaim() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 4096, + fanout: 4, + value_sep_threshold: 128 * 1024, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + + write_batch(&log, 0, 30).await; + assert!( + log.sealed_file_count() >= 4, + "batch should seal >= fanout level-0 runs" + ); + log.reclaim().await.unwrap(); + let base = sealed_ids(&log); + assert_eq!( + base.len(), + 1, + "level-0 runs merge into one level-1 base run" + ); + + write_batch(&log, 30, 60).await; + log.reclaim().await.unwrap(); + let after = sealed_ids(&log); + + assert!( + base.is_subset(&after), + "tiered must leave the base run untouched (not re-upload it): base={base:?} after={after:?}" + ); + assert_eq!( + after.len(), + 2, + "two level-1 runs (< fanout) — no base re-merge" + ); + assert_eq!(log.len(), 60, "all keys live through tiered merges"); + }); + } + + /// Quantitative flood check on the REAL engine: 12 reclaims over a churning + /// ~200-key live set. Size-tiered rewrites far less than full-merge would. + /// Full-merge's cost is analytical (it rewrote the whole live set on every + /// reclaim — 12 × live-set), so we compare measured tiered bytes against + /// that ceiling without keeping the dead full-merge path around. + #[test] + fn reclaim_write_amp_beats_full_merge() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 4096, + fanout: 4, + value_sep_threshold: 128 * 1024, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + write_batch(&log, 0, 200).await; // base live set + log.reclaim().await.unwrap(); // fold base into a level-1 run + let live_set_bytes = log.compaction_bytes.get(); // ~one full live-set rewrite + log.compaction_bytes.set(0); // measure the churn phase only + + let reclaims = 12usize; + for r in 0..reclaims { + let lo = (r * 16) % 200; + write_batch(&log, lo, lo + 16).await; // overwrite 16 existing keys + log.reclaim().await.unwrap(); + } + let tiered = log.compaction_bytes.get(); + // Full-merge would rewrite the entire live set on every reclaim. + let full_merge = live_set_bytes * reclaims as u64; + eprintln!( + "\n COMPACTION BYTES over {reclaims} reclaims (base ~200 KiB):\n full-merge (analytical = {reclaims}× live set) = {:.2} MiB\n size-tiered (measured) = {:.2} MiB\n tiered rewrites {:.1}× LESS\n", + full_merge as f64 / 1048576.0, + tiered as f64 / 1048576.0, + full_merge as f64 / tiered.max(1) as f64 + ); + assert!( + tiered * 2 < full_merge, + "tiered must rewrite far less: tiered={tiered} full-merge={full_merge}" + ); + }); + } +} + +#[cfg(test)] +mod value_sep_tests { + use super::*; + use crate::log::config::LogConfig; + use crate::value_store::content_hash; + use bytes::Bytes; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + fn key(i: usize) -> Bytes { + Bytes::from(format!("k{i:05}")) + } + + /// A large value is stored in the blob store, NOT inline: the log record is a + /// tiny pointer (header + key + 16-byte hash), and GET still returns the value. + #[test] + fn large_value_is_separated_and_reads_back() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + + let big = vec![0xABu8; 64 * 1024]; // 64 KiB > 4 KiB threshold + log.put_full(key(0), &big, &[], None).await.unwrap(); + + assert_eq!(log.values.blob_count(), 1, "value went to the blob store"); + let entry = *log.index.borrow().get(b"k00000").unwrap(); + assert!( + (entry.record_size as usize) < 4096, + "log record is a tiny pointer, not the 64 KiB value: {} bytes", + entry.record_size + ); + let (v, _m) = log.read_value(entry).await.unwrap(); + assert_eq!( + v, + Bytes::from(big), + "GET derefs the blob and returns the value" + ); + + // A small value stays inline (no new blob). + log.put_full(key(1), b"small", &[], None).await.unwrap(); + assert_eq!(log.values.blob_count(), 1, "small value stays inline"); + }); + } + + /// THE proof: compaction moves only pointers for separated values. Churn a set + /// of large values across many reclaims and compare compaction bytes to inline. + #[test] + fn compaction_moves_only_pointers_not_values() { + run(async { + async fn churn(threshold: usize) -> (u64, usize) { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 64 * 1024, + fanout: 4, + value_sep_threshold: threshold, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let n = 60usize; + let v0 = vec![0xCDu8; 32 * 1024]; + for i in 0..n { + log.put_full(key(i), &v0, &[], None).await.unwrap(); + } + log.reclaim().await.unwrap(); + log.compaction_bytes.set(0); // measure the churn phase only + for r in 0..10u8 { + let vr = vec![r; 32 * 1024]; // new content each round + for i in 0..n { + log.put_full(key(i), &vr, &[], None).await.unwrap(); + } + log.reclaim().await.unwrap(); + } + // all n keys still readable through the blob deref + for i in 0..n { + let e = *log + .index + .borrow() + .get(format!("k{i:05}").as_bytes()) + .unwrap(); + assert_eq!(log.read_value(e).await.unwrap().0.len(), 32 * 1024); + } + (log.compaction_bytes.get(), log.values.blob_count()) + } + let (vs_bytes, vs_blobs) = churn(4096).await; // value-separated + let (inline_bytes, _) = churn(usize::MAX).await; // everything inline + eprintln!( + "\n COMPACTION BYTES over 10 reclaims (60 keys x 32 KiB, churned):\n inline = {:.2} MiB\n value-sep = {:.2} MiB ({} live blobs — dedup across keys)\n value-sep moves {:.0}x fewer bytes (only pointers)\n", + inline_bytes as f64 / 1048576.0, + vs_bytes as f64 / 1048576.0, + vs_blobs, + inline_bytes as f64 / vs_bytes.max(1) as f64 + ); + assert!( + vs_bytes * 5 < inline_bytes, + "value-sep must move far fewer compaction bytes: vs={vs_bytes} inline={inline_bytes}" + ); + assert!( + vs_blobs <= 2, + "identical per-round values dedup to ~1 blob, got {vs_blobs}" + ); + }); + } + + /// Overwriting or deleting a separated value reclaims the old blob (refcount→0). + #[test] + fn overwrite_and_delete_gc_the_blob() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + + log.put_full(key(0), &vec![1u8; 8192], &[], None) + .await + .unwrap(); + assert_eq!(log.values.blob_count(), 1); + // overwrite with different content -> old blob GC'd, one blob remains + log.put_full(key(0), &vec![2u8; 8192], &[], None) + .await + .unwrap(); + assert_eq!( + log.values.blob_count(), + 1, + "old blob reclaimed on overwrite" + ); + // delete -> blob GC'd + log.tombstone(b"k00000").await.unwrap(); + assert_eq!(log.values.blob_count(), 0, "blob reclaimed on delete"); + }); + } + + /// After a clean restart, separated values still read back (footer carried the + /// hash; refcounts rebuilt), and a subsequent overwrite still GCs correctly. + #[test] + fn separated_values_survive_reopen() { + run(async { + let dir = TempDir::new().unwrap(); + let path = dir.path().to_path_buf(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let big = vec![0x5Au8; 100 * 1024]; + { + let log = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + log.put_full(key(0), &big, &[], None).await.unwrap(); + log.reclaim().await.unwrap(); // seal -> footer carries the value hash + } + // Reopen from disk. + let log = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + let e = *log.index.borrow().get(b"k00000").unwrap(); + assert_eq!( + log.read_value(e).await.unwrap().0, + Bytes::from(big.clone()), + "value reads back after reopen" + ); + assert_eq!( + log.values.refcount(&content_hash(&big)), + 1, + "refcount rebuilt from footer" + ); + // Overwrite -> the rebuilt refcount lets the inherited blob GC. + log.put_full(key(0), &vec![9u8; 100 * 1024], &[], None) + .await + .unwrap(); + assert_eq!( + log.values.refcount(&content_hash(&big)), + 0, + "old blob unref'd after reopen+overwrite" + ); + }); + } + + /// CRASH recovery (no clean footer): a value-separated key written to the + /// active file and never sealed must, after reopen, rebuild the value-sep + /// sidecar from the RECORD SCAN (`replay_active`) — not the footer. Proven by + /// a post-reopen overwrite correctly GC'ing the inherited blob. + #[test] + fn separated_values_survive_crash_recovery() { + run(async { + let dir = TempDir::new().unwrap(); + let path = dir.path().to_path_buf(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let big = vec![0x33u8; 100 * 1024]; + { + let log = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + log.put_full(key(0), &big, &[], None).await.unwrap(); + // Drop WITHOUT sealing -> active file has no footer (a crash). + } + let log = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + let e = *log.index.borrow().get(b"k00000").unwrap(); + assert_eq!( + log.read_value(e).await.unwrap().0, + Bytes::from(big.clone()), + "reads back after crash recovery" + ); + assert_eq!( + log.values.refcount(&content_hash(&big)), + 1, + "refcount rebuilt from the record scan, not a footer" + ); + log.put_full(key(0), &vec![0x44u8; 100 * 1024], &[], None) + .await + .unwrap(); + assert_eq!( + log.values.refcount(&content_hash(&big)), + 0, + "sidecar from scan let the old blob GC on overwrite" + ); + }); + } + + /// MSET (`put_many`) separates large values, derefs them on read, dedups + /// identical content, and GCs the old blob when a key is rewritten in a + /// later batch. + #[test] + fn mset_separates_large_values() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let big1 = Bytes::from(vec![1u8; 8192]); + let big2 = Bytes::from(vec![2u8; 8192]); + let small = Bytes::from_static(b"inline"); + log.put_many(&[ + (key(0), big1.clone()), + (key(1), big2.clone()), + (key(2), small.clone()), + ]) + .await + .unwrap(); + assert_eq!( + log.values.blob_count(), + 2, + "two distinct large values separated; small stays inline" + ); + for (k, want) in [(0usize, &big1), (1, &big2), (2, &small)] { + let e = *log + .index + .borrow() + .get(format!("k{k:05}").as_bytes()) + .unwrap(); + assert_eq!( + log.read_value(e).await.unwrap().0, + *want, + "MSET value {k} reads back" + ); + } + // Rewrite key0 in a later MSET with new content -> old blob GC'd. + let big1b = Bytes::from(vec![9u8; 8192]); + log.put_many(&[(key(0), big1b)]).await.unwrap(); + assert_eq!( + log.values.refcount(&content_hash(&big1)), + 0, + "old MSET blob reclaimed" + ); + assert_eq!(log.values.blob_count(), 2, "key0's new blob + key1's blob"); + }); + } + + /// Cross-key dedup refcount: two keys with identical large content share ONE + /// blob (refcount 2). Deleting one must NOT delete the blob — the other key + /// still reads correctly. This is the premature-deletion / data-loss guard. + #[test] + fn shared_blob_not_deleted_while_referenced() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let v = vec![7u8; 8192]; + log.put_full(key(0), &v, &[], None).await.unwrap(); + log.put_full(key(1), &v, &[], None).await.unwrap(); // identical content + assert_eq!( + log.values.blob_count(), + 1, + "identical content dedups to one blob" + ); + assert_eq!(log.values.refcount(&content_hash(&v)), 2); + + log.tombstone(b"k00000").await.unwrap(); // delete ONE referencing key + assert_eq!( + log.values.blob_count(), + 1, + "blob survives — k1 still references it" + ); + let e = *log.index.borrow().get(b"k00001").unwrap(); + assert_eq!( + log.read_value(e).await.unwrap().0, + Bytes::from(v.clone()), + "surviving key still reads" + ); + + log.tombstone(b"k00001").await.unwrap(); // delete the last reference + assert_eq!( + log.values.blob_count(), + 0, + "blob reclaimed only after last reference drops" + ); + }); + } + + /// CAS / conditional writes with large values: a successful CAS GCs the old + /// blob; a CAS that LOSES the post-check must unref the blob it wrote (no leak). + #[test] + fn cas_large_value_gc_and_abort_unref() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let now = super::now_ms(); + let v1 = vec![1u8; 8192]; + // SETNX-style on an absent key: writes + separates. + assert!( + log.put_full_cond(key(0), &v1, &[], None, WriteCondition::KeyAbsent, now) + .await + .unwrap() + .is_some() + ); + assert_eq!(log.values.blob_count(), 1); + let rev = log.index.borrow().get(b"k00000").unwrap().tstamp_ms; + + // CAS with matching revision: overwrites, old blob GC'd. + let v2 = vec![2u8; 8192]; + assert!( + log.put_full_cond(key(0), &v2, &[], None, WriteCondition::Revision(rev), now) + .await + .unwrap() + .is_some() + ); + assert_eq!( + log.values.refcount(&content_hash(&v1)), + 0, + "old blob GC'd on successful CAS" + ); + assert_eq!(log.values.refcount(&content_hash(&v2)), 1); + + // CAS with a stale revision: aborts. The blob it wrote must be unref'd. + let v3 = vec![3u8; 8192]; + assert!( + log.put_full_cond(key(0), &v3, &[], None, WriteCondition::Revision(rev), now) + .await + .unwrap() + .is_none() + ); + assert_eq!( + log.values.refcount(&content_hash(&v3)), + 0, + "aborted CAS unref'd its blob — no leak" + ); + assert_eq!(log.values.blob_count(), 1, "only v2's blob remains"); + }); + } +} + +#[cfg(test)] +mod crash_consistency { + use super::*; + use crate::log::config::LogConfig; + use crate::value_store::{ContentHash, content_hash}; + use bytes::Bytes; + use std::collections::{BTreeMap, HashSet}; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + #[derive(Clone)] + enum Op { + Set { k: u8, val: Vec, large: bool }, + Del { k: u8 }, + } + fn kb(k: u8) -> Bytes { + Bytes::from(format!("k{k}")) + } + + /// Exhaustive power-loss crash-consistency proof. Write a workload, fsync at a + /// known point, then — modelling a power loss, which can only lose the + /// UN-fsynced tail — truncate the active log at EVERY byte offset in that tail + /// and recover. After each recovery the state must be a valid prefix of the + /// write history: exactly the records that fully fit below the cut, last-writer + /// -wins; every surviving key must read back its correct value (deref proves the + /// blob is present = no dangling pointer); and the blob count must equal the + /// live large-value set (sweep reclaimed orphans = no leak). + /// + /// The tail contains a value-separated OVERWRITE (k0: A→B). That is the case the + /// deferred-blob-deletion fix protects: a cut that loses the overwrite reverts + /// k0 to A, and A's blob must still exist. Without the fix this test fails at + /// those offsets with a dangling-pointer read error. + #[test] + fn exhaustive_tail_truncation_is_consistent() { + run(async { + let work = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 40, + fanout: 8, + value_sep_threshold: 256, + }; + let big = |b: u8| vec![b; 512]; // >= threshold -> value-separated + let ops = [ + Op::Set { + k: 0, + val: big(0xA1), + large: true, + }, + Op::Set { + k: 1, + val: b"s1".to_vec(), + large: false, + }, + // ---- fsync here: everything above is durable ---- + Op::Set { + k: 0, + val: big(0xB2), + large: true, + }, // overwrite k0 (old A blob deferred) + Op::Set { + k: 2, + val: big(0xC3), + large: true, + }, + Op::Del { k: 1 }, + Op::Set { + k: 3, + val: b"s3".to_vec(), + large: false, + }, + ]; + let fsync_after = 2usize; + + let mut ends: Vec = Vec::with_capacity(ops.len()); + let mut fsync_offset = 0u64; + { + let log = NamespaceLog::open(work.path().to_path_buf(), cfg) + .await + .unwrap(); + for (i, op) in ops.iter().enumerate() { + match op { + Op::Set { k, val, .. } => { + log.put_full(kb(*k), val, &[], None).await.unwrap(); + } + Op::Del { k } => { + log.tombstone(kb(*k).as_ref()).await.unwrap(); + } + } + ends.push(log.active.borrow().write_offset()); + if i + 1 == fsync_after { + log.sync().await.unwrap(); + fsync_offset = log.active.borrow().write_offset(); + } + } + // Deliberately NO final sync: ops after `fsync_after` are the + // crash-vulnerable un-fsynced tail. + } + + // Capture the on-disk image (page cache reflects all written bytes). + let data_bytes = std::fs::read(work.path().join(data_filename(0))).unwrap(); + let values_dir = work.path().join("values"); + let blob_snapshot: Vec<(std::ffi::OsString, Vec)> = std::fs::read_dir(&values_dir) + .map(|rd| { + rd.flatten() + .map(|e| (e.file_name(), std::fs::read(e.path()).unwrap())) + .collect() + }) + .unwrap_or_default(); + + let crash = TempDir::new().unwrap(); + let crash_data = crash.path().join(data_filename(0)); + let crash_values = crash.path().join("values"); + + for t in (fsync_offset as usize)..=data_bytes.len() { + // Rebuild the crashed image: log truncated to t, FULL blob set + // restored (sweep_orphans mutates it, so restore every iteration). + std::fs::write(&crash_data, &data_bytes[..t]).unwrap(); + let _ = std::fs::remove_dir_all(&crash_values); + if !blob_snapshot.is_empty() { + std::fs::create_dir_all(&crash_values).unwrap(); + for (name, bytes) in &blob_snapshot { + std::fs::write(crash_values.join(name), bytes).unwrap(); + } + } + + // Oracle: the prefix of ops whose record fully fits below the cut. + let mut state: BTreeMap, bool)> = BTreeMap::new(); + for (op, end) in ops.iter().zip(ends.iter()) { + if *end > t as u64 { + break; + } + match op { + Op::Set { k, val, large } => { + state.insert(*k, (val.clone(), *large)); + } + Op::Del { k } => { + state.remove(k); + } + } + } + + let log = NamespaceLog::open(crash.path().to_path_buf(), cfg) + .await + .unwrap(); + + // (1) recovered key set == expected prefix key set + let recovered: HashSet> = + log.index.borrow().iter().map(|(k, _)| k.to_vec()).collect(); + let expected: HashSet> = state.keys().map(|k| kb(*k).to_vec()).collect(); + assert_eq!(recovered, expected, "key set mismatch at truncation t={t}"); + + // (2) every surviving key reads its correct value (deref => blob + // present => no dangling pointer) + for (k, (val, _large)) in &state { + let e = *log.index.borrow().get(kb(*k).as_ref()).unwrap(); + let got = log.read_value(e).await.unwrap_or_else(|err| { + panic!("DANGLING/corrupt read for k{k} at t={t}: {err:?}") + }); + assert_eq!( + got.0.as_ref(), + val.as_slice(), + "value mismatch k{k} at t={t}" + ); + } + + // (3) blob count == distinct live large values (orphans swept => no leak) + let want: HashSet = state + .values() + .filter(|(_, large)| *large) + .map(|(v, _)| content_hash(v)) + .collect(); + assert_eq!( + log.values.blob_count(), + want.len(), + "blob leak/missing at t={t}: have {} want {}", + log.values.blob_count(), + want.len() + ); + } + + eprintln!( + "\n CRASH-CONSISTENCY: {} tail-truncation offsets ({}..={}) all recovered to a\n valid prefix — zero dangling pointers, zero blob leaks.\n", + data_bytes.len() - fsync_offset as usize + 1, + fsync_offset, + data_bytes.len() + ); + }); + } + + /// Bit-rot of a DURABLE record: corrupt one byte inside record `i`, and + /// recovery must detect the CRC mismatch, truncate at the start of record `i` + /// (dropping it and everything after it), and leave the prefix [0, i) fully + /// intact and readable — with the now-unreferenced blobs of the dropped tail + /// reclaimed by `sweep_orphans`. Workload uses only distinct-key appends (no + /// value-sep overwrites) so the recovered prefix never reverts to a value + /// whose blob was legitimately GC'd. + #[test] + fn corruption_truncates_at_bad_record_keeping_prefix() { + run(async { + let work = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 40, + fanout: 8, + value_sep_threshold: 256, + }; + let big = |b: u8| vec![b; 512]; + let ops = vec![ + Op::Set { + k: 0, + val: big(0xA1), + large: true, + }, + Op::Set { + k: 1, + val: big(0xB2), + large: true, + }, + Op::Set { + k: 2, + val: b"s2".to_vec(), + large: false, + }, + Op::Set { + k: 3, + val: big(0xC3), + large: true, + }, + ]; + + let mut ends: Vec = Vec::with_capacity(ops.len()); + { + let log = NamespaceLog::open(work.path().to_path_buf(), cfg) + .await + .unwrap(); + for op in &ops { + if let Op::Set { k, val, .. } = op { + log.put_full(kb(*k), val, &[], None).await.unwrap(); + } + ends.push(log.active.borrow().write_offset()); + } + log.sync().await.unwrap(); // everything durable + } + let data_bytes = std::fs::read(work.path().join(data_filename(0))).unwrap(); + let blob_snapshot: Vec<(std::ffi::OsString, Vec)> = + std::fs::read_dir(work.path().join("values")) + .map(|rd| { + rd.flatten() + .map(|e| (e.file_name(), std::fs::read(e.path()).unwrap())) + .collect() + }) + .unwrap_or_default(); + + let crash = TempDir::new().unwrap(); + let crash_data = crash.path().join(data_filename(0)); + let crash_values = crash.path().join("values"); + + for i in 0..ops.len() { + let start = if i == 0 { 0 } else { ends[i - 1] as usize }; + let pos = (start + ends[i] as usize) / 2; // a byte inside record i + let mut corrupt = data_bytes.clone(); + corrupt[pos] ^= 0xFF; + std::fs::write(&crash_data, &corrupt).unwrap(); + let _ = std::fs::remove_dir_all(&crash_values); + std::fs::create_dir_all(&crash_values).unwrap(); + for (name, bytes) in &blob_snapshot { + std::fs::write(crash_values.join(name), bytes).unwrap(); + } + + // Expected: only records strictly before the corrupted one survive. + let mut state: BTreeMap, bool)> = BTreeMap::new(); + for op in &ops[..i] { + if let Op::Set { k, val, large } = op { + state.insert(*k, (val.clone(), *large)); + } + } + + let log = NamespaceLog::open(crash.path().to_path_buf(), cfg) + .await + .unwrap(); + let recovered: HashSet> = + log.index.borrow().iter().map(|(k, _)| k.to_vec()).collect(); + let expected: HashSet> = state.keys().map(|k| kb(*k).to_vec()).collect(); + assert_eq!( + recovered, expected, + "corruption at record {i} should keep exactly the prefix" + ); + for (k, (val, _)) in &state { + let e = *log.index.borrow().get(kb(*k).as_ref()).unwrap(); + let got = log.read_value(e).await.unwrap_or_else(|err| { + panic!("prefix key k{k} unreadable after corrupting record {i}: {err:?}") + }); + assert_eq!( + got.0.as_ref(), + val.as_slice(), + "prefix value k{k} wrong after corrupting record {i}" + ); + } + let want: HashSet = state + .values() + .filter(|(_, l)| *l) + .map(|(v, _)| content_hash(v)) + .collect(); + assert_eq!( + log.values.blob_count(), + want.len(), + "dropped-tail blobs not reclaimed after corrupting record {i}" + ); + } + eprintln!( + "\n CRASH-CONSISTENCY: single-byte corruption at every record truncates cleanly\n at the bad record; the prefix stays intact and the dropped tail's blobs are swept.\n" + ); + }); + } + + /// Torn footer + multi-file recovery: after a reclaim seals records (with + /// value-separated keys) into a footered sealed file and opens a new active, + /// truncating the SEALED file's footer must make `read_footer` reject the + /// (now-invalid) magic and fall back to `rebuild_from_records` — a full scan + /// that re-derives the value-sep sidecar from each record's VALUE_SEP flag. + /// Across every footer-region cut (and into the records), every key must still + /// recover and read back through the multi-file (sealed + active) layout. + #[test] + fn torn_footer_falls_back_to_scan_across_files() { + run(async { + let work = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 40, + fanout: 8, + value_sep_threshold: 256, + }; + let big = |b: u8| vec![b; 512]; + let ops = vec![ + Op::Set { + k: 0, + val: big(0xD1), + large: true, + }, + Op::Set { + k: 1, + val: b"s1".to_vec(), + large: false, + }, + Op::Set { + k: 2, + val: big(0xE2), + large: true, + }, + Op::Set { + k: 3, + val: big(0xF3), + large: true, + }, + ]; + let mut ends: Vec = Vec::with_capacity(ops.len()); + let records_end; + { + let log = NamespaceLog::open(work.path().to_path_buf(), cfg) + .await + .unwrap(); + for op in &ops { + if let Op::Set { k, val, .. } = op { + log.put_full(kb(*k), val, &[], None).await.unwrap(); + } + ends.push(log.active.borrow().write_offset()); + } + records_end = log.active.borrow().write_offset(); + log.reclaim().await.unwrap(); // seal file 0 (records + footer), open active file 1 + } + // The reclaim footered file 0 and created an empty active file 1. + let sealed_bytes = std::fs::read(work.path().join(data_filename(0))).unwrap(); + assert!( + sealed_bytes.len() as u64 > records_end, + "footer was appended past the records" + ); + let active1 = work.path().join(data_filename(1)); + assert!( + active1.exists(), + "reclaim opened a new active file (multi-file layout)" + ); + let blob_snapshot: Vec<(std::ffi::OsString, Vec)> = + std::fs::read_dir(work.path().join("values")) + .map(|rd| { + rd.flatten() + .map(|e| (e.file_name(), std::fs::read(e.path()).unwrap())) + .collect() + }) + .unwrap_or_default(); + + let crash = TempDir::new().unwrap(); + let f0 = crash.path().join(data_filename(0)); + let f1 = crash.path().join(data_filename(1)); + let cvals = crash.path().join("values"); + + // Cut from late in the last record through the entire footer region. + let lo = (records_end as usize).saturating_sub(40); + for t in lo..=sealed_bytes.len() { + std::fs::write(&f0, &sealed_bytes[..t]).unwrap(); + std::fs::write(&f1, b"").unwrap(); // empty active (highest id) + let _ = std::fs::remove_dir_all(&cvals); + std::fs::create_dir_all(&cvals).unwrap(); + for (name, bytes) in &blob_snapshot { + std::fs::write(cvals.join(name), bytes).unwrap(); + } + + // Records fully below the cut survive the scan; a cut in the footer + // region (t >= records_end) keeps all records. + let mut state: BTreeMap, bool)> = BTreeMap::new(); + for (op, end) in ops.iter().zip(ends.iter()) { + if *end > t as u64 { + break; + } + if let Op::Set { k, val, large } = op { + state.insert(*k, (val.clone(), *large)); + } + } + + let log = NamespaceLog::open(crash.path().to_path_buf(), cfg) + .await + .unwrap(); + let recovered: HashSet> = + log.index.borrow().iter().map(|(k, _)| k.to_vec()).collect(); + let expected: HashSet> = state.keys().map(|k| kb(*k).to_vec()).collect(); + assert_eq!( + recovered, expected, + "torn-footer scan recovered wrong key set at t={t}" + ); + for (k, (val, _)) in &state { + let e = *log.index.borrow().get(kb(*k).as_ref()).unwrap(); + let got = log.read_value(e).await.unwrap_or_else(|err| { + panic!("k{k} unreadable via torn-footer scan at t={t}: {err:?}") + }); + assert_eq!( + got.0.as_ref(), + val.as_slice(), + "value mismatch k{k} at t={t}" + ); + } + let want: HashSet = state + .values() + .filter(|(_, l)| *l) + .map(|(v, _)| content_hash(v)) + .collect(); + assert_eq!( + log.values.blob_count(), + want.len(), + "blob leak/missing at t={t}" + ); + } + eprintln!( + "\n CRASH-CONSISTENCY: torn footer over {} cuts → scan fallback rebuilt value-sep\n state from records across the sealed+active multi-file layout. No dangling, no leaks.\n", + sealed_bytes.len() - lo + 1 + ); + }); + } +} + +#[cfg(test)] +mod concurrency_tests { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use std::collections::BTreeMap; + use std::rc::Rc; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + async fn live_state(log: &NamespaceLog) -> BTreeMap, Vec> { + let entries: Vec<(Vec, IndexEntry)> = log + .index + .borrow() + .iter() + .map(|(k, e)| (k.to_vec(), *e)) + .collect(); + let mut out = BTreeMap::new(); + for (k, e) in entries { + let (v, _m) = log.read_value(e).await.unwrap(); + out.insert(k, v.to_vec()); + } + out + } + + /// Stress the new per-key write striping under real concurrency: many spawned + /// tasks hammer a small shared keyspace with interleaved SET / CAS / DEL (so + /// same-key writes actually contend on stripes). This must (a) never deadlock + /// — single-key writes each hold exactly one stripe — and (b) leave on-disk + /// state that, after a full fsync, recovery reproduces EXACTLY. A conditional + /// write that loses a race writes nothing (no orphan), so the durable log can + /// never replay to anything other than the live runtime state. + #[test] + fn concurrent_mixed_writes_recover_to_runtime_state() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 40, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = Rc::new( + NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(), + ); + + let mut handles = Vec::new(); + for t in 0..6u64 { + let log = log.clone(); + handles.push(monoio::spawn(async move { + for i in 0..120u64 { + let k = Bytes::from(format!("k{}", (t + i) % 5)); // 5 hot keys, heavy same-key contention + let big = i % 7 == 0; // mix in value-separated (>4 KiB) writes + let val = if big { + vec![(t as u8).wrapping_add(i as u8); 8192] + } else { + vec![t as u8; 24] + }; + match (t + i) % 3 { + 0 => { + log.put_full(k, &val, &[], None).await.unwrap(); + } + 1 => { + // CAS against whatever revision we just observed. + let now = now_ms(); + let cond = match log.index.borrow().get(k.as_ref()) { + Some(e) => WriteCondition::Revision(e.tstamp_ms), + None => WriteCondition::KeyAbsent, + }; + let _ = log + .put_full_cond(k, &val, &[], None, cond, now) + .await + .unwrap(); + } + _ => { + let _ = log.tombstone(k.as_ref()).await.unwrap(); + } + } + } + })); + } + for h in handles { + h.await; + } + + // Full durability, then snapshot the live runtime state. + log.sync().await.unwrap(); + let runtime = live_state(&log).await; + drop(log); + + // Recover from disk; it must reproduce the exact runtime state — no + // resurrected "failed" CAS, no lost update, no dangling value-sep blob. + let log2 = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let recovered = live_state(&log2).await; + assert_eq!( + recovered, runtime, + "recovery diverged from the concurrent runtime state" + ); + }); + } +} + +#[cfg(test)] +mod perf_overhead { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use std::time::Instant; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("rt") + .block_on(f) + } + + /// Quantify the cost added to the WRITE path by the per-key stripe lock, and + /// confirm the READ path is untouched. Reported, not asserted. Ignored by + /// default (a perf probe, not a regression test): `cargo test -- --ignored`. + #[test] + #[ignore = "perf probe; run with --ignored --nocapture"] + fn write_path_overhead() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 40, + fanout: 8, + value_sep_threshold: 1 << 20, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let val = vec![0u8; 64]; // small inline value (the common case) + + // 1) bare added cost: FxHash(key) + uncontended stripe lock/unlock. + let n = 500_000; + let key = b"some-typical-key"; + let t = Instant::now(); + for _ in 0..n { + let g = log.wlock(std::hint::black_box(key)).lock().await; + std::hint::black_box(&g); + } + let lock_ns = t.elapsed().as_nanos() as f64 / n as f64; + + // 2) full small-value write (encode + append + index + stripe lock). + let nw = 50_000; + let t = Instant::now(); + for i in 0..nw { + log.put_full(Bytes::from(format!("k{i:08}")), &val, &[], None) + .await + .unwrap(); + } + let put_ns = t.elapsed().as_nanos() as f64 / nw as f64; + + // 3) warm read (lock-free path; the stripe lock is never taken). + let e = *log.index.borrow().get(b"k00000000").unwrap(); + let nr = 200_000; + let t = Instant::now(); + for _ in 0..nr { + std::hint::black_box(log.read_value(e).await.unwrap()); + } + let read_ns = t.elapsed().as_nanos() as f64 / nr as f64; + + eprintln!( + "\n PERF (single shard, sequential):\n stripe lock acquire+release (uncontended) = {lock_ns:.0} ns <- the per-write add\n full small-value put_full = {:.0} ns/op ({:.2}% is the lock)\n warm read_value (lock-free, unchanged) = {read_ns:.0} ns/op\n", + put_ns, + lock_ns / put_ns * 100.0 + ); + }); + } +} + +#[cfg(test)] +mod integrity_tests { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("rt") + .block_on(f) + } + + /// #2: a value-separated blob corrupted on disk is DETECTED on read (content + /// hash mismatch), not returned as wrong data — parity with the inline CRC + /// check. (Drop the re-hash in `deref` and this returns corrupted bytes → the + /// final assert fails.) + #[test] + fn corrupted_blob_is_detected_on_read() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let big = vec![0x11u8; 8192]; + log.put_full(Bytes::from_static(b"k"), &big, &[], None) + .await + .unwrap(); + let e = *log.index.borrow().get(b"k").unwrap(); + assert_eq!( + log.read_value(e).await.unwrap().0, + Bytes::from(big.clone()), + "sanity: reads back" + ); + + // Flip a byte in the blob file on disk. + let blob = std::fs::read_dir(dir.path().join("values")) + .unwrap() + .flatten() + .map(|d| d.path()) + .find(|p| { + p.file_name() + .unwrap() + .to_string_lossy() + .starts_with("blob-") + }) + .expect("blob file"); + let mut bytes = std::fs::read(&blob).unwrap(); + bytes[0] ^= 0xFF; + std::fs::write(&blob, bytes).unwrap(); + + assert!( + log.read_value(e).await.is_err(), + "corrupted blob must be detected, not returned as data" + ); + }); + } + + /// #3: revisions stay monotonic across a restart even when recovered data + /// carries a tstamp ahead of the wall clock (clock skew / future-dated write). + /// The revision clock seeds from the max recovered tstamp. (Seed from 0 — the + /// old behavior — and the post-restart write gets a smaller revision: fails.) + #[test] + fn revisions_monotonic_across_restart() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 1 << 20, + }; + let path = dir.path().to_path_buf(); + { + let log = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + log.put_full(Bytes::from_static(b"k0"), b"v0", &[], None) + .await + .unwrap(); + } + // Append a record with a far-future tstamp directly to the active file. + let future = now_ms() + 10_000_000; + let rec = crate::log::record::encode( + future, + crate::log::record::flags::NO_EXPIRY, + 0, + b"k1", + b"v1", + &[], + ) + .unwrap(); + { + use std::io::Write; + let p = path.join(crate::log::file::data_filename(0)); + std::fs::OpenOptions::new() + .append(true) + .open(&p) + .unwrap() + .write_all(&rec) + .unwrap(); + } + // Reopen → recovery sees `future`; the revision clock must seed from it. + let log2 = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + let rev = log2 + .put_full(Bytes::from_static(b"k2"), b"v2", &[], None) + .await + .unwrap(); + assert!( + rev > future, + "post-restart revision {rev} must exceed recovered max {future}" + ); + }); + } +} + +#[cfg(test)] +mod watch_valuesep_tests { + use super::*; + use crate::log::config::LogConfig; + use crate::watch::{KeyFilter, WatchEvent}; + use bytes::Bytes; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("rt") + .block_on(f) + } + + /// Regression: watch resumption (`scan_since`) must deref a value-separated + /// record to its real value, not emit the 16-byte content-hash pointer. (Skip + /// the deref and the event value is 16 bytes, not the 8 KiB value → fails.) + #[test] + fn scan_since_emits_real_value_for_separated_record() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 4096, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let big = vec![0x7Cu8; 8192]; // > threshold → separated + log.put_full(Bytes::from_static(b"wk"), &big, &[], None) + .await + .unwrap(); + + let events = log.scan_since(&KeyFilter::Exact(b"wk"), 0).await.unwrap(); + assert_eq!(events.len(), 1, "exactly one Set event"); + match &events[0] { + WatchEvent::Set { value, .. } => { + assert_eq!( + value, + &Bytes::from(big.clone()), + "watch replay must emit the real value, not the hash" + ); + } + other => panic!("expected Set, got {other:?}"), + } + }); + } +} + +#[cfg(test)] +mod reclaim_concurrency_tests { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use std::rc::Rc; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("rt") + .block_on(f) + } + + /// Regression: a write issued while a reclaim is running must WAIT for it and + /// then succeed — it must NOT return `ReclamationBusy`. (Before: writes errored + /// during reclaim.) A small rotate threshold makes the reclaim do real merge + /// work so the concurrent write actually overlaps it. + #[test] + fn writes_wait_for_reclaim_then_succeed() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 256, + fanout: 4, + value_sep_threshold: 1 << 20, + }; + let log = Rc::new( + NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(), + ); + + // Fill enough that many runs seal → reclaim has a multi-level merge to do. + let val = vec![0xACu8; 80]; + for i in 0..80u32 { + log.put_full(Bytes::from(format!("k{i:04}")), &val, &[], None) + .await + .unwrap(); + } + + // Run reclaim and a write concurrently; the write must wait, not error. + let a = log.clone(); + let b = log.clone(); + let t_reclaim = monoio::spawn(async move { a.reclaim().await }); + let t_write = monoio::spawn(async move { + b.put_full(Bytes::from_static(b"during-reclaim"), &[1u8; 80], &[], None) + .await + }); + let wr = t_write.await; + let rr = t_reclaim.await; + assert!(rr.is_ok(), "reclaim failed: {rr:?}"); + assert!( + wr.is_ok(), + "write during reclaim must wait+succeed, not error: {wr:?}" + ); + + // The waited write is durable and reads back. + let e = *log.index.borrow().get(b"during-reclaim").unwrap(); + assert_eq!(log.read_value(e).await.unwrap().0.len(), 80); + assert_eq!(log.len(), 81, "all keys present"); + }); + } +} + +#[cfg(test)] +mod reclaim_durability_tests { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use std::cell::Cell; + use std::rc::Rc; + use std::time::Duration; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("rt") + .block_on(f) + } + + /// Teeth-verified regression for the footer-consistency drain. + /// + /// The bug: `reclaim()` built the sealed file's footer from the index WITHOUT + /// first draining in-flight writes. A write that had passed the gate and + /// reserved an offset in the active file but had not yet `index.insert`ed + /// would be missing from that footer and silently lost on the next footer + /// (fast-path) recovery — and could even append AFTER the footer trailer. + /// + /// The fix: `reclaim()` spins `while in_flight_writes > 0` before sealing. + /// This asserts that contract directly: a held `WriteGuard` (exactly the + /// "appended but not yet indexed" state, since the guard spans append→insert) + /// pins `in_flight_writes == 1`, and reclaim must NOT seal until it is + /// released. Remove the drain loop in `reclaim()` and this fails: reclaim + /// seals (opens a new active, sets `done`) while the guard is still held. + #[test] + fn reclaim_does_not_seal_while_a_write_is_in_flight() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 512, + fanout: 4, + value_sep_threshold: 1 << 20, + }; + let log = Rc::new( + NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(), + ); + for i in 0..5u32 { + log.put_full(Bytes::from(format!("seed{i}")), b"v", &[], None) + .await + .unwrap(); + } + let active_before = log.active.borrow().file_id; + + // Pin in_flight_writes==1: the exact "in the seal window" state a real + // write occupies between reserving its offset and inserting its index entry. + let guard = log.begin_write().unwrap(); + assert_eq!(log.in_flight_writes.get(), 1); + + let done = Rc::new(Cell::new(false)); + let (a, d) = (log.clone(), done.clone()); + let h = monoio::spawn(async move { + let r = a.reclaim().await; + d.set(true); + r + }); + + // Give reclaim ample scheduling to reach — and block in — its drain. + for _ in 0..30 { + monoio::time::sleep(Duration::from_micros(100)).await; + } + assert!( + !done.get(), + "reclaim completed while a write was in-flight — drain missing" + ); + assert!( + log.reclaim_in_progress.get(), + "reclaim should be mid-drain, holding the gate" + ); + assert_eq!( + log.active.borrow().file_id, + active_before, + "reclaim sealed (opened a new active) while a write was still in-flight" + ); + + // Release the in-flight write → drain observes 0 → reclaim seals. + drop(guard); + let _report = h.await.unwrap(); // unwraps the reclaim Result — succeeds once drained + assert!(done.get()); + assert_ne!( + log.active.borrow().file_id, + active_before, + "reclaim should have sealed and opened a new active after draining" + ); + }); + } + + /// End-to-end companion: writes issued concurrently with a reclaim all survive + /// a subsequent FOOTER (fast-path) recovery. This exercises the full + /// reclaim+write+reopen path and passes deterministically with the drain. + /// (Note: its *teeth* are timing-dependent — in a quiet test the small-write + /// io_uring appends rarely suspend long enough to interleave reclaim's seal, + /// so the deterministic contract test above is what actually guards the fix.) + #[test] + fn acked_writes_during_reclaim_survive_footer_recovery() { + run(async { + for _round in 0..10u32 { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 512, + fanout: 4, + value_sep_threshold: 1 << 20, + }; + let path = dir.path().to_path_buf(); + let val = vec![0xBEu8; 64]; + + let acked: Vec = { + let log = Rc::new(NamespaceLog::open(path.clone(), cfg).await.unwrap()); + for i in 0..60u32 { + log.put_full(Bytes::from(format!("base{i:04}")), &val, &[], None) + .await + .unwrap(); + } + let a = log.clone(); + let t_reclaim = monoio::spawn(async move { + let _ = a.reclaim().await; + }); + let mut acked = Vec::new(); + for j in 0..40u32 { + let k = format!("hot{j:04}"); + if log + .put_full(Bytes::from(k.clone()), &val, &[], None) + .await + .is_ok() + { + acked.push(k); + } + } + t_reclaim.await; + log.seal_active_for_shutdown().await.ok(); + acked + }; + + let log2 = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + let idx = log2.index.borrow(); + for k in &acked { + assert!( + idx.get(k.as_bytes()).is_some(), + "acked write {k} lost after footer recovery (reclaim seal didn't drain it)" + ); + } + } + }); + } +} + +#[cfg(test)] +mod enospc_recovery_tests { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + /// A disk-full (ENOSPC) write fails cleanly end-to-end: the failing key is + /// never indexed (insert happens only after a successful append), prior + /// committed writes are untouched, the active file is poisoned so no later + /// write shadows the torn slot, and after reopen the committed prefix survives + /// intact with no corruption. + #[test] + fn disk_full_write_preserves_committed_prefix_across_recovery() { + run(async { + let dir = TempDir::new().unwrap(); + let path = dir.path().to_path_buf(); + let cfg = LogConfig { + rotate_threshold: 1 << 30, + fanout: 8, + value_sep_threshold: 1 << 20, + }; + { + let log = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + log.put_full(Bytes::from_static(b"k1"), b"v1", &[], None) + .await + .unwrap(); + log.put_full(Bytes::from_static(b"k2"), b"v2", &[], None) + .await + .unwrap(); + + // Disk fills on the next record's append. + log.active.borrow().force_next_write_failure(); + let r = log + .put_full(Bytes::from_static(b"k3"), b"v3", &[], None) + .await; + assert!( + r.is_err(), + "disk-full write must surface an error to the caller" + ); + + assert!( + log.index.borrow().get(b"k3").is_none(), + "failed write must not be indexed" + ); + assert!(log.index.borrow().get(b"k1").is_some()); + assert!(log.index.borrow().get(b"k2").is_some()); + assert!( + log.put_full(Bytes::from_static(b"k4"), b"v4", &[], None) + .await + .is_err(), + "writes after a disk-full poison must fail, not silently land past the gap" + ); + } + + // Reopen: committed prefix survives, failed/blocked writes absent, clean replay. + let log = NamespaceLog::open(path.clone(), cfg).await.unwrap(); + let e1 = *log.index.borrow().get(b"k1").unwrap(); + let e2 = *log.index.borrow().get(b"k2").unwrap(); + assert_eq!( + log.read_value(e1).await.unwrap().0, + Bytes::from_static(b"v1") + ); + assert_eq!( + log.read_value(e2).await.unwrap().0, + Bytes::from_static(b"v2") + ); + assert!( + log.index.borrow().get(b"k3").is_none(), + "failed write absent after recovery" + ); + assert!( + log.index.borrow().get(b"k4").is_none(), + "blocked write absent after recovery" + ); + }); + } +} + +#[cfg(test)] +mod fd_footprint_tests { + use super::*; + use crate::log::config::LogConfig; + use bytes::Bytes; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + fn open_fds() -> usize { + std::fs::read_dir("/proc/self/fd") + .map(|d| d.count()) + .unwrap_or(0) + } + + /// A `NamespaceLog`'s open-fd footprint is one fd for the active file PLUS one + /// per sealed file it holds open for reads. So `MAX_NAMESPACES` bounds the + /// namespace *count* but NOT the descriptor count — fds scale with sealed + /// files per namespace, which is what actually binds before the namespace cap + /// in a many-namespaces deployment. This pins that relationship so a future + /// change that, say, stops holding sealed fds (or starts leaking them) is + /// visible. `fanout` is set huge so runs accumulate without compaction merges. + #[test] + fn open_fds_scale_with_sealed_file_count() { + run(async { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 256, + fanout: 1 << 20, + value_sep_threshold: 1 << 20, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + + let fds_before = open_fds(); + let sealed_before = log.sealed_file_count(); + + // Each ~300-byte record exceeds rotate_threshold (256) → seals the + // active and opens a fresh one, accumulating sealed files. + for i in 0..20u32 { + log.put_full(Bytes::from(format!("k{i:04}")), &[0xAB; 300], &[], None) + .await + .unwrap(); + } + + let sealed_after = log.sealed_file_count(); + let fds_after = open_fds(); + let new_sealed = sealed_after - sealed_before; + + assert!( + new_sealed >= 10, + "expected sealed files to accumulate, got {new_sealed}" + ); + // Each retained sealed file holds an fd: descriptor growth tracks it. + assert!( + fds_after >= fds_before + new_sealed, + "open fds ({fds_after}) did not grow with sealed files ({fds_before} + {new_sealed}) \ + — fd footprint is per-sealed-file and not bounded by the namespace cap" + ); + }); + } +} diff --git a/crates/engine/src/log/reclaim.rs b/crates/engine/src/log/reclaim.rs index 992651a..e80c254 100644 --- a/crates/engine/src/log/reclaim.rs +++ b/crates/engine/src/log/reclaim.rs @@ -11,6 +11,23 @@ use crate::log::file::{ BufGuard, FooterEntry, LogFile, data_filename, footer_entry_from_index, reclaim_tmp_filename, }; use crate::log::index::IndexEntry; +use crate::log::record::{HEADER_LEN, flags as rflags, parse_header}; + +/// If `record_bytes` is a value-separated record, return its 16-byte blob hash +/// so the new sealed file's footer carries it forward (recovery + GC depend on +/// the footer hash). The record bytes themselves are copied verbatim by reclaim. +fn value_hash_of(record_bytes: &[u8]) -> Option<[u8; 16]> { + let hdr = parse_header(record_bytes, 0).ok()?; + if hdr.flags & rflags::VALUE_SEP == 0 || hdr.val_size as usize != 16 { + return None; + } + let start = HEADER_LEN + hdr.key_size as usize; + let end = start + 16; + let slice = record_bytes.get(start..end)?; + let mut h = [0u8; 16]; + h.copy_from_slice(slice); + Some(h) +} #[derive(Debug, Clone, Copy)] pub struct ReclaimReport { @@ -20,7 +37,7 @@ pub struct ReclaimReport { /// Files whose unlink failed after compaction; disk space is not freed until /// a subsequent reclaim or manual cleanup. pub dead_files_leaked: u32, - pub new_file_id: u16, + pub new_file_id: u32, } /// Read every live entry from `sealed_files` and write them into a single new @@ -32,7 +49,7 @@ pub struct ReclaimReport { pub async fn reclaim_namespace( dir: PathBuf, sealed_files: &[Rc], - next_file_id: u16, + next_file_id: u32, live: &[(Bytes, IndexEntry, Option)], ) -> Result<(ReclaimReport, Vec<(Bytes, IndexEntry, Option)>)> { let tmp_path = dir.join(reclaim_tmp_filename(next_file_id)); @@ -52,7 +69,7 @@ pub async fn reclaim_namespace( new_file.truncate_to(0).await?; // Build an owned-Rc map so read futures can capture file handles without borrowing. - let file_map: FxHashMap> = sealed_files + let file_map: FxHashMap> = sealed_files .iter() .map(|f| (f.file_id, Rc::clone(f))) .collect(); @@ -84,6 +101,7 @@ pub async fn reclaim_namespace( for ((key, old_entry, ttl), bytes_res) in live.iter().zip(read_results) { let bytes = bytes_res?.into_inner(); + let value_hash = value_hash_of(&bytes); let (new_offset, _) = new_file.append(bytes).await?; live_bytes += old_entry.record_size as u64; let new_entry = IndexEntry::new( @@ -93,7 +111,12 @@ pub async fn reclaim_namespace( old_entry.tstamp_ms, ); new_entries.push((key.clone(), new_entry, *ttl)); - footer.push(footer_entry_from_index(key.clone(), &new_entry, *ttl)); + footer.push(footer_entry_from_index( + key.clone(), + &new_entry, + *ttl, + value_hash, + )); } new_file.write_footer(&footer).await?; @@ -101,6 +124,10 @@ pub async fn reclaim_namespace( let final_path = dir.join(data_filename(next_file_id)); monoio::fs::rename(&tmp_path, &final_path).await?; + // Make the rename durable: without fsyncing the directory, a power loss could + // leave the merged file under its tmp name (or nameless) while old inputs are + // already unlinked below — losing the compacted data. + crate::log::file::sync_dir(&dir).await; let live_keys = new_entries.len() as u64; diff --git a/crates/engine/src/log/record.rs b/crates/engine/src/log/record.rs index 0cd268c..17f34a4 100644 --- a/crates/engine/src/log/record.rs +++ b/crates/engine/src/log/record.rs @@ -28,6 +28,10 @@ pub mod flags { pub const TOMBSTONE: u8 = 0b0000_0001; pub const NO_EXPIRY: u8 = 0b0000_0010; pub const TTL_UPDATE: u8 = 0b0000_0100; + /// Value-separated: the record's value field is a 16-byte content hash, not + /// the value itself. The value lives in the content-addressed blob store + /// (`value_store`). Set for values >= `LogConfig::value_sep_threshold`. + pub const VALUE_SEP: u8 = 0b0000_1000; } /// Fixed header bytes preceding every record. diff --git a/crates/engine/src/log/recover.rs b/crates/engine/src/log/recover.rs index 19eb0d6..d19797c 100644 --- a/crates/engine/src/log/recover.rs +++ b/crates/engine/src/log/recover.rs @@ -39,6 +39,7 @@ pub async fn open_namespace(dir: PathBuf) -> Result { // Fresh namespace — create active file id 0. let path = dir.join(crate::log::file::data_filename(0)); let active = LogFile::open_rw(path, 0).await?; + crate::log::file::sync_dir(&dir).await; // make the new file's dir entry durable return Ok(OpenedFiles { sealed, active, @@ -80,11 +81,11 @@ pub async fn open_namespace(dir: PathBuf) -> Result { offset: 0, reason: "file_id overflow on clean-shutdown recovery", })?; - if next_id >= u16::MAX - 100 { + if next_id >= u32::MAX - 100 { warn!( file_id = next_id, - remaining = u16::MAX - next_id, - "file_id nearing u16::MAX; compact sealed files to reclaim IDs" + remaining = u32::MAX - next_id, + "file_id nearing u32::MAX; compact sealed files to reclaim IDs" ); } let new_path = active_path @@ -94,7 +95,9 @@ pub async fn open_namespace(dir: PathBuf) -> Result { reason: "namespace data_dir has no parent; cannot compute next-file path", })? .join(crate::log::file::data_filename(next_id)); - LogFile::open_rw(new_path, next_id).await? + let active = LogFile::open_rw(new_path, next_id).await?; + crate::log::file::sync_dir(&dir).await; // new active after clean-shutdown recovery + active } None => { drop(highest); @@ -111,16 +114,19 @@ pub async fn open_namespace(dir: PathBuf) -> Result { }) } -fn apply_footer_entries(index: &mut NsIndex, file_id: u16, entries: &[FooterEntry]) { +fn apply_footer_entries(index: &mut NsIndex, file_id: u32, entries: &[FooterEntry]) { for e in entries { let entry = IndexEntry::new(file_id, e.record_offset, e.record_size, e.tstamp_ms); index.insert(e.key.clone(), entry, e.expires_at_ms); + // Value-separated keys carry their blob hash in the footer — repopulate + // the sidecar so overwrite/delete can unref and blob refcounts rebuild. + index.set_valsep(&e.key, e.value_hash); } } /// Scan a file's records from the start, populating the index. Used as a /// fallback when a sealed file's footer is missing/corrupt. -async fn rebuild_from_records(file: &LogFile, file_id: u16, index: &mut NsIndex) -> Result<()> { +async fn rebuild_from_records(file: &LogFile, file_id: u32, index: &mut NsIndex) -> Result<()> { let total = file.size().await?; let mut offset = 0u64; while offset < total { @@ -156,7 +162,7 @@ async fn rebuild_from_records(file: &LogFile, file_id: u16, index: &mut NsIndex) /// Replay the active file from offset 0 to EOF. On bad CRC, truncate at the /// last good boundary. -async fn replay_active(file: &LogFile, file_id: u16, index: &mut NsIndex) -> Result<()> { +async fn replay_active(file: &LogFile, file_id: u32, index: &mut NsIndex) -> Result<()> { let total = file.size().await?; let mut offset = 0u64; let mut last_good = 0u64; @@ -205,7 +211,7 @@ async fn replay_active(file: &LogFile, file_id: u16, index: &mut NsIndex) -> Res fn apply_record( index: &mut NsIndex, - file_id: u16, + file_id: u32, offset: u64, hdr: &crate::log::record::RecordHeader, body: &[u8], @@ -251,5 +257,22 @@ fn apply_record( } else { Some(hdr.expires_at_ms) }; - index.insert(Bytes::copy_from_slice(key), entry, ttl); + let key_bytes = Bytes::copy_from_slice(key); + index.insert(key_bytes.clone(), entry, ttl); + // Value-separated record: the value field is the 16-byte blob hash. Repopulate + // the sidecar so the blob can be unref'd on a later overwrite/delete. + if hdr.flags & rflags::VALUE_SEP != 0 { + let vstart = hdr.key_size as usize; + let vend = vstart + hdr.val_size as usize; + if hdr.val_size as usize == 16 && body.len() >= vend { + let mut h = [0u8; 16]; + h.copy_from_slice(&body[vstart..vend]); + index.set_valsep(&key_bytes, Some(h)); + } else { + warn!( + offset, + "value-separated record without a 16-byte hash; ignoring sidecar entry" + ); + } + } } diff --git a/crates/engine/src/store.rs b/crates/engine/src/store.rs index b0241bd..1d39a72 100644 --- a/crates/engine/src/store.rs +++ b/crates/engine/src/store.rs @@ -82,7 +82,27 @@ impl ShardStore { /// from a hot async path after the runtime is handling requests. pub async fn open(data_dir: &Path, memory_bytes: usize) -> Result { std::fs::create_dir_all(data_dir)?; - let config = LogConfig::default(); + // Compaction is size-tiered (GlideFS-friendly). `KV_COMPACTION_FANOUT` + // tunes the per-level fanout; `KV_VALUE_SEP_THRESHOLD` the + // value-separation cutoff. + let config = { + let mut c = LogConfig::default(); + if let Ok(n) = std::env::var("KV_COMPACTION_FANOUT") + .unwrap_or_default() + .parse::() + { + if n >= 2 { + c.fanout = n; + } + } + if let Ok(n) = std::env::var("KV_VALUE_SEP_THRESHOLD") + .unwrap_or_default() + .parse::() + { + c.value_sep_threshold = n; + } + c + }; let mut namespaces: FxHashMap> = FxHashMap::default(); // Collect valid namespace subdirectories, then open them concurrently. @@ -160,13 +180,23 @@ impl ShardStore { } let dir = self.data_dir.join(ns); let nslog = Rc::new(NamespaceLog::open(dir, self.config).await?); - // Re-check after the await — another spawned task may have beaten us. - Ok(self - .namespaces - .borrow_mut() - .entry(ns.to_string()) - .or_insert(nslog) - .clone()) + // Re-check after the await: a concurrent task may have inserted this same + // namespace (dedup — return theirs, drop ours), OR filled the map to the + // cap while we were opening. Without the cap re-check, N concurrent opens + // of distinct new namespaces could all pass the pre-await gate at + // `len == MAX-1` and each insert, overshooting the cap. + let mut ns_map = self.namespaces.borrow_mut(); + if let Some(existing) = ns_map.get(ns) { + return Ok(existing.clone()); + } + if ns_map.len() >= MAX_NAMESPACES { + // Our freshly-created (empty) namespace dir is left behind; it is + // harmless and reused idempotently if this namespace is opened later. + return Err(EngineError::CapacityExceeded { + reason: "namespace limit reached", + }); + } + Ok(ns_map.entry(ns.to_string()).or_insert(nslog).clone()) } /// Test-only accessor that bypasses `ensure_ns` validation. Do not use in production code. @@ -353,7 +383,15 @@ impl ShardStore { } else { WriteCondition::KeyAbsent }; - let expires_at_ms = opts.ttl.map(|d| Self::validate_ttl(d, now)).transpose()?; + // Honor KEEPTTL the same way `set` does: preserve the key's existing + // expiry instead of silently clearing it. Relevant to SETXX (the key + // exists, so it may carry a TTL); on SETNX the key is absent, so the + // index lookup returns None and this is a no-op. + let expires_at_ms = if opts.keep_ttl { + nslog.index.borrow().ttl(key) + } else { + opts.ttl.map(|d| Self::validate_ttl(d, now)).transpose()? + }; let meta_bytes: Vec = opts .metadata .as_ref() @@ -944,16 +982,15 @@ impl ShardStore { pub async fn incr(&self, ns: &str, key: &[u8], delta: i64) -> Result { let nslog = self.ensure_ns(ns).await?; - // Serialize INCRs within this namespace. Without the lock, every concurrent - // CAS attempt submits a (futile) disk append and only one wins per round — - // io_uring completion order is roughly submission order, so a late submitter - // can lose every round as new contenders refill the in-flight pool. Holding - // the lock makes each INCR's read-modify-write run to completion; the small - // retry budget below only needs to cover cross-op contention (a SET/DEL - // squeezing in while we're between read and write). - let _incr_guard = nslog.incr_lock.lock().await; - - for _ in 0..8u8 { + // Optimistic read-modify-write. `put_full_cond` now holds the key's write + // stripe and checks BEFORE appending, so a lost race writes nothing (no + // futile append) and simply returns `None` — the retry below re-reads and + // tries again. No dedicated INCR lock is needed any more. + // 64 attempts, not 8: a hot counter incremented hundreds of times per + // tick can legitimately lose 8 CAS races in a row and still be making + // progress. The cap only exists to bound a pathological livelock, so set + // it well above realistic same-key contention before surfacing Conflict. + for _ in 0..64u16 { let now = now_ms(); // Read current value + TTL + revision for the CAS condition. @@ -1053,11 +1090,11 @@ impl ShardStore { KeyFilter::Exact(k) => self .watchers .borrow_mut() - .subscribe_key(ns_b, Bytes::copy_from_slice(k)), + .subscribe_key(ns_b, Bytes::copy_from_slice(k))?, KeyFilter::Prefix(p) => self .watchers .borrow_mut() - .subscribe_prefix(ns_b, Bytes::copy_from_slice(p)), + .subscribe_prefix(ns_b, Bytes::copy_from_slice(p))?, }; let nslog = self.ensure_ns(ns).await?; @@ -1283,6 +1320,8 @@ impl ShardStore { // prior write — see its doc for why this beats reading last_revision() // here under concurrent writers. let revision = nslog.flush().await?; + // Drop all value-separated blobs for this namespace (FLUSHDB clears everything). + nslog.values.clear(); let mut w = self.watchers.borrow_mut(); for key in live_keys { @@ -2419,4 +2458,262 @@ mod tests { assert_eq!(final_val, CONTENDERS as i64 * PER_CONTENDER); }); } + + /// Regression: `set_conditional` (SETXX/SETNX) used to ignore `keep_ttl` and + /// silently clear the key's TTL. SETXX with KEEPTTL must preserve the + /// existing expiry, matching `set`'s behavior. + #[test] + fn setxx_keep_ttl_preserves_existing_expiry() { + let tmp = TempDir::new().unwrap(); + let path = tmp.path().to_path_buf(); + run(async move { + let s = open_store(&path).await; + set_ttl(&s, b"kt", b"v1", Duration::from_secs(3600)).await; + let ok = s + .setxx( + "default", + b"kt", + Bytes::from_static(b"v2"), + SetOptions { + ttl: None, + metadata: None, + keep_ttl: true, + }, + ) + .await + .unwrap(); + assert!(ok, "setxx on a live key must succeed"); + assert_eq!(get_value(&s, b"kt").await.unwrap().as_ref(), b"v2"); + match s.ttl("default", b"kt").await.unwrap() { + TtlResult::Remaining(secs) => { + assert!(secs > 0, "KEEPTTL must preserve the existing TTL") + } + other => panic!("expected Remaining, got {other:?}"), + } + }); + } + + /// Regression: `current_entries` emitted `revision: 0` for every initial + /// watch event, breaking the documented "dedup by revision" contract across + /// the subscribe→scan window. Initial events must carry the key's real + /// revision — the same value a subsequent GET reports. + #[test] + fn watch_initial_event_carries_real_revision() { + let tmp = TempDir::new().unwrap(); + let path = tmp.path().to_path_buf(); + run(async move { + let s = open_store(&path).await; + set(&s, b"wr", b"v").await; + let expected = s.get("default", b"wr").await.unwrap().unwrap().revision; + let (initial, _rx) = s + .watch_subscribe("default", KeyFilter::Exact(b"wr"), 0) + .await + .unwrap(); + assert_eq!(initial.len(), 1); + match &initial[0] { + WatchEvent::Set { revision, .. } => { + assert!( + *revision > 0, + "initial event must carry a real revision, not 0" + ); + assert_eq!( + *revision, expected, + "initial revision must match GET revision so callers can dedup" + ); + } + other => panic!("expected Set, got {other:?}"), + } + }); + } + + /// Regression: `scan_file_records` (the watch catch-up / `scan_since` path) + /// skipped the CRC check every other record-reading path performs, so a + /// bit-flipped record would be streamed to subscribers as a bogus event. + /// With the CRC check, the corrupt record is skipped and no event is emitted. + #[test] + fn watch_replay_skips_crc_corrupted_record() { + use std::io::{Seek, SeekFrom, Write as _}; + let tmp = TempDir::new().unwrap(); + let path = tmp.path().to_path_buf(); + run(async move { + let s = open_store(&path).await; + set(&s, b"ck", b"hello").await; + // Flush so the record bytes are on disk where scan_since reads them. + s.sync_logs().await.unwrap(); + let active_path = s.get_ns("default").unwrap().active.borrow().path.clone(); + + // Corrupt the first value byte. Record at offset 0: HEADER_LEN(37) + + // key "ck"(2) = byte 39 is the first value byte. Flipping it breaks + // the record CRC without disturbing the parseable header. + let mut f = std::fs::OpenOptions::new() + .write(true) + .open(&active_path) + .unwrap(); + f.seek(SeekFrom::Start(39)).unwrap(); + f.write_all(&[0xFF]).unwrap(); + f.flush().unwrap(); + + // since=1 (< the record's ms-epoch tstamp) routes through scan_since, + // which reads from disk and now hits the corrupted bytes. + let (initial, _rx) = s + .watch_subscribe("default", KeyFilter::Exact(b"ck"), 1) + .await + .unwrap(); + assert!( + initial.is_empty(), + "a CRC-corrupt record must not be replayed as a watch event, got {initial:?}" + ); + }); + } +} + +#[cfg(test)] +mod namespace_cap_tests { + use super::*; + use crate::error::EngineError; + use crate::types::SetOptions; + use bytes::Bytes; + use std::future::Future; + use std::rc::Rc; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + /// Concurrent opens of distinct new namespaces cannot overshoot the cap. + /// With one slot free, two tasks both pass the synchronous pre-await gate + /// (`len == MAX-1`), both open concurrently, then resume: the first inserts + /// (filling the slot) and the second's post-await re-check must reject it. + /// Without the cap re-check after the await, both insert → `MAX+1`: teeth. + #[test] + fn concurrent_boundary_opens_do_not_overshoot_cap() { + run(async { + let dir = TempDir::new().unwrap(); + let s = Rc::new(ShardStore::open(dir.path(), 4 << 20).await.unwrap()); + + // Fill to exactly one slot below the cap. + let mut i = 0usize; + while s.namespace_count() < MAX_NAMESPACES - 1 { + s.set( + &format!("db{i}"), + b"k", + Bytes::from_static(b"v"), + SetOptions::default(), + ) + .await + .unwrap(); + i += 1; + } + assert_eq!(s.namespace_count(), MAX_NAMESPACES - 1); + + // Two concurrent opens of distinct new namespaces racing for the last slot. + let (s1, s2) = (s.clone(), s.clone()); + let t1 = monoio::spawn(async move { + s1.set( + "race_a", + b"k", + Bytes::from_static(b"v"), + SetOptions::default(), + ) + .await + }); + let t2 = monoio::spawn(async move { + s2.set( + "race_b", + b"k", + Bytes::from_static(b"v"), + SetOptions::default(), + ) + .await + }); + let (r1, r2) = (t1.await, t2.await); + + assert_eq!( + s.namespace_count(), + MAX_NAMESPACES, + "cap must hold exactly under concurrent boundary opens (no overshoot)" + ); + let wins = [r1.is_ok(), r2.is_ok()].into_iter().filter(|b| *b).count(); + assert_eq!(wins, 1, "exactly one concurrent open wins the last slot"); + let rejected = [&r1, &r2] + .into_iter() + .filter(|r| matches!(r, Err(EngineError::CapacityExceeded { .. }))) + .count(); + assert_eq!(rejected, 1, "the loser must get a clean CapacityExceeded"); + }); + } + + /// The per-shard namespace cap degrades gracefully: opening distinct + /// namespaces succeeds up to `MAX_NAMESPACES`, the next one is rejected with a + /// clean `CapacityExceeded` (no panic, the map does not grow), and existing + /// namespaces — including `default` — keep serving reads and writes at the cap. + /// This is the bound that keeps NamespaceLog + file-descriptor growth finite. + #[test] + fn namespace_cap_is_enforced_and_existing_namespaces_keep_working() { + run(async { + let dir = TempDir::new().unwrap(); + let s = ShardStore::open(dir.path(), 4 << 20).await.unwrap(); + + // "default" pre-exists; open distinct namespaces until the map is full. + let mut i = 0usize; + while s.namespace_count() < MAX_NAMESPACES { + let ns = format!("db{i}"); + s.set(&ns, b"k", Bytes::from_static(b"v"), SetOptions::default()) + .await + .unwrap(); + i += 1; + } + assert_eq!(s.namespace_count(), MAX_NAMESPACES); + + // One namespace over the cap: clean rejection, map unchanged. + let over = s + .set( + &format!("db{i}"), + b"k", + Bytes::from_static(b"v"), + SetOptions::default(), + ) + .await; + assert!( + matches!(over, Err(EngineError::CapacityExceeded { .. })), + "expected CapacityExceeded at the cap, got {over:?}" + ); + assert_eq!( + s.namespace_count(), + MAX_NAMESPACES, + "rejected open must not grow the map" + ); + + // Existing namespaces are unaffected by the cap. + s.set( + "db0", + b"k2", + Bytes::from_static(b"v2"), + SetOptions::default(), + ) + .await + .unwrap(); + assert!( + s.get("db0", b"k2").await.unwrap().is_some(), + "existing ns still writable at cap" + ); + s.set( + "default", + b"dk", + Bytes::from_static(b"dv"), + SetOptions::default(), + ) + .await + .unwrap(); + assert!( + s.get("default", b"dk").await.unwrap().is_some(), + "default ns still works at cap" + ); + }); + } } diff --git a/crates/engine/src/value_store.rs b/crates/engine/src/value_store.rs new file mode 100644 index 0000000..27cb42a --- /dev/null +++ b/crates/engine/src/value_store.rs @@ -0,0 +1,431 @@ +//! Content-addressed value store (value separation, WiscKey-style) for large +//! values — the GlideFS-friendly large-value path. +//! +//! A value is keyed by its BLAKE3-128 content hash and stored once; identical +//! values across keys/forks/tenants dedup to a single blob. The main log holds +//! only the small `(key -> hash)` pointer record, so compaction moves pointers, +//! never large values — collapsing large-value write amplification. Blobs are +//! immutable and refcounted; a blob is unlinked when its last reference drops. +//! +//! Blob I/O is async via `monoio::fs` (io_uring) — it runs on the same reactor +//! as the log engine and never blocks the shard's event loop. Refcounts are +//! in-memory, rebuilt from the live index on open; `sweep_orphans` reclaims any +//! blob a crash left without a referencing record. + +use std::cell::RefCell; +use std::path::PathBuf; + +use rustc_hash::FxHashMap; + +use crate::error::Result; + +/// BLAKE3-128 content hash (matches GlideFS's block addressing width). +pub type ContentHash = [u8; 16]; + +pub fn content_hash(value: &[u8]) -> ContentHash { + let mut out = [0u8; 16]; + out.copy_from_slice(&blake3::hash(value).as_bytes()[..16]); + out +} + +fn hex16(h: &ContentHash) -> String { + let mut s = String::with_capacity(32); + for b in h { + s.push_str(&format!("{b:02x}")); + } + s +} + +/// Parse a `blob-<32 hex>` filename back into its content hash. +fn parse_blob_name(name: &str) -> Option { + let hex = name.strip_prefix("blob-")?; + if hex.len() != 32 { + return None; + } + let mut h = [0u8; 16]; + for (i, b) in h.iter_mut().enumerate() { + *b = u8::from_str_radix(hex.get(i * 2..i * 2 + 2)?, 16).ok()?; + } + Some(h) +} + +/// Content-addressed, refcounted blob store. Refcounts are in-memory (rebuilt +/// from the index on open in the integrated engine). +pub struct ValueStore { + dir: PathBuf, + refs: RefCell>, + /// Blobs whose refcount has hit zero but whose deletion is deferred until the + /// next fsync (see `collect_garbage`). Deleting a superseded blob before the + /// superseding log record is durable would, on a power loss that loses that + /// record, leave the reverted-to key pointing at a deleted blob (a dangling + /// pointer). Deferring past the fsync makes the revert safe. + pending_delete: RefCell>, + /// Striped locks serializing the file create/delete of a given blob. A blob's + /// `put` (write) and `collect_garbage` (unlink) for the SAME content hash hold + /// the same stripe, so they can never race — without it, an unlink in flight + /// could delete a file a concurrent same-content `put` just recreated (a + /// dangling pointer). Different content → different stripe → still concurrent. + file_locks: Vec>, +} + +/// Number of blob file-op stripes. Same-hash ops serialize; different hashes +/// stay concurrent. Blob writes are the rare large-value path, so this is small. +const FILE_LOCK_STRIPES: usize = 16; + +impl ValueStore { + pub fn new(dir: PathBuf) -> Self { + // Dir is created lazily on the first blob write — an all-small-value + // namespace never materializes a `values/` directory at all. + Self { + dir, + refs: RefCell::new(FxHashMap::default()), + pending_delete: RefCell::new(Vec::new()), + file_locks: (0..FILE_LOCK_STRIPES) + .map(|_| futures_util::lock::Mutex::new(())) + .collect(), + } + } + + fn path(&self, h: &ContentHash) -> PathBuf { + self.dir.join(format!("blob-{}", hex16(h))) + } + + /// The file-op stripe for a content hash (first bytes of the hash → stripe). + fn flock(&self, h: &ContentHash) -> &futures_util::lock::Mutex<()> { + &self.file_locks[(h[0] as usize) & (FILE_LOCK_STRIPES - 1)] + } + + /// Store `value`, deduplicated by content. Returns its content hash. Writes + /// the blob only on first reference (immutable, write-once); subsequent puts + /// of identical content just bump the refcount — no rewrite, no extra bytes. + pub async fn put(&self, value: &[u8]) -> Result { + let h = content_hash(value); + // Serialize this content's file create against a concurrent delete of the + // same content in `collect_garbage` (held across the refcount bump + write + // so the decision and the file op are atomic for this hash). + let _fl = self.flock(&h).lock().await; + let first = { + let mut refs = self.refs.borrow_mut(); + let c = refs.entry(h).or_insert(0); + *c += 1; + *c == 1 + }; + if first { + // Write the blob durably BEFORE the caller writes the pointer record + // that references it. The log uses appendfsync-everysec, but the + // pointer and the value live in different files — so we must fsync the + // blob's data AND its directory entry here, or a power loss could + // leave a durable pointer aimed at a non-durable blob (a dangling + // pointer = corruption, worse than the everysec "lose the last 1s" + // contract). With this ordering, the worst a crash can do is leave an + // orphan blob (durable blob, lost pointer) — reclaimed by + // `sweep_orphans` on the next open. All I/O is io_uring (no blocking). + if let Err(e) = self.write_blob_durable(&h, value).await { + self.dec(&h); // roll back the ref; no phantom reference to a missing blob + return Err(e); + } + } + Ok(h) + } + + /// Write `value` to its blob path and make it crash-durable: fsync the file's + /// data, then fsync the parent directory so the new directory entry survives a + /// power loss. Returns only once the blob is durable on stable storage. + async fn write_blob_durable(&self, h: &ContentHash, value: &[u8]) -> Result<()> { + // Propagate a create failure rather than swallow it: if the directory + // can't be made, the `open` below fails with a generic ENOENT that hides + // the real cause (e.g. EACCES on the parent). idempotent: Ok if it exists. + monoio::fs::create_dir_all(&self.dir).await?; + let file = monoio::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(self.path(h)) + .await?; + let (res, _buf) = file.write_all_at(value.to_vec(), 0).await; + res?; + file.sync_all().await?; // blob bytes durable + let _ = file.close().await; + // fsync the directory so the blob's name is durable before any pointer + // record referencing it can become durable. A failure here weakens the + // crash-durability contract (the blob's directory entry may not survive a + // power loss, leaving a durable pointer aimed at a nameless blob), so + // surface it as an error rather than swallow it. The caller rolls back the + // refcount on Err, so a failed durability step never leaves a phantom ref. + let dir = monoio::fs::OpenOptions::new() + .read(true) + .open(&self.dir) + .await?; + let sync_res = dir.sync_all().await; + let _ = dir.close().await; + sync_res?; + Ok(()) + } + + pub async fn get(&self, h: &ContentHash) -> Result> { + Ok(monoio::fs::read(self.path(h)).await?) + } + + /// Recovery: rebuild the in-memory refcount for a hash referenced by a live + /// index entry, WITHOUT writing the blob (it already exists on disk from + /// before the restart). Called once per live value-separated key at open. + pub fn incr_ref(&self, h: &ContentHash) { + *self.refs.borrow_mut().entry(*h).or_insert(0) += 1; + } + + /// Decrement the in-memory refcount; return true if it hit zero (blob dead). + fn dec(&self, h: &ContentHash) -> bool { + let mut refs = self.refs.borrow_mut(); + match refs.get_mut(h) { + Some(c) => { + *c = c.saturating_sub(1); + if *c == 0 { + refs.remove(h); + true + } else { + false + } + } + None => false, + } + } + + /// Drop one reference. When the last reference goes away the blob is NOT + /// deleted immediately — it is queued for `collect_garbage`, which runs after + /// the next fsync. This preserves the crash-consistency invariant: a blob is + /// only physically removed once the log record that superseded it is durable, + /// so a power loss that reverts the key always finds its blob still present. + pub fn unref(&self, h: &ContentHash) { + if self.dec(h) { + self.pending_delete.borrow_mut().push(*h); + } + } + + /// Delete the blobs queued by `unref`, but only those still at refcount 0 + /// (a queued blob may have been re-referenced by an identical-content write + /// in the meantime). MUST be called only after the log has been fsynced past + /// the records that orphaned these blobs — i.e. right after `LogFile::sync`. + /// Blobs whose deletion is skipped here for any reason are still reachable as + /// orphans and reclaimed by `sweep_orphans` on the next open. + pub async fn collect_garbage(&self) { + let pending: Vec = std::mem::take(&mut *self.pending_delete.borrow_mut()); + for h in pending { + // Hold this content's file stripe so the refcount==0 check and the + // unlink are atomic w.r.t. a concurrent same-content `put`: either the + // put re-references it first (refcount>0 → we skip) or we delete first + // (and the put then recreates it). The file can't be left missing while + // a live key references it. + let _fl = self.flock(&h).lock().await; + if self.refcount(&h) == 0 { + let _ = monoio::fs::remove_file(self.path(&h)).await; + } + } + } + + /// Drop all blobs and refcounts (FLUSHDB). Nukes the whole `values/` tree — + /// including any deferred-delete or orphan blobs — and resets all state. + pub fn clear(&self) { + self.refs.borrow_mut().clear(); + self.pending_delete.borrow_mut().clear(); + let _ = std::fs::remove_dir_all(&self.dir); + } + + /// Reclaim orphan blobs: files on disk that no live key references. A crash + /// between writing a blob and appending its log record (or between writing a + /// new blob and unref'ing the old one) leaves such a file. Call once at open, + /// AFTER refcounts have been rebuilt from the live index — then any blob not + /// in `refs` is unreachable and safe to delete. Returns the count removed. + /// + /// Directory listing uses `std::fs` because this runs at open, before the + /// shard serves traffic (same place `recover` already lists data files); + /// the deletions go through io_uring. + pub async fn sweep_orphans(&self) -> Result { + let entries = match std::fs::read_dir(&self.dir) { + Ok(e) => e, + Err(_) => return Ok(0), // no values dir => nothing to sweep + }; + let mut orphans: Vec = Vec::new(); + for ent in entries.flatten() { + let name = ent.file_name(); + let name = name.to_string_lossy(); + match parse_blob_name(&name) { + Some(h) if !self.refs.borrow().contains_key(&h) => orphans.push(ent.path()), + Some(_) => {} // referenced — keep + None => {} // not a blob file — ignore + } + } + let removed = orphans.len(); + for p in orphans { + let _ = monoio::fs::remove_file(p).await; + } + Ok(removed) + } + + pub fn blob_count(&self) -> usize { + self.refs.borrow().len() + } + + pub fn refcount(&self, h: &ContentHash) -> u32 { + self.refs.borrow().get(h).copied().unwrap_or(0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + /// Identical large values DEDUP to one blob (write-once), and refcounted GC + /// reclaims it — on real files, through the async io_uring path. + #[test] + fn dedup_write_once_and_gc() { + run(async { + let dir = TempDir::new().unwrap(); + let vs = ValueStore::new(dir.path().join("values")); + + let big = vec![7u8; 1_000_000]; // 1 MiB + let h1 = vs.put(&big).await.unwrap(); + let h2 = vs.put(&big).await.unwrap(); // identical content + assert_eq!(h1, h2, "same content → same hash"); + assert_eq!(vs.blob_count(), 1, "identical values dedup to ONE blob"); + assert_eq!(vs.refcount(&h1), 2); + assert_eq!(vs.get(&h1).await.unwrap(), big, "roundtrip"); + + let other = vec![9u8; 1_000_000]; + vs.put(&other).await.unwrap(); + assert_eq!(vs.blob_count(), 2, "distinct content → distinct blob"); + + // Drop both refs to the first blob → refcount 0, queued for deletion. + vs.unref(&h1); + vs.unref(&h1); + assert_eq!(vs.refcount(&h1), 0); + assert!( + vs.get(&h1).await.is_ok(), + "blob still on disk before collect (deferred delete)" + ); + // collect_garbage runs after an fsync → now the blob is physically gone. + vs.collect_garbage().await; + assert!( + vs.get(&h1).await.is_err(), + "blob GC'd after collect_garbage" + ); + assert_eq!(vs.blob_count(), 1, "only the live blob remains"); + }); + } + + /// A crash can leave a blob on disk with no referencing key. After refcounts + /// are rebuilt from the live index, `sweep_orphans` reclaims exactly those. + #[test] + fn sweep_reclaims_orphans_only() { + run(async { + let dir = TempDir::new().unwrap(); + let vs = ValueStore::new(dir.path().join("values")); + let live = vs.put(&vec![1u8; 4096]).await.unwrap(); + let orphan = vs.put(&vec![2u8; 4096]).await.unwrap(); + // Simulate a crash that wrote the orphan blob but never recorded its + // reference: forget it from the in-memory refs (as a fresh open would, + // since no live key points at it). + vs.refs.borrow_mut().remove(&orphan); + + let removed = vs.sweep_orphans().await.unwrap(); + assert_eq!(removed, 1, "exactly the unreferenced blob is reclaimed"); + assert!(vs.get(&orphan).await.is_err(), "orphan blob deleted"); + assert_eq!( + vs.get(&live).await.unwrap().len(), + 4096, + "live blob untouched" + ); + }); + } +} + +#[cfg(test)] +mod gc_race_tests { + use super::*; + use std::rc::Rc; + use tempfile::TempDir; + + fn run(f: F) -> F::Output { + monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime") + .block_on(f) + } + + /// Deterministic regression (real teeth): a blob queued for deletion that is + /// re-referenced BEFORE GC runs must survive — `collect_garbage` re-checks the + /// live refcount instead of deleting everything it queued. (Remove the + /// `refcount == 0` guard in `collect_garbage` and this fails: the blob is gone.) + #[test] + fn collect_garbage_skips_a_requeued_then_rereferenced_blob() { + run(async { + let dir = TempDir::new().unwrap(); + let vs = ValueStore::new(dir.path().join("values")); + let v = vec![0x5Au8; 4096]; + let h = vs.put(&v).await.unwrap(); // refcount 1, file written + vs.unref(&h); // refcount 0, queued for delete, file still present + assert_eq!(vs.refcount(&h), 0); + vs.put(&v).await.unwrap(); // re-reference BEFORE gc → refcount 1 + assert_eq!(vs.refcount(&h), 1); + vs.collect_garbage().await; // must SKIP h (live again) + assert_eq!( + vs.get(&h).await.unwrap(), + v, + "re-referenced blob must survive GC" + ); + }); + } + + /// Stress: `collect_garbage` racing a same-content `put`. The per-content file + /// lock makes create/delete of one hash mutually exclusive, so a re-referenced + /// blob is never left deleted — a by-construction guarantee against io_uring + /// completion reordering. (The bad reorder is hard to force on a given kernel, + /// so this passes with or without the lock; the deterministic test above has + /// the teeth, the lock provides correctness under any ordering.) + #[test] + fn gc_does_not_delete_a_concurrently_recreated_blob() { + run(async { + let dir = TempDir::new().unwrap(); + let vs = Rc::new(ValueStore::new(dir.path().join("values"))); + let v = vec![0xABu8; 8192]; + let h = content_hash(&v); + + for _ in 0..300 { + // Make h a queued (refcount 0) deletion with its file still on disk. + vs.put(&v).await.unwrap(); + vs.unref(&h); + assert_eq!(vs.refcount(&h), 0); + + // Race GC (wants to delete h) against a put re-referencing the same content. + let a = vs.clone(); + let b = vs.clone(); + let vb = v.clone(); + let t_gc = monoio::spawn(async move { a.collect_garbage().await }); + let t_put = monoio::spawn(async move { b.put(&vb).await.unwrap() }); + t_gc.await; + t_put.await; + + // The put re-referenced it → refcount 1 → the blob MUST still exist. + assert_eq!(vs.refcount(&h), 1, "put should have re-referenced the blob"); + assert_eq!( + vs.get(&h).await.expect("live blob deleted by GC/put race"), + v, + "blob content intact after concurrent GC + recreate" + ); + + // Reset for the next round. + vs.unref(&h); + vs.collect_garbage().await; + } + }); + } +} diff --git a/crates/engine/src/watch.rs b/crates/engine/src/watch.rs index de18fe7..9bd62d7 100644 --- a/crates/engine/src/watch.rs +++ b/crates/engine/src/watch.rs @@ -4,11 +4,20 @@ use bytes::Bytes; use futures_channel::mpsc::{Receiver, Sender, channel}; use rustc_hash::FxHashMap; +use crate::error::{EngineError, Result}; + /// Capacity for per-subscriber watch channels. A slow subscriber that fills /// its buffer is pruned (same as a disconnected subscriber) rather than /// allowed to grow without bound. const WATCH_CHANNEL_CAPACITY: usize = 512; +/// Hard cap on the number of live subscriptions (exact keys + prefixes) a +/// single registry will hold. Dead senders are pruned lazily on `notify` and +/// on each `subscribe_*`; this cap bounds the worst case where a client +/// registers many distinct keys faster than pruning reclaims them, preventing +/// unbounded growth of the `keys`/`prefixes` collections on the shard thread. +const MAX_TOTAL_SUBSCRIPTIONS: usize = 65_536; + #[derive(Debug, Clone)] pub enum WatchEvent { Set { @@ -57,22 +66,50 @@ impl WatchRegistry { } } - pub fn subscribe_key(&mut self, ns: Bytes, key: Bytes) -> Receiver { + pub fn subscribe_key(&mut self, ns: Bytes, key: Bytes) -> Result> { // Prune dead senders for this key before inserting the new one. if let Some(senders) = self.keys.get_mut(&(ns.clone(), key.clone())) { senders.retain(|tx| !tx.is_closed()); } + self.ensure_capacity()?; let (tx, rx) = channel(WATCH_CHANNEL_CAPACITY); self.keys.entry((ns, key)).or_default().push(tx); - rx + Ok(rx) } - pub fn subscribe_prefix(&mut self, ns: Bytes, prefix: Bytes) -> Receiver { + pub fn subscribe_prefix(&mut self, ns: Bytes, prefix: Bytes) -> Result> { // Prune dead prefix senders before inserting the new one. self.prefixes.retain(|(_, tx)| !tx.is_closed()); + self.ensure_capacity()?; let (tx, rx) = channel(WATCH_CHANNEL_CAPACITY); self.prefixes.push(((ns, prefix), tx)); - rx + Ok(rx) + } + + /// Total live subscriptions across exact keys and prefixes. + fn total_subscriptions(&self) -> usize { + self.keys.values().map(Vec::len).sum::() + self.prefixes.len() + } + + /// Reject a new subscription only if the registry is genuinely full. The + /// cheap count runs first; only when it trips do we pay for a full prune of + /// dead senders and re-count, so ordinary subscriber churn (disconnects that + /// haven't been pruned yet) never produces a false capacity error. + fn ensure_capacity(&mut self) -> Result<()> { + if self.total_subscriptions() < MAX_TOTAL_SUBSCRIPTIONS { + return Ok(()); + } + self.keys.retain(|_, senders| { + senders.retain(|tx| !tx.is_closed()); + !senders.is_empty() + }); + self.prefixes.retain(|(_, tx)| !tx.is_closed()); + if self.total_subscriptions() >= MAX_TOTAL_SUBSCRIPTIONS { + return Err(EngineError::CapacityExceeded { + reason: "watch subscription limit reached", + }); + } + Ok(()) } pub fn notify(&mut self, ns: &str, key: &[u8], event: WatchEvent) { @@ -123,7 +160,9 @@ mod tests { #[test] fn exact_key_receives_event() { let mut reg = WatchRegistry::new(); - let mut rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")); + let mut rx = reg + .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")) + .unwrap(); reg.notify("ns", b"k", set_event(b"k")); assert!(matches!(rx.try_recv().unwrap(), WatchEvent::Set { .. })); } @@ -131,7 +170,9 @@ mod tests { #[test] fn exact_key_ignores_other_keys() { let mut reg = WatchRegistry::new(); - let mut rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")); + let mut rx = reg + .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")) + .unwrap(); reg.notify("ns", b"other", set_event(b"other")); assert!(rx.try_recv().is_err(), "channel should be empty"); } @@ -139,7 +180,9 @@ mod tests { #[test] fn exact_key_ignores_other_namespaces() { let mut reg = WatchRegistry::new(); - let mut rx = reg.subscribe_key(Bytes::from_static(b"ns1"), Bytes::from_static(b"k")); + let mut rx = reg + .subscribe_key(Bytes::from_static(b"ns1"), Bytes::from_static(b"k")) + .unwrap(); reg.notify("ns2", b"k", set_event(b"k")); assert!(rx.try_recv().is_err(), "channel should be empty"); } @@ -147,7 +190,9 @@ mod tests { #[test] fn prefix_receives_matching_keys() { let mut reg = WatchRegistry::new(); - let mut rx = reg.subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/")); + let mut rx = reg + .subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/")) + .unwrap(); reg.notify("ns", b"cfg/a", set_event(b"cfg/a")); reg.notify("ns", b"cfg/b", del_event(b"cfg/b")); reg.notify("ns", b"other", set_event(b"other")); // no match @@ -160,7 +205,9 @@ mod tests { #[test] fn dead_exact_sender_pruned() { let mut reg = WatchRegistry::new(); - let rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")); + let rx = reg + .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")) + .unwrap(); drop(rx); // First notify prunes the dead sender. reg.notify("ns", b"k", set_event(b"k")); @@ -174,7 +221,9 @@ mod tests { #[test] fn dead_prefix_sender_pruned() { let mut reg = WatchRegistry::new(); - let rx = reg.subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/")); + let rx = reg + .subscribe_prefix(Bytes::from_static(b"ns"), Bytes::from_static(b"cfg/")) + .unwrap(); drop(rx); reg.notify("ns", b"cfg/x", set_event(b"cfg/x")); assert!(reg.prefixes.is_empty()); @@ -183,10 +232,42 @@ mod tests { #[test] fn multiple_subscribers_same_key() { let mut reg = WatchRegistry::new(); - let mut rx1 = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")); - let mut rx2 = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")); + let mut rx1 = reg + .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")) + .unwrap(); + let mut rx2 = reg + .subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"k")) + .unwrap(); reg.notify("ns", b"k", set_event(b"k")); assert!(rx1.try_recv().is_ok()); assert!(rx2.try_recv().is_ok()); } + + #[test] + fn subscription_cap_rejects_when_full_but_reclaims_dead_first() { + let mut reg = WatchRegistry::new(); + // Fill the registry to the cap with distinct live keys. + let mut live = Vec::with_capacity(MAX_TOTAL_SUBSCRIPTIONS); + for i in 0..MAX_TOTAL_SUBSCRIPTIONS { + let key = Bytes::from(format!("k{i}")); + live.push( + reg.subscribe_key(Bytes::from_static(b"ns"), key) + .expect("under cap"), + ); + } + assert_eq!(reg.total_subscriptions(), MAX_TOTAL_SUBSCRIPTIONS); + + // At the cap, one more is rejected. + assert!(matches!( + reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"overflow")), + Err(EngineError::CapacityExceeded { .. }) + )); + + // Drop one receiver: its sender is now dead. The next subscribe must + // reclaim it instead of falsely rejecting. + live.pop(); + let rx = reg.subscribe_key(Bytes::from_static(b"ns"), Bytes::from_static(b"after-drop")); + assert!(rx.is_ok(), "dead sender should have been reclaimed"); + assert_eq!(reg.total_subscriptions(), MAX_TOTAL_SUBSCRIPTIONS); + } } diff --git a/crates/engine/tests/emfile.rs b/crates/engine/tests/emfile.rs new file mode 100644 index 0000000..cb382cf --- /dev/null +++ b/crates/engine/tests/emfile.rs @@ -0,0 +1,126 @@ +//! Forced file-descriptor exhaustion (EMFILE). +//! +//! This lives in its OWN integration-test binary on purpose: it lowers the +//! process-global `RLIMIT_NOFILE`, which would poison any sibling test sharing +//! the process. As the sole test here, the clamp affects only this process. +//! +//! It proves the descriptor-exhaustion gap we characterized degrades gracefully: +//! opening a new namespace under EMFILE fails with a clean `Io` error (no panic, +//! no corruption), an already-open namespace keeps serving reads, and the store +//! recovers as soon as descriptors are freed. + +use beyond_kv_engine::error::EngineError; +use beyond_kv_engine::store::ShardStore; +use beyond_kv_engine::types::SetOptions; +use bytes::Bytes; +use tempfile::TempDir; + +fn open_fd_count() -> usize { + std::fs::read_dir("/proc/self/fd") + .map(|d| d.count()) + .unwrap_or(0) +} + +/// Set the `RLIMIT_NOFILE` soft limit (clamped to the hard limit); return the +/// previous soft limit so the caller can restore it. +fn set_nofile_soft(soft: u64) -> u64 { + unsafe { + let mut rl = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + assert_eq!( + libc::getrlimit(libc::RLIMIT_NOFILE, &mut rl), + 0, + "getrlimit failed" + ); + let old = rl.rlim_cur; + rl.rlim_cur = soft.min(rl.rlim_max); + assert_eq!( + libc::setrlimit(libc::RLIMIT_NOFILE, &rl), + 0, + "setrlimit failed" + ); + old + } +} + +#[test] +fn emfile_on_namespace_open_degrades_gracefully_and_recovers() { + let mut rt = monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .expect("monoio runtime"); + rt.block_on(async { + let dir = TempDir::new().unwrap(); + let s = ShardStore::open(dir.path(), 4 << 20).await.unwrap(); + + // A namespace we rely on after the clamp; seed a value to read back. + s.set( + "keeper", + b"k", + Bytes::from_static(b"v0"), + SetOptions::default(), + ) + .await + .unwrap(); + + // Clamp the soft fd limit to just above current usage. All runtime + store + // infra descriptors are already allocated, so only NEW file opens (new + // namespaces / log files) can now hit EMFILE. + let cur = open_fd_count(); + let old = set_nofile_soft((cur + 8) as u64); + + // Open fresh namespaces until one fails for lack of descriptors. + let mut hit: Option = None; + for i in 0..256 { + match s + .set( + &format!("new{i}"), + b"k", + Bytes::from_static(b"v"), + SetOptions::default(), + ) + .await + { + Ok(()) => {} + Err(e) => { + hit = Some(e); + break; + } + } + } + let err = hit.expect("expected an fd-exhaustion error while opening new namespaces"); + assert!( + matches!(err, EngineError::Io { .. }), + "EMFILE must surface as a clean Io error, got {err:?}" + ); + + // Graceful degradation: an already-open namespace still serves reads + // (the read path needs no new descriptor). No panic, no corruption. + let got = s + .get("keeper", b"k") + .await + .expect("get must not error under EMFILE"); + assert_eq!( + got.map(|e| e.value), + Some(Bytes::from_static(b"v0")), + "existing namespace remains readable while descriptors are exhausted" + ); + + // Recovery: once descriptors are available again, opening namespaces works. + set_nofile_soft(old); + s.set( + "after_recovery", + b"k", + Bytes::from_static(b"v"), + SetOptions::default(), + ) + .await + .expect("opening a namespace must succeed once fds are freed"); + assert!( + s.get("after_recovery", b"k").await.unwrap().is_some(), + "store recovers cleanly after fd exhaustion clears" + ); + }); +} diff --git a/crates/engine/tests/writeamp.rs b/crates/engine/tests/writeamp.rs new file mode 100644 index 0000000..3e9c5f4 --- /dev/null +++ b/crates/engine/tests/writeamp.rs @@ -0,0 +1,77 @@ +//! Write-amplification measurement: value separation vs the inline baseline +//! (pre-value-separation behavior), using the engine's real `compaction_bytes` +//! counter (bytes relocated by compaction = the GlideFS S3 re-upload cost). +//! +//! Run: cargo test -p beyond-kv-engine --test writeamp -- --nocapture + +use beyond_kv_engine::log::NamespaceLog; +use beyond_kv_engine::log::config::LogConfig; +use bytes::Bytes; +use tempfile::TempDir; + +fn key(i: usize) -> Bytes { + Bytes::from(format!("k{i:05}")) +} + +/// Churn `n` 32 KiB values across `rounds` reclaims; return cumulative +/// compaction bytes after each round. `threshold = usize::MAX` ⇒ values stay +/// inline (the pre-value-separation baseline); a small threshold ⇒ separated. +async fn sweep(threshold: usize, rounds: usize) -> Vec { + let dir = TempDir::new().unwrap(); + let cfg = LogConfig { + rotate_threshold: 64 * 1024, + fanout: 4, + value_sep_threshold: threshold, + }; + let log = NamespaceLog::open(dir.path().to_path_buf(), cfg) + .await + .unwrap(); + let n = 60usize; + let v0 = vec![0xCDu8; 32 * 1024]; + for i in 0..n { + log.put_full(key(i), &v0, &[], None).await.unwrap(); + } + log.reclaim().await.unwrap(); + log.compaction_bytes.set(0); // measure only the churn phase + let mut series = Vec::with_capacity(rounds); + for r in 0..rounds { + let vr = vec![r as u8; 32 * 1024]; // new content each round + for i in 0..n { + log.put_full(key(i), &vr, &[], None).await.unwrap(); + } + log.reclaim().await.unwrap(); + series.push(log.compaction_bytes.get()); + } + series +} + +#[test] +fn writeamp_sweep_csv() { + let mut rt = monoio::RuntimeBuilder::::new() + .enable_timer() + .build() + .unwrap(); + rt.block_on(async { + let rounds = 15usize; + let inline = sweep(usize::MAX, rounds).await; // baseline: values inline + let vsep = sweep(4096, rounds).await; // value separation on + println!("WRITEAMP_CSV_START"); + println!("round,inline_mib,valuesep_mib"); + for r in 0..rounds { + println!( + "{},{:.4},{:.4}", + r + 1, + inline[r] as f64 / 1048576.0, + vsep[r] as f64 / 1048576.0 + ); + } + println!("WRITEAMP_CSV_END"); + let (ti, tv) = (inline[rounds - 1], vsep[rounds - 1]); + println!( + "TOTAL inline={:.2} MiB valuesep={:.4} MiB ratio={:.0}x", + ti as f64 / 1048576.0, + tv as f64 / 1048576.0, + ti as f64 / tv.max(1) as f64 + ); + }); +}