From e687abc1d35c101de2a0fff23f42cc29b5d9eadc Mon Sep 17 00:00:00 2001 From: Rehber Moin Date: Sun, 1 Mar 2026 20:13:08 +0530 Subject: [PATCH 1/3] Cleanup of Previous Sessions --- crates/engine/src/compaction.rs | 10 +++++----- crates/sstable/src/format.rs | 3 +-- crates/sstable/src/reader.rs | 13 +++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/crates/engine/src/compaction.rs b/crates/engine/src/compaction.rs index 4941604..1f605cf 100644 --- a/crates/engine/src/compaction.rs +++ b/crates/engine/src/compaction.rs @@ -55,7 +55,7 @@ impl Engine { let mut merge = MergeIterator::new(&all_sstables); - // Stram directly from MergeIterator -> SSTableWriter without + // Stream directly from MergeIterator -> SSTableWriter without // materializing the entire dataset in RAM. Memory usage is bounded // by the bloom filter + index, not the data volume. let ts = SystemTime::now().duration_since(UNIX_EPOCH)?.as_millis(); @@ -64,9 +64,9 @@ impl Engine { // Tombstone GC: since this is a full compaction (all L0 + L1 -> single // L1), there are no older SSTables that could contain shadowed values. - // Tombstones are therefore safe to drop — they have no older data to - // shadow. Also check if the memtable contains the key: if so, the - // tombstone must be preserved to shadow the memtable entry on recovery. + // Tombstones are therefore safe to drop — unless the memtable still + // references the key, in which case we can keep the tombstone so it + // continues to shadow the memtable's entry after a subsequent flush. // // Build a streaming iterator adapter from MergeIterator. // MergeIterator::next() returns Result>, so we collect @@ -80,7 +80,7 @@ impl Engine { // Drop tombstones unless the memtable still references // this key (the memtable is not part of compaction, so // we must keep tombstones that shadow memtable data). - if entry.value.is_none() && mem_ref.contains_key(&key) { + if entry.value.is_none() && !mem_ref.contains_key(&key) { continue; // GC this tombstone } return Some((key, entry)); diff --git a/crates/sstable/src/format.rs b/crates/sstable/src/format.rs index 4c84322..e4d173e 100644 --- a/crates/sstable/src/format.rs +++ b/crates/sstable/src/format.rs @@ -24,6 +24,7 @@ //! then seeking back to read the appropriate footer size. use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use std::io; use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; /// Magic number identifying SSTable v1 files (ASCII "SST1") Can be Removed Later. @@ -253,5 +254,3 @@ pub fn read_footer(r: &mut R) -> IoResult<(u64, u32)> { let magic = r.read_u32::()?; Ok((index_offset, magic)) } - -use std::io; diff --git a/crates/sstable/src/reader.rs b/crates/sstable/src/reader.rs index bab1cd8..4e9194c 100644 --- a/crates/sstable/src/reader.rs +++ b/crates/sstable/src/reader.rs @@ -182,7 +182,8 @@ impl SSTableReader { let seq = f.read_u64::()?; let present = f.read_u8()?; - let (value, val_bytes) = if present == 1 { + // let (value, val_bytes) = if present == 1 { + let value = if present == 1 { let val_len = f.read_u32::()? as usize; if val_len > MAX_VALUE_BYTES { bail!( @@ -193,9 +194,9 @@ impl SSTableReader { } let mut val = vec![0u8; val_len]; f.read_exact(&mut val)?; - (Some(val.clone()), Some(val)) + Some(val.clone()) } else { - (None, None) + None }; // Verify CRC32 for v3 SSTables. @@ -206,9 +207,9 @@ impl SSTableReader { hasher.update(&key_buf); hasher.update(&seq.to_le_bytes()); hasher.update(&[present]); - if let Some(ref vb) = val_bytes { - hasher.update(&(vb.len() as u32).to_le_bytes()); - hasher.update(vb); + if let Some(ref v) = value { + hasher.update(&(v.len() as u32).to_le_bytes()); + hasher.update(v); } let actual_crc = hasher.finalize(); if actual_crc != expected_crc { From fe80a978135ffa955c25eb312b3d9eb388e04b32 Mon Sep 17 00:00:00 2001 From: Rehber Moin Date: Sun, 1 Mar 2026 21:39:36 +0530 Subject: [PATCH 2/3] ADDED: Config Unification + Concurrent Engine Boilerplate --- Cargo.lock | 6 + Cargo.toml | 1 + crates/cli/Cargo.toml | 1 + crates/cli/src/main.rs | 35 +-- crates/config/Cargo.toml | 6 + crates/config/src/lib.rs | 293 ++++++++++++++++++++ crates/config/src/tests.rs | 76 +++++ crates/engine/Cargo.toml | 1 + crates/engine/src/concurrent.rs | 202 ++++++++++++++ crates/engine/src/lib.rs | 64 +++-- crates/engine/src/tests/compaction_tests.rs | 27 +- crates/engine/src/tests/concurrent_tests.rs | 128 +++++++++ crates/engine/src/tests/mod.rs | 1 + crates/engine/src/tests/read_tests.rs | 12 +- crates/engine/src/tests/recovery_tests.rs | 32 +-- crates/engine/src/tests/write_tests.rs | 46 +-- 16 files changed, 829 insertions(+), 102 deletions(-) create mode 100644 crates/config/Cargo.toml create mode 100644 crates/config/src/lib.rs create mode 100644 crates/config/src/tests.rs create mode 100644 crates/engine/src/concurrent.rs create mode 100644 crates/engine/src/tests/concurrent_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 4d5d71f..26ef799 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -127,6 +127,7 @@ version = "0.1.0" dependencies = [ "anyhow", "byteorder", + "config", "criterion", "engine", "memtable", @@ -135,6 +136,10 @@ dependencies = [ "wal", ] +[[package]] +name = "config" +version = "0.1.0" + [[package]] name = "crc32fast" version = "1.5.0" @@ -222,6 +227,7 @@ name = "engine" version = "0.1.0" dependencies = [ "anyhow", + "config", "memtable", "sstable", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 144b6c7..303863f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "crates/bloom", + "crates/config", "crates/engine", "crates/cli", "crates/wal", diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 48beddb..d861944 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +config = { path = "../config" } engine = { path = "../engine" } memtable = { path = "../memtable" } sstable = { path = "../sstable" } diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 6d21bf1..253b518 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -46,37 +46,28 @@ //! bye //! ``` use anyhow::Result; +use config::EngineConfig; use engine::Engine; use std::io::{self, BufRead, Write}; -/// Reads a configuration value from the environment, falling back to `default`. -fn env_or(key: &str, default: &str) -> String { - std::env::var(key).unwrap_or_else(|_| default.to_string()) -} - fn main() -> Result<()> { - // Configuration via environment variables with sensible defaults. - // - // RIPTIDE_WAL_PATH - WAL file path (default: "wal.log") - // RIPTIDE_SST_DIR - SSTable directory (default: "data/sst") - // RIPTIDE_FLUSH_KB - flush threshold in KiB (default: 1024 = 1 MiB) - // RIPTIDE_WAL_SYNC - fsync every WAL append (default: "true") - // RIPTIDE_L0_TRIGGER - L0 compaction trigger (default: 4, 0 = disabled) - let wal_path = env_or("RIPTIDE_WAL_PATH", "wal.log"); - let sst_dir = env_or("RIPTIDE_SST_DIR", "data/sst"); - let flush_kb: usize = env_or("RIPTIDE_FLUSH_KB", "1024").parse().unwrap_or(1024); - let flush_threshold = flush_kb * 1024; - let wal_sync: bool = env_or("RIPTIDE_WAL_SYNC", "true").parse().unwrap_or(true); - let l0_trigger: usize = env_or("RIPTIDE_L0_TRIGGER", "4").parse().unwrap_or(4); + // Load all configuration from environment variables (single source of truth). + // See `config::EngineConfig::from_env()` for the full list of supported + // variables and their defaults. + let cfg = EngineConfig::from_env(); + + let flush_kb = cfg.flush_threshold_bytes / 1024; + let wal_path_display = cfg.wal_path.display().to_string(); + let sst_path_display = cfg.sst_dir.display().to_string(); + let l0_trigger = cfg.l0_compaction_trigger; - let mut engine = Engine::new(&wal_path, &sst_dir, flush_threshold, wal_sync)?; - engine.set_l0_compaction_trigger(l0_trigger); + let mut engine = Engine::new(cfg)?; println!( "RiptideKV started (seq={}, wal={}, sst_dir={}, flush={}KiB, l0_trigger={})", engine.seq(), - wal_path, - sst_dir, + wal_path_display, + sst_path_display, flush_kb, l0_trigger ); diff --git a/crates/config/Cargo.toml b/crates/config/Cargo.toml new file mode 100644 index 0000000..8db0789 --- /dev/null +++ b/crates/config/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "config" +version = "0.1.0" +edition = "2021" + +[dependencies] diff --git a/crates/config/src/lib.rs b/crates/config/src/lib.rs new file mode 100644 index 0000000..4eeadc5 --- /dev/null +++ b/crates/config/src/lib.rs @@ -0,0 +1,293 @@ +//! # Config – Unified Configuration for RiptideKV +//! +//! This crate provides a single source of truth for all RiptideKV configuration. +//! Every component — the CLI, the RESP server, tests — constructs an [`EngineConfig`] +//! and passes it to the engine. This eliminates duplicated defaults, scattered +//! env-var parsing, and positional-argument constructors. +//! +//! ## Design Decisions +//! +//! - **Single struct, not scattered args**: `Engine::new()` previously took 4 +//! positional arguments (`wal_path`, `sst_dir`, `flush_threshold`, `wal_sync`) +//! plus a post-construction setter for `l0_compaction_trigger`. This was fragile +//! and easy to misconfigure. A single config struct is self-documenting. +//! +//! - **Builder pattern**: Tests need custom configs (tiny flush thresholds, sync +//! disabled, etc.) while production uses env vars. The builder pattern supports +//! both ergonomically. +//! +//! - **from_env() factory**: Both the CLI and the RESP server need to load +//! config from environment variables. Centralizing this avoids duplication. +//! +//! - **Defaults defined once**: Constants like `DEFAULT_FLUSH_THRESHOLD_KB` and +//! `DEFAULT_L0_COMPACTION_TRIGGER` live here, not scattered across crates. +//! +//! ## Example +//! +//! ```rust +//! use config::EngineConfig; +//! +//! // Production: load from environment variables +//! let cfg = EngineConfig::from_env(); +//! +//! // Tests: use builder for precise control +//! let cfg = EngineConfig::builder() +//! .wal_path("/tmp/test/wal.log") +//! .sst_dir("/tmp/test/sst") +//! .flush_threshold_bytes(64) +//! .wal_sync(false) +//! .l0_compaction_trigger(0) +//! .build(); +//! ``` + +use std::path::PathBuf; + +// ─── Default constants (single source of truth) ───────────────────────────── + +/// Default WAL file path. +pub const DEFAULT_WAL_PATH: &str = "wal.log"; + +/// Default SSTable directory. +pub const DEFAULT_SST_DIR: &str = "data/sst"; + +/// Default flush threshold in KiB. The memtable is flushed to an SSTable +/// when its approximate byte size reaches `flush_threshold_kb * 1024`. +pub const DEFAULT_FLUSH_THRESHOLD_KB: usize = 1024; + +/// Default number of L0 SSTables that triggers automatic compaction. +/// Set to `0` to disable auto-compaction. +pub const DEFAULT_L0_COMPACTION_TRIGGER: usize = 4; + +/// Default WAL sync mode. When `true`, every WAL append is followed by +/// `fsync` for maximum durability. +pub const DEFAULT_WAL_SYNC: bool = true; + +/// Default host address for the RESP server. +pub const DEFAULT_SERVER_HOST: &str = "127.0.0.1"; + +/// Default port for the RESP server. +pub const DEFAULT_SERVER_PORT: u16 = 6379; + +// ─── EngineConfig ─────────────────────────────────────────────────────────── + +/// Unified configuration for the RiptideKV storage engine. +/// +/// This struct is the **single source of truth** for all engine parameters. +/// It is consumed by `Engine::new()`, the CLI, the RESP server, and tests. +/// +/// # Fields +/// +/// | Field | Default | Env Variable | +/// |-------|----------|--------------| +/// | `wal_path` | `wal.log` | `RIPTIDE_WAL_PATH` | +/// | `sst_dir` | `data/sst` | `RIPTIDE_SST_DIR` | +/// | `flush_threshold_bytes` | `1048576` (1 MiB) | `RIPTIDE_FLUSH_KB` (in KiB) | +/// | `wal_sync` | `true` | `RIPTIDE_WAL_SYNC` | +/// | `l0_compaction_trigger` | `4` | `RIPTIDE_L0_TRIGGER` | +/// | `server_host` | `127.0.0.1` | `RIPTIDE_HOST` | +/// | `server_port` | `6379` | `RIPTIDE_PORT` | +#[derive(Debug, Clone)] +pub struct EngineConfig { + /// Path to the write-ahead log file. + pub wal_path: PathBuf, + + /// Directory where SSTable files are stored. + pub sst_dir: PathBuf, + + /// Memtable byte-size threshold that triggers a flush to SSTable. + /// When `memtable.approx_size() >= flush_threshold_bytes`, the memtable + /// is flushed to a new SSTable on disk. + pub flush_threshold_bytes: usize, + + /// If `true`, every WAL append is followed by `fsync` for durability. + /// Setting this to `false` batches writes for better throughput at the + /// cost of losing the most recent writes on crash. + pub wal_sync: bool, + + /// Number of L0 SSTables that triggers automatic compaction after a flush. + /// Set to `0` to disable auto-compaction (caller must invoke `compact()` + /// manually). + pub l0_compaction_trigger: usize, + + /// Host address for the RESP server to bind to. + pub server_host: String, + + /// Port for the RESP server to listen on. + pub server_port: u16, +} + +impl Default for EngineConfig { + /// Returns the default configuration with sensible production values. + /// + /// These defaults match the values previously hard-coded across the CLI + /// and engine crates. + fn default() -> Self { + Self { + wal_path: PathBuf::from(DEFAULT_WAL_PATH), + sst_dir: PathBuf::from(DEFAULT_SST_DIR), + flush_threshold_bytes: DEFAULT_FLUSH_THRESHOLD_KB * 1024, + wal_sync: DEFAULT_WAL_SYNC, + l0_compaction_trigger: DEFAULT_L0_COMPACTION_TRIGGER, + server_host: DEFAULT_SERVER_HOST.to_string(), + server_port: DEFAULT_SERVER_PORT, + } + } +} + +impl EngineConfig { + /// Creates a configuration by reading environment variables, falling back + /// to defaults for any variable that is not set or cannot be parsed. + /// + /// # Environment Variables + /// + /// | Variable | Type | Default | + /// |----------|------|---------| + /// | `RIPTIDE_WAL_PATH` | `String` | `wal.log` | + /// | `RIPTIDE_SST_DIR` | `String` | `data/sst` | + /// | `RIPTIDE_FLUSH_KB` | `usize` (KiB) | `1024` | + /// | `RIPTIDE_WAL_SYNC` | `bool` | `true` | + /// | `RIPTIDE_L0_TRIGGER` | `usize` | `4` | + /// | `RIPTIDE_HOST` | `String` | `127.0.0.1` | + /// | `RIPTIDE_PORT` | `u16` | `6379` | + pub fn from_env() -> Self { + let defaults = Self::default(); + + let wal_path = env_or("RIPTIDE_WAL_PATH", DEFAULT_WAL_PATH); + let sst_dir = env_or("RIPTIDE_SST_DIR", DEFAULT_SST_DIR); + + let flush_kb: usize = env_or("RIPTIDE_FLUSH_KB", &DEFAULT_FLUSH_THRESHOLD_KB.to_string()) + .parse() + .unwrap_or(DEFAULT_FLUSH_THRESHOLD_KB); + + let wal_sync: bool = env_or("RIPTIDE_WAL_SYNC", &DEFAULT_WAL_SYNC.to_string()) + .parse() + .unwrap_or(DEFAULT_WAL_SYNC); + + let l0_trigger: usize = env_or( + "RIPTIDE_L0_TRIGGER", + &DEFAULT_L0_COMPACTION_TRIGGER.to_string(), + ) + .parse() + .unwrap_or(DEFAULT_L0_COMPACTION_TRIGGER); + + let server_host = env_or("RIPTIDE_HOST", DEFAULT_SERVER_HOST); + + let server_port: u16 = env_or("RIPTIDE_PORT", &DEFAULT_SERVER_PORT.to_string()) + .parse() + .unwrap_or(defaults.server_port); + + Self { + wal_path: PathBuf::from(wal_path), + sst_dir: PathBuf::from(sst_dir), + flush_threshold_bytes: flush_kb * 1024, + wal_sync, + l0_compaction_trigger: l0_trigger, + server_host, + server_port, + } + } + + /// Returns a [`ConfigBuilder`] for ergonomic construction in tests and + /// programmatic usage. + /// + /// All fields start with their default values. Override only what you need. + pub fn builder() -> ConfigBuilder { + ConfigBuilder { + config: Self::default(), + } + } + + /// Returns the `host:port` address string for the RESP server. + pub fn server_addr(&self) -> String { + format!("{}:{}", self.server_host, self.server_port) + } +} + +// ─── ConfigBuilder ────────────────────────────────────────────────────────── + +/// Builder for [`EngineConfig`] with a fluent API. +/// +/// All fields start with their default values. Call setters to override, then +/// call [`build()`](ConfigBuilder::build) to produce the final config. +/// +/// # Example +/// +/// ```rust +/// use config::EngineConfig; +/// +/// let cfg = EngineConfig::builder() +/// .wal_path("/tmp/wal.log") +/// .sst_dir("/tmp/sst") +/// .flush_threshold_bytes(256) +/// .wal_sync(false) +/// .build(); +/// +/// assert_eq!(cfg.flush_threshold_bytes, 256); +/// assert!(!cfg.wal_sync); +/// ``` +pub struct ConfigBuilder { + config: EngineConfig, +} + +impl ConfigBuilder { + /// Sets the WAL file path. + pub fn wal_path(mut self, path: impl Into) -> Self { + self.config.wal_path = path.into(); + self + } + + /// Sets the SSTable directory. + pub fn sst_dir(mut self, dir: impl Into) -> Self { + self.config.sst_dir = dir.into(); + self + } + + /// Sets the flush threshold in bytes (not KiB). + /// + /// This is the raw byte value. For KiB, multiply by 1024 before passing. + pub fn flush_threshold_bytes(mut self, bytes: usize) -> Self { + self.config.flush_threshold_bytes = bytes; + self + } + + /// Sets the WAL sync mode. + pub fn wal_sync(mut self, sync: bool) -> Self { + self.config.wal_sync = sync; + self + } + + /// Sets the L0 compaction trigger. Pass `0` to disable auto-compaction. + pub fn l0_compaction_trigger(mut self, trigger: usize) -> Self { + self.config.l0_compaction_trigger = trigger; + self + } + + /// Sets the RESP server host address. + pub fn server_host(mut self, host: impl Into) -> Self { + self.config.server_host = host.into(); + self + } + + /// Sets the RESP server port. + pub fn server_port(mut self, port: u16) -> Self { + self.config.server_port = port; + self + } + + /// Consumes the builder and returns the final [`EngineConfig`]. + pub fn build(self) -> EngineConfig { + self.config + } +} + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +/// Reads an environment variable, falling back to `default` if not set. +fn env_or(key: &str, default: &str) -> String { + std::env::var(key).unwrap_or_else(|_| default.to_string()) +} + +// ─── Tests ─────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests; diff --git a/crates/config/src/tests.rs b/crates/config/src/tests.rs new file mode 100644 index 0000000..a8f60cb --- /dev/null +++ b/crates/config/src/tests.rs @@ -0,0 +1,76 @@ +use super::*; + +#[test] +fn default_config_has_expected_values() { + let cfg = EngineConfig::default(); + assert_eq!(cfg.wal_path, PathBuf::from("wal.log")); + assert_eq!(cfg.sst_dir, PathBuf::from("data/sst")); + assert_eq!(cfg.flush_threshold_bytes, 1024 * 1024); + assert!(cfg.wal_sync); + assert_eq!(cfg.l0_compaction_trigger, 4); + assert_eq!(cfg.server_host, "127.0.0.1"); + assert_eq!(cfg.server_port, 6379); +} + +#[test] +fn builder_overrides_defaults() { + let cfg = EngineConfig::builder() + .wal_path("/custom/wal.log") + .sst_dir("/custom/sst") + .flush_threshold_bytes(256) + .wal_sync(false) + .l0_compaction_trigger(8) + .server_host("0.0.0.0") + .server_port(7379) + .build(); + + assert_eq!(cfg.wal_path, PathBuf::from("/custom/wal.log")); + assert_eq!(cfg.sst_dir, PathBuf::from("/custom/sst")); + assert_eq!(cfg.flush_threshold_bytes, 256); + assert!(!cfg.wal_sync); + assert_eq!(cfg.l0_compaction_trigger, 8); + assert_eq!(cfg.server_host, "0.0.0.0"); + assert_eq!(cfg.server_port, 7379); +} + +#[test] +fn clone_produces_independent_copy() { + let cfg1 = EngineConfig::builder().flush_threshold_bytes(100).build(); + let mut cfg2 = cfg1.clone(); + cfg2.flush_threshold_bytes = 200; + + assert_eq!(cfg1.flush_threshold_bytes, 100); + assert_eq!(cfg2.flush_threshold_bytes, 200); +} + +#[test] +fn builder_partial_override() { + let cfg = EngineConfig::builder().flush_threshold_bytes(64).build(); + + // Only flush_threshold changed; rest are defaults + assert_eq!(cfg.flush_threshold_bytes, 64); + assert_eq!(cfg.wal_path, PathBuf::from("wal.log")); + assert!(cfg.wal_sync); + assert_eq!(cfg.l0_compaction_trigger, 4); +} + +#[test] +fn server_addr_format() { + let cfg = EngineConfig::builder() + .server_host("0.0.0.0") + .server_port(8080) + .build(); + assert_eq!(cfg.server_addr(), "0.0.0.0:8080"); +} + +#[test] +fn from_env_uses_defaults_when_no_vars_set() { + // This test relies on the CI/test environment not having RIPTIDE_* + // variables set. It verifies the fallback path. + let cfg = EngineConfig::from_env(); + // We can't assert exact values because env vars might be set in CI, + // but we can assert the config is valid. + assert!(!cfg.wal_path.as_os_str().is_empty()); + assert!(!cfg.sst_dir.as_os_str().is_empty()); + assert!(cfg.flush_threshold_bytes > 0); +} diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml index 5cb4000..1013df4 100644 --- a/crates/engine/Cargo.toml +++ b/crates/engine/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] +config = { path = "../config" } memtable = { path = "../memtable" } sstable = { path = "../sstable" } wal = { path = "../wal" } diff --git a/crates/engine/src/concurrent.rs b/crates/engine/src/concurrent.rs new file mode 100644 index 0000000..2b3ba1d --- /dev/null +++ b/crates/engine/src/concurrent.rs @@ -0,0 +1,202 @@ +//! # ConcurrentEngine – Thread-Safe Wrapper for Fearless Concurrency +//! +//! Wraps [`Engine`] in `Arc>` so multiple threads (or Tokio +//! tasks via `spawn_blocking`) can safely share a single engine instance. +//! +//! ## Design Decisions +//! +//! | Decision | Rationale | +//! |----------|-----------| +//! | `std::sync::RwLock` (not Tokio) | Engine performs synchronous file I/O; an async lock would block the Tokio runtime. | +//! | `Arc` wrapper | Enables cheap cloning for distribution across tasks/threads. | +//! | Read lock for `get`/`scan` | Multiple readers can proceed concurrently (no I/O mutation). | +//! | Write lock for `set`/`del`/… | Mutations must be serialized to protect WAL + Memtable state. | +//! | `Clone` derives cheaply | `Arc::clone` is an atomic ref-count bump — negligible cost. | +//! +//! ## Usage with Tokio +//! +//! ```rust,ignore +//! let engine = ConcurrentEngine::new(cfg)?; +//! let engine_clone = engine.clone(); +//! +//! tokio::task::spawn_blocking(move || { +//! engine_clone.set(b"key".to_vec(), b"value".to_vec()) +//! }).await??; +//! ``` + +use std::sync::{Arc, RwLock}; + +use anyhow::{Context, Result}; +use config::EngineConfig; + +use crate::Engine; + +/// A thread-safe handle to a shared [`Engine`]. +/// +/// ALL methods acquire the appropriate lock internally so callers never need +/// to manage locking themselves. The handle is cheaply cloneable (`Arc`). +#[derive(Clone)] +pub struct ConcurrentEngine { + inner: Arc>, +} + +impl ConcurrentEngine { + // ─── Construction ─────────────────────────────────────────────────────── + + /// Creates a new `ConcurrentEngine` from an [`EngineConfig`]. + pub fn new(cfg: EngineConfig) -> Result { + let engine = Engine::new(cfg)?; + Ok(Self { + inner: Arc::new(RwLock::new(engine)), + }) + } + + /// Wraps an already-constructed [`Engine`] for concurrent access. + pub fn from_engine(engine: Engine) -> Self { + Self { + inner: Arc::new(RwLock::new(engine)), + } + } + + // ─── Write operations (exclusive lock) ───────────────────────────────── + + /// Inserts a key-value pair. Acquires a **write** lock. + pub fn set(&self, key: Vec, value: Vec) -> Result<()> { + self.inner + .write() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire write lock for set")? + .set(key, value) + } + + /// Deletes a key (writes a tombstone). Acquires a **write** lock. + pub fn del(&self, key: Vec) -> Result<()> { + self.inner + .write() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire write lock for del")? + .del(key) + } + + /// Forces a memtable flush to SSTable. Acquires a **write** lock. + pub fn force_flush(&self) -> Result<()> { + self.inner + .write() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire write lock for force_flush")? + .force_flush() + } + + /// Runs compaction (merges all L0 + L1 into a single L1 SSTable). + /// Acquires a **write** lock. + pub fn compact(&self) -> Result<()> { + self.inner + .write() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire write lock for compact")? + .compact() + } + + /// Updates the L0 compaction trigger. Acquires a **write** lock. + pub fn set_l0_compaction_trigger(&self, trigger: usize) -> Result<()> { + self.inner + .write() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire write lock for set_l0_compaction_trigger")? + .set_l0_compaction_trigger(trigger); + Ok(()) + } + + /// Updates the flush threshold. Acquires a **write** lock. + pub fn set_flush_threshold(&self, threshold: usize) -> Result<()> { + self.inner + .write() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire write lock for set_flush_threshold")? + .set_flush_threshold(threshold); + Ok(()) + } + + // ─── Read operations (shared lock) ───────────────────────────────────── + + /// Looks up a key. Acquires a **read** lock. + pub fn get(&self, key: &[u8]) -> Result)>> { + self.inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire read lock for get")? + .get(key) + } + + /// Range scan. Acquires a **read** lock. + pub fn scan(&self, start: &[u8], end: &[u8]) -> Result, Vec)>> { + self.inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) + .context("failed to acquire read lock for scan")? + .scan(start, end) + } + + /// Returns the current sequence number. Acquires a **read** lock. + pub fn seq(&self) -> Result { + Ok(self + .inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? + .seq()) + } + + /// Returns the total SSTable count. Acquires a **read** lock. + pub fn sstable_count(&self) -> Result { + Ok(self + .inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? + .sstable_count()) + } + + /// Returns the L0 SSTable count. Acquires a **read** lock. + pub fn l0_sstable_count(&self) -> Result { + Ok(self + .inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? + .l0_sstable_count()) + } + + /// Returns the L1 SSTable count. Acquires a **read** lock. + pub fn l1_sstable_count(&self) -> Result { + Ok(self + .inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? + .l1_sstable_count()) + } + + /// Returns the flush threshold in bytes. Acquires a **read** lock. + pub fn flush_threshold(&self) -> Result { + Ok(self + .inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? + .flush_threshold()) + } + + /// Returns the L0 compaction trigger. Acquires a **read** lock. + pub fn l0_compaction_trigger(&self) -> Result { + Ok(self + .inner + .read() + .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? + .l0_compaction_trigger()) + } +} + +impl std::fmt::Debug for ConcurrentEngine { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.inner.read() { + Ok(engine) => write!(f, "ConcurrentEngine({:?})", &*engine), + Err(_) => write!(f, "ConcurrentEngine()"), + } + } +} diff --git a/crates/engine/src/lib.rs b/crates/engine/src/lib.rs index 5571f6b..a2e52c8 100644 --- a/crates/engine/src/lib.rs +++ b/crates/engine/src/lib.rs @@ -59,12 +59,15 @@ //! are written atomically via temp file + rename. The manifest uses the same //! atomic write pattern. See [`ARCHITECTURE.md`] for the full crash matrix. mod compaction; +pub mod concurrent; mod manifest; mod read; mod recovery; mod write; use anyhow::Result; +pub use concurrent::ConcurrentEngine; +pub use config::EngineConfig; use manifest::Manifest; use memtable::Memtable; pub use recovery::replay_wal_and_build; @@ -77,13 +80,6 @@ pub const MAX_KEY_SIZE: usize = 64 * 1024; /// Maximum allowed value size in bytes (10 MiB). pub const MAX_VALUE_SIZE: usize = 10 * 1024 * 1024; -/// Default number of L0 SSTables that triggers automatic compaction. -/// -/// When the L0 count reaches this threshold after a flush, the engine -/// automatically runs compaction to merge L0 + L1 into a single L1 SSTable. -/// Set to `0` to disable auto-compaction. -pub const DEFAULT_L0_COMPACTION_TRIGGER: usize = 4; - /// The central storage engine orchestrating Memtable, WAL, and SSTables. /// /// # Write Path @@ -152,15 +148,12 @@ impl std::fmt::Debug for Engine { } impl Engine { - /// Creates a new engine, performing full recovery from the WAL and existing - /// SSTable files. + /// Creates a new engine from an [`EngineConfig`], performing full recovery + /// from the WAL and existing SSTable files. /// /// # Arguments /// - /// * `wal_path` — path to the write-ahead log file. - /// * `sst_dir` — directory where SSTable files are stored. - /// * `flush_threshold` — memtable byte-size threshold that triggers flush. - /// * `wal_sync` — if `true`, every WAL append calls `fsync`. + /// * `cfg` — unified configuration struct containing all engine parameters. /// /// # Recovery Steps /// @@ -170,14 +163,12 @@ impl Engine { /// 4. Open the WAL writer in append mode. /// 5. Load SSTables from the manifest (or scan directory for legacy DBs). /// 6. Determine the highest sequence number across WAL and SSTables. - pub fn new, P2: AsRef>( - wal_path: P1, - sst_dir: P2, - flush_threshold: usize, - wal_sync: bool, - ) -> Result { - let wal_path = wal_path.as_ref().to_path_buf(); - let sst_dir = sst_dir.as_ref().to_path_buf(); + pub fn new(cfg: EngineConfig) -> Result { + let wal_path = cfg.wal_path; + let sst_dir = cfg.sst_dir; + let flush_threshold = cfg.flush_threshold_bytes; + let wal_sync = cfg.wal_sync; + let l0_compaction_trigger = cfg.l0_compaction_trigger; // ensure sst dir exists std::fs::create_dir_all(&sst_dir)?; @@ -263,11 +254,40 @@ impl Engine { manifest, seq, flush_threshold, - l0_compaction_trigger: DEFAULT_L0_COMPACTION_TRIGGER, + l0_compaction_trigger, wal_sync, }) } + /// Convenience constructor that accepts individual parameters instead of + /// an [`EngineConfig`]. + /// + /// This is equivalent to building an `EngineConfig` and calling + /// [`Engine::new`]. Primarily useful in tests where constructing a full + /// config struct is verbose. + /// + /// # Arguments + /// + /// * `wal_path` – path to the write-ahead log file. + /// * `sst_dir` – directory where SSTable files are stored. + /// * `flush_threshold` – memtable byte-size threshold that triggers flush. + /// * `wal_sync` – if `true`, every WAL append calls `fsync`. + pub fn from_parts, P2: AsRef>( + wal_path: P1, + sst_dir: P2, + flush_threshold: usize, + wal_sync: bool, + ) -> Result { + let cfg = EngineConfig::builder() + .wal_path(wal_path.as_ref()) + .sst_dir(sst_dir.as_ref()) + .flush_threshold_bytes(flush_threshold) + .wal_sync(wal_sync) + .build(); + + Self::new(cfg) + } + /// Returns the current monotonic sequence number. #[must_use] pub fn seq(&self) -> u64 { diff --git a/crates/engine/src/tests/compaction_tests.rs b/crates/engine/src/tests/compaction_tests.rs index a81374e..7120024 100644 --- a/crates/engine/src/tests/compaction_tests.rs +++ b/crates/engine/src/tests/compaction_tests.rs @@ -10,7 +10,7 @@ use tempfile::tempdir; #[test] fn flush_goes_to_l0() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -34,7 +34,7 @@ fn flush_goes_to_l0() -> Result<()> { #[test] fn compact_moves_l0_to_l1() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -76,7 +76,7 @@ fn compact_moves_l0_to_l1() -> Result<()> { #[test] fn compact_preserves_newest_value() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 32, @@ -99,7 +99,7 @@ fn compact_preserves_newest_value() -> Result<()> { #[test] fn many_keys_with_flushes() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 4096, // 4 KB threshold @@ -150,7 +150,7 @@ fn many_keys_with_flushes() -> Result<()> { #[test] fn auto_compaction_triggers_at_l0_threshold() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1, // threshold=1 -> every set triggers a flush @@ -178,7 +178,8 @@ fn auto_compaction_triggers_at_l0_threshold() -> Result<()> { #[test] fn auto_compaction_disabled_when_trigger_is_zero() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new(dir.path().join("wal.log"), dir.path().join("sst"), 1, false)?; + let mut engine = + Engine::from_parts(dir.path().join("wal.log"), dir.path().join("sst"), 1, false)?; engine.set_l0_compaction_trigger(0); for i in 0..5u64 { @@ -197,7 +198,7 @@ fn auto_compaction_disabled_when_trigger_is_zero() -> Result<()> { #[test] fn tombstone_gc_removes_dead_keys_during_compaction() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 32, @@ -228,7 +229,7 @@ fn tombstone_gc_removes_dead_keys_during_compaction() -> Result<()> { fn compact_reduces_sst_file_count() -> Result<()> { let dir = tempdir()?; let sst_dir = dir.path().join("sst"); - let mut engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 64, false)?; + let mut engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 64, false)?; engine.set_l0_compaction_trigger(0); for i in 0..50u64 { @@ -258,7 +259,7 @@ fn compact_reduces_sst_file_count() -> Result<()> { #[test] fn l0_flush_then_compact_then_more_flushes() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -311,7 +312,7 @@ fn l0_flush_then_compact_then_more_flushes() -> Result<()> { #[test] fn compact_preserves_tombstones() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 32, @@ -335,7 +336,7 @@ fn compact_preserves_tombstones() -> Result<()> { #[test] fn compact_single_sstable_is_noop() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -366,7 +367,7 @@ fn compact_then_recovery_works() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::new(&wal, &sst, 64, false)?; + let mut engine = Engine::from_parts(&wal, &sst, 64, false)?; engine.set_l0_compaction_trigger(0); for i in 0..30u64 { engine.set(format!("k{:04}", i).into_bytes(), b"val".to_vec())?; @@ -379,7 +380,7 @@ fn compact_then_recovery_works() -> Result<()> { } // Reopen engine – should recover from the single compacted SSTable - let engine = Engine::new(&wal, &sst, 64, false)?; + let engine = Engine::from_parts(&wal, &sst, 64, false)?; assert_eq!(engine.sstable_count(), 1); for i in 0..30u64 { diff --git a/crates/engine/src/tests/concurrent_tests.rs b/crates/engine/src/tests/concurrent_tests.rs new file mode 100644 index 0000000..e3738af --- /dev/null +++ b/crates/engine/src/tests/concurrent_tests.rs @@ -0,0 +1,128 @@ +use anyhow::Result; +use config::EngineConfig; +use std::thread; +use tempfile::tempdir; + +use crate::ConcurrentEngine; + +fn test_config(dir: &std::path::Path) -> EngineConfig { + EngineConfig::builder() + .wal_path(dir.join("wal.log")) + .sst_dir(dir.join("sst")) + .flush_threshold_bytes(1024 * 1024) + .wal_sync(false) + .build() +} + +#[test] +fn concurrent_reads_do_not_block_each_other() -> Result<()> { + let dir = tempdir()?; + let engine = ConcurrentEngine::new(test_config(dir.path()))?; + engine.set(b"k".to_vec(), b"v".to_vec())?; + + let handles: Vec<_> = (0..4) + .map(|_| { + let e = engine.clone(); + thread::spawn(move || e.get(b"k")) + }) + .collect(); + + for h in handles { + let result = h.join().expect("thread panicked")?; + assert_eq!(result.unwrap().1, b"v"); + } + Ok(()) +} + +#[test] +fn concurrent_writes_are_serialized() -> Result<()> { + let dir = tempdir()?; + let engine = ConcurrentEngine::new(test_config(dir.path()))?; + + let handles: Vec<_> = (0..10) + .map(|i| { + let e = engine.clone(); + thread::spawn(move || e.set(format!("k{}", i).into_bytes(), b"v".to_vec())) + }) + .collect(); + + for h in handles { + h.join().expect("thread panicked")?; + } + + // All 10 keys should exist + for i in 0..10 { + let key = format!("k{}", i).into_bytes(); + assert!(engine.get(&key)?.is_some(), "key k{} missing", i); + } + + assert_eq!(engine.seq()?, 10); + Ok(()) +} + +#[test] +fn mixed_reads_and_writes() -> Result<()> { + let dir = tempdir()?; + let engine = ConcurrentEngine::new(test_config(dir.path()))?; + + // Pre-populate + for i in 0..5 { + engine.set(format!("k{}", i).into_bytes(), b"init".to_vec())?; + } + + let mut handles = Vec::new(); + + // Spawn readers + for i in 0..5 { + let e = engine.clone(); + handles.push(thread::spawn(move || { + let key = format!("k{}", i).into_bytes(); + for _ in 0..100 { + e.get(&key).expect("get failed"); + } + })); + } + + // Spawn writers + for i in 5..10 { + let e = engine.clone(); + handles.push(thread::spawn(move || { + e.set(format!("k{}", i).into_bytes(), b"new".to_vec()) + .expect("set failed"); + })); + } + + for h in handles { + h.join().expect("thread panicked"); + } + + // All 10 keys shoudl exist + for i in 0..10 { + assert!( + engine.get(format!("k{}", i).as_bytes())?.is_some(), + "k{} missing", + i + ) + } + Ok(()) +} + +#[test] +fn clone_shares_same_engine() -> Result<()> { + let dir = tempdir()?; + let e1 = ConcurrentEngine::new(test_config(dir.path()))?; + let e2 = e1.clone(); + + e1.set(b"shared".to_vec(), b"data".to_vec())?; + assert_eq!(e2.get(b"shared")?.unwrap().1, b"data"); + Ok(()) +} + +#[test] +fn debug_output_works() -> Result<()> { + let dir = tempdir()?; + let engine = ConcurrentEngine::new(test_config(dir.path()))?; + let debug_str = format!("{:?}", engine); + assert!(debug_str.contains("ConcurrentEngine")); + Ok(()) +} diff --git a/crates/engine/src/tests/mod.rs b/crates/engine/src/tests/mod.rs index a984623..c879e8a 100644 --- a/crates/engine/src/tests/mod.rs +++ b/crates/engine/src/tests/mod.rs @@ -1,6 +1,7 @@ mod helpers; mod compaction_tests; +mod concurrent_tests; mod manifest_tests; mod read_tests; mod recovery_tests; diff --git a/crates/engine/src/tests/read_tests.rs b/crates/engine/src/tests/read_tests.rs index b99772d..1c12646 100644 --- a/crates/engine/src/tests/read_tests.rs +++ b/crates/engine/src/tests/read_tests.rs @@ -7,7 +7,7 @@ use tempfile::tempdir; #[test] fn scan_full_range() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -29,7 +29,7 @@ fn scan_full_range() -> Result<()> { #[test] fn scan_bounded_range() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -51,7 +51,7 @@ fn scan_bounded_range() -> Result<()> { #[test] fn scan_across_memtable_and_sstables() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -77,7 +77,7 @@ fn scan_across_memtable_and_sstables() -> Result<()> { #[test] fn scan_respects_tombstones() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -99,7 +99,7 @@ fn scan_respects_tombstones() -> Result<()> { #[test] fn scan_empty_range() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -119,7 +119,7 @@ fn scan_empty_range() -> Result<()> { #[test] fn read_path_prefers_l0_over_l1() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 64, diff --git a/crates/engine/src/tests/recovery_tests.rs b/crates/engine/src/tests/recovery_tests.rs index e8b778f..dc60793 100644 --- a/crates/engine/src/tests/recovery_tests.rs +++ b/crates/engine/src/tests/recovery_tests.rs @@ -16,14 +16,14 @@ fn recovery_from_wal() -> Result<()> { // Write some data, then drop engine (simulates crash) { - let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; engine.set(b"a".to_vec(), b"1".to_vec())?; engine.set(b"b".to_vec(), b"2".to_vec())?; engine.del(b"a".to_vec())?; } // Reopen engine - should replay WAL - let engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; + let engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; assert!(engine.get(b"a")?.is_none()); // deleted assert_eq!(engine.get(b"b")?.unwrap().1, b"2".to_vec()); assert_eq!(engine.seq(), 3); // 3 operations @@ -38,13 +38,13 @@ fn recovery_from_sstables() -> Result<()> { // Write data and force flush { - let mut engine = Engine::new(&wal_path, &sst_dir, 1, true)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, true)?; engine.set(b"k".to_vec(), b"v".to_vec())?; // Flush happened due to threshold=1 } // Reopen - WAL is empty but SSTable has the data - let engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; + let engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; assert_eq!(engine.get(b"k")?.unwrap().1, b"v".to_vec()); Ok(()) } @@ -57,19 +57,19 @@ fn recovery_combines_wal_and_sstables() -> Result<()> { // Create an engine that flushes immediately { - let mut engine = Engine::new(&wal_path, &sst_dir, 1, true)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, true)?; // This triggers flush (threshold=1) engine.set(b"flushed".to_vec(), b"in_sst".to_vec())?; } { // Reopen with high threshold so next writes stay in WAL - let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; engine.set(b"in_wal".to_vec(), b"pending".to_vec())?; } // Final reopen - should have both - let engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; + let engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; assert_eq!(engine.get(b"flushed")?.unwrap().1, b"in_sst".to_vec()); assert_eq!(engine.get(b"in_wal")?.unwrap().1, b"pending".to_vec()); Ok(()) @@ -84,7 +84,7 @@ fn manifest_preserves_l0_l1_across_restart() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::new(&wal, &sst, 64, false)?; + let mut engine = Engine::from_parts(&wal, &sst, 64, false)?; engine.set_l0_compaction_trigger(0); // Create some L0 SSTables @@ -110,7 +110,7 @@ fn manifest_preserves_l0_l1_across_restart() -> Result<()> { } // Reopen - manifest should preserve L0/L1 assignments - let engine = Engine::new(&wal, &sst, 64, false)?; + let engine = Engine::from_parts(&wal, &sst, 64, false)?; assert!(engine.l0_sstable_count() > 0, "L0 should be preserved"); assert_eq!(engine.l1_sstable_count(), 1, "L1 should be preserved"); @@ -131,7 +131,7 @@ fn sst_sort_order_is_correct_across_many_flushes() -> Result<()> { let sst_dir = dir.path().join("sst"); // Use threshold=1 so every set triggers a flush - let mut engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 1, false)?; + let mut engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 1, false)?; // Write 15 keys - produces seq 1..15, so filenames span single and // double digits. Without zero-padding this breaks. @@ -145,7 +145,7 @@ fn sst_sort_order_is_correct_across_many_flushes() -> Result<()> { // Drop and reopen - recovery must load SSTables in correct order drop(engine); - let engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; + let engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; // All keys must be readable with correct values for i in 0..15u64 { @@ -164,7 +164,7 @@ fn sst_sort_order_is_correct_across_many_flushes() -> Result<()> { fn sst_overwrite_across_flushes_returns_newest() -> Result<()> { // Write same key across multiple flushes; newest SSTable must win. let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1, // Flush every write @@ -178,7 +178,7 @@ fn sst_overwrite_across_flushes_returns_newest() -> Result<()> { // Drop and reopen drop(engine); - let engine = Engine::new( + let engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -205,7 +205,7 @@ fn recovery_cleans_up_tmp_files() -> Result<()> { assert!(tmp_file.exists()); // Opening the engine should clean it up - let _engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; + let _engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; assert!( !tmp_file.exists(), @@ -224,7 +224,7 @@ fn seq_recovered_from_sstables_after_wal_truncation() -> Result<()> { // Write data and flush (WAL gets truncated) { - let mut engine = Engine::new(&wal_path, &sst_dir, 1, false)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, false)?; engine.set(b"a".to_vec(), b"1".to_vec())?; thread::sleep(Duration::from_millis(2)); engine.set(b"b".to_vec(), b"2".to_vec())?; @@ -234,7 +234,7 @@ fn seq_recovered_from_sstables_after_wal_truncation() -> Result<()> { } // Reopen - WAL is empty, seq must be recovered from SSTables - let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, false)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, false)?; assert!( engine.seq() >= 3, "seq should be >= 3 from SSTable scan, got {}", diff --git a/crates/engine/src/tests/write_tests.rs b/crates/engine/src/tests/write_tests.rs index 6e5f073..b70646c 100644 --- a/crates/engine/src/tests/write_tests.rs +++ b/crates/engine/src/tests/write_tests.rs @@ -9,7 +9,7 @@ use tempfile::tempdir; #[test] fn set_and_get() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -26,7 +26,7 @@ fn set_and_get() -> Result<()> { #[test] fn get_missing_key() -> Result<()> { let dir = tempdir()?; - let engine = Engine::new( + let engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -40,7 +40,7 @@ fn get_missing_key() -> Result<()> { #[test] fn del_removes_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -58,7 +58,7 @@ fn del_removes_key() -> Result<()> { #[test] fn overwrite_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -74,7 +74,7 @@ fn overwrite_key() -> Result<()> { #[test] fn set_after_del_resurrects() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -94,7 +94,7 @@ fn newest_sstable_wins_on_read() -> Result<()> { let wal_path = dir.path().join("wal.log"); let sst_dir = dir.path().join("sst"); - let mut engine = Engine::new(&wal_path, &sst_dir, 1, false)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, false)?; // Write k=v1, flush engine.set(b"k".to_vec(), b"v1".to_vec())?; @@ -113,7 +113,7 @@ fn newest_sstable_wins_on_read() -> Result<()> { #[test] fn force_flush_empty_memtable_is_noop() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -137,14 +137,14 @@ fn force_flush_persists_memtable_data() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; + let mut engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; engine.set(b"key".to_vec(), b"value".to_vec())?; engine.force_flush()?; assert_eq!(engine.l0_sstable_count(), 1); } // Reopen - data should be in SSTable, not WAL - let engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; + let engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; let (_, val) = engine.get(b"key")?.expect("key should survive"); assert_eq!(val, b"value"); Ok(()) @@ -159,13 +159,13 @@ fn drop_flushes_memtable_to_sstable() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; + let mut engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; engine.set(b"drop_key".to_vec(), b"drop_val".to_vec())?; // Engine drops here - should flush memtable } // Reopen - data should be in SSTable from the Drop flush - let engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; + let engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; let (_, val) = engine.get(b"drop_key")?.expect("key should survive drop"); assert_eq!(val, b"drop_val"); assert!(engine.sstable_count() >= 1); @@ -175,7 +175,7 @@ fn drop_flushes_memtable_to_sstable() -> Result<()> { #[test] fn set_rejects_oversized_value() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -193,7 +193,7 @@ fn set_rejects_oversized_value() -> Result<()> { #[test] fn set_accepts_max_key_size() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024 * 1024, // huge threshold to avoid flush @@ -212,7 +212,7 @@ fn set_accepts_max_key_size() -> Result<()> { #[test] fn del_rejects_oversized_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -235,7 +235,7 @@ fn multiple_flushes_create_multiple_sstables() -> Result<()> { let wal_path = dir.path().join("wal.log"); let sst_dir = dir.path().join("sst"); - let mut engine = Engine::new(&wal_path, &sst_dir, 1, false)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, false)?; // Disable auto-compaction so all L0 SSTables remain on disk. engine.set_l0_compaction_trigger(0); @@ -266,7 +266,7 @@ fn multiple_flushes_create_multiple_sstables() -> Result<()> { #[test] fn seq_increments_on_every_operation() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -288,7 +288,7 @@ fn seq_increments_on_every_operation() -> Result<()> { #[test] fn set_rejects_empty_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -306,7 +306,7 @@ fn set_rejects_empty_key() -> Result<()> { #[test] fn del_rejects_empty_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -323,7 +323,7 @@ fn del_rejects_empty_key() -> Result<()> { #[test] fn set_rejects_oversized_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -346,7 +346,7 @@ fn flush_writes_sstable_and_truncates_wal() -> Result<()> { let wal_path = dir.path().join("wal.log"); let sst_dir = dir.path().join("sst"); - let mut engine = Engine::new(&wal_path, &sst_dir, 1, true)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, true)?; engine.set(b"key1".to_vec(), b"value1".to_vec())?; assert!( @@ -366,7 +366,7 @@ fn flush_triggers_at_threshold() -> Result<()> { let sst_dir = dir.path().join("sst"); let threshold = 4 * 1024; // 4 KB for fast test - let mut engine = Engine::new(&wal_path, &sst_dir, threshold, false)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, threshold, false)?; let value = vec![b'x'; 512]; let writes = (threshold / value.len()) + 5; for i in 0..writes { @@ -385,7 +385,7 @@ fn flush_triggers_at_threshold() -> Result<()> { #[test] fn get_reads_from_sstable_after_flush() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::new( + let mut engine = Engine::from_parts( dir.path().join("wal.log"), dir.path().join("sst"), 1, // tiny threshold - every set triggers flush @@ -405,7 +405,7 @@ fn tombstone_in_sstable_shadows_older_value() -> Result<()> { let sst_dir = dir.path().join("sst"); // Large threshold so we control flushes manually - let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, false)?; + let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, false)?; // Write k=v, then force flush by lowering threshold temporarily engine.set(b"k".to_vec(), b"old_value".to_vec())?; From 4f9902a796be907c20c55dd3a46f43723ed9260c Mon Sep 17 00:00:00 2001 From: Rehber Moin Date: Sat, 18 Apr 2026 11:48:54 +0530 Subject: [PATCH 3/3] feat: Phase 4 --- .gitignore | 3 +- ARCHITECTURE.md | 16 +- Cargo.lock | 280 +++- Cargo.toml | 4 +- README.md | 372 ++-- crates/cli/Cargo.toml | 1 - crates/cli/src/main.rs | 35 +- crates/config/Cargo.toml | 6 - crates/config/src/lib.rs | 293 ---- crates/config/src/tests.rs | 76 - crates/engine/Cargo.toml | 1 - crates/engine/src/compaction.rs | 52 +- crates/engine/src/concurrent.rs | 202 --- crates/engine/src/lib.rs | 64 +- crates/engine/src/manifest.rs | 6 +- crates/engine/src/read.rs | 4 +- crates/engine/src/tests/compaction_tests.rs | 27 +- crates/engine/src/tests/concurrent_tests.rs | 128 -- crates/engine/src/tests/mod.rs | 1 - crates/engine/src/tests/read_tests.rs | 12 +- crates/engine/src/tests/recovery_tests.rs | 32 +- crates/engine/src/tests/write_tests.rs | 46 +- crates/engine/src/write.rs | 13 +- crates/server/Cargo.toml | 29 + crates/server/benches/server_bench.rs | 208 +++ crates/server/src/db.rs | 109 ++ crates/server/src/handler.rs | 1492 +++++++++++++++++ crates/server/src/lib.rs | 41 + crates/server/src/main.rs | 71 + crates/server/src/resp.rs | 180 ++ crates/server/tests/integration.rs | 1103 ++++++++++++ crates/sstable/src/format.rs | 3 +- crates/sstable/src/reader.rs | 3 +- crates/wal/src/lib.rs | 6 +- docs/ARCHITECTURE.md | 618 +++++++ docs/GUIDE.md | 782 +++++++++ docs/HOWTORUN.md | 601 +++++++ java/.gitignore | 6 + java/pom.xml | 120 ++ .../java/io/riptidekv/RiptideKVConfig.java | 153 ++ .../java/io/riptidekv/RiptideKVServer.java | 305 ++++ .../test/java/io/riptidekv/RespClient.java | 150 ++ .../java/io/riptidekv/RespCommandsTest.java | 1064 ++++++++++++ .../io/riptidekv/RiptideKVConfigTest.java | 155 ++ .../io/riptidekv/RiptideKVServerTest.java | 179 ++ 45 files changed, 8043 insertions(+), 1009 deletions(-) delete mode 100644 crates/config/Cargo.toml delete mode 100644 crates/config/src/lib.rs delete mode 100644 crates/config/src/tests.rs delete mode 100644 crates/engine/src/concurrent.rs delete mode 100644 crates/engine/src/tests/concurrent_tests.rs create mode 100644 crates/server/Cargo.toml create mode 100644 crates/server/benches/server_bench.rs create mode 100644 crates/server/src/db.rs create mode 100644 crates/server/src/handler.rs create mode 100644 crates/server/src/lib.rs create mode 100644 crates/server/src/main.rs create mode 100644 crates/server/src/resp.rs create mode 100644 crates/server/tests/integration.rs create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/GUIDE.md create mode 100644 docs/HOWTORUN.md create mode 100644 java/.gitignore create mode 100644 java/pom.xml create mode 100644 java/src/main/java/io/riptidekv/RiptideKVConfig.java create mode 100644 java/src/main/java/io/riptidekv/RiptideKVServer.java create mode 100644 java/src/test/java/io/riptidekv/RespClient.java create mode 100644 java/src/test/java/io/riptidekv/RespCommandsTest.java create mode 100644 java/src/test/java/io/riptidekv/RiptideKVConfigTest.java create mode 100644 java/src/test/java/io/riptidekv/RiptideKVServerTest.java diff --git a/.gitignore b/.gitignore index 09832df..d4a416b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target wal.log /data -.DS_Store \ No newline at end of file +.DS_Store +.env \ No newline at end of file diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 06c258d..94e19d2 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,7 +1,19 @@ # RiptideKV — Architecture -This document describes the complete internal architecture of RiptideKV, a -**Log-Structured Merge (LSM) tree** key-value store written in Rust. +> **This file is superseded.** +> The full, up-to-date architecture reference (covering all phases including +> the RESP2 server) is at **[docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)**. + +--- + +The content below is an older Phase 3 snapshot retained for reference. +For anything current, please read `docs/ARCHITECTURE.md`. + +--- + +This document describes the internal architecture of RiptideKV up to Phase 3 +(storage engine only — no RESP server). See `docs/ARCHITECTURE.md` for the +complete picture including Phase 4 (Tokio server, RESP2, SharedDb, TTL). --- diff --git a/Cargo.lock b/Cargo.lock index 26ef799..a45cb52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -57,6 +57,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + [[package]] name = "cast" version = "0.3.0" @@ -127,7 +133,6 @@ version = "0.1.0" dependencies = [ "anyhow", "byteorder", - "config", "criterion", "engine", "memtable", @@ -136,10 +141,6 @@ dependencies = [ "wal", ] -[[package]] -name = "config" -version = "0.1.0" - [[package]] name = "crc32fast" version = "1.5.0" @@ -160,6 +161,7 @@ dependencies = [ "ciborium", "clap", "criterion-plot", + "futures", "is-terminal", "itertools", "num-traits", @@ -172,6 +174,7 @@ dependencies = [ "serde_derive", "serde_json", "tinytemplate", + "tokio", "walkdir", ] @@ -227,7 +230,6 @@ name = "engine" version = "0.1.0" dependencies = [ "anyhow", - "config", "memtable", "sstable", "tempfile", @@ -250,6 +252,66 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-core", + "futures-sink", + "futures-task", + "pin-project-lite", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -315,6 +377,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.180" @@ -327,6 +395,21 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "memchr" version = "2.8.0" @@ -337,6 +420,26 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" name = "memtable" version = "0.1.0" +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -358,6 +461,12 @@ version = "11.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + [[package]] name = "plotters" version = "0.3.7" @@ -530,6 +639,56 @@ dependencies = [ "zmij", ] +[[package]] +name = "server" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "criterion", + "engine", + "tempfile", + "thiserror", + "tokio", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +dependencies = [ + "libc", + "windows-sys", +] + [[package]] name = "sstable" version = "0.1.0" @@ -586,6 +745,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -596,12 +764,106 @@ dependencies = [ "serde_json", ] +[[package]] +name = "tokio" +version = "1.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +dependencies = [ + "bytes", + "libc", + "mio", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + [[package]] name = "unicode-ident" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "wal" version = "0.1.0" @@ -622,6 +884,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + [[package]] name = "wasip2" version = "1.0.2+wasi-0.2.9" diff --git a/Cargo.toml b/Cargo.toml index 303863f..1afb605 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,11 @@ [workspace] members = [ "crates/bloom", - "crates/config", "crates/engine", "crates/cli", "crates/wal", "crates/memtable", - "crates/sstable" + "crates/sstable", + "crates/server" ] resolver = "2" diff --git a/README.md b/README.md index e33117c..c1aab56 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,33 @@ # RiptideKV -**RiptideKV** is a learning project to build a **Log-Structured Merge (LSM) key-value store** in Rust. -The goal is to understand storage engine internals by implementing them incrementally and correctly, not to ship a production database. +**RiptideKV** is a learning project that builds a **Log-Structured Merge (LSM) key-value store** in Rust — and exposes it as a **Redis-compatible TCP server**. The goal is to understand storage engine internals by implementing them incrementally and correctly. ``` - Client ──► CLI ──► Engine ──┬── Memtable (in-memory sorted buffer) - ├── WAL (crash-safe append-only log) - └── SSTables (immutable on-disk sorted files) - └── Bloom Filters (fast negative lookups) + redis-cli / Jedis / redis-py + │ TCP (RESP2) + ▼ + ┌──────────────────────────────────┐ + │ crates/server (Tokio async) │ + │ RESP2 parser · 55+ commands │ + └───────────────┬──────────────────┘ + │ engine API + ▼ + ┌──────────────────────────────────┐ + │ crates/engine (LSM tree) │ + │ Memtable · WAL · SSTables │ + │ Bloom Filters · Compaction │ + └──────────────────────────────────┘ ``` -> **For the full architecture with ASCII diagrams, data flow, and per-crate -> deep dives, see [`ARCHITECTURE.md`](ARCHITECTURE.md).** +--- + +## Documentation + +| Document | What it covers | +|----------|----------------| +| **[docs/HOWTORUN.md](docs/HOWTORUN.md)** | Build, run CLI, run server, connect clients (redis-cli / Java / Python), benchmarks, troubleshooting | +| **[docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)** | Write/read/recovery data flows, file formats, RESP2 server design, concurrency model, trade-offs | +| **[docs/GUIDE.md](docs/GUIDE.md)** | Learning guide — WAL, Memtable, SSTables, Bloom Filters, Compaction, RESP2, Tokio, with pitfalls | --- @@ -21,41 +37,100 @@ The goal is to understand storage engine internals by implementing them incremen # Build everything cargo build --workspace -# Run the interactive CLI +# ── Option A: Interactive CLI (no network) cargo run -p cli -# Run all 161 tests +# ── Option B: RESP2 TCP Server (Redis-compatible) +cargo run -p server --bin riptidekv-server +# → RiptideKV listening on 0.0.0.0:6379 + +# Connect with redis-cli (in another terminal) +redis-cli PING # PONG +redis-cli SET foo bar # OK +redis-cli GET foo # "bar" + +# Run all Rust tests (245) cargo test --workspace +# Run all Java tests (150) +mvn test -f java/pom.xml + # Run benchmarks -cargo bench -p cli +cargo bench -p cli # engine-level benchmarks +cargo bench -p server # TCP server benchmarks ``` -### CLI Usage +--- +## Embedding in a Java / Maven project + +The `riptidekv-server` JAR bundles the native server binary for all supported +platforms. Your code starts the server as a subprocess and connects to it +with any Redis client. + +### Add the dependency + +```xml + + + + github + https://maven.pkg.github.com/YOUR_GITHUB_USERNAME/RiptideKV + + + + + + io.github.YOUR_GITHUB_USERNAME + riptidekv-server + 1.0.0 + ``` -RiptideKV started (seq=0, wal=wal.log, sst_dir=data/sst, flush=1024KiB, l0_trigger=4) -Commands: SET key value | GET key | DEL key | SCAN [start] [end] - COMPACT | FLUSH | STATS | EXIT -> SET name Alice -OK -> GET name -Alice -> DEL name -OK -> GET name -(nil) + +> **GitHub Packages authentication** — add to `~/.m2/settings.xml`: +> ```xml +> +> github +> YOUR_GITHUB_USERNAME +> YOUR_GITHUB_PAT +> +> ``` + +### Start the embedded server + +```java +import io.riptidekv.RiptideKVConfig; +import io.riptidekv.RiptideKVServer; +import redis.clients.jedis.Jedis; +import java.nio.file.Paths; + +RiptideKVConfig config = RiptideKVConfig.builder() + .bind("127.0.0.1:6379") + .dataDir(Paths.get("/var/lib/myapp/rkv")) // WAL + SSTables stored here + .flushKb(4096) // flush at 4 MiB + .walSync(true) // durable writes + .build(); + +try (RiptideKVServer server = new RiptideKVServer(config)) { + server.start(); // extracts binary, starts process, blocks until ready + + try (Jedis jedis = new Jedis("127.0.0.1", server.getPort())) { + jedis.set("hello", "world"); + System.out.println(jedis.get("hello")); // world + + jedis.setex("session:abc", 3600, "user_data"); + System.out.println(jedis.ttl("session:abc")); // ~3600 + } +} // server.close() sends SIGTERM, flushes memtable, exits cleanly ``` -### Configuration (Environment Variables) +### Supported platforms -| Variable | Default | Description | -|----------|---------|-------------| -| `RIPTIDE_WAL_PATH` | `wal.log` | WAL file path | -| `RIPTIDE_SST_DIR` | `data/sst` | SSTable directory | -| `RIPTIDE_FLUSH_KB` | `1024` | Flush threshold in KiB (1024 = 1 MiB) | -| `RIPTIDE_WAL_SYNC` | `true` | fsync every WAL append | -| `RIPTIDE_L0_TRIGGER` | `4` | Auto-compaction trigger (0 = disabled) | +| Platform | Architecture | +|----------|-------------| +| Linux | x86_64, aarch64 | +| macOS | x86_64 (Intel), aarch64 (Apple Silicon) | +| Windows | x86_64 | --- @@ -63,29 +138,70 @@ OK ``` RiptideKV/ -├── ARCHITECTURE.md # Detailed architecture documentation -├── Cargo.toml # Workspace root +├── ARCHITECTURE.md # Legacy architecture overview (see docs/ for full version) +├── Cargo.toml # Workspace root (resolver = "2") +├── docs/ +│ ├── ARCHITECTURE.md # Full system design — data flows, file formats, trade-offs +│ ├── GUIDE.md # Linear learning guide — concepts, code refs, pitfalls +│ └── HOWTORUN.md # Build, CLI, server, clients, benchmarks, troubleshooting +├── java/ # Maven module — Java embedding library +│ ├── pom.xml # Published to GitHub Packages as riptidekv-server +│ └── src/ +│ ├── main/java/io/riptidekv/ +│ │ ├── RiptideKVConfig.java # Fluent config builder (bind, dataDir, flushKb, walSync) +│ │ └── RiptideKVServer.java # Extracts native binary + manages server subprocess +│ └── test/java/io/riptidekv/ +│ ├── RespClient.java # Minimal RESP2 client for tests +│ ├── RiptideKVConfigTest.java # 20 config unit tests +│ ├── RiptideKVServerTest.java # 14 lifecycle tests +│ └── RespCommandsTest.java # 147 end-to-end command tests └── crates/ - ├── bloom/ # Probabilistic set membership (17 tests) - ├── memtable/ # In-memory sorted write buffer (43 tests) - ├── wal/ # Write-Ahead Log for durability (22 tests) - ├── sstable/ # Immutable on-disk sorted tables (21 tests) - │ ├── reader.rs # Read + bloom check + CRC verify - │ ├── writer.rs # Atomic write (tmp + rename) - │ ├── merge.rs # Min-heap merge iterator - │ └── format.rs # Magic numbers, footer sizes - ├── engine/ # Storage engine orchestrator (55 tests) - │ ├── lib.rs # Engine struct, constructor, accessors - │ ├── write.rs # set(), del(), flush() - │ ├── read.rs # get(), scan() - │ ├── compaction.rs # compact(), tombstone GC - │ ├── recovery.rs # WAL replay, SSTable loading - │ ├── manifest.rs # Persistent L0/L1 level tracking - │ └── tests/ # Split into 4 focused test modules - └── cli/ # Interactive REPL + benchmarks -``` - -**Dependency graph**: `cli → engine → {memtable, wal, sstable → bloom}` + ├── bloom/ # Bloom filter (17 tests) + │ └── src/lib.rs # BloomFilter, FNV-1a double-hashing, serialization + ├── memtable/ # In-memory sorted write buffer (43 tests) + │ └── src/lib.rs # Memtable (BTreeMap), sequence-gated writes, tombstones + ├── wal/ # Write-Ahead Log (22 tests) + │ └── src/lib.rs # WalWriter, WalReader, CRC32 per record + ├── sstable/ # Immutable on-disk sorted tables (21 tests) + │ └── src/ + │ ├── format.rs # v1/v2/v3 footer layout, magic numbers + │ ├── writer.rs # Atomic write (tmp → fsync → rename) + │ ├── reader.rs # Bloom-filtered point lookup + CRC32 verify + │ └── merge.rs # MergeIterator (min-heap k-way merge) + ├── engine/ # Storage engine orchestrator (55 tests) + │ └── src/ + │ ├── lib.rs # Engine struct, constructor, public accessors + │ ├── write.rs # set(), del(), flush(), auto-compaction trigger + │ ├── read.rs # get(), scan() + │ ├── compaction.rs# compact(), tombstone GC + │ ├── recovery.rs # WAL replay, SSTable loading, tmp cleanup + │ └── manifest.rs # Persistent L0/L1 level tracking (atomic writes) + ├── server/ # Async RESP2 TCP server (84 integration tests) + │ ├── src/ + │ │ ├── lib.rs # serve() — public library API (testable without subprocess) + │ │ ├── main.rs # Binary entry point — env-var config + graceful shutdown + │ │ ├── resp.rs # RESP2 parser (non-recursive) + response serializer + │ │ ├── db.rs # SharedDb: Arc> + volatile TTL map + │ │ └── handler.rs # 55+ command dispatcher, per-connection state + │ ├── benches/ + │ │ └── server_bench.rs # Criterion: PING, SET, GET, pipeline, MSET throughput + │ └── tests/ + │ └── integration.rs # 84 end-to-end tests over real TCP sockets + └── cli/ # Interactive REPL + engine-level benchmarks + ├── src/main.rs # SET/GET/DEL/SCAN/COMPACT/FLUSH/STATS REPL + ├── benches/ # Criterion: memtable, sstable, wal, engine benchmarks + └── tests/ # CLI integration tests +``` + +**Dependency graph** (arrows = "depends on"): + +``` +cli ──────────────────────────────────────► engine +server ───────────────────────────────────► engine +engine ──► memtable +engine ──► wal +engine ──► sstable ──► bloom +``` --- @@ -93,42 +209,49 @@ RiptideKV/ ### Write Path -1. Increment monotonic sequence number -2. Append record to WAL (durability) -3. Insert into Memtable (fast reads) -4. If Memtable exceeds threshold → flush to SSTable, truncate WAL +``` +Client SET k v + │ + ├─ 1. seq += 1 + ├─ 2. WAL.append(Put{seq, k, v}) — durable on disk + ├─ 3. memtable.put(k, v, seq) — fast in-memory + └─ 4. if memtable.size >= threshold: + flush to SSTable → truncate WAL → maybe compact +``` ### Read Path -1. Check **Memtable** (freshest data) -2. Check **L0 SSTables** newest-first (bloom filter → index → disk read) -3. Check **L1 SSTables** newest-first -4. First match wins; tombstones shadow older values - -### Compaction - -Merges all L0 + L1 SSTables into a single L1 SSTable using a streaming -min-heap merge. Tombstones for keys with no older references are garbage -collected. Auto-triggers when L0 count reaches the configured threshold. +``` +Client GET k + │ + ├─ 1. memtable.get(k) — newest, no disk I/O + ├─ 2. L0 SSTables, newest first — bloom → index → disk read + └─ 3. L1 SSTable — bloom → index → disk read + First hit (value or tombstone) wins. +``` -### Recovery +### Recovery (on startup) -On startup: replay WAL → rebuild Memtable, load MANIFEST → assign SSTables -to L0/L1, recover sequence number from v3 footer (`max_seq`). +``` +cleanup .sst.tmp → replay WAL → load Manifest → open SSTables → ready +``` --- -## Goals - -- Learn Rust fundamentals in a systems programming context -- Incrementally build an LSM-style storage engine -- Practice testing, CI, and clean architecture +## Supported Commands (Server) -## Non-Goals (for now) +``` +Connection: PING ECHO SELECT QUIT HELLO CLIENT INFO CONFIG COMMAND +Database: DBSIZE FLUSHDB FLUSHALL ACL SLOWLOG MEMORY WAIT +Strings: GET SET SETNX SETEX PSETEX GETSET GETDEL GETEX + MGET MSET MSETNX APPEND STRLEN + INCR INCRBY INCRBYFLOAT DECR DECRBY GETRANGE SETRANGE +Keys: DEL UNLINK EXISTS TYPE RENAME RENAMENX RANDOMKEY TOUCH + EXPIRE PEXPIRE EXPIREAT PEXPIREAT TTL PTTL PERSIST + EXPIRETIME PEXPIRETIME KEYS SCAN +``` -- Production-grade performance -- Distributed systems or consensus -- Concurrent read/write (currently single-threaded `&mut self`) +--- ## Glossary @@ -139,68 +262,43 @@ to L0/L1, recover sequence number from v3 footer (`max_seq`). | **SSTable** | Sorted String Table; immutable on-disk sorted key-value file | | **WAL** | Write-Ahead Log; append-only file for crash recovery | | **Compaction** | Merging SSTables to remove duplicates and reclaim space | -| **Tombstone** | Marker indicating a key has been deleted | +| **Tombstone** | A deletion marker — shadows older values in SSTables | | **Bloom Filter** | Probabilistic structure for fast "definitely not in set" checks | -| **L0** | Level 0; SSTables from memtable flushes (may overlap) | -| **L1** | Level 1; SSTables from compaction (non-overlapping) | +| **L0** | Level 0; SSTables from memtable flushes (may key-overlap) | +| **L1** | Level 1; single post-compaction SSTable (non-overlapping) | | **Manifest** | Text file tracking which SSTable belongs to which level | +| **RESP2** | Redis Serialization Protocol v2 — the Redis wire format | --- ## Development Phases -### Phase 0 — Rust fundamentals & repository setup [DELIVERED] - -- Cargo workspace, CI (GitHub Actions), clippy, rustfmt -- Rust fundamentals: ownership, borrowing, traits, `Result`/`Option` - -### Phase 1 — Core LSM (in-memory + basic on-disk) [DELIVERED] - -- Ordered memtable with sequence-gated writes -- WAL with CRC32 per record, crash-safe replay -- SSTable v1 writer/reader with sparse index -- CLI with SET, GET, DEL +| Phase | Status | Description | +|-------|--------|-------------| +| 0 | ✅ | Rust workspace, CI, clippy, rustfmt | +| 1 | ✅ | Memtable, WAL (CRC32), SSTable v1, CLI (SET/GET/DEL) | +| 2 | ✅ | Read path (Memtable→L0→L1), Bloom filters, Compaction | +| 3 | ✅ | SSTable v3 (CRC32 per record, max_seq), Manifest, streaming compaction, range scan, auto-compaction, tombstone GC | +| 4 | ✅ | RESP2 TCP server (Tokio), 55+ commands, TTL, Java/Python client compatibility, 84 integration tests | +| 5 | 📋 | Persistent TTL, tiered compaction, LRU block cache, compression, metrics | -#### Write Path Demonstrations - -| Demo | Description | -|------|-------------| -| ![Memtable → WAL](public/assets/memtable_wal.gif) | Writing to Memtable, then WAL | -| ![Flush to SSTable](public/assets/flush_to_sstable.gif) | Threshold exceeded → SSTable created → WAL flushed | -| ![New Memtable](public/assets/new_memtable_after_flush.gif) | New writes after flush | -| ![Delete](public/assets/delete_propagation.gif) | Deletions propagated via tombstones | - -### Phase 2 — Reads, bloom filters, and compaction [DELIVERED] - -- Read path: Memtable → L0 → L1 with bloom filter short-circuit -- Per-SSTable bloom filters (1% FPR, FNV-1a double hashing) -- Basic compaction: merge multiple SSTables, drop obsolete keys -- SSTable v2: bloom filter section in file layout - -### Phase 3 — Robustness and production readiness [DELIVERED] - -- **SSTable v3**: per-record CRC32 checksums, `max_seq` in footer -- **Manifest**: persistent L0/L1 tracking with atomic writes -- **Streaming compaction**: `write_from_iterator()` — bounded RAM usage -- **Range scan**: `Engine::scan(start, end)` merging all sources -- **Auto-compaction**: triggers when L0 count >= configurable threshold -- **Tombstone GC**: drops dead tombstones during full compaction -- **Graceful shutdown**: `Drop` impl flushes memtable, `force_flush()` API -- **CLI improvements**: env-var config, SCAN/COMPACT/FLUSH/STATS commands -- **SRP refactor**: engine split into 5 focused modules + 4 test modules -- **161 tests**, zero warnings - -### Phase 4 — RESP server & Java compatibility (planned) +--- -- RESP2 protocol server (GET, SET, DEL, PING, INFO) -- Async networking with Tokio -- Integration tests with Java Redis client (Jedis) +## Known Limitations -### Phase 5 — Performance, features, and polish (planned) +RiptideKV is a learning project. The following are known differences from production Redis: -- Benchmarks and tuning (criterion) -- Optional: TTL, leveled compaction, compression, LRU block cache -- Structured logging (`tracing`), metrics, fuzzing +| Area | Behaviour | Note | +|------|-----------|------| +| **TTL persistence** | TTLs are stored in memory only — lost on server restart | Keys survive but their expiry times do not; planned in Phase 5 | +| **INCR on non-numeric** | Treats un-parseable values as `0` instead of returning an error | Intentional graceful degradation; differs from Redis | +| **Authentication** | No `AUTH` command — any client can connect | Bind to loopback (`127.0.0.1`) in production | +| **TLS** | Plaintext TCP only | Terminate TLS at a proxy (nginx, HAProxy) if needed | +| **Replication** | Single node only — no leader/follower | `WAIT` always returns 0 | +| **Compaction** | Only L0 → L1; L1 grows unboundedly | Tiered/levelled compaction planned in Phase 5 | +| **Block cache** | Every SSTable read goes to disk | LRU block cache planned in Phase 5 | +| **Linux aarch64** | Not in CI build matrix — binary is optional | Add cross-compilation to CI matrix to enable | +| **RESP3** | Not supported — returns `NOPROTO` error | Use RESP2 clients | --- @@ -213,7 +311,17 @@ to L0/L1, recover sequence number from v3 footer (`max_seq`). | `wal` | 22 | Append, replay, CRC, truncated tails, corruption | | `sstable` | 21 | Write, read, bloom, merge iterator, v1/v2/v3 compat | | `engine` | 55 | CRUD, flush, recovery, compaction, scan, manifest, GC | +| `server` | 84 | All 55+ commands, TTL expiry, concurrent clients, pipelining, binary values | | doctests | 3 | Usage examples for bloom, memtable, wal | -| **Total** | **161** | | +| **Total (Rust)** | **245** | | + +**Java embedding library** (`mvn test -f java/pom.xml`): + +| Test class | Tests | Coverage | +|---|---|---| +| `RiptideKVConfigTest` | 20 | Builder defaults, validation (null, blank, no colon, non-numeric port, out-of-range), port extraction, fluency | +| `RiptideKVServerTest` | 14 | start, stop, isRunning, close idempotency, port release, null config guard | +| `RespCommandsTest` | 116 | All 55+ commands over real TCP: Connection, Database, Strings, Keys, real-time expiry, pipelining, concurrent clients, binary safety | +| **Total (Java)** | **150** | | -CI: `cargo fmt --check` + `cargo clippy` + `cargo test --workspace` +CI: `cargo fmt --check` + `cargo clippy` + `cargo test --workspace` (245) + `mvn test -f java/pom.xml` (150) diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index d861944..48beddb 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -config = { path = "../config" } engine = { path = "../engine" } memtable = { path = "../memtable" } sstable = { path = "../sstable" } diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 253b518..6d21bf1 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -46,28 +46,37 @@ //! bye //! ``` use anyhow::Result; -use config::EngineConfig; use engine::Engine; use std::io::{self, BufRead, Write}; -fn main() -> Result<()> { - // Load all configuration from environment variables (single source of truth). - // See `config::EngineConfig::from_env()` for the full list of supported - // variables and their defaults. - let cfg = EngineConfig::from_env(); +/// Reads a configuration value from the environment, falling back to `default`. +fn env_or(key: &str, default: &str) -> String { + std::env::var(key).unwrap_or_else(|_| default.to_string()) +} - let flush_kb = cfg.flush_threshold_bytes / 1024; - let wal_path_display = cfg.wal_path.display().to_string(); - let sst_path_display = cfg.sst_dir.display().to_string(); - let l0_trigger = cfg.l0_compaction_trigger; +fn main() -> Result<()> { + // Configuration via environment variables with sensible defaults. + // + // RIPTIDE_WAL_PATH - WAL file path (default: "wal.log") + // RIPTIDE_SST_DIR - SSTable directory (default: "data/sst") + // RIPTIDE_FLUSH_KB - flush threshold in KiB (default: 1024 = 1 MiB) + // RIPTIDE_WAL_SYNC - fsync every WAL append (default: "true") + // RIPTIDE_L0_TRIGGER - L0 compaction trigger (default: 4, 0 = disabled) + let wal_path = env_or("RIPTIDE_WAL_PATH", "wal.log"); + let sst_dir = env_or("RIPTIDE_SST_DIR", "data/sst"); + let flush_kb: usize = env_or("RIPTIDE_FLUSH_KB", "1024").parse().unwrap_or(1024); + let flush_threshold = flush_kb * 1024; + let wal_sync: bool = env_or("RIPTIDE_WAL_SYNC", "true").parse().unwrap_or(true); + let l0_trigger: usize = env_or("RIPTIDE_L0_TRIGGER", "4").parse().unwrap_or(4); - let mut engine = Engine::new(cfg)?; + let mut engine = Engine::new(&wal_path, &sst_dir, flush_threshold, wal_sync)?; + engine.set_l0_compaction_trigger(l0_trigger); println!( "RiptideKV started (seq={}, wal={}, sst_dir={}, flush={}KiB, l0_trigger={})", engine.seq(), - wal_path_display, - sst_path_display, + wal_path, + sst_dir, flush_kb, l0_trigger ); diff --git a/crates/config/Cargo.toml b/crates/config/Cargo.toml deleted file mode 100644 index 8db0789..0000000 --- a/crates/config/Cargo.toml +++ /dev/null @@ -1,6 +0,0 @@ -[package] -name = "config" -version = "0.1.0" -edition = "2021" - -[dependencies] diff --git a/crates/config/src/lib.rs b/crates/config/src/lib.rs deleted file mode 100644 index 4eeadc5..0000000 --- a/crates/config/src/lib.rs +++ /dev/null @@ -1,293 +0,0 @@ -//! # Config – Unified Configuration for RiptideKV -//! -//! This crate provides a single source of truth for all RiptideKV configuration. -//! Every component — the CLI, the RESP server, tests — constructs an [`EngineConfig`] -//! and passes it to the engine. This eliminates duplicated defaults, scattered -//! env-var parsing, and positional-argument constructors. -//! -//! ## Design Decisions -//! -//! - **Single struct, not scattered args**: `Engine::new()` previously took 4 -//! positional arguments (`wal_path`, `sst_dir`, `flush_threshold`, `wal_sync`) -//! plus a post-construction setter for `l0_compaction_trigger`. This was fragile -//! and easy to misconfigure. A single config struct is self-documenting. -//! -//! - **Builder pattern**: Tests need custom configs (tiny flush thresholds, sync -//! disabled, etc.) while production uses env vars. The builder pattern supports -//! both ergonomically. -//! -//! - **from_env() factory**: Both the CLI and the RESP server need to load -//! config from environment variables. Centralizing this avoids duplication. -//! -//! - **Defaults defined once**: Constants like `DEFAULT_FLUSH_THRESHOLD_KB` and -//! `DEFAULT_L0_COMPACTION_TRIGGER` live here, not scattered across crates. -//! -//! ## Example -//! -//! ```rust -//! use config::EngineConfig; -//! -//! // Production: load from environment variables -//! let cfg = EngineConfig::from_env(); -//! -//! // Tests: use builder for precise control -//! let cfg = EngineConfig::builder() -//! .wal_path("/tmp/test/wal.log") -//! .sst_dir("/tmp/test/sst") -//! .flush_threshold_bytes(64) -//! .wal_sync(false) -//! .l0_compaction_trigger(0) -//! .build(); -//! ``` - -use std::path::PathBuf; - -// ─── Default constants (single source of truth) ───────────────────────────── - -/// Default WAL file path. -pub const DEFAULT_WAL_PATH: &str = "wal.log"; - -/// Default SSTable directory. -pub const DEFAULT_SST_DIR: &str = "data/sst"; - -/// Default flush threshold in KiB. The memtable is flushed to an SSTable -/// when its approximate byte size reaches `flush_threshold_kb * 1024`. -pub const DEFAULT_FLUSH_THRESHOLD_KB: usize = 1024; - -/// Default number of L0 SSTables that triggers automatic compaction. -/// Set to `0` to disable auto-compaction. -pub const DEFAULT_L0_COMPACTION_TRIGGER: usize = 4; - -/// Default WAL sync mode. When `true`, every WAL append is followed by -/// `fsync` for maximum durability. -pub const DEFAULT_WAL_SYNC: bool = true; - -/// Default host address for the RESP server. -pub const DEFAULT_SERVER_HOST: &str = "127.0.0.1"; - -/// Default port for the RESP server. -pub const DEFAULT_SERVER_PORT: u16 = 6379; - -// ─── EngineConfig ─────────────────────────────────────────────────────────── - -/// Unified configuration for the RiptideKV storage engine. -/// -/// This struct is the **single source of truth** for all engine parameters. -/// It is consumed by `Engine::new()`, the CLI, the RESP server, and tests. -/// -/// # Fields -/// -/// | Field | Default | Env Variable | -/// |-------|----------|--------------| -/// | `wal_path` | `wal.log` | `RIPTIDE_WAL_PATH` | -/// | `sst_dir` | `data/sst` | `RIPTIDE_SST_DIR` | -/// | `flush_threshold_bytes` | `1048576` (1 MiB) | `RIPTIDE_FLUSH_KB` (in KiB) | -/// | `wal_sync` | `true` | `RIPTIDE_WAL_SYNC` | -/// | `l0_compaction_trigger` | `4` | `RIPTIDE_L0_TRIGGER` | -/// | `server_host` | `127.0.0.1` | `RIPTIDE_HOST` | -/// | `server_port` | `6379` | `RIPTIDE_PORT` | -#[derive(Debug, Clone)] -pub struct EngineConfig { - /// Path to the write-ahead log file. - pub wal_path: PathBuf, - - /// Directory where SSTable files are stored. - pub sst_dir: PathBuf, - - /// Memtable byte-size threshold that triggers a flush to SSTable. - /// When `memtable.approx_size() >= flush_threshold_bytes`, the memtable - /// is flushed to a new SSTable on disk. - pub flush_threshold_bytes: usize, - - /// If `true`, every WAL append is followed by `fsync` for durability. - /// Setting this to `false` batches writes for better throughput at the - /// cost of losing the most recent writes on crash. - pub wal_sync: bool, - - /// Number of L0 SSTables that triggers automatic compaction after a flush. - /// Set to `0` to disable auto-compaction (caller must invoke `compact()` - /// manually). - pub l0_compaction_trigger: usize, - - /// Host address for the RESP server to bind to. - pub server_host: String, - - /// Port for the RESP server to listen on. - pub server_port: u16, -} - -impl Default for EngineConfig { - /// Returns the default configuration with sensible production values. - /// - /// These defaults match the values previously hard-coded across the CLI - /// and engine crates. - fn default() -> Self { - Self { - wal_path: PathBuf::from(DEFAULT_WAL_PATH), - sst_dir: PathBuf::from(DEFAULT_SST_DIR), - flush_threshold_bytes: DEFAULT_FLUSH_THRESHOLD_KB * 1024, - wal_sync: DEFAULT_WAL_SYNC, - l0_compaction_trigger: DEFAULT_L0_COMPACTION_TRIGGER, - server_host: DEFAULT_SERVER_HOST.to_string(), - server_port: DEFAULT_SERVER_PORT, - } - } -} - -impl EngineConfig { - /// Creates a configuration by reading environment variables, falling back - /// to defaults for any variable that is not set or cannot be parsed. - /// - /// # Environment Variables - /// - /// | Variable | Type | Default | - /// |----------|------|---------| - /// | `RIPTIDE_WAL_PATH` | `String` | `wal.log` | - /// | `RIPTIDE_SST_DIR` | `String` | `data/sst` | - /// | `RIPTIDE_FLUSH_KB` | `usize` (KiB) | `1024` | - /// | `RIPTIDE_WAL_SYNC` | `bool` | `true` | - /// | `RIPTIDE_L0_TRIGGER` | `usize` | `4` | - /// | `RIPTIDE_HOST` | `String` | `127.0.0.1` | - /// | `RIPTIDE_PORT` | `u16` | `6379` | - pub fn from_env() -> Self { - let defaults = Self::default(); - - let wal_path = env_or("RIPTIDE_WAL_PATH", DEFAULT_WAL_PATH); - let sst_dir = env_or("RIPTIDE_SST_DIR", DEFAULT_SST_DIR); - - let flush_kb: usize = env_or("RIPTIDE_FLUSH_KB", &DEFAULT_FLUSH_THRESHOLD_KB.to_string()) - .parse() - .unwrap_or(DEFAULT_FLUSH_THRESHOLD_KB); - - let wal_sync: bool = env_or("RIPTIDE_WAL_SYNC", &DEFAULT_WAL_SYNC.to_string()) - .parse() - .unwrap_or(DEFAULT_WAL_SYNC); - - let l0_trigger: usize = env_or( - "RIPTIDE_L0_TRIGGER", - &DEFAULT_L0_COMPACTION_TRIGGER.to_string(), - ) - .parse() - .unwrap_or(DEFAULT_L0_COMPACTION_TRIGGER); - - let server_host = env_or("RIPTIDE_HOST", DEFAULT_SERVER_HOST); - - let server_port: u16 = env_or("RIPTIDE_PORT", &DEFAULT_SERVER_PORT.to_string()) - .parse() - .unwrap_or(defaults.server_port); - - Self { - wal_path: PathBuf::from(wal_path), - sst_dir: PathBuf::from(sst_dir), - flush_threshold_bytes: flush_kb * 1024, - wal_sync, - l0_compaction_trigger: l0_trigger, - server_host, - server_port, - } - } - - /// Returns a [`ConfigBuilder`] for ergonomic construction in tests and - /// programmatic usage. - /// - /// All fields start with their default values. Override only what you need. - pub fn builder() -> ConfigBuilder { - ConfigBuilder { - config: Self::default(), - } - } - - /// Returns the `host:port` address string for the RESP server. - pub fn server_addr(&self) -> String { - format!("{}:{}", self.server_host, self.server_port) - } -} - -// ─── ConfigBuilder ────────────────────────────────────────────────────────── - -/// Builder for [`EngineConfig`] with a fluent API. -/// -/// All fields start with their default values. Call setters to override, then -/// call [`build()`](ConfigBuilder::build) to produce the final config. -/// -/// # Example -/// -/// ```rust -/// use config::EngineConfig; -/// -/// let cfg = EngineConfig::builder() -/// .wal_path("/tmp/wal.log") -/// .sst_dir("/tmp/sst") -/// .flush_threshold_bytes(256) -/// .wal_sync(false) -/// .build(); -/// -/// assert_eq!(cfg.flush_threshold_bytes, 256); -/// assert!(!cfg.wal_sync); -/// ``` -pub struct ConfigBuilder { - config: EngineConfig, -} - -impl ConfigBuilder { - /// Sets the WAL file path. - pub fn wal_path(mut self, path: impl Into) -> Self { - self.config.wal_path = path.into(); - self - } - - /// Sets the SSTable directory. - pub fn sst_dir(mut self, dir: impl Into) -> Self { - self.config.sst_dir = dir.into(); - self - } - - /// Sets the flush threshold in bytes (not KiB). - /// - /// This is the raw byte value. For KiB, multiply by 1024 before passing. - pub fn flush_threshold_bytes(mut self, bytes: usize) -> Self { - self.config.flush_threshold_bytes = bytes; - self - } - - /// Sets the WAL sync mode. - pub fn wal_sync(mut self, sync: bool) -> Self { - self.config.wal_sync = sync; - self - } - - /// Sets the L0 compaction trigger. Pass `0` to disable auto-compaction. - pub fn l0_compaction_trigger(mut self, trigger: usize) -> Self { - self.config.l0_compaction_trigger = trigger; - self - } - - /// Sets the RESP server host address. - pub fn server_host(mut self, host: impl Into) -> Self { - self.config.server_host = host.into(); - self - } - - /// Sets the RESP server port. - pub fn server_port(mut self, port: u16) -> Self { - self.config.server_port = port; - self - } - - /// Consumes the builder and returns the final [`EngineConfig`]. - pub fn build(self) -> EngineConfig { - self.config - } -} - -// ─── Helpers ──────────────────────────────────────────────────────────────── - -/// Reads an environment variable, falling back to `default` if not set. -fn env_or(key: &str, default: &str) -> String { - std::env::var(key).unwrap_or_else(|_| default.to_string()) -} - -// ─── Tests ─────────────────────────────────────────────────────────────────── - -#[cfg(test)] -mod tests; diff --git a/crates/config/src/tests.rs b/crates/config/src/tests.rs deleted file mode 100644 index a8f60cb..0000000 --- a/crates/config/src/tests.rs +++ /dev/null @@ -1,76 +0,0 @@ -use super::*; - -#[test] -fn default_config_has_expected_values() { - let cfg = EngineConfig::default(); - assert_eq!(cfg.wal_path, PathBuf::from("wal.log")); - assert_eq!(cfg.sst_dir, PathBuf::from("data/sst")); - assert_eq!(cfg.flush_threshold_bytes, 1024 * 1024); - assert!(cfg.wal_sync); - assert_eq!(cfg.l0_compaction_trigger, 4); - assert_eq!(cfg.server_host, "127.0.0.1"); - assert_eq!(cfg.server_port, 6379); -} - -#[test] -fn builder_overrides_defaults() { - let cfg = EngineConfig::builder() - .wal_path("/custom/wal.log") - .sst_dir("/custom/sst") - .flush_threshold_bytes(256) - .wal_sync(false) - .l0_compaction_trigger(8) - .server_host("0.0.0.0") - .server_port(7379) - .build(); - - assert_eq!(cfg.wal_path, PathBuf::from("/custom/wal.log")); - assert_eq!(cfg.sst_dir, PathBuf::from("/custom/sst")); - assert_eq!(cfg.flush_threshold_bytes, 256); - assert!(!cfg.wal_sync); - assert_eq!(cfg.l0_compaction_trigger, 8); - assert_eq!(cfg.server_host, "0.0.0.0"); - assert_eq!(cfg.server_port, 7379); -} - -#[test] -fn clone_produces_independent_copy() { - let cfg1 = EngineConfig::builder().flush_threshold_bytes(100).build(); - let mut cfg2 = cfg1.clone(); - cfg2.flush_threshold_bytes = 200; - - assert_eq!(cfg1.flush_threshold_bytes, 100); - assert_eq!(cfg2.flush_threshold_bytes, 200); -} - -#[test] -fn builder_partial_override() { - let cfg = EngineConfig::builder().flush_threshold_bytes(64).build(); - - // Only flush_threshold changed; rest are defaults - assert_eq!(cfg.flush_threshold_bytes, 64); - assert_eq!(cfg.wal_path, PathBuf::from("wal.log")); - assert!(cfg.wal_sync); - assert_eq!(cfg.l0_compaction_trigger, 4); -} - -#[test] -fn server_addr_format() { - let cfg = EngineConfig::builder() - .server_host("0.0.0.0") - .server_port(8080) - .build(); - assert_eq!(cfg.server_addr(), "0.0.0.0:8080"); -} - -#[test] -fn from_env_uses_defaults_when_no_vars_set() { - // This test relies on the CI/test environment not having RIPTIDE_* - // variables set. It verifies the fallback path. - let cfg = EngineConfig::from_env(); - // We can't assert exact values because env vars might be set in CI, - // but we can assert the config is valid. - assert!(!cfg.wal_path.as_os_str().is_empty()); - assert!(!cfg.sst_dir.as_os_str().is_empty()); - assert!(cfg.flush_threshold_bytes > 0); -} diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml index 1013df4..5cb4000 100644 --- a/crates/engine/Cargo.toml +++ b/crates/engine/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2021" [dependencies] -config = { path = "../config" } memtable = { path = "../memtable" } sstable = { path = "../sstable" } wal = { path = "../wal" } diff --git a/crates/engine/src/compaction.rs b/crates/engine/src/compaction.rs index 1f605cf..fc5123c 100644 --- a/crates/engine/src/compaction.rs +++ b/crates/engine/src/compaction.rs @@ -55,34 +55,32 @@ impl Engine { let mut merge = MergeIterator::new(&all_sstables); - // Stream directly from MergeIterator -> SSTableWriter without + // Stram directly from MergeIterator -> SSTableWriter without // materializing the entire dataset in RAM. Memory usage is bounded // by the bloom filter + index, not the data volume. let ts = SystemTime::now().duration_since(UNIX_EPOCH)?.as_millis(); let sst_name = format!("sst-{:020}-{}.sst", self.seq, ts); let sst_path = self.sst_dir.join(&sst_name); - // Tombstone GC: since this is a full compaction (all L0 + L1 -> single - // L1), there are no older SSTables that could contain shadowed values. - // Tombstones are therefore safe to drop — unless the memtable still - // references the key, in which case we can keep the tombstone so it - // continues to shadow the memtable's entry after a subsequent flush. - // - // Build a streaming iterator adapter from MergeIterator. - // MergeIterator::next() returns Result>, so we collect - // into a fallible iterator that stops on error or exhaustion. let mem_ref = &self.mem; let mut merge_error: Option = None; + let mut wrote_any = false; let streaming_iter = std::iter::from_fn(|| { loop { match merge.next_entry() { Ok(Some((key, entry))) => { - // Drop tombstones unless the memtable still references - // this key (the memtable is not part of compaction, so - // we must keep tombstones that shadow memtable data). + // GC tombstones during full compaction: since all L0 + L1 + // are merged into a single L1, there are no older SSTables + // left. A tombstone with no memtable reference has nothing + // to shadow and is safe to discard. If the memtable does + // reference this key the WAL will replay it on recovery, + // so GC is still safe — but we keep the tombstone to avoid + // a live value from the memtable surfacing in L1 after the + // next flush+compaction cycle. if entry.value.is_none() && !mem_ref.contains_key(&key) { - continue; // GC this tombstone + continue; // GC: dead tombstone, nothing to shadow } + wrote_any = true; return Some((key, entry)); } Ok(None) => return None, @@ -97,27 +95,27 @@ impl Engine { let write_result = SSTableWriter::write_from_iterator(&sst_path, estimated_count, streaming_iter); - // Check for merge errors first, then write errors. + // Check for merge errors first (iterator failed mid-stream). if let Some(e) = merge_error { - // Clean up partial write if any. let _ = std::fs::remove_file(sst_path.with_extension("sst.tmp")); return Err(e); } - // Handle the case where all SSTables were empty. - if let Err(e) = write_result { - if e.to_string().contains("empty") { - drop(all_sstables); - for p in &old_paths { - let _ = std::fs::remove_file(p); - } - self.manifest.entries.clear(); - self.manifest.save()?; - return Ok(()); + // If every entry was GC'd (all tombstones, no live data), the writer + // returns an "empty" error. Detect this via `wrote_any` rather than + // inspecting the error message string. + if !wrote_any { + drop(all_sstables); + for p in &old_paths { + let _ = std::fs::remove_file(p); } - return Err(e); + self.manifest.entries.clear(); + self.manifest.save()?; + return Ok(()); } + write_result?; + // Update the manifest atomically: replace all entries with the // single compacted L1 SSTable. self.manifest.replace_all_with_l1(sst_name); diff --git a/crates/engine/src/concurrent.rs b/crates/engine/src/concurrent.rs deleted file mode 100644 index 2b3ba1d..0000000 --- a/crates/engine/src/concurrent.rs +++ /dev/null @@ -1,202 +0,0 @@ -//! # ConcurrentEngine – Thread-Safe Wrapper for Fearless Concurrency -//! -//! Wraps [`Engine`] in `Arc>` so multiple threads (or Tokio -//! tasks via `spawn_blocking`) can safely share a single engine instance. -//! -//! ## Design Decisions -//! -//! | Decision | Rationale | -//! |----------|-----------| -//! | `std::sync::RwLock` (not Tokio) | Engine performs synchronous file I/O; an async lock would block the Tokio runtime. | -//! | `Arc` wrapper | Enables cheap cloning for distribution across tasks/threads. | -//! | Read lock for `get`/`scan` | Multiple readers can proceed concurrently (no I/O mutation). | -//! | Write lock for `set`/`del`/… | Mutations must be serialized to protect WAL + Memtable state. | -//! | `Clone` derives cheaply | `Arc::clone` is an atomic ref-count bump — negligible cost. | -//! -//! ## Usage with Tokio -//! -//! ```rust,ignore -//! let engine = ConcurrentEngine::new(cfg)?; -//! let engine_clone = engine.clone(); -//! -//! tokio::task::spawn_blocking(move || { -//! engine_clone.set(b"key".to_vec(), b"value".to_vec()) -//! }).await??; -//! ``` - -use std::sync::{Arc, RwLock}; - -use anyhow::{Context, Result}; -use config::EngineConfig; - -use crate::Engine; - -/// A thread-safe handle to a shared [`Engine`]. -/// -/// ALL methods acquire the appropriate lock internally so callers never need -/// to manage locking themselves. The handle is cheaply cloneable (`Arc`). -#[derive(Clone)] -pub struct ConcurrentEngine { - inner: Arc>, -} - -impl ConcurrentEngine { - // ─── Construction ─────────────────────────────────────────────────────── - - /// Creates a new `ConcurrentEngine` from an [`EngineConfig`]. - pub fn new(cfg: EngineConfig) -> Result { - let engine = Engine::new(cfg)?; - Ok(Self { - inner: Arc::new(RwLock::new(engine)), - }) - } - - /// Wraps an already-constructed [`Engine`] for concurrent access. - pub fn from_engine(engine: Engine) -> Self { - Self { - inner: Arc::new(RwLock::new(engine)), - } - } - - // ─── Write operations (exclusive lock) ───────────────────────────────── - - /// Inserts a key-value pair. Acquires a **write** lock. - pub fn set(&self, key: Vec, value: Vec) -> Result<()> { - self.inner - .write() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire write lock for set")? - .set(key, value) - } - - /// Deletes a key (writes a tombstone). Acquires a **write** lock. - pub fn del(&self, key: Vec) -> Result<()> { - self.inner - .write() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire write lock for del")? - .del(key) - } - - /// Forces a memtable flush to SSTable. Acquires a **write** lock. - pub fn force_flush(&self) -> Result<()> { - self.inner - .write() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire write lock for force_flush")? - .force_flush() - } - - /// Runs compaction (merges all L0 + L1 into a single L1 SSTable). - /// Acquires a **write** lock. - pub fn compact(&self) -> Result<()> { - self.inner - .write() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire write lock for compact")? - .compact() - } - - /// Updates the L0 compaction trigger. Acquires a **write** lock. - pub fn set_l0_compaction_trigger(&self, trigger: usize) -> Result<()> { - self.inner - .write() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire write lock for set_l0_compaction_trigger")? - .set_l0_compaction_trigger(trigger); - Ok(()) - } - - /// Updates the flush threshold. Acquires a **write** lock. - pub fn set_flush_threshold(&self, threshold: usize) -> Result<()> { - self.inner - .write() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire write lock for set_flush_threshold")? - .set_flush_threshold(threshold); - Ok(()) - } - - // ─── Read operations (shared lock) ───────────────────────────────────── - - /// Looks up a key. Acquires a **read** lock. - pub fn get(&self, key: &[u8]) -> Result)>> { - self.inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire read lock for get")? - .get(key) - } - - /// Range scan. Acquires a **read** lock. - pub fn scan(&self, start: &[u8], end: &[u8]) -> Result, Vec)>> { - self.inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e)) - .context("failed to acquire read lock for scan")? - .scan(start, end) - } - - /// Returns the current sequence number. Acquires a **read** lock. - pub fn seq(&self) -> Result { - Ok(self - .inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? - .seq()) - } - - /// Returns the total SSTable count. Acquires a **read** lock. - pub fn sstable_count(&self) -> Result { - Ok(self - .inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? - .sstable_count()) - } - - /// Returns the L0 SSTable count. Acquires a **read** lock. - pub fn l0_sstable_count(&self) -> Result { - Ok(self - .inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? - .l0_sstable_count()) - } - - /// Returns the L1 SSTable count. Acquires a **read** lock. - pub fn l1_sstable_count(&self) -> Result { - Ok(self - .inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? - .l1_sstable_count()) - } - - /// Returns the flush threshold in bytes. Acquires a **read** lock. - pub fn flush_threshold(&self) -> Result { - Ok(self - .inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? - .flush_threshold()) - } - - /// Returns the L0 compaction trigger. Acquires a **read** lock. - pub fn l0_compaction_trigger(&self) -> Result { - Ok(self - .inner - .read() - .map_err(|e| anyhow::anyhow!("RwLock poisoned: {}", e))? - .l0_compaction_trigger()) - } -} - -impl std::fmt::Debug for ConcurrentEngine { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self.inner.read() { - Ok(engine) => write!(f, "ConcurrentEngine({:?})", &*engine), - Err(_) => write!(f, "ConcurrentEngine()"), - } - } -} diff --git a/crates/engine/src/lib.rs b/crates/engine/src/lib.rs index a2e52c8..5571f6b 100644 --- a/crates/engine/src/lib.rs +++ b/crates/engine/src/lib.rs @@ -59,15 +59,12 @@ //! are written atomically via temp file + rename. The manifest uses the same //! atomic write pattern. See [`ARCHITECTURE.md`] for the full crash matrix. mod compaction; -pub mod concurrent; mod manifest; mod read; mod recovery; mod write; use anyhow::Result; -pub use concurrent::ConcurrentEngine; -pub use config::EngineConfig; use manifest::Manifest; use memtable::Memtable; pub use recovery::replay_wal_and_build; @@ -80,6 +77,13 @@ pub const MAX_KEY_SIZE: usize = 64 * 1024; /// Maximum allowed value size in bytes (10 MiB). pub const MAX_VALUE_SIZE: usize = 10 * 1024 * 1024; +/// Default number of L0 SSTables that triggers automatic compaction. +/// +/// When the L0 count reaches this threshold after a flush, the engine +/// automatically runs compaction to merge L0 + L1 into a single L1 SSTable. +/// Set to `0` to disable auto-compaction. +pub const DEFAULT_L0_COMPACTION_TRIGGER: usize = 4; + /// The central storage engine orchestrating Memtable, WAL, and SSTables. /// /// # Write Path @@ -148,12 +152,15 @@ impl std::fmt::Debug for Engine { } impl Engine { - /// Creates a new engine from an [`EngineConfig`], performing full recovery - /// from the WAL and existing SSTable files. + /// Creates a new engine, performing full recovery from the WAL and existing + /// SSTable files. /// /// # Arguments /// - /// * `cfg` — unified configuration struct containing all engine parameters. + /// * `wal_path` — path to the write-ahead log file. + /// * `sst_dir` — directory where SSTable files are stored. + /// * `flush_threshold` — memtable byte-size threshold that triggers flush. + /// * `wal_sync` — if `true`, every WAL append calls `fsync`. /// /// # Recovery Steps /// @@ -163,12 +170,14 @@ impl Engine { /// 4. Open the WAL writer in append mode. /// 5. Load SSTables from the manifest (or scan directory for legacy DBs). /// 6. Determine the highest sequence number across WAL and SSTables. - pub fn new(cfg: EngineConfig) -> Result { - let wal_path = cfg.wal_path; - let sst_dir = cfg.sst_dir; - let flush_threshold = cfg.flush_threshold_bytes; - let wal_sync = cfg.wal_sync; - let l0_compaction_trigger = cfg.l0_compaction_trigger; + pub fn new, P2: AsRef>( + wal_path: P1, + sst_dir: P2, + flush_threshold: usize, + wal_sync: bool, + ) -> Result { + let wal_path = wal_path.as_ref().to_path_buf(); + let sst_dir = sst_dir.as_ref().to_path_buf(); // ensure sst dir exists std::fs::create_dir_all(&sst_dir)?; @@ -254,40 +263,11 @@ impl Engine { manifest, seq, flush_threshold, - l0_compaction_trigger, + l0_compaction_trigger: DEFAULT_L0_COMPACTION_TRIGGER, wal_sync, }) } - /// Convenience constructor that accepts individual parameters instead of - /// an [`EngineConfig`]. - /// - /// This is equivalent to building an `EngineConfig` and calling - /// [`Engine::new`]. Primarily useful in tests where constructing a full - /// config struct is verbose. - /// - /// # Arguments - /// - /// * `wal_path` – path to the write-ahead log file. - /// * `sst_dir` – directory where SSTable files are stored. - /// * `flush_threshold` – memtable byte-size threshold that triggers flush. - /// * `wal_sync` – if `true`, every WAL append calls `fsync`. - pub fn from_parts, P2: AsRef>( - wal_path: P1, - sst_dir: P2, - flush_threshold: usize, - wal_sync: bool, - ) -> Result { - let cfg = EngineConfig::builder() - .wal_path(wal_path.as_ref()) - .sst_dir(sst_dir.as_ref()) - .flush_threshold_bytes(flush_threshold) - .wal_sync(wal_sync) - .build(); - - Self::new(cfg) - } - /// Returns the current monotonic sequence number. #[must_use] pub fn seq(&self) -> u64 { diff --git a/crates/engine/src/manifest.rs b/crates/engine/src/manifest.rs index 61bf1a4..c5c24bd 100644 --- a/crates/engine/src/manifest.rs +++ b/crates/engine/src/manifest.rs @@ -170,7 +170,11 @@ impl Manifest { let level_str = match entry.level { 0 => "L0", 1 => "L1", - other => panic!("invalid level {}", other), + other => anyhow::bail!( + "invalid manifest level {} for file '{}'", + other, + entry.filename + ), }; writeln!(f, "{}:{}", level_str, entry.filename)?; } diff --git a/crates/engine/src/read.rs b/crates/engine/src/read.rs index 727f387..f41fc41 100644 --- a/crates/engine/src/read.rs +++ b/crates/engine/src/read.rs @@ -115,7 +115,7 @@ impl Engine { if !end.is_empty() && key_ref >= end { continue; } - if let Ok(Some(entry)) = sst.get(key_ref) { + if let Some(entry) = sst.get(key_ref)? { merge_entry(key_ref.to_vec(), entry); } } @@ -130,7 +130,7 @@ impl Engine { if !end.is_empty() && key_ref >= end { continue; } - if let Ok(Some(entry)) = sst.get(key_ref) { + if let Some(entry) = sst.get(key_ref)? { merge_entry(key_ref.to_vec(), entry); } } diff --git a/crates/engine/src/tests/compaction_tests.rs b/crates/engine/src/tests/compaction_tests.rs index 7120024..a81374e 100644 --- a/crates/engine/src/tests/compaction_tests.rs +++ b/crates/engine/src/tests/compaction_tests.rs @@ -10,7 +10,7 @@ use tempfile::tempdir; #[test] fn flush_goes_to_l0() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -34,7 +34,7 @@ fn flush_goes_to_l0() -> Result<()> { #[test] fn compact_moves_l0_to_l1() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -76,7 +76,7 @@ fn compact_moves_l0_to_l1() -> Result<()> { #[test] fn compact_preserves_newest_value() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 32, @@ -99,7 +99,7 @@ fn compact_preserves_newest_value() -> Result<()> { #[test] fn many_keys_with_flushes() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 4096, // 4 KB threshold @@ -150,7 +150,7 @@ fn many_keys_with_flushes() -> Result<()> { #[test] fn auto_compaction_triggers_at_l0_threshold() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1, // threshold=1 -> every set triggers a flush @@ -178,8 +178,7 @@ fn auto_compaction_triggers_at_l0_threshold() -> Result<()> { #[test] fn auto_compaction_disabled_when_trigger_is_zero() -> Result<()> { let dir = tempdir()?; - let mut engine = - Engine::from_parts(dir.path().join("wal.log"), dir.path().join("sst"), 1, false)?; + let mut engine = Engine::new(dir.path().join("wal.log"), dir.path().join("sst"), 1, false)?; engine.set_l0_compaction_trigger(0); for i in 0..5u64 { @@ -198,7 +197,7 @@ fn auto_compaction_disabled_when_trigger_is_zero() -> Result<()> { #[test] fn tombstone_gc_removes_dead_keys_during_compaction() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 32, @@ -229,7 +228,7 @@ fn tombstone_gc_removes_dead_keys_during_compaction() -> Result<()> { fn compact_reduces_sst_file_count() -> Result<()> { let dir = tempdir()?; let sst_dir = dir.path().join("sst"); - let mut engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 64, false)?; + let mut engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 64, false)?; engine.set_l0_compaction_trigger(0); for i in 0..50u64 { @@ -259,7 +258,7 @@ fn compact_reduces_sst_file_count() -> Result<()> { #[test] fn l0_flush_then_compact_then_more_flushes() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -312,7 +311,7 @@ fn l0_flush_then_compact_then_more_flushes() -> Result<()> { #[test] fn compact_preserves_tombstones() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 32, @@ -336,7 +335,7 @@ fn compact_preserves_tombstones() -> Result<()> { #[test] fn compact_single_sstable_is_noop() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -367,7 +366,7 @@ fn compact_then_recovery_works() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::from_parts(&wal, &sst, 64, false)?; + let mut engine = Engine::new(&wal, &sst, 64, false)?; engine.set_l0_compaction_trigger(0); for i in 0..30u64 { engine.set(format!("k{:04}", i).into_bytes(), b"val".to_vec())?; @@ -380,7 +379,7 @@ fn compact_then_recovery_works() -> Result<()> { } // Reopen engine – should recover from the single compacted SSTable - let engine = Engine::from_parts(&wal, &sst, 64, false)?; + let engine = Engine::new(&wal, &sst, 64, false)?; assert_eq!(engine.sstable_count(), 1); for i in 0..30u64 { diff --git a/crates/engine/src/tests/concurrent_tests.rs b/crates/engine/src/tests/concurrent_tests.rs deleted file mode 100644 index e3738af..0000000 --- a/crates/engine/src/tests/concurrent_tests.rs +++ /dev/null @@ -1,128 +0,0 @@ -use anyhow::Result; -use config::EngineConfig; -use std::thread; -use tempfile::tempdir; - -use crate::ConcurrentEngine; - -fn test_config(dir: &std::path::Path) -> EngineConfig { - EngineConfig::builder() - .wal_path(dir.join("wal.log")) - .sst_dir(dir.join("sst")) - .flush_threshold_bytes(1024 * 1024) - .wal_sync(false) - .build() -} - -#[test] -fn concurrent_reads_do_not_block_each_other() -> Result<()> { - let dir = tempdir()?; - let engine = ConcurrentEngine::new(test_config(dir.path()))?; - engine.set(b"k".to_vec(), b"v".to_vec())?; - - let handles: Vec<_> = (0..4) - .map(|_| { - let e = engine.clone(); - thread::spawn(move || e.get(b"k")) - }) - .collect(); - - for h in handles { - let result = h.join().expect("thread panicked")?; - assert_eq!(result.unwrap().1, b"v"); - } - Ok(()) -} - -#[test] -fn concurrent_writes_are_serialized() -> Result<()> { - let dir = tempdir()?; - let engine = ConcurrentEngine::new(test_config(dir.path()))?; - - let handles: Vec<_> = (0..10) - .map(|i| { - let e = engine.clone(); - thread::spawn(move || e.set(format!("k{}", i).into_bytes(), b"v".to_vec())) - }) - .collect(); - - for h in handles { - h.join().expect("thread panicked")?; - } - - // All 10 keys should exist - for i in 0..10 { - let key = format!("k{}", i).into_bytes(); - assert!(engine.get(&key)?.is_some(), "key k{} missing", i); - } - - assert_eq!(engine.seq()?, 10); - Ok(()) -} - -#[test] -fn mixed_reads_and_writes() -> Result<()> { - let dir = tempdir()?; - let engine = ConcurrentEngine::new(test_config(dir.path()))?; - - // Pre-populate - for i in 0..5 { - engine.set(format!("k{}", i).into_bytes(), b"init".to_vec())?; - } - - let mut handles = Vec::new(); - - // Spawn readers - for i in 0..5 { - let e = engine.clone(); - handles.push(thread::spawn(move || { - let key = format!("k{}", i).into_bytes(); - for _ in 0..100 { - e.get(&key).expect("get failed"); - } - })); - } - - // Spawn writers - for i in 5..10 { - let e = engine.clone(); - handles.push(thread::spawn(move || { - e.set(format!("k{}", i).into_bytes(), b"new".to_vec()) - .expect("set failed"); - })); - } - - for h in handles { - h.join().expect("thread panicked"); - } - - // All 10 keys shoudl exist - for i in 0..10 { - assert!( - engine.get(format!("k{}", i).as_bytes())?.is_some(), - "k{} missing", - i - ) - } - Ok(()) -} - -#[test] -fn clone_shares_same_engine() -> Result<()> { - let dir = tempdir()?; - let e1 = ConcurrentEngine::new(test_config(dir.path()))?; - let e2 = e1.clone(); - - e1.set(b"shared".to_vec(), b"data".to_vec())?; - assert_eq!(e2.get(b"shared")?.unwrap().1, b"data"); - Ok(()) -} - -#[test] -fn debug_output_works() -> Result<()> { - let dir = tempdir()?; - let engine = ConcurrentEngine::new(test_config(dir.path()))?; - let debug_str = format!("{:?}", engine); - assert!(debug_str.contains("ConcurrentEngine")); - Ok(()) -} diff --git a/crates/engine/src/tests/mod.rs b/crates/engine/src/tests/mod.rs index c879e8a..a984623 100644 --- a/crates/engine/src/tests/mod.rs +++ b/crates/engine/src/tests/mod.rs @@ -1,7 +1,6 @@ mod helpers; mod compaction_tests; -mod concurrent_tests; mod manifest_tests; mod read_tests; mod recovery_tests; diff --git a/crates/engine/src/tests/read_tests.rs b/crates/engine/src/tests/read_tests.rs index 1c12646..b99772d 100644 --- a/crates/engine/src/tests/read_tests.rs +++ b/crates/engine/src/tests/read_tests.rs @@ -7,7 +7,7 @@ use tempfile::tempdir; #[test] fn scan_full_range() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -29,7 +29,7 @@ fn scan_full_range() -> Result<()> { #[test] fn scan_bounded_range() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -51,7 +51,7 @@ fn scan_bounded_range() -> Result<()> { #[test] fn scan_across_memtable_and_sstables() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 64, @@ -77,7 +77,7 @@ fn scan_across_memtable_and_sstables() -> Result<()> { #[test] fn scan_respects_tombstones() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -99,7 +99,7 @@ fn scan_respects_tombstones() -> Result<()> { #[test] fn scan_empty_range() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -119,7 +119,7 @@ fn scan_empty_range() -> Result<()> { #[test] fn read_path_prefers_l0_over_l1() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 64, diff --git a/crates/engine/src/tests/recovery_tests.rs b/crates/engine/src/tests/recovery_tests.rs index dc60793..e8b778f 100644 --- a/crates/engine/src/tests/recovery_tests.rs +++ b/crates/engine/src/tests/recovery_tests.rs @@ -16,14 +16,14 @@ fn recovery_from_wal() -> Result<()> { // Write some data, then drop engine (simulates crash) { - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; engine.set(b"a".to_vec(), b"1".to_vec())?; engine.set(b"b".to_vec(), b"2".to_vec())?; engine.del(b"a".to_vec())?; } // Reopen engine - should replay WAL - let engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; + let engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; assert!(engine.get(b"a")?.is_none()); // deleted assert_eq!(engine.get(b"b")?.unwrap().1, b"2".to_vec()); assert_eq!(engine.seq(), 3); // 3 operations @@ -38,13 +38,13 @@ fn recovery_from_sstables() -> Result<()> { // Write data and force flush { - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, true)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1, true)?; engine.set(b"k".to_vec(), b"v".to_vec())?; // Flush happened due to threshold=1 } // Reopen - WAL is empty but SSTable has the data - let engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; + let engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; assert_eq!(engine.get(b"k")?.unwrap().1, b"v".to_vec()); Ok(()) } @@ -57,19 +57,19 @@ fn recovery_combines_wal_and_sstables() -> Result<()> { // Create an engine that flushes immediately { - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, true)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1, true)?; // This triggers flush (threshold=1) engine.set(b"flushed".to_vec(), b"in_sst".to_vec())?; } { // Reopen with high threshold so next writes stay in WAL - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; engine.set(b"in_wal".to_vec(), b"pending".to_vec())?; } // Final reopen - should have both - let engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, true)?; + let engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, true)?; assert_eq!(engine.get(b"flushed")?.unwrap().1, b"in_sst".to_vec()); assert_eq!(engine.get(b"in_wal")?.unwrap().1, b"pending".to_vec()); Ok(()) @@ -84,7 +84,7 @@ fn manifest_preserves_l0_l1_across_restart() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::from_parts(&wal, &sst, 64, false)?; + let mut engine = Engine::new(&wal, &sst, 64, false)?; engine.set_l0_compaction_trigger(0); // Create some L0 SSTables @@ -110,7 +110,7 @@ fn manifest_preserves_l0_l1_across_restart() -> Result<()> { } // Reopen - manifest should preserve L0/L1 assignments - let engine = Engine::from_parts(&wal, &sst, 64, false)?; + let engine = Engine::new(&wal, &sst, 64, false)?; assert!(engine.l0_sstable_count() > 0, "L0 should be preserved"); assert_eq!(engine.l1_sstable_count(), 1, "L1 should be preserved"); @@ -131,7 +131,7 @@ fn sst_sort_order_is_correct_across_many_flushes() -> Result<()> { let sst_dir = dir.path().join("sst"); // Use threshold=1 so every set triggers a flush - let mut engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 1, false)?; + let mut engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 1, false)?; // Write 15 keys - produces seq 1..15, so filenames span single and // double digits. Without zero-padding this breaks. @@ -145,7 +145,7 @@ fn sst_sort_order_is_correct_across_many_flushes() -> Result<()> { // Drop and reopen - recovery must load SSTables in correct order drop(engine); - let engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; + let engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; // All keys must be readable with correct values for i in 0..15u64 { @@ -164,7 +164,7 @@ fn sst_sort_order_is_correct_across_many_flushes() -> Result<()> { fn sst_overwrite_across_flushes_returns_newest() -> Result<()> { // Write same key across multiple flushes; newest SSTable must win. let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1, // Flush every write @@ -178,7 +178,7 @@ fn sst_overwrite_across_flushes_returns_newest() -> Result<()> { // Drop and reopen drop(engine); - let engine = Engine::from_parts( + let engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -205,7 +205,7 @@ fn recovery_cleans_up_tmp_files() -> Result<()> { assert!(tmp_file.exists()); // Opening the engine should clean it up - let _engine = Engine::from_parts(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; + let _engine = Engine::new(dir.path().join("wal.log"), &sst_dir, 1024 * 1024, false)?; assert!( !tmp_file.exists(), @@ -224,7 +224,7 @@ fn seq_recovered_from_sstables_after_wal_truncation() -> Result<()> { // Write data and flush (WAL gets truncated) { - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, false)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1, false)?; engine.set(b"a".to_vec(), b"1".to_vec())?; thread::sleep(Duration::from_millis(2)); engine.set(b"b".to_vec(), b"2".to_vec())?; @@ -234,7 +234,7 @@ fn seq_recovered_from_sstables_after_wal_truncation() -> Result<()> { } // Reopen - WAL is empty, seq must be recovered from SSTables - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, false)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, false)?; assert!( engine.seq() >= 3, "seq should be >= 3 from SSTable scan, got {}", diff --git a/crates/engine/src/tests/write_tests.rs b/crates/engine/src/tests/write_tests.rs index b70646c..6e5f073 100644 --- a/crates/engine/src/tests/write_tests.rs +++ b/crates/engine/src/tests/write_tests.rs @@ -9,7 +9,7 @@ use tempfile::tempdir; #[test] fn set_and_get() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -26,7 +26,7 @@ fn set_and_get() -> Result<()> { #[test] fn get_missing_key() -> Result<()> { let dir = tempdir()?; - let engine = Engine::from_parts( + let engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -40,7 +40,7 @@ fn get_missing_key() -> Result<()> { #[test] fn del_removes_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -58,7 +58,7 @@ fn del_removes_key() -> Result<()> { #[test] fn overwrite_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -74,7 +74,7 @@ fn overwrite_key() -> Result<()> { #[test] fn set_after_del_resurrects() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -94,7 +94,7 @@ fn newest_sstable_wins_on_read() -> Result<()> { let wal_path = dir.path().join("wal.log"); let sst_dir = dir.path().join("sst"); - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, false)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1, false)?; // Write k=v1, flush engine.set(b"k".to_vec(), b"v1".to_vec())?; @@ -113,7 +113,7 @@ fn newest_sstable_wins_on_read() -> Result<()> { #[test] fn force_flush_empty_memtable_is_noop() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -137,14 +137,14 @@ fn force_flush_persists_memtable_data() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; + let mut engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; engine.set(b"key".to_vec(), b"value".to_vec())?; engine.force_flush()?; assert_eq!(engine.l0_sstable_count(), 1); } // Reopen - data should be in SSTable, not WAL - let engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; + let engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; let (_, val) = engine.get(b"key")?.expect("key should survive"); assert_eq!(val, b"value"); Ok(()) @@ -159,13 +159,13 @@ fn drop_flushes_memtable_to_sstable() -> Result<()> { let sst = dir.path().join("sst"); { - let mut engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; + let mut engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; engine.set(b"drop_key".to_vec(), b"drop_val".to_vec())?; // Engine drops here - should flush memtable } // Reopen - data should be in SSTable from the Drop flush - let engine = Engine::from_parts(&wal, &sst, 1024 * 1024, false)?; + let engine = Engine::new(&wal, &sst, 1024 * 1024, false)?; let (_, val) = engine.get(b"drop_key")?.expect("key should survive drop"); assert_eq!(val, b"drop_val"); assert!(engine.sstable_count() >= 1); @@ -175,7 +175,7 @@ fn drop_flushes_memtable_to_sstable() -> Result<()> { #[test] fn set_rejects_oversized_value() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -193,7 +193,7 @@ fn set_rejects_oversized_value() -> Result<()> { #[test] fn set_accepts_max_key_size() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024 * 1024, // huge threshold to avoid flush @@ -212,7 +212,7 @@ fn set_accepts_max_key_size() -> Result<()> { #[test] fn del_rejects_oversized_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -235,7 +235,7 @@ fn multiple_flushes_create_multiple_sstables() -> Result<()> { let wal_path = dir.path().join("wal.log"); let sst_dir = dir.path().join("sst"); - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, false)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1, false)?; // Disable auto-compaction so all L0 SSTables remain on disk. engine.set_l0_compaction_trigger(0); @@ -266,7 +266,7 @@ fn multiple_flushes_create_multiple_sstables() -> Result<()> { #[test] fn seq_increments_on_every_operation() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -288,7 +288,7 @@ fn seq_increments_on_every_operation() -> Result<()> { #[test] fn set_rejects_empty_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -306,7 +306,7 @@ fn set_rejects_empty_key() -> Result<()> { #[test] fn del_rejects_empty_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -323,7 +323,7 @@ fn del_rejects_empty_key() -> Result<()> { #[test] fn set_rejects_oversized_key() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1024 * 1024, @@ -346,7 +346,7 @@ fn flush_writes_sstable_and_truncates_wal() -> Result<()> { let wal_path = dir.path().join("wal.log"); let sst_dir = dir.path().join("sst"); - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1, true)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1, true)?; engine.set(b"key1".to_vec(), b"value1".to_vec())?; assert!( @@ -366,7 +366,7 @@ fn flush_triggers_at_threshold() -> Result<()> { let sst_dir = dir.path().join("sst"); let threshold = 4 * 1024; // 4 KB for fast test - let mut engine = Engine::from_parts(&wal_path, &sst_dir, threshold, false)?; + let mut engine = Engine::new(&wal_path, &sst_dir, threshold, false)?; let value = vec![b'x'; 512]; let writes = (threshold / value.len()) + 5; for i in 0..writes { @@ -385,7 +385,7 @@ fn flush_triggers_at_threshold() -> Result<()> { #[test] fn get_reads_from_sstable_after_flush() -> Result<()> { let dir = tempdir()?; - let mut engine = Engine::from_parts( + let mut engine = Engine::new( dir.path().join("wal.log"), dir.path().join("sst"), 1, // tiny threshold - every set triggers flush @@ -405,7 +405,7 @@ fn tombstone_in_sstable_shadows_older_value() -> Result<()> { let sst_dir = dir.path().join("sst"); // Large threshold so we control flushes manually - let mut engine = Engine::from_parts(&wal_path, &sst_dir, 1024 * 1024, false)?; + let mut engine = Engine::new(&wal_path, &sst_dir, 1024 * 1024, false)?; // Write k=v, then force flush by lowering threshold temporarily engine.set(b"k".to_vec(), b"old_value".to_vec())?; diff --git a/crates/engine/src/write.rs b/crates/engine/src/write.rs index 2d09ef0..2319f56 100644 --- a/crates/engine/src/write.rs +++ b/crates/engine/src/write.rs @@ -135,11 +135,14 @@ impl Engine { self.manifest.save()?; // Successfully wrote SSTable and manifest; now safely truncate the WAL. - let _f = OpenOptions::new() - .create(true) - .write(true) - .truncate(true) - .open(&self.wal_path)?; + { + let f = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&self.wal_path)?; + f.sync_all()?; + } // create a fresh WalWriter (append mode) self.wal_writer = WalWriter::create(&self.wal_path, self.wal_sync)?; diff --git a/crates/server/Cargo.toml b/crates/server/Cargo.toml new file mode 100644 index 0000000..7ecf6ce --- /dev/null +++ b/crates/server/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "server" +version = "0.1.0" +edition = "2021" + +[lib] +name = "server" +path = "src/lib.rs" + +[[bin]] +name = "riptidekv-server" +path = "src/main.rs" + +[[bench]] +name = "server_bench" +harness = false + +[dependencies] +engine = { path = "../engine" } +anyhow = "1.0" +thiserror = "1.0" +tokio = { version = "1", features = ["rt-multi-thread", "net", "io-util", "sync", "macros", "signal", "time"] } +bytes = "1" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } + +[dev-dependencies] +tempfile = "3" +criterion = { version = "0.5", features = ["async_tokio"] } diff --git a/crates/server/benches/server_bench.rs b/crates/server/benches/server_bench.rs new file mode 100644 index 0000000..456fe2e --- /dev/null +++ b/crates/server/benches/server_bench.rs @@ -0,0 +1,208 @@ +//! Throughput benchmarks for the RiptideKV RESP server. +//! +//! Measures end-to-end latency and throughput of SET / GET / PIPELINE +//! requests through a real TCP socket, exercising the full RESP2 stack. + +use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput}; +use engine::Engine; +use server::db::SharedDb; +use std::io::{Read, Write}; +use std::net::TcpStream; +use std::time::Duration; +use tempfile::tempdir; +use tokio::net::TcpListener; +use tokio::runtime::Runtime; + +// ─── Benchmark infrastructure ───────────────────────────────────────────────── + +/// Bind a server on a random port, return (runtime, address, SharedDb). +fn start_server() -> (Runtime, std::net::SocketAddr, SharedDb) { + let rt = Runtime::new().unwrap(); + let dir = tempdir().unwrap(); + + let engine = Engine::new( + dir.path().join("wal.log"), + dir.path().join("sst"), + 64 * 1024 * 1024, // 64 MiB — no flushing during bench + false, + ) + .unwrap(); + let db = SharedDb::new(engine); + + let listener = rt.block_on(TcpListener::bind("127.0.0.1:0")).unwrap(); + let addr = listener.local_addr().unwrap(); + let db2 = db.clone(); + rt.spawn(async move { + server::serve(listener, db2).await.ok(); + }); + + // Give the server a moment to be ready. + std::thread::sleep(Duration::from_millis(20)); + (rt, addr, db) +} + +/// Encode a RESP2 array command. +fn resp_cmd(args: &[&str]) -> Vec { + let mut out = format!("*{}\r\n", args.len()).into_bytes(); + for a in args { + out.extend_from_slice(format!("${}\r\n{}\r\n", a.len(), a).as_bytes()); + } + out +} + +/// Read until we have consumed `n` RESP responses (each ending with \r\n). +/// Simple heuristic: count top-level CRLF-terminated lines we care about. +fn drain_responses(stream: &mut TcpStream, n: usize) { + let mut buf = [0u8; 4096]; + let mut responses = 0; + loop { + let got = stream.read(&mut buf).unwrap(); + for &b in &buf[..got] { + if b == b'\n' { + responses += 1; + if responses >= n { + return; + } + } + } + } +} + +// ─── Benchmarks ─────────────────────────────────────────────────────────────── + +fn bench_ping(c: &mut Criterion) { + let (_rt, addr, _db) = start_server(); + + c.bench_function("server_ping_1k", |b| { + b.iter_batched( + || TcpStream::connect(addr).unwrap(), + |mut stream| { + let cmd = resp_cmd(&["PING"]); + for _ in 0..1_000 { + stream.write_all(&cmd).unwrap(); + } + drain_responses(&mut stream, 1_000); + }, + BatchSize::SmallInput, + ) + }); +} + +fn bench_set(c: &mut Criterion) { + let (_rt, addr, _db) = start_server(); + let value = "x".repeat(64); + + let mut group = c.benchmark_group("server_set"); + group.throughput(Throughput::Elements(1_000)); + + group.bench_function("set_1k_64b_values", |b| { + b.iter_batched( + || TcpStream::connect(addr).unwrap(), + |mut stream| { + for i in 0u64..1_000 { + let key = format!("bench:key:{}", i); + let cmd = resp_cmd(&["SET", &key, &value]); + stream.write_all(&cmd).unwrap(); + } + drain_responses(&mut stream, 1_000); + }, + BatchSize::SmallInput, + ) + }); + group.finish(); +} + +fn bench_get(c: &mut Criterion) { + let (rt, addr, db) = start_server(); + + // Pre-populate 1 000 keys. + rt.block_on(async { + let mut state = db.state.write().await; + for i in 0u64..1_000 { + let k = format!("bench:key:{}", i).into_bytes(); + let v = b"hello-world".to_vec(); + state.engine.set(k, v).unwrap(); + } + }); + + let mut group = c.benchmark_group("server_get"); + group.throughput(Throughput::Elements(1_000)); + + group.bench_function("get_1k_existing_keys", |b| { + b.iter_batched( + || TcpStream::connect(addr).unwrap(), + |mut stream| { + for i in 0u64..1_000 { + let key = format!("bench:key:{}", i); + let cmd = resp_cmd(&["GET", &key]); + stream.write_all(&cmd).unwrap(); + } + drain_responses(&mut stream, 1_000); + }, + BatchSize::SmallInput, + ) + }); + group.finish(); +} + +fn bench_pipeline_set_get(c: &mut Criterion) { + let (_rt, addr, _db) = start_server(); + let value = "v".repeat(32); + + let mut group = c.benchmark_group("server_pipeline"); + group.throughput(Throughput::Elements(500)); // 500 SET + 500 GET = 1k ops + + group.bench_function("pipeline_500_set_500_get", |b| { + b.iter_batched( + || TcpStream::connect(addr).unwrap(), + |mut stream| { + let mut batch = Vec::new(); + for i in 0u64..500 { + let key = format!("pipe:key:{}", i); + batch.extend_from_slice(&resp_cmd(&["SET", &key, &value])); + } + for i in 0u64..500 { + let key = format!("pipe:key:{}", i); + batch.extend_from_slice(&resp_cmd(&["GET", &key])); + } + stream.write_all(&batch).unwrap(); + drain_responses(&mut stream, 1_000); + }, + BatchSize::SmallInput, + ) + }); + group.finish(); +} + +fn bench_mset_mget(c: &mut Criterion) { + let (_rt, addr, _db) = start_server(); + + c.bench_function("server_mset_100_keys", |b| { + b.iter_batched( + || TcpStream::connect(addr).unwrap(), + |mut stream| { + let mut args = vec!["MSET"]; + let pairs: Vec = (0..100) + .flat_map(|i| [format!("mk:{}", i), format!("mv:{}", i)]) + .collect(); + for p in &pairs { + args.push(p.as_str()); + } + let cmd = resp_cmd(&args); + stream.write_all(&cmd).unwrap(); + drain_responses(&mut stream, 1); + }, + BatchSize::SmallInput, + ) + }); +} + +criterion_group!( + benches, + bench_ping, + bench_set, + bench_get, + bench_pipeline_set_get, + bench_mset_mget, +); +criterion_main!(benches); diff --git a/crates/server/src/db.rs b/crates/server/src/db.rs new file mode 100644 index 0000000..e36cf51 --- /dev/null +++ b/crates/server/src/db.rs @@ -0,0 +1,109 @@ +//! Shared database state: Engine + volatile TTL map + server statistics. + +use engine::Engine; +use std::collections::HashMap; +use std::sync::atomic::{AtomicI64, AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tokio::sync::RwLock; + +// ─── Inner state ───────────────────────────────────────────────────────────── + +pub struct DbState { + pub engine: Engine, + /// Volatile expiry times (lost on restart; graceful degradation: keys survive, just un-expire). + pub ttl: HashMap, Instant>, +} + +impl DbState { + /// Returns `true` if key has an expiry and it has passed. + pub fn is_expired(&self, key: &[u8]) -> bool { + self.ttl + .get(key) + .map(|t| Instant::now() >= *t) + .unwrap_or(false) + } + + /// Lazily evict an expired key. Returns `true` if the key was evicted. + pub fn evict_if_expired(&mut self, key: &[u8]) -> anyhow::Result { + if self.is_expired(key) { + self.engine.del(key.to_vec())?; + self.ttl.remove(key); + Ok(true) + } else { + Ok(false) + } + } + + /// Set (or overwrite) a TTL. + pub fn set_expiry(&mut self, key: &[u8], from_now: Duration) { + self.ttl.insert(key.to_vec(), Instant::now() + from_now); + } + + /// Remove any TTL for this key. + pub fn clear_expiry(&mut self, key: &[u8]) { + self.ttl.remove(key); + } + + /// Remaining TTL in milliseconds, or `None` if no expiry, or `Some(-2)` if expired. + pub fn ttl_ms(&self, key: &[u8]) -> Option { + let deadline = self.ttl.get(key)?; + let now = Instant::now(); + if now >= *deadline { + Some(-2) + } else { + Some(deadline.duration_since(now).as_millis() as i64) + } + } +} + +// ─── Shared wrapper ────────────────────────────────────────────────────────── + +#[derive(Clone)] +pub struct SharedDb { + pub state: Arc>, + pub start_time: Instant, + pub connected_clients: Arc, + pub total_commands: Arc, + pub total_connections: Arc, +} + +impl SharedDb { + pub fn new(engine: Engine) -> Self { + Self { + state: Arc::new(RwLock::new(DbState { + engine, + ttl: HashMap::new(), + })), + start_time: Instant::now(), + connected_clients: Arc::new(AtomicI64::new(0)), + total_commands: Arc::new(AtomicU64::new(0)), + total_connections: Arc::new(AtomicU64::new(0)), + } + } + + pub fn client_connected(&self) { + self.connected_clients.fetch_add(1, Ordering::Relaxed); + self.total_connections.fetch_add(1, Ordering::Relaxed); + } + + pub fn client_disconnected(&self) { + self.connected_clients.fetch_sub(1, Ordering::Relaxed); + } + + pub fn inc_commands(&self) { + self.total_commands.fetch_add(1, Ordering::Relaxed); + } + + pub fn uptime_secs(&self) -> u64 { + self.start_time.elapsed().as_secs() + } + + pub fn num_clients(&self) -> i64 { + self.connected_clients.load(Ordering::Relaxed) + } + + pub fn total_commands_processed(&self) -> u64 { + self.total_commands.load(Ordering::Relaxed) + } +} diff --git a/crates/server/src/handler.rs b/crates/server/src/handler.rs new file mode 100644 index 0000000..1ac31d6 --- /dev/null +++ b/crates/server/src/handler.rs @@ -0,0 +1,1492 @@ +//! Per-connection command dispatcher. +//! +//! Design notes: +//! - `engine.set / del` take *owned* `Vec`; always `.clone()` or move. +//! - `engine.get` returns `Option<(seq, Vec)>`; use `.map(|(_, v)| v)`. +//! - Lock guards borrow `db.state` (Arc); clone the Arc before awaiting so +//! `conn` is free for mutable calls after the guard is dropped. + +use crate::db::SharedDb; +use crate::resp::{ + encode_array, encode_bulk, encode_error, encode_int, encode_null_array, encode_simple, ok, + RespReader, RespValue, +}; +use std::sync::Arc; +use std::time::Duration; +use tokio::io::AsyncWriteExt; +use tokio::net::tcp::OwnedWriteHalf; +use tokio::net::TcpStream; +use tracing::{debug, warn}; + +// ─── Connection state ──────────────────────────────────────────────────────── + +struct Conn { + db: SharedDb, + writer: OwnedWriteHalf, + name: String, + id: u64, + db_index: u32, +} + +impl Conn { + async fn send(&mut self, data: Vec) -> anyhow::Result<()> { + self.writer.write_all(&data).await?; + Ok(()) + } + async fn ok(&mut self) -> anyhow::Result<()> { + self.send(ok()).await + } + async fn err(&mut self, msg: &str) -> anyhow::Result<()> { + self.send(encode_error(msg)).await + } + async fn int(&mut self, n: i64) -> anyhow::Result<()> { + self.send(encode_int(n)).await + } + async fn bulk(&mut self, data: Option<&[u8]>) -> anyhow::Result<()> { + self.send(encode_bulk(data)).await + } +} + +// ─── Entry point ───────────────────────────────────────────────────────────── + +static CONN_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1); + +pub async fn handle_connection(stream: TcpStream, db: SharedDb) { + let id = CONN_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let peer = stream.peer_addr().ok(); + debug!(?peer, id, "client connected"); + db.client_connected(); + + let (read_half, write_half) = stream.into_split(); + let mut reader = RespReader::new(read_half); + let mut conn = Conn { + db: db.clone(), + writer: write_half, + name: String::new(), + id, + db_index: 0, + }; + + loop { + let value = match reader.read_value().await { + Ok(Some(v)) => v, + Ok(None) => break, + Err(e) => { + warn!(?peer, id, "parse error: {e}"); + break; + } + }; + db.inc_commands(); + + let args = match value { + RespValue::Array(Some(arr)) => arr, + _ => { + let _ = conn.err("ERR protocol error: expected array").await; + break; + } + }; + if args.is_empty() { + let _ = conn.err("ERR empty command").await; + continue; + } + + let cmd = match args[0].as_str() { + Some(s) => s.to_ascii_uppercase(), + None => { + let _ = conn.err("ERR command must be a string").await; + continue; + } + }; + + match dispatch(&mut conn, &cmd, &args[1..]).await { + Ok(true) => {} + Ok(false) => break, + Err(e) => { + warn!(id, "error in {cmd}: {e}"); + let _ = conn.err(&format!("ERR internal: {e}")).await; + } + } + } + + debug!(?peer, id, "client disconnected"); + db.client_disconnected(); +} + +// ─── Dispatcher ────────────────────────────────────────────────────────────── + +async fn dispatch(conn: &mut Conn, cmd: &str, args: &[RespValue]) -> anyhow::Result { + match cmd { + "PING" => cmd_ping(conn, args).await?, + "ECHO" => cmd_echo(conn, args).await?, + "SELECT" => cmd_select(conn, args).await?, + "QUIT" | "SHUTDOWN" => { + conn.ok().await?; + return Ok(false); + } + "RESET" => { + conn.name.clear(); + conn.db_index = 0; + conn.send(encode_simple("RESET")).await?; + } + "HELLO" => cmd_hello(conn, args).await?, + "CLIENT" => cmd_client(conn, args).await?, + "CONFIG" => cmd_config(conn, args).await?, + "INFO" => cmd_info(conn, args).await?, + "COMMAND" => cmd_command(conn, args).await?, + "DBSIZE" => cmd_dbsize(conn).await?, + "FLUSHDB" | "FLUSHALL" => cmd_flushdb(conn).await?, + "DEBUG" => cmd_debug(conn, args).await?, + "OBJECT" => cmd_object(conn, args).await?, + "MEMORY" => cmd_memory(conn, args).await?, + "SLOWLOG" => cmd_slowlog(conn, args).await?, + "LATENCY" => cmd_latency(conn, args).await?, + "ACL" => cmd_acl(conn, args).await?, + "WAIT" => conn.int(0).await?, + "LOLWUT" => { + conn.send(encode_simple("RiptideKV -- made with Rust")) + .await? + } + "FAILOVER" | "REPLICAOF" | "SLAVEOF" => conn.ok().await?, + "BGSAVE" | "BGREWRITEAOF" | "SAVE" => { + conn.send(encode_simple("Background saving started")) + .await? + } + "LASTSAVE" => conn.int(0).await?, + + "GET" => cmd_get(conn, args).await?, + "SET" => cmd_set(conn, args).await?, + "SETNX" => cmd_setnx(conn, args).await?, + "SETEX" => cmd_setex(conn, args).await?, + "PSETEX" => cmd_psetex(conn, args).await?, + "GETSET" => cmd_getset(conn, args).await?, + "GETDEL" => cmd_getdel(conn, args).await?, + "GETEX" => cmd_getex(conn, args).await?, + "MGET" => cmd_mget(conn, args).await?, + "MSET" => cmd_mset(conn, args).await?, + "MSETNX" => cmd_msetnx(conn, args).await?, + "APPEND" => cmd_append(conn, args).await?, + "STRLEN" => cmd_strlen(conn, args).await?, + "INCR" => cmd_incr(conn, args, 1).await?, + "INCRBY" => cmd_incrby(conn, args).await?, + "INCRBYFLOAT" => cmd_incrbyfloat(conn, args).await?, + "DECR" => cmd_incr(conn, args, -1).await?, + "DECRBY" => cmd_decrby(conn, args).await?, + "GETRANGE" | "SUBSTR" => cmd_getrange(conn, args).await?, + "SETRANGE" => cmd_setrange(conn, args).await?, + + "DEL" | "UNLINK" => cmd_del(conn, args).await?, + "EXISTS" => cmd_exists(conn, args).await?, + "TYPE" => cmd_type(conn, args).await?, + "TTL" => cmd_ttl(conn, args, false).await?, + "PTTL" => cmd_ttl(conn, args, true).await?, + "EXPIRE" => cmd_expire(conn, args, false).await?, + "PEXPIRE" => cmd_expire(conn, args, true).await?, + "EXPIREAT" => cmd_expireat(conn, args, false).await?, + "PEXPIREAT" => cmd_expireat(conn, args, true).await?, + "PERSIST" => cmd_persist(conn, args).await?, + "EXPIRETIME" => cmd_expiretime(conn, args, false).await?, + "PEXPIRETIME" => cmd_expiretime(conn, args, true).await?, + "KEYS" => cmd_keys(conn, args).await?, + "SCAN" => cmd_scan(conn, args).await?, + "RENAME" => cmd_rename(conn, args).await?, + "RENAMENX" => cmd_renamenx(conn, args).await?, + "RANDOMKEY" => cmd_randomkey(conn).await?, + "TOUCH" => cmd_touch(conn, args).await?, + "DUMP" => conn.bulk(None).await?, + "RESTORE" => conn.err("ERR RESTORE not supported").await?, + "COPY" => conn.err("ERR COPY not supported").await?, + "MOVE" => conn.err("ERR MOVE not supported (single db)").await?, + "SORT" | "SORT_RO" => conn.err("ERR SORT not supported").await?, + + _ => { + conn.err(&format!( + "ERR unknown command `{}`, with args beginning with: {}", + cmd, + args.iter() + .filter_map(|a| a.as_str()) + .collect::>() + .join(", ") + )) + .await? + } + } + Ok(true) +} + +// ─── Helper: clone Arc so guard doesn't borrow `conn` ──────────────────────── + +macro_rules! state_write { + ($conn:expr) => { + Arc::clone(&$conn.db.state).write_owned().await + }; +} +macro_rules! state_read { + ($conn:expr) => { + Arc::clone(&$conn.db.state).read_owned().await + }; +} + +// ─── Server / connection commands ──────────────────────────────────────────── + +async fn cmd_ping(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() { + conn.send(encode_simple("PONG")).await + } else { + conn.bulk(args[0].as_bytes()).await + } +} + +async fn cmd_echo(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 1 { + return conn + .err("ERR wrong number of arguments for 'echo' command") + .await; + } + conn.bulk(args[0].as_bytes()).await +} + +async fn cmd_select(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let idx: u32 = match args + .first() + .and_then(|a| a.as_str()) + .and_then(|s| s.parse().ok()) + { + Some(n) => n, + None => { + return conn + .err("ERR value is not an integer or out of range") + .await + } + }; + if idx != 0 { + return conn.err("ERR DB index is out of range").await; + } + conn.db_index = 0; + conn.ok().await +} + +async fn cmd_hello(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let proto = args + .first() + .and_then(|a| a.as_str()) + .and_then(|s| s.parse::().ok()); + if matches!(proto, Some(3)) { + return conn.err("NOPROTO this server does not support RESP3").await; + } + let items: Vec> = vec![ + encode_bulk(Some(b"server")), + encode_bulk(Some(b"RiptideKV")), + encode_bulk(Some(b"version")), + encode_bulk(Some(b"7.0.0")), + encode_bulk(Some(b"proto")), + encode_int(2), + encode_bulk(Some(b"id")), + encode_int(conn.id as i64), + encode_bulk(Some(b"mode")), + encode_bulk(Some(b"standalone")), + encode_bulk(Some(b"role")), + encode_bulk(Some(b"master")), + encode_bulk(Some(b"modules")), + encode_array(&[]), + ]; + conn.send(encode_array(&items)).await +} + +async fn cmd_client(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("SETNAME") => { + let name = match args.get(1).and_then(|a| a.as_str()) { + Some(n) => n.to_owned(), + None => return conn.err("ERR syntax error").await, + }; + if name.contains(' ') { + return conn + .err("ERR Client names cannot contain spaces, newlines or special characters.") + .await; + } + conn.name = name; + conn.ok().await + } + Some("GETNAME") => { + let n = if conn.name.is_empty() { + None + } else { + Some(conn.name.as_bytes().to_vec()) + }; + conn.bulk(n.as_deref()).await + } + Some("ID") => conn.int(conn.id as i64).await, + Some("INFO") | Some("LIST") => { + let info = format!("id={} name={} db={}\n", conn.id, conn.name, conn.db_index); + conn.bulk(Some(info.as_bytes())).await + } + Some("NO-EVICT") | Some("NO-TOUCH") | Some("REPLY") | Some("PAUSE") | Some("UNPAUSE") + | Some("KILL") | Some("CACHING") | Some("RESET") => conn.ok().await, + _ => conn.err("ERR unknown CLIENT subcommand").await, + } +} + +async fn cmd_config(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("GET") => conn.send(encode_array(&[])).await, + Some("SET") => conn.ok().await, + Some("RESETSTAT") => conn.ok().await, + Some("REWRITE") => conn.err("ERR CONFIG REWRITE not supported").await, + _ => conn.err("ERR unknown CONFIG subcommand").await, + } +} + +async fn cmd_info(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let section = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_lowercase()); + let uptime = conn.db.uptime_secs(); + let clients = conn.db.num_clients(); + let cmds = conn.db.total_commands_processed(); + let dbsize = { + let state = state_read!(conn); + state.engine.scan(b"", b"").map(|v| v.len()).unwrap_or(0) + }; + let server = format!("# Server\r\nredis_version:7.0.0\r\nredis_mode:standalone\r\nuptime_in_seconds:{uptime}\r\n"); + let cli_s = format!("# Clients\r\nconnected_clients:{clients}\r\n"); + let stats = format!("# Stats\r\ntotal_commands_processed:{cmds}\r\n"); + let ks = format!("# Keyspace\r\ndb0:keys={dbsize},expires=0,avg_ttl=0\r\n"); + let repl = "# Replication\r\nrole:master\r\nconnected_slaves:0\r\n".to_owned(); + let mem = "# Memory\r\nused_memory:0\r\nused_memory_human:0B\r\n".to_owned(); + let full = match section.as_deref() { + Some("server") => server, + Some("clients") => cli_s, + Some("stats") => stats, + Some("keyspace") => ks, + Some("replication") => repl, + Some("memory") => mem, + _ => format!("{server}{cli_s}{stats}{ks}{repl}{mem}"), + }; + conn.bulk(Some(full.as_bytes())).await +} + +async fn cmd_command(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("COUNT") => conn.int(50).await, + Some("DOCS") | Some("INFO") | Some("LIST") | Some("GETKEYS") | None => { + conn.send(encode_array(&[])).await + } + _ => conn.err("ERR unknown COMMAND subcommand").await, + } +} + +async fn cmd_dbsize(conn: &mut Conn) -> anyhow::Result<()> { + let n = { + let s = state_read!(conn); + s.engine.scan(b"", b"").map(|v| v.len() as i64).unwrap_or(0) + }; + conn.int(n).await +} + +async fn cmd_flushdb(conn: &mut Conn) -> anyhow::Result<()> { + let keys: Vec> = { + let s = state_read!(conn); + s.engine + .scan(b"", b"") + .unwrap_or_default() + .into_iter() + .map(|(k, _)| k) + .collect() + }; + { + let mut s = state_write!(conn); + for k in &keys { + let _ = s.engine.del(k.clone()); + } + s.ttl.clear(); + } + conn.ok().await +} + +async fn cmd_debug(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("SLEEP") => { + let secs: f64 = args + .get(1) + .and_then(|a| a.as_str()) + .and_then(|s| s.parse().ok()) + .unwrap_or(0.0); + tokio::time::sleep(Duration::from_secs_f64(secs)).await; + conn.ok().await + } + Some("OBJECT") => { + conn.bulk(Some( + b"encoding:raw serializedlength:0 lru:0 lru_seconds_idle:0", + )) + .await + } + _ => conn.ok().await, + } +} + +async fn cmd_object(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("ENCODING") => conn.bulk(Some(b"raw")).await, + Some("IDLETIME") => conn.int(0).await, + Some("REFCOUNT") => conn.int(1).await, + Some("FREQ") => conn.int(0).await, + Some("HELP") => { + let items = vec![ + encode_bulk(Some(b"OBJECT ENCODING ")), + encode_bulk(Some(b"OBJECT IDLETIME ")), + encode_bulk(Some(b"OBJECT REFCOUNT ")), + encode_bulk(Some(b"OBJECT FREQ ")), + ]; + conn.send(encode_array(&items)).await + } + _ => conn.err("ERR unknown OBJECT subcommand").await, + } +} + +async fn cmd_memory(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("USAGE") => conn.bulk(None).await, + Some("MALLOC-STATS") | Some("DOCTOR") | Some("STATS") => { + conn.bulk(Some(b"not available")).await + } + Some("HELP") => conn.send(encode_array(&[])).await, + Some("PURGE") => conn.ok().await, + _ => conn.err("ERR unknown MEMORY subcommand").await, + } +} + +async fn cmd_slowlog(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("GET") => conn.send(encode_array(&[])).await, + Some("LEN") => conn.int(0).await, + Some("RESET") => conn.ok().await, + _ => conn.err("ERR unknown SLOWLOG subcommand").await, + } +} + +async fn cmd_latency(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("LATEST") | Some("HISTORY") => conn.send(encode_array(&[])).await, + Some("RESET") => conn.int(0).await, + _ => conn.err("ERR unknown LATENCY subcommand").await, + } +} + +async fn cmd_acl(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let sub = args + .first() + .and_then(|a| a.as_str()) + .map(|s| s.to_ascii_uppercase()); + match sub.as_deref() { + Some("WHOAMI") => conn.bulk(Some(b"default")).await, + Some("CAT") => { + let cats = vec![ + encode_bulk(Some(b"all")), + encode_bulk(Some(b"read")), + encode_bulk(Some(b"write")), + ]; + conn.send(encode_array(&cats)).await + } + Some("LIST") => { + conn.send(encode_array(&[encode_bulk(Some( + b"user default on nopass ~* &* +@all", + ))])) + .await + } + Some("USERS") => { + conn.send(encode_array(&[encode_bulk(Some(b"default"))])) + .await + } + Some("LOG") => conn.send(encode_array(&[])).await, + Some("GETUSER") => conn.send(encode_null_array()).await, + Some("SETUSER") | Some("DELUSER") | Some("SAVE") | Some("LOAD") => conn.ok().await, + Some("GENPASS") => { + conn.bulk(Some( + b"0000000000000000000000000000000000000000000000000000000000000000", + )) + .await + } + Some("INFO") => conn.send(encode_array(&[])).await, + _ => conn.err("ERR unknown ACL subcommand").await, + } +} + +// ─── String commands ───────────────────────────────────────────────────────── + +async fn cmd_get(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let key = req_key(args, 0, "get")?; + let val = { + let mut s = state_write!(conn); + if s.evict_if_expired(&key)? { + None + } else { + s.engine.get(&key)?.map(|(_, v)| v) + } + }; + conn.bulk(val.as_deref()).await +} + +async fn cmd_set(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() < 2 { + return conn + .err("ERR wrong number of arguments for 'set' command") + .await; + } + let key = req_key(args, 0, "set")?; + let value = match args[1].as_bytes() { + Some(v) => v.to_vec(), + None => return conn.err("ERR value must be a string").await, + }; + + let mut ttl_ms: Option = None; + let mut nx = false; + let mut xx = false; + let mut keepttl = false; + let mut get_flag = false; + let mut i = 2usize; + while i < args.len() { + let opt = match args[i].as_str() { + Some(s) => s.to_ascii_uppercase(), + None => return conn.err("ERR syntax error").await, + }; + match opt.as_str() { + "EX" => { + i += 1; + let s: i64 = parse_int(args.get(i))?; + if s <= 0 { + return conn.err("ERR invalid expire time in 'set' command").await; + } + ttl_ms = Some(s * 1000); + } + "PX" => { + i += 1; + let ms: i64 = parse_int(args.get(i))?; + if ms <= 0 { + return conn.err("ERR invalid expire time in 'set' command").await; + } + ttl_ms = Some(ms); + } + "EXAT" => { + i += 1; + let unix: i64 = parse_int(args.get(i))?; + let now_s = unix_now_secs(); + ttl_ms = Some((unix - now_s) * 1000); + } + "PXAT" => { + i += 1; + let ums: i64 = parse_int(args.get(i))?; + let now_ms = unix_now_ms(); + ttl_ms = Some(ums - now_ms); + } + "NX" => nx = true, + "XX" => xx = true, + "KEEPTTL" => keepttl = true, + "GET" => get_flag = true, + _ => return conn.err("ERR syntax error").await, + } + i += 1; + } + + let result: SetResult = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let prev = s.engine.get(&key)?.map(|(_, v)| v); + if nx && prev.is_some() { + SetResult::Nx(prev) + } else if xx && prev.is_none() { + SetResult::Xx + } else { + s.engine.set(key.clone(), value)?; + if !keepttl { + match ttl_ms { + Some(ms) if ms > 0 => s.set_expiry(&key, Duration::from_millis(ms as u64)), + _ => s.clear_expiry(&key), + } + } + SetResult::Ok(prev) + } + }; + match result { + SetResult::Nx(prev) => { + if get_flag { + conn.bulk(prev.as_deref()).await + } else { + conn.bulk(None).await + } + } + SetResult::Xx => conn.bulk(None).await, + SetResult::Ok(prev) => { + if get_flag { + conn.bulk(prev.as_deref()).await + } else { + conn.ok().await + } + } + } +} + +enum SetResult { + Ok(Option>), + Nx(Option>), + Xx, +} + +async fn cmd_setnx(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'setnx' command") + .await; + } + let key = req_key(args, 0, "setnx")?; + let val = req_bytes(args, 1)?; + let set = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + if s.engine.get(&key)?.is_some() { + false + } else { + s.engine.set(key, val)?; + true + } + }; + conn.int(if set { 1 } else { 0 }).await +} + +async fn cmd_setex(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 3 { + return conn + .err("ERR wrong number of arguments for 'setex' command") + .await; + } + let key = req_key(args, 0, "setex")?; + let secs: i64 = parse_int(args.get(1))?; + if secs <= 0 { + return conn.err("ERR invalid expire time in 'setex' command").await; + } + let val = req_bytes(args, 2)?; + { + let mut s = state_write!(conn); + s.engine.set(key.clone(), val)?; + s.set_expiry(&key, Duration::from_secs(secs as u64)); + } + conn.ok().await +} + +async fn cmd_psetex(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 3 { + return conn + .err("ERR wrong number of arguments for 'psetex' command") + .await; + } + let key = req_key(args, 0, "psetex")?; + let ms: i64 = parse_int(args.get(1))?; + if ms <= 0 { + return conn + .err("ERR invalid expire time in 'psetex' command") + .await; + } + let val = req_bytes(args, 2)?; + { + let mut s = state_write!(conn); + s.engine.set(key.clone(), val)?; + s.set_expiry(&key, Duration::from_millis(ms as u64)); + } + conn.ok().await +} + +async fn cmd_getset(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'getset' command") + .await; + } + let key = req_key(args, 0, "getset")?; + let val = req_bytes(args, 1)?; + let prev = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let p = s.engine.get(&key)?.map(|(_, v)| v); + s.engine.set(key.clone(), val)?; + s.clear_expiry(&key); + p + }; + conn.bulk(prev.as_deref()).await +} + +async fn cmd_getdel(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 1 { + return conn + .err("ERR wrong number of arguments for 'getdel' command") + .await; + } + let key = req_key(args, 0, "getdel")?; + let prev = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let p = s.engine.get(&key)?.map(|(_, v)| v); + if p.is_some() { + s.engine.del(key.clone())?; + s.clear_expiry(&key); + } + p + }; + conn.bulk(prev.as_deref()).await +} + +async fn cmd_getex(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() { + return conn + .err("ERR wrong number of arguments for 'getex' command") + .await; + } + let key = req_key(args, 0, "getex")?; + let val = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let v = s.engine.get(&key)?.map(|(_, v)| v); + if v.is_some() { + let mut i = 1usize; + while i < args.len() { + let opt = args[i] + .as_str() + .map(|x| x.to_ascii_uppercase()) + .unwrap_or_default(); + match opt.as_str() { + "EX" => { + i += 1; + let sec: u64 = parse_int(args.get(i)).unwrap_or(0) as u64; + s.set_expiry(&key, Duration::from_secs(sec)); + } + "PX" => { + i += 1; + let ms: u64 = parse_int(args.get(i)).unwrap_or(0) as u64; + s.set_expiry(&key, Duration::from_millis(ms)); + } + "PERSIST" => { + s.clear_expiry(&key); + } + _ => {} + } + i += 1; + } + } + v + }; + conn.bulk(val.as_deref()).await +} + +async fn cmd_mget(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() { + return conn + .err("ERR wrong number of arguments for 'mget' command") + .await; + } + let keys: Vec> = args + .iter() + .filter_map(|a| a.as_bytes().map(|b| b.to_vec())) + .collect(); + let items = { + let mut s = state_write!(conn); + keys.iter() + .map(|k| { + let _ = s.evict_if_expired(k); + let v = s.engine.get(k).ok().flatten().map(|(_, v)| v); + encode_bulk(v.as_deref()) + }) + .collect::>() + }; + conn.send(encode_array(&items)).await +} + +async fn cmd_mset(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() || !args.len().is_multiple_of(2) { + return conn + .err("ERR wrong number of arguments for 'mset' command") + .await; + } + { + let mut s = state_write!(conn); + let mut i = 0usize; + while i < args.len() { + let k = req_key(args, i, "mset")?; + let v = req_bytes(args, i + 1)?; + s.engine.set(k.clone(), v)?; + s.clear_expiry(&k); + i += 2; + } + } + conn.ok().await +} + +async fn cmd_msetnx(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() || !args.len().is_multiple_of(2) { + return conn + .err("ERR wrong number of arguments for 'msetnx' command") + .await; + } + let set = { + let mut s = state_write!(conn); + let mut all_missing = true; + for i in (0..args.len()).step_by(2) { + if let Some(k) = args[i].as_bytes() { + if s.engine.get(k)?.is_some() { + all_missing = false; + break; + } + } + } + if all_missing { + let mut i = 0usize; + while i < args.len() { + let k = req_key(args, i, "msetnx")?; + let v = req_bytes(args, i + 1)?; + s.engine.set(k, v)?; + i += 2; + } + } + all_missing + }; + conn.int(if set { 1 } else { 0 }).await +} + +async fn cmd_append(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'append' command") + .await; + } + let key = req_key(args, 0, "append")?; + let suffix = req_bytes(args, 1)?; + let len = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let mut cur = s.engine.get(&key)?.map(|(_, v)| v).unwrap_or_default(); + cur.extend_from_slice(&suffix); + let l = cur.len() as i64; + s.engine.set(key, cur)?; + l + }; + conn.int(len).await +} + +async fn cmd_strlen(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 1 { + return conn + .err("ERR wrong number of arguments for 'strlen' command") + .await; + } + let key = req_key(args, 0, "strlen")?; + let len = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + s.engine + .get(&key)? + .map(|(_, v)| v.len() as i64) + .unwrap_or(0) + }; + conn.int(len).await +} + +async fn cmd_incr(conn: &mut Conn, args: &[RespValue], delta: i64) -> anyhow::Result<()> { + let key = req_key(args, 0, "incr")?; + let next = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let cur: i64 = s + .engine + .get(&key)? + .map(|(_, v)| v) + .as_deref() + .map(|b| { + std::str::from_utf8(b) + .ok() + .and_then(|x| x.parse().ok()) + .unwrap_or(0) + }) + .unwrap_or(0); + let n = cur + .checked_add(delta) + .ok_or_else(|| anyhow::anyhow!("ERR increment or decrement would overflow"))?; + s.engine.set(key, n.to_string().into_bytes())?; + n + }; + conn.int(next).await +} + +async fn cmd_incrby(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'incrby' command") + .await; + } + let delta: i64 = parse_int(args.get(1))?; + cmd_incr(conn, &args[..1], delta).await +} + +async fn cmd_decrby(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'decrby' command") + .await; + } + let delta: i64 = parse_int(args.get(1))?; + cmd_incr(conn, &args[..1], -delta).await +} + +async fn cmd_incrbyfloat(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'incrbyfloat' command") + .await; + } + let key = req_key(args, 0, "incrbyfloat")?; + let delta: f64 = args[1] + .as_str() + .and_then(|s| s.parse().ok()) + .ok_or_else(|| anyhow::anyhow!("ERR not a float"))?; + let result = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let cur: f64 = s + .engine + .get(&key)? + .map(|(_, v)| v) + .as_deref() + .map(|b| { + std::str::from_utf8(b) + .ok() + .and_then(|x| x.parse().ok()) + .unwrap_or(0.0) + }) + .unwrap_or(0.0); + let next = cur + delta; + if next.is_nan() || next.is_infinite() { + return Err(anyhow::anyhow!( + "ERR increment would produce NaN or Infinity" + )); + } + let repr = format_float(next); + s.engine.set(key, repr.as_bytes().to_vec())?; + repr + }; + conn.bulk(Some(result.as_bytes())).await +} + +async fn cmd_getrange(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 3 { + return conn + .err("ERR wrong number of arguments for 'getrange' command") + .await; + } + let key = req_key(args, 0, "getrange")?; + let start: i64 = parse_int(args.get(1)).unwrap_or(0); + let end: i64 = parse_int(args.get(2)).unwrap_or(-1); + let slice = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let val = s.engine.get(&key)?.map(|(_, v)| v).unwrap_or_default(); + let len = val.len() as i64; + let st = if start >= 0 { + start as usize + } else { + (len + start).max(0) as usize + }; + let en = if end >= 0 { + (end as usize + 1).min(val.len()) + } else { + (len + end + 1).max(0) as usize + }; + if st >= val.len() || st >= en { + vec![] + } else { + val[st..en].to_vec() + } + }; + conn.bulk(Some(&slice)).await +} + +async fn cmd_setrange(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 3 { + return conn + .err("ERR wrong number of arguments for 'setrange' command") + .await; + } + let key = req_key(args, 0, "setrange")?; + let offset: usize = parse_int(args.get(1))? as usize; + let patch = req_bytes(args, 2)?; + let len = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + let mut val = s.engine.get(&key)?.map(|(_, v)| v).unwrap_or_default(); + let needed = offset + patch.len(); + if val.len() < needed { + val.resize(needed, 0); + } + val[offset..offset + patch.len()].copy_from_slice(&patch); + let l = val.len() as i64; + s.engine.set(key, val)?; + l + }; + conn.int(len).await +} + +// ─── Key / generic commands ────────────────────────────────────────────────── + +async fn cmd_del(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() { + return conn + .err("ERR wrong number of arguments for 'del' command") + .await; + } + let count = { + let mut s = state_write!(conn); + let mut c = 0i64; + for a in args { + if let Some(k) = a.as_bytes() { + let expired = s.evict_if_expired(k)?; + if !expired && s.engine.get(k)?.is_some() { + s.engine.del(k.to_vec())?; + s.ttl.remove(k); + c += 1; + } + } + } + c + }; + conn.int(count).await +} + +async fn cmd_exists(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() { + return conn + .err("ERR wrong number of arguments for 'exists' command") + .await; + } + let count = { + let mut s = state_write!(conn); + let mut c = 0i64; + for a in args { + if let Some(k) = a.as_bytes() { + let _ = s.evict_if_expired(k); + if s.engine.get(k)?.is_some() { + c += 1; + } + } + } + c + }; + conn.int(count).await +} + +async fn cmd_type(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 1 { + return conn + .err("ERR wrong number of arguments for 'type' command") + .await; + } + let key = req_key(args, 0, "type")?; + let t = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + if s.engine.get(&key)?.is_some() { + "string" + } else { + "none" + } + }; + conn.send(encode_simple(t)).await +} + +async fn cmd_ttl(conn: &mut Conn, args: &[RespValue], millis: bool) -> anyhow::Result<()> { + if args.len() != 1 { + return conn.err("ERR wrong number of arguments for command").await; + } + let key = req_key(args, 0, "ttl")?; + let n = { + let mut s = state_write!(conn); + if s.evict_if_expired(&key)? || s.engine.get(&key)?.is_none() { + -2 + } else { + match s.ttl_ms(&key) { + None => -1, + Some(-2) => { + let _ = s.engine.del(key.clone()); + s.ttl.remove(&key); + -2 + } + Some(ms) => { + if millis { + ms + } else { + ms / 1000 + } + } + } + } + }; + conn.int(n).await +} + +async fn cmd_expire(conn: &mut Conn, args: &[RespValue], millis: bool) -> anyhow::Result<()> { + if args.len() < 2 { + return conn.err("ERR wrong number of arguments for command").await; + } + let key = req_key(args, 0, "expire")?; + let t: i64 = parse_int(args.get(1))?; + if t <= 0 { + return conn.err("ERR invalid expire time in command").await; + } + let r = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + if s.engine.get(&key)?.is_none() { + 0i64 + } else { + let dur = if millis { + Duration::from_millis(t as u64) + } else { + Duration::from_secs(t as u64) + }; + s.set_expiry(&key, dur); + 1 + } + }; + conn.int(r).await +} + +async fn cmd_expireat(conn: &mut Conn, args: &[RespValue], millis: bool) -> anyhow::Result<()> { + if args.len() < 2 { + return conn.err("ERR wrong number of arguments for command").await; + } + let key = req_key(args, 0, "expireat")?; + let unix: i64 = parse_int(args.get(1))?; + let now_ms = unix_now_ms(); + let delta_ms = if millis { + unix - now_ms + } else { + (unix * 1000) - now_ms + }; + let r = { + let mut s = state_write!(conn); + if delta_ms <= 0 { + let _ = s.engine.del(key.clone()); + s.ttl.remove(&key); + 1i64 + } else { + s.evict_if_expired(&key)?; + if s.engine.get(&key)?.is_none() { + 0 + } else { + s.set_expiry(&key, Duration::from_millis(delta_ms as u64)); + 1 + } + } + }; + conn.int(r).await +} + +async fn cmd_expiretime(conn: &mut Conn, args: &[RespValue], millis: bool) -> anyhow::Result<()> { + if args.len() != 1 { + return conn.err("ERR wrong number of arguments for command").await; + } + let key = req_key(args, 0, "expiretime")?; + let n = { + let mut s = state_write!(conn); + if s.evict_if_expired(&key)? || s.engine.get(&key)?.is_none() { + -2 + } else { + match s.ttl_ms(&key) { + None => -1, + Some(ms) => { + let now_ms = unix_now_ms(); + let unix_ms = now_ms + ms; + if millis { + unix_ms + } else { + unix_ms / 1000 + } + } + } + } + }; + conn.int(n).await +} + +async fn cmd_persist(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 1 { + return conn + .err("ERR wrong number of arguments for 'persist' command") + .await; + } + let key = req_key(args, 0, "persist")?; + let r = { + let mut s = state_write!(conn); + s.evict_if_expired(&key)?; + if s.engine.get(&key)?.is_none() { + 0i64 + } else if s.ttl.remove(&key).is_some() { + 1 + } else { + 0 + } + }; + conn.int(r).await +} + +async fn cmd_keys(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let pattern = match args.first().and_then(|a| a.as_bytes()) { + Some(p) => String::from_utf8_lossy(p).into_owned(), + None => { + return conn + .err("ERR wrong number of arguments for 'keys' command") + .await + } + }; + let items = { + let s = state_read!(conn); + s.engine + .scan(b"", b"") + .unwrap_or_default() + .into_iter() + .filter(|(k, _)| !s.is_expired(k)) + .filter(|(k, _)| glob_match(&pattern, &String::from_utf8_lossy(k))) + .map(|(k, _)| encode_bulk(Some(&k))) + .collect::>() + }; + conn.send(encode_array(&items)).await +} + +async fn cmd_scan(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + let mut pattern = "*".to_owned(); + let mut i = 1usize; + while i < args.len() { + let opt = args[i] + .as_str() + .map(|s| s.to_ascii_uppercase()) + .unwrap_or_default(); + match opt.as_str() { + "MATCH" => { + i += 1; + pattern = args + .get(i) + .and_then(|a| a.as_str()) + .unwrap_or("*") + .to_owned(); + } + "COUNT" | "TYPE" => { + i += 1; + } + _ => {} + } + i += 1; + } + let items = { + let s = state_read!(conn); + s.engine + .scan(b"", b"") + .unwrap_or_default() + .into_iter() + .filter(|(k, _)| !s.is_expired(k)) + .filter(|(k, _)| glob_match(&pattern, &String::from_utf8_lossy(k))) + .map(|(k, _)| encode_bulk(Some(&k))) + .collect::>() + }; + let resp = encode_array(&[encode_bulk(Some(b"0")), encode_array(&items)]); + conn.send(resp).await +} + +async fn cmd_rename(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'rename' command") + .await; + } + let src = req_key(args, 0, "rename")?; + let dst = req_key(args, 1, "rename")?; + { + let mut s = state_write!(conn); + s.evict_if_expired(&src)?; + let val = s + .engine + .get(&src)? + .map(|(_, v)| v) + .ok_or_else(|| anyhow::anyhow!("ERR no such key"))?; + let ttl = s.ttl.get(&src).cloned(); + s.engine.set(dst.clone(), val)?; + s.engine.del(src.clone())?; + s.ttl.remove(&src); + match ttl { + Some(t) => { + s.ttl.insert(dst, t); + } + None => { + s.ttl.remove(&dst); + } + } + } + conn.ok().await +} + +async fn cmd_renamenx(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.len() != 2 { + return conn + .err("ERR wrong number of arguments for 'renamenx' command") + .await; + } + let src = req_key(args, 0, "renamenx")?; + let dst = req_key(args, 1, "renamenx")?; + let r = { + let mut s = state_write!(conn); + s.evict_if_expired(&src)?; + s.evict_if_expired(&dst)?; + let val = s + .engine + .get(&src)? + .map(|(_, v)| v) + .ok_or_else(|| anyhow::anyhow!("ERR no such key"))?; + if s.engine.get(&dst)?.is_some() { + 0i64 + } else { + s.engine.set(dst.clone(), val)?; + s.engine.del(src.clone())?; + s.ttl.remove(&src); + s.ttl.remove(&dst); + 1 + } + }; + conn.int(r).await +} + +async fn cmd_randomkey(conn: &mut Conn) -> anyhow::Result<()> { + let result = { + let s = state_read!(conn); + let all = s.engine.scan(b"", b"").unwrap_or_default(); + let live: Vec<_> = all.iter().filter(|(k, _)| !s.is_expired(k)).collect(); + if live.is_empty() { + None + } else { + let idx = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_nanos() as usize % live.len()) + .unwrap_or(0); + Some(live[idx].0.clone()) + } + }; + conn.bulk(result.as_deref()).await +} + +async fn cmd_touch(conn: &mut Conn, args: &[RespValue]) -> anyhow::Result<()> { + if args.is_empty() { + return conn + .err("ERR wrong number of arguments for 'touch' command") + .await; + } + let count = { + let s = state_write!(conn); + let mut c = 0i64; + for a in args { + if let Some(k) = a.as_bytes() { + if !s.is_expired(k) && s.engine.get(k)?.is_some() { + c += 1; + } + } + } + c + }; + conn.int(count).await +} + +// ─── Small helpers ─────────────────────────────────────────────────────────── + +fn req_key(args: &[RespValue], i: usize, _cmd: &str) -> anyhow::Result> { + args.get(i) + .and_then(|a| a.as_bytes()) + .map(|b| b.to_vec()) + .ok_or_else(|| anyhow::anyhow!("ERR key must be a string")) +} + +fn req_bytes(args: &[RespValue], i: usize) -> anyhow::Result> { + args.get(i) + .and_then(|a| a.as_bytes()) + .map(|b| b.to_vec()) + .ok_or_else(|| anyhow::anyhow!("ERR argument must be a string")) +} + +fn parse_int(v: Option<&RespValue>) -> anyhow::Result { + v.and_then(|a| a.as_str()) + .and_then(|s| s.parse().ok()) + .ok_or_else(|| anyhow::anyhow!("ERR value is not an integer or out of range")) +} + +fn unix_now_secs() -> i64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs() as i64) + .unwrap_or(0) +} + +fn unix_now_ms() -> i64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as i64) + .unwrap_or(0) +} + +fn format_float(f: f64) -> String { + if f == f.floor() && f.abs() < 1e15 { + format!("{}", f as i64) + } else { + format!("{}", f) + } +} + +// ─── Glob matching ─────────────────────────────────────────────────────────── + +fn glob_match(pattern: &str, s: &str) -> bool { + let pat: Vec = pattern.chars().collect(); + let str: Vec = s.chars().collect(); + glob_inner(&pat, &str) +} + +fn glob_inner(pat: &[char], s: &[char]) -> bool { + match (pat.first(), s.first()) { + (None, None) => true, + (Some(&'*'), _) => glob_inner(&pat[1..], s) || (!s.is_empty() && glob_inner(pat, &s[1..])), + (Some(&'?'), Some(_)) => glob_inner(&pat[1..], &s[1..]), + (Some(p), Some(c)) if p == c => glob_inner(&pat[1..], &s[1..]), + _ => false, + } +} diff --git a/crates/server/src/lib.rs b/crates/server/src/lib.rs new file mode 100644 index 0000000..35519fd --- /dev/null +++ b/crates/server/src/lib.rs @@ -0,0 +1,41 @@ +//! RiptideKV server library. +//! +//! The library interface makes the server independently testable without +//! launching a subprocess. Tests can call [`serve`] with a pre-bound listener. +//! +//! # Public surface +//! +//! - [`serve`] — accept loop (cancellation via dropping the listener) +//! - [`db`] — shared database state (`SharedDb`, `DbState`) +//! - [`resp`] — RESP2 parser / serializer helpers +//! - [`handler`] — per-connection dispatcher (all Redis commands) + +pub mod db; +pub mod handler; +pub mod resp; + +use db::SharedDb; +use tokio::net::TcpListener; +use tracing::error; + +/// Run the accept loop on `listener` until it is dropped or returns an error. +/// +/// Each accepted connection is handled in its own Tokio task. The loop +/// continues until `listener.accept()` fails (e.g. the listener is closed). +pub async fn serve(listener: TcpListener, db: SharedDb) -> anyhow::Result<()> { + loop { + match listener.accept().await { + Ok((stream, _)) => { + stream.set_nodelay(true)?; + let db = db.clone(); + tokio::spawn(async move { + handler::handle_connection(stream, db).await; + }); + } + Err(e) => { + error!("accept error: {e}"); + return Err(e.into()); + } + } + } +} diff --git a/crates/server/src/main.rs b/crates/server/src/main.rs new file mode 100644 index 0000000..d3a1078 --- /dev/null +++ b/crates/server/src/main.rs @@ -0,0 +1,71 @@ +//! RiptideKV — RESP2 server entry point. +//! +//! This is a thin configuration wrapper around `server::serve()`. +//! +//! Configuration via environment variables: +//! RIPTIDE_BIND bind address (default 0.0.0.0:6379) +//! RIPTIDE_WAL_PATH WAL file path (default wal.log) +//! RIPTIDE_SST_DIR SSTable directory (default data/sst) +//! RIPTIDE_FLUSH_KB memtable flush threshold KiB (default 1024) +//! RIPTIDE_WAL_SYNC fsync after every WAL write (default true) + +use engine::Engine; +use server::db::SharedDb; +use tracing::{error, info}; +use tracing_subscriber::EnvFilter; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env().add_directive("server=info".parse()?)) + .init(); + + let bind = std::env::var("RIPTIDE_BIND").unwrap_or_else(|_| "0.0.0.0:6379".into()); + let wal_path = std::env::var("RIPTIDE_WAL_PATH").unwrap_or_else(|_| "wal.log".into()); + let sst_dir = std::env::var("RIPTIDE_SST_DIR").unwrap_or_else(|_| "data/sst".into()); + let flush_kb: usize = std::env::var("RIPTIDE_FLUSH_KB") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(1024); + let wal_sync: bool = std::env::var("RIPTIDE_WAL_SYNC") + .ok() + .map(|v| v != "false" && v != "0") + .unwrap_or(true); + + std::fs::create_dir_all(&sst_dir)?; + + let engine = Engine::new(&wal_path, &sst_dir, flush_kb * 1024, wal_sync)?; + + let db = SharedDb::new(engine); + let listener = tokio::net::TcpListener::bind(&bind).await?; + info!("RiptideKV listening on {bind}"); + + tokio::select! { + res = server::serve(listener, db) => { + if let Err(e) = res { error!("server error: {e}"); } + } + _ = tokio::signal::ctrl_c() => { + info!("shutting down (SIGINT)"); + } + _ = sigterm() => { + info!("shutting down (SIGTERM)"); + } + } + + Ok(()) +} + +/// Resolves when SIGTERM is received on Unix; never resolves on other platforms. +#[cfg(unix)] +async fn sigterm() { + use tokio::signal::unix::{signal, SignalKind}; + signal(SignalKind::terminate()) + .expect("failed to install SIGTERM handler") + .recv() + .await; +} + +#[cfg(not(unix))] +async fn sigterm() { + std::future::pending::<()>().await; +} diff --git a/crates/server/src/resp.rs b/crates/server/src/resp.rs new file mode 100644 index 0000000..c9baeca --- /dev/null +++ b/crates/server/src/resp.rs @@ -0,0 +1,180 @@ +//! RESP2 protocol parser and serializer. +//! +//! Supports all five RESP2 types plus inline command parsing (for redis-cli +//! and telnet clients that don't use the full framing). + +use anyhow::{bail, Result}; +use bytes::Bytes; +use tokio::io::{AsyncBufReadExt, AsyncReadExt, BufReader}; +use tokio::net::tcp::OwnedReadHalf; + +/// A RESP2 value. +#[derive(Debug, Clone, PartialEq)] +pub enum RespValue { + SimpleString(String), + Error(String), + Integer(i64), + BulkString(Option), + Array(Option>), +} + +impl RespValue { + /// Convenience: unwrap a bulk-string or simple-string as UTF-8 bytes. + pub fn as_bytes(&self) -> Option<&[u8]> { + match self { + RespValue::BulkString(Some(b)) => Some(b), + RespValue::SimpleString(s) => Some(s.as_bytes()), + _ => None, + } + } + + /// Convenience: unwrap as a UTF-8 string slice. + pub fn as_str(&self) -> Option<&str> { + match self { + RespValue::BulkString(Some(b)) => std::str::from_utf8(b).ok(), + RespValue::SimpleString(s) => Some(s.as_str()), + _ => None, + } + } +} + +// ─── Async parser ──────────────────────────────────────────────────────────── + +pub struct RespReader { + inner: BufReader, +} + +impl RespReader { + pub fn new(read_half: OwnedReadHalf) -> Self { + Self { + inner: BufReader::with_capacity(8 * 1024, read_half), + } + } + + /// Read one RESP value (or inline command) from the stream. + /// Returns `Ok(None)` on clean EOF. + /// + /// Arrays are parsed by reading each element with `read_item()` to avoid + /// recursive async functions (which require `Box::pin` indirection). + pub async fn read_value(&mut self) -> Result> { + let mut line = String::new(); + let n = self.inner.read_line(&mut line).await?; + if n == 0 { + return Ok(None); + } + let trimmed = line.trim_end_matches(['\r', '\n']); + + if let Some(rest) = trimmed.strip_prefix('*') { + let count: i64 = rest.parse()?; + if count < 0 { + return Ok(Some(RespValue::Array(None))); + } + let mut items = Vec::with_capacity(count as usize); + for _ in 0..count { + items.push(self.read_item().await?); + } + return Ok(Some(RespValue::Array(Some(items)))); + } + + // Non-array top-level value or inline command. + self.parse_scalar(trimmed).await.map(Some) + } + + /// Read exactly one scalar (non-array) RESP item from the stream. + /// Used to read the elements of an array without recursion. + async fn read_item(&mut self) -> Result { + let mut line = String::new(); + let n = self.inner.read_line(&mut line).await?; + if n == 0 { + bail!("unexpected EOF reading RESP item"); + } + let trimmed = line.trim_end_matches(['\r', '\n']); + self.parse_scalar(trimmed).await + } + + /// Parse a single non-array RESP line into a value. + async fn parse_scalar(&mut self, line: &str) -> Result { + match line.as_bytes().first() { + Some(b'+') => Ok(RespValue::SimpleString(line[1..].to_owned())), + Some(b'-') => Ok(RespValue::Error(line[1..].to_owned())), + Some(b':') => Ok(RespValue::Integer(line[1..].parse()?)), + Some(b'$') => { + let len: i64 = line[1..].parse()?; + if len < 0 { + return Ok(RespValue::BulkString(None)); + } + let len = len as usize; + let mut buf = vec![0u8; len + 2]; // +2 for \r\n + self.inner.read_exact(&mut buf).await?; + buf.truncate(len); + Ok(RespValue::BulkString(Some(Bytes::from(buf)))) + } + _ => { + // Inline command: space-separated tokens + let parts: Vec = line + .split_ascii_whitespace() + .map(|s| RespValue::BulkString(Some(Bytes::from(s.as_bytes().to_vec())))) + .collect(); + if parts.is_empty() { + bail!("empty inline command"); + } + // Wrap inline tokens as an array so the dispatcher sees the same shape. + Ok(RespValue::Array(Some(parts))) + } + } + } +} + +// ─── Serializer ────────────────────────────────────────────────────────────── + +/// Encode a RESP2 simple string. +#[inline] +pub fn encode_simple(s: &str) -> Vec { + format!("+{}\r\n", s).into_bytes() +} + +/// Encode a RESP2 error. +#[inline] +pub fn encode_error(msg: &str) -> Vec { + format!("-{}\r\n", msg).into_bytes() +} + +/// Encode a RESP2 integer. +#[inline] +pub fn encode_int(n: i64) -> Vec { + format!(":{}\r\n", n).into_bytes() +} + +/// Encode a RESP2 bulk string (or null). +pub fn encode_bulk(data: Option<&[u8]>) -> Vec { + match data { + None => b"$-1\r\n".to_vec(), + Some(b) => { + let mut out = format!("${}\r\n", b.len()).into_bytes(); + out.extend_from_slice(b); + out.extend_from_slice(b"\r\n"); + out + } + } +} + +/// Encode a RESP2 array (or null). +pub fn encode_array(items: &[Vec]) -> Vec { + let mut out = format!("*{}\r\n", items.len()).into_bytes(); + for item in items { + out.extend_from_slice(item); + } + out +} + +/// Null array. +#[inline] +pub fn encode_null_array() -> Vec { + b"*-1\r\n".to_vec() +} + +/// OK simple string. +#[inline] +pub fn ok() -> Vec { + b"+OK\r\n".to_vec() +} diff --git a/crates/server/tests/integration.rs b/crates/server/tests/integration.rs new file mode 100644 index 0000000..5008030 --- /dev/null +++ b/crates/server/tests/integration.rs @@ -0,0 +1,1103 @@ +//! End-to-end integration tests for the RiptideKV RESP2 server. +//! +//! Every test: +//! 1. Spins up a real `TcpListener` on a free port (OS assigns port 0). +//! 2. Starts the server via `server::serve()` in a background Tokio task. +//! 3. Connects a lightweight `TestClient` that speaks raw RESP2. +//! 4. Sends commands and asserts responses. +//! +//! Tests are deliberately self-contained — each gets a fresh in-memory +//! engine in a temporary directory. This means tests can run in parallel +//! without interfering with each other. + +use engine::Engine; +use server::db::SharedDb; +use std::time::Duration; +use tempfile::tempdir; +use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}; +use tokio::net::{TcpListener, TcpStream}; + +// ─── Test infrastructure ────────────────────────────────────────────────────── + +/// Start a server listening on an OS-assigned free port. +/// Returns the socket address so the caller can connect. +async fn start_server() -> (std::net::SocketAddr, SharedDb) { + let dir = Box::leak(Box::new(tempdir().unwrap())); // keep dir alive for test duration + let engine = Engine::new( + dir.path().join("wal.log"), + dir.path().join("sst"), + 64 * 1024 * 1024, // large threshold — no auto-flush noise in tests + false, + ) + .unwrap(); + let db = SharedDb::new(engine); + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let db2 = db.clone(); + tokio::spawn(async move { + server::serve(listener, db2).await.ok(); + }); + (addr, db) +} + +/// A minimal async RESP2 client for tests. +struct TestClient { + reader: BufReader, + writer: tokio::net::tcp::OwnedWriteHalf, +} + +impl TestClient { + async fn connect(addr: std::net::SocketAddr) -> Self { + let stream = TcpStream::connect(addr).await.unwrap(); + let (r, w) = stream.into_split(); + Self { + reader: BufReader::new(r), + writer: w, + } + } + + /// Send a RESP2 array command (the format all Redis clients use). + async fn send(&mut self, args: &[&str]) { + let mut buf = format!("*{}\r\n", args.len()).into_bytes(); + for a in args { + buf.extend_from_slice(format!("${}\r\n{}\r\n", a.len(), a).as_bytes()); + } + self.writer.write_all(&buf).await.unwrap(); + } + + /// Read a single RESP2 response line (the first line of the response). + async fn read_line(&mut self) -> String { + let mut line = String::new(); + self.reader.read_line(&mut line).await.unwrap(); + line.trim_end_matches(['\r', '\n']).to_owned() + } + + /// Read a complete RESP2 response and return it as a `Response`. + async fn recv(&mut self) -> Response { + let line = self.read_line().await; + match line.as_bytes().first() { + Some(b'+') => Response::Simple(line[1..].to_owned()), + Some(b'-') => Response::Error(line[1..].to_owned()), + Some(b':') => Response::Int(line[1..].parse().unwrap()), + Some(b'$') => { + let n: i64 = line[1..].parse().unwrap(); + if n < 0 { + return Response::Null; + } + let n = n as usize; + let mut buf = vec![0u8; n + 2]; + self.reader.read_exact(&mut buf).await.unwrap(); + buf.truncate(n); + Response::Bulk(buf) + } + Some(b'*') => { + let count: i64 = line[1..].parse().unwrap(); + if count < 0 { + return Response::NullArray; + } + let mut items = Vec::new(); + for _ in 0..count { + let item = Box::pin(self.recv()).await; + items.push(item); + } + Response::Array(items) + } + _ => panic!("unexpected RESP line: {:?}", line), + } + } + + /// Convenience: send + recv in one call. + async fn cmd(&mut self, args: &[&str]) -> Response { + self.send(args).await; + self.recv().await + } + + /// Convenience: expect "+OK". + async fn ok(&mut self, args: &[&str]) { + assert_eq!(self.cmd(args).await, Response::Simple("OK".into())); + } + + /// Convenience: expect a specific integer. + async fn int(&mut self, args: &[&str], expected: i64) { + assert_eq!(self.cmd(args).await, Response::Int(expected)); + } + + /// Convenience: expect a bulk string with this UTF-8 content. + async fn bulk_str(&mut self, args: &[&str], expected: &str) { + assert_eq!( + self.cmd(args).await, + Response::Bulk(expected.as_bytes().to_vec()) + ); + } + + /// Convenience: expect null bulk. + async fn null(&mut self, args: &[&str]) { + assert_eq!(self.cmd(args).await, Response::Null); + } +} + +#[derive(Debug, PartialEq, Clone)] +enum Response { + Simple(String), + Error(String), + Int(i64), + Bulk(Vec), + Null, + NullArray, + Array(Vec), +} + +impl Response { + fn as_str(&self) -> &str { + match self { + Response::Bulk(b) => std::str::from_utf8(b).unwrap(), + Response::Simple(s) => s.as_str(), + _ => panic!("not a string response"), + } + } + fn as_int(&self) -> i64 { + match self { + Response::Int(n) => *n, + _ => panic!("not an int response"), + } + } + fn is_error(&self) -> bool { + matches!(self, Response::Error(_)) + } + fn error_msg(&self) -> &str { + match self { + Response::Error(s) => s.as_str(), + _ => panic!("not an error"), + } + } +} + +// ─── Connection / server commands ───────────────────────────────────────────── + +#[tokio::test] +async fn test_ping_no_args() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + assert_eq!(c.cmd(&["PING"]).await, Response::Simple("PONG".into())); +} + +#[tokio::test] +async fn test_ping_with_message() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + assert_eq!( + c.cmd(&["PING", "hello"]).await, + Response::Bulk(b"hello".to_vec()) + ); +} + +#[tokio::test] +async fn test_echo() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.bulk_str(&["ECHO", "RiptideKV"], "RiptideKV").await; +} + +#[tokio::test] +async fn test_select_zero_ok() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SELECT", "0"]).await; +} + +#[tokio::test] +async fn test_select_nonzero_error() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["SELECT", "1"]).await; + assert!(r.is_error()); +} + +#[tokio::test] +async fn test_quit_closes_connection() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["QUIT"]).await; + // Next read should get EOF (connection closed by server). + let mut line = String::new(); + let n = c.reader.read_line(&mut line).await.unwrap(); + assert_eq!(n, 0, "server should close connection after QUIT"); +} + +#[tokio::test] +async fn test_client_setname_getname() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["CLIENT", "SETNAME", "myconn"]).await; + c.bulk_str(&["CLIENT", "GETNAME"], "myconn").await; +} + +#[tokio::test] +async fn test_client_id() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["CLIENT", "ID"]).await; + assert!(matches!(r, Response::Int(n) if n > 0)); +} + +#[tokio::test] +async fn test_hello_resp2() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["HELLO", "2"]).await; + assert!( + matches!(r, Response::Array(_)), + "HELLO should return an array" + ); +} + +#[tokio::test] +async fn test_hello_resp3_rejected() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["HELLO", "3"]).await; + assert!(r.is_error()); + assert!(r.error_msg().contains("NOPROTO")); +} + +#[tokio::test] +async fn test_info_returns_bulk() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["INFO"]).await; + assert!(matches!(&r, Response::Bulk(b) if !b.is_empty())); + assert!(r.as_str().contains("redis_version")); +} + +#[tokio::test] +async fn test_info_section_server() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["INFO", "server"]).await; + assert!(r.as_str().contains("uptime_in_seconds")); +} + +#[tokio::test] +async fn test_command_count() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["COMMAND", "COUNT"]).await; + assert!(matches!(r, Response::Int(n) if n > 0)); +} + +#[tokio::test] +async fn test_config_get_returns_empty_array() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["CONFIG", "GET", "*"]).await; + assert!(matches!(r, Response::Array(_))); +} + +#[tokio::test] +async fn test_unknown_command_error() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["XYZZY"]).await; + assert!(r.is_error()); + assert!(r.error_msg().contains("unknown command")); +} + +// ─── GET / SET ──────────────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_set_and_get() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "name", "alice"]).await; + c.bulk_str(&["GET", "name"], "alice").await; +} + +#[tokio::test] +async fn test_get_missing_key_returns_null() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.null(&["GET", "no-such-key"]).await; +} + +#[tokio::test] +async fn test_set_overwrites_existing_value() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v1"]).await; + c.ok(&["SET", "k", "v2"]).await; + c.bulk_str(&["GET", "k"], "v2").await; +} + +#[tokio::test] +async fn test_set_nx_only_sets_when_absent() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + // Key absent → set succeeds, returns OK. + c.ok(&["SET", "nx_key", "first", "NX"]).await; + // Key present → set fails, returns null. + c.null(&["SET", "nx_key", "second", "NX"]).await; + // Value unchanged. + c.bulk_str(&["GET", "nx_key"], "first").await; +} + +#[tokio::test] +async fn test_set_xx_only_sets_when_present() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + // Key absent → XX fails. + c.null(&["SET", "xx_key", "v", "XX"]).await; + // Create it first. + c.ok(&["SET", "xx_key", "original"]).await; + // Now XX succeeds. + c.ok(&["SET", "xx_key", "updated", "XX"]).await; + c.bulk_str(&["GET", "xx_key"], "updated").await; +} + +#[tokio::test] +async fn test_set_get_flag_returns_old_value() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "old"]).await; + let r = c.cmd(&["SET", "k", "new", "GET"]).await; + assert_eq!(r, Response::Bulk(b"old".to_vec())); + c.bulk_str(&["GET", "k"], "new").await; +} + +#[tokio::test] +async fn test_set_with_ex_expiry() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "expkey", "hello", "EX", "100"]).await; + let ttl = c.cmd(&["TTL", "expkey"]).await.as_int(); + assert!( + ttl > 0 && ttl <= 100, + "TTL should be between 1 and 100, got {}", + ttl + ); +} + +#[tokio::test] +async fn test_set_with_px_expiry() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "pxkey", "v", "PX", "100000"]).await; + let pttl = c.cmd(&["PTTL", "pxkey"]).await.as_int(); + assert!(pttl > 0 && pttl <= 100_000); +} + +#[tokio::test] +async fn test_set_keepttl_preserves_expiry() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v1", "EX", "100"]).await; + c.ok(&["SET", "k", "v2", "KEEPTTL"]).await; + let ttl = c.cmd(&["TTL", "k"]).await.as_int(); + assert!(ttl > 0, "KEEPTTL should preserve the TTL, got {}", ttl); +} + +#[tokio::test] +async fn test_set_invalid_ex_returns_error() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["SET", "k", "v", "EX", "-1"]).await; + assert!(r.is_error()); +} + +// ─── SETNX / SETEX / PSETEX ─────────────────────────────────────────────────── + +#[tokio::test] +async fn test_setnx() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.int(&["SETNX", "k", "v"], 1).await; + c.int(&["SETNX", "k", "v2"], 0).await; + c.bulk_str(&["GET", "k"], "v").await; +} + +#[tokio::test] +async fn test_setex() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SETEX", "k", "60", "val"]).await; + let ttl = c.cmd(&["TTL", "k"]).await.as_int(); + assert!(ttl > 0 && ttl <= 60); +} + +#[tokio::test] +async fn test_setex_invalid_timeout_returns_error() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["SETEX", "k", "0", "v"]).await; + assert!(r.is_error()); +} + +#[tokio::test] +async fn test_psetex() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["PSETEX", "k", "60000", "val"]).await; + let pttl = c.cmd(&["PTTL", "k"]).await.as_int(); + assert!(pttl > 0 && pttl <= 60_000); +} + +// ─── GETSET / GETDEL / GETEX ────────────────────────────────────────────────── + +#[tokio::test] +async fn test_getset() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "old"]).await; + let r = c.cmd(&["GETSET", "k", "new"]).await; + assert_eq!(r, Response::Bulk(b"old".to_vec())); + c.bulk_str(&["GET", "k"], "new").await; +} + +#[tokio::test] +async fn test_getset_missing_key_returns_null() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["GETSET", "absent", "val"]).await; + assert_eq!(r, Response::Null); + c.bulk_str(&["GET", "absent"], "val").await; +} + +#[tokio::test] +async fn test_getdel_deletes_and_returns() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + let r = c.cmd(&["GETDEL", "k"]).await; + assert_eq!(r, Response::Bulk(b"v".to_vec())); + c.null(&["GET", "k"]).await; +} + +#[tokio::test] +async fn test_getdel_missing_returns_null() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.null(&["GETDEL", "absent"]).await; +} + +#[tokio::test] +async fn test_getex_sets_expiry() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + c.bulk_str(&["GETEX", "k", "EX", "100"], "v").await; + let ttl = c.cmd(&["TTL", "k"]).await.as_int(); + assert!(ttl > 0 && ttl <= 100); +} + +#[tokio::test] +async fn test_getex_persist_removes_expiry() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v", "EX", "100"]).await; + c.bulk_str(&["GETEX", "k", "PERSIST"], "v").await; + c.int(&["TTL", "k"], -1).await; +} + +// ─── MGET / MSET / MSETNX ───────────────────────────────────────────────────── + +#[tokio::test] +async fn test_mset_and_mget() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "k1", "v1", "k2", "v2", "k3", "v3"]).await; + let r = c.cmd(&["MGET", "k1", "k2", "k3", "absent"]).await; + assert_eq!( + r, + Response::Array(vec![ + Response::Bulk(b"v1".to_vec()), + Response::Bulk(b"v2".to_vec()), + Response::Bulk(b"v3".to_vec()), + Response::Null, + ]) + ); +} + +#[tokio::test] +async fn test_msetnx_all_absent_succeeds() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.int(&["MSETNX", "a", "1", "b", "2"], 1).await; + c.bulk_str(&["GET", "a"], "1").await; +} + +#[tokio::test] +async fn test_msetnx_any_present_fails() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "existing", "x"]).await; + c.int(&["MSETNX", "existing", "new", "fresh", "y"], 0).await; + // "fresh" must not have been set either (atomicity). + c.null(&["GET", "fresh"]).await; +} + +// ─── APPEND / STRLEN ────────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_append_creates_and_extends() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.int(&["APPEND", "k", "hello"], 5).await; + c.int(&["APPEND", "k", " world"], 11).await; + c.bulk_str(&["GET", "k"], "hello world").await; +} + +#[tokio::test] +async fn test_strlen() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "hello"]).await; + c.int(&["STRLEN", "k"], 5).await; + c.int(&["STRLEN", "absent"], 0).await; +} + +// ─── INCR / INCRBY / INCRBYFLOAT / DECR / DECRBY ───────────────────────────── + +#[tokio::test] +async fn test_incr_creates_and_increments() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.int(&["INCR", "counter"], 1).await; + c.int(&["INCR", "counter"], 2).await; + c.int(&["INCR", "counter"], 3).await; +} + +#[tokio::test] +async fn test_incrby() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "n", "10"]).await; + c.int(&["INCRBY", "n", "5"], 15).await; + c.int(&["INCRBY", "n", "-3"], 12).await; +} + +#[tokio::test] +async fn test_decr() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "n", "10"]).await; + c.int(&["DECR", "n"], 9).await; +} + +#[tokio::test] +async fn test_decrby() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "n", "100"]).await; + c.int(&["DECRBY", "n", "30"], 70).await; +} + +#[tokio::test] +async fn test_incrbyfloat() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "f", "10"]).await; + let r = c.cmd(&["INCRBYFLOAT", "f", "1.5"]).await; + // Returns as a bulk string. + assert_eq!(r.as_str(), "11.5"); +} + +// ─── GETRANGE / SETRANGE ────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_getrange() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "hello world"]).await; + c.bulk_str(&["GETRANGE", "k", "0", "4"], "hello").await; + c.bulk_str(&["GETRANGE", "k", "6", "-1"], "world").await; + c.bulk_str(&["GETRANGE", "k", "0", "-1"], "hello world") + .await; +} + +#[tokio::test] +async fn test_setrange_extends_string() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "Hello World"]).await; + c.int(&["SETRANGE", "k", "6", "Redis"], 11).await; + c.bulk_str(&["GET", "k"], "Hello Redis").await; +} + +// ─── DEL / EXISTS / TYPE ────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_del_single_key() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + c.int(&["DEL", "k"], 1).await; + c.null(&["GET", "k"]).await; +} + +#[tokio::test] +async fn test_del_multiple_keys_returns_count() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "k1", "v1", "k2", "v2", "k3", "v3"]).await; + c.int(&["DEL", "k1", "k2", "k3", "absent"], 3).await; +} + +#[tokio::test] +async fn test_del_missing_key_returns_zero() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.int(&["DEL", "absent"], 0).await; +} + +#[tokio::test] +async fn test_exists_present_and_absent() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + c.int(&["EXISTS", "k"], 1).await; + c.int(&["EXISTS", "absent"], 0).await; +} + +#[tokio::test] +async fn test_exists_multiple_keys() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "k1", "v1", "k2", "v2"]).await; + // k1 counted twice (both references present). + c.int(&["EXISTS", "k1", "k1", "k2", "absent"], 3).await; +} + +#[tokio::test] +async fn test_type_string_and_none() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + assert_eq!( + c.cmd(&["TYPE", "k"]).await, + Response::Simple("string".into()) + ); + assert_eq!( + c.cmd(&["TYPE", "absent"]).await, + Response::Simple("none".into()) + ); +} + +// ─── TTL / PTTL / EXPIRE / PEXPIRE / PERSIST / EXPIRETIME ──────────────────── + +#[tokio::test] +async fn test_ttl_no_expiry_returns_minus_one() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + c.int(&["TTL", "k"], -1).await; + c.int(&["PTTL", "k"], -1).await; +} + +#[tokio::test] +async fn test_ttl_absent_key_returns_minus_two() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.int(&["TTL", "absent"], -2).await; +} + +#[tokio::test] +async fn test_expire_and_ttl() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + c.int(&["EXPIRE", "k", "60"], 1).await; + let ttl = c.cmd(&["TTL", "k"]).await.as_int(); + assert!( + ttl > 0 && ttl <= 60, + "TTL should be in (0, 60], got {}", + ttl + ); +} + +#[tokio::test] +async fn test_pexpire_and_pttl() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + c.int(&["PEXPIRE", "k", "60000"], 1).await; + let pttl = c.cmd(&["PTTL", "k"]).await.as_int(); + assert!(pttl > 0 && pttl <= 60_000); +} + +#[tokio::test] +async fn test_persist_removes_expiry() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v", "EX", "60"]).await; + c.int(&["PERSIST", "k"], 1).await; + c.int(&["TTL", "k"], -1).await; + // Calling PERSIST again on a key with no TTL should return 0. + c.int(&["PERSIST", "k"], 0).await; +} + +#[tokio::test] +async fn test_key_expires_and_becomes_invisible() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + // Set a key with a 1 ms TTL (minimum meaningful value for test). + c.ok(&["SET", "k", "v", "PX", "50"]).await; + // Still visible immediately. + assert_ne!(c.cmd(&["GET", "k"]).await, Response::Null); + // Wait for expiry. + tokio::time::sleep(Duration::from_millis(100)).await; + // Should now be gone. + c.null(&["GET", "k"]).await; + c.int(&["EXISTS", "k"], 0).await; + c.int(&["TTL", "k"], -2).await; +} + +#[tokio::test] +async fn test_expireat_sets_unix_expiry() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v"]).await; + let future_unix = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs() + + 120; + let r = c.cmd(&["EXPIREAT", "k", &future_unix.to_string()]).await; + assert_eq!(r, Response::Int(1)); + let ttl = c.cmd(&["TTL", "k"]).await.as_int(); + assert!(ttl > 0 && ttl <= 120); +} + +// ─── KEYS / SCAN / DBSIZE / FLUSHDB ────────────────────────────────────────── + +#[tokio::test] +async fn test_keys_wildcard() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "user:1", "a", "user:2", "b", "item:1", "c"]) + .await; + let r = c.cmd(&["KEYS", "user:*"]).await; + let keys = match r { + Response::Array(v) => v, + _ => panic!("expected array"), + }; + assert_eq!(keys.len(), 2); + assert!(keys.iter().all(|k| k.as_str().starts_with("user:"))); +} + +#[tokio::test] +async fn test_keys_question_mark_pattern() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "foo", "1", "bar", "2", "baz", "3", "foobar", "4"]) + .await; + let r = c.cmd(&["KEYS", "???"]).await; + let keys = match r { + Response::Array(v) => v, + _ => panic!("expected array"), + }; + // foo, bar, baz should match (3 chars); foobar should not + assert_eq!(keys.len(), 3); +} + +#[tokio::test] +async fn test_scan_basic() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "k1", "v1", "k2", "v2", "k3", "v3"]).await; + let r = c.cmd(&["SCAN", "0"]).await; + match r { + Response::Array(v) => { + assert_eq!(v.len(), 2); + assert_eq!(v[0].as_str(), "0"); // cursor always 0 (we return all at once) + match &v[1] { + Response::Array(keys) => assert_eq!(keys.len(), 3), + _ => panic!("expected nested array for keys"), + } + } + _ => panic!("expected array from SCAN"), + } +} + +#[tokio::test] +async fn test_scan_with_match_pattern() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "prefix:1", "a", "prefix:2", "b", "other", "c"]) + .await; + let r = c.cmd(&["SCAN", "0", "MATCH", "prefix:*"]).await; + match r { + Response::Array(v) => match &v[1] { + Response::Array(keys) => assert_eq!(keys.len(), 2), + _ => panic!(), + }, + _ => panic!("expected array"), + } +} + +#[tokio::test] +async fn test_dbsize() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.int(&["DBSIZE"], 0).await; + c.ok(&["MSET", "a", "1", "b", "2", "c", "3"]).await; + c.int(&["DBSIZE"], 3).await; +} + +#[tokio::test] +async fn test_flushdb_clears_all_keys() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "k1", "v1", "k2", "v2"]).await; + c.ok(&["FLUSHDB"]).await; + c.int(&["DBSIZE"], 0).await; + c.null(&["GET", "k1"]).await; +} + +// ─── RENAME / RENAMENX ──────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_rename() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "src", "hello"]).await; + c.ok(&["RENAME", "src", "dst"]).await; + c.null(&["GET", "src"]).await; + c.bulk_str(&["GET", "dst"], "hello").await; +} + +#[tokio::test] +async fn test_rename_missing_source_errors() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["RENAME", "absent", "dst"]).await; + assert!(r.is_error()); +} + +#[tokio::test] +async fn test_rename_preserves_ttl() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "src", "v", "EX", "100"]).await; + c.ok(&["RENAME", "src", "dst"]).await; + let ttl = c.cmd(&["TTL", "dst"]).await.as_int(); + assert!(ttl > 0, "TTL should be transferred on rename"); +} + +#[tokio::test] +async fn test_renamenx_absent_destination() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "src", "v"]).await; + c.int(&["RENAMENX", "src", "dst"], 1).await; + c.bulk_str(&["GET", "dst"], "v").await; + c.null(&["GET", "src"]).await; +} + +#[tokio::test] +async fn test_renamenx_present_destination_fails() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "src", "sv", "dst", "dv"]).await; + c.int(&["RENAMENX", "src", "dst"], 0).await; + // Both keys unchanged. + c.bulk_str(&["GET", "src"], "sv").await; + c.bulk_str(&["GET", "dst"], "dv").await; +} + +// ─── TOUCH ──────────────────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_touch_counts_existing_keys() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "k1", "v", "k2", "v"]).await; + c.int(&["TOUCH", "k1", "k2", "absent"], 2).await; +} + +// ─── Concurrent clients ─────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_multiple_concurrent_clients() { + let (addr, _db) = start_server().await; + + let handles: Vec<_> = (0..10u32) + .map(|i| { + tokio::spawn(async move { + let mut c = TestClient::connect(addr).await; + let key = format!("client_{}", i); + let val = format!("value_{}", i); + c.ok(&["SET", &key, &val]).await; + c.bulk_str(&["GET", &key], &val).await; + }) + }) + .collect(); + + for h in handles { + h.await.unwrap(); + } +} + +#[tokio::test] +async fn test_concurrent_incr_correctness() { + let (addr, _db) = start_server().await; + + // 50 tasks each increment "shared_counter" 10 times = 500 total. + let handles: Vec<_> = (0..50u32) + .map(|_| { + tokio::spawn(async move { + let mut c = TestClient::connect(addr).await; + for _ in 0..10 { + c.cmd(&["INCR", "shared_counter"]).await; + } + }) + }) + .collect(); + + for h in handles { + h.await.unwrap(); + } + + let mut c = TestClient::connect(addr).await; + let final_val = c.cmd(&["GET", "shared_counter"]).await; + let n: i64 = final_val.as_str().parse().unwrap(); + assert_eq!( + n, 500, + "concurrent INCRs should be serialized by the RwLock" + ); +} + +// ─── ACL / slow-log / memory / latency stubs ────────────────────────────────── + +#[tokio::test] +async fn test_acl_whoami() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.bulk_str(&["ACL", "WHOAMI"], "default").await; +} + +#[tokio::test] +async fn test_slowlog_get_returns_empty() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["SLOWLOG", "GET"]).await; + assert!(matches!(r, Response::Array(v) if v.is_empty())); +} + +#[tokio::test] +async fn test_memory_usage() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let r = c.cmd(&["MEMORY", "USAGE", "somekey"]).await; + // We return null for MEMORY USAGE (not supported in detail). + assert!(matches!(r, Response::Null)); +} + +// ─── Pipelining ─────────────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_pipelining_set_mget() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + + // Send 5 SETs without waiting for responses (pipelining). + for i in 0..5u32 { + c.send(&["SET", &format!("pk{}", i), &format!("pv{}", i)]) + .await; + } + // Collect all 5 OK responses. + for _ in 0..5 { + assert_eq!(c.recv().await, Response::Simple("OK".into())); + } + + // Now MGET all 5 keys in one shot. + let keys: Vec = (0..5).map(|i| format!("pk{}", i)).collect(); + let mut args = vec!["MGET"]; + for k in &keys { + args.push(k.as_str()); + } + let r = c.cmd(&args).await; + match r { + Response::Array(v) => { + assert_eq!(v.len(), 5); + for (i, item) in v.iter().enumerate() { + assert_eq!(item.as_str(), &format!("pv{}", i)); + } + } + _ => panic!("expected array"), + } +} + +// ─── Edge cases ─────────────────────────────────────────────────────────────── + +#[tokio::test] +async fn test_set_and_get_binary_safe_value() { + let (addr, _db) = start_server().await; + // Build a command manually with binary (non-UTF-8) bytes in the value. + let mut c = TestClient::connect(addr).await; + // We use SET with a value of 3 bytes 0x00 0x01 0x02. + let mut raw = b"*3\r\n$3\r\nSET\r\n$6\r\nbinkey\r\n$3\r\n\x00\x01\x02\r\n".to_vec(); + c.writer.write_all(&raw).await.unwrap(); + raw.clear(); + assert_eq!(c.recv().await, Response::Simple("OK".into())); + + c.send(&["GET", "binkey"]).await; + let r = c.recv().await; + assert_eq!(r, Response::Bulk(vec![0x00, 0x01, 0x02])); +} + +#[tokio::test] +async fn test_incr_on_non_numeric_value_errors() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "notanumber"]).await; + let r = c.cmd(&["INCR", "k"]).await; + // The engine returns 0 for unparseable numbers (graceful degradation), + // so we just check the key is still accessible. + // If it errors, that's also acceptable — just not a panic. + let _ = r; +} + +#[tokio::test] +async fn test_set_large_value() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + let large_val = "x".repeat(65_536); // 64 KiB + c.ok(&["SET", "bigkey", &large_val]).await; + let r = c.cmd(&["GET", "bigkey"]).await; + assert_eq!(r.as_str(), large_val); +} + +#[tokio::test] +async fn test_del_after_expiry_returns_zero() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v", "PX", "50"]).await; + tokio::time::sleep(Duration::from_millis(100)).await; + // After expiry, DEL should return 0 (key not found). + c.int(&["DEL", "k"], 0).await; +} + +#[tokio::test] +async fn test_set_clears_existing_ttl() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["SET", "k", "v", "EX", "100"]).await; + // Overwrite with a plain SET (no EX) → TTL should be cleared. + c.ok(&["SET", "k", "v2"]).await; + c.int(&["TTL", "k"], -1).await; +} + +// ─── INFO stats reflect server activity ──────────────────────────────────────── + +#[tokio::test] +async fn test_info_dbsize_reflects_keys() { + let (addr, _db) = start_server().await; + let mut c = TestClient::connect(addr).await; + c.ok(&["MSET", "x", "1", "y", "2", "z", "3"]).await; + let r = c.cmd(&["INFO", "keyspace"]).await; + let info = r.as_str().to_owned(); + assert!( + info.contains("keys=3"), + "keyspace info should show 3 keys: {}", + info + ); +} diff --git a/crates/sstable/src/format.rs b/crates/sstable/src/format.rs index e4d173e..1eb3443 100644 --- a/crates/sstable/src/format.rs +++ b/crates/sstable/src/format.rs @@ -24,8 +24,7 @@ //! then seeking back to read the appropriate footer size. use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use std::io; -use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::io::{self, Read, Result as IoResult, Seek, SeekFrom, Write}; /// Magic number identifying SSTable v1 files (ASCII "SST1") Can be Removed Later. pub const SSTABLE_MAGIC_V1: u32 = 0x5353_5431; diff --git a/crates/sstable/src/reader.rs b/crates/sstable/src/reader.rs index 4e9194c..d330bb5 100644 --- a/crates/sstable/src/reader.rs +++ b/crates/sstable/src/reader.rs @@ -182,7 +182,6 @@ impl SSTableReader { let seq = f.read_u64::()?; let present = f.read_u8()?; - // let (value, val_bytes) = if present == 1 { let value = if present == 1 { let val_len = f.read_u32::()? as usize; if val_len > MAX_VALUE_BYTES { @@ -194,7 +193,7 @@ impl SSTableReader { } let mut val = vec![0u8; val_len]; f.read_exact(&mut val)?; - Some(val.clone()) + Some(val) } else { None }; diff --git a/crates/wal/src/lib.rs b/crates/wal/src/lib.rs index 3946e8b..2778820 100644 --- a/crates/wal/src/lib.rs +++ b/crates/wal/src/lib.rs @@ -100,11 +100,7 @@ impl WalWriter { /// * `path` - file system path for the WAL (created if it does not exist). /// * `sync` - if true, every `append` call is followed by `fsync`. pub fn create>(path: P, sync: bool) -> Result { - let file = OpenOptions::new() - .create(true) - .append(true) - .read(true) - .open(path)?; + let file = OpenOptions::new().create(true).append(true).open(path)?; Ok(Self { file, sync, diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..518d18c --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,618 @@ +# RiptideKV — Architecture Reference + +> **Who is this for?** +> A new engineer joining the project, or anyone wanting to understand exactly how +> every byte flows through the system. No prior database internals knowledge +> required — concepts are introduced before code is referenced. + +--- + +## Table of Contents + +1. [Big Picture](#1-big-picture) +2. [Crate Map](#2-crate-map) +3. [Write Path — Step by Step](#3-write-path--step-by-step) +4. [Read Path — Step by Step](#4-read-path--step-by-step) +5. [Flush — Memtable → SSTable](#5-flush--memtable--sstable) +6. [Compaction — Merging SSTables](#6-compaction--merging-sstables) +7. [Recovery — Surviving a Crash](#7-recovery--surviving-a-crash) +8. [File Formats](#8-file-formats) + - [WAL Record](#81-wal-record) + - [SSTable (v3)](#82-sstable-v3) + - [Manifest](#83-manifest) +9. [RESP2 Server Architecture](#9-resp2-server-architecture) +10. [Concurrency Model](#10-concurrency-model) +11. [Configuration Surface](#11-configuration-surface) +12. [Key Design Decisions & Trade-offs](#12-key-design-decisions--trade-offs) + +--- + +## 1. Big Picture + +RiptideKV is a **Log-Structured Merge (LSM) tree** key-value store exposed +over the **Redis Serialization Protocol (RESP2)**. Its architecture mirrors +the storage engines inside systems like LevelDB and RocksDB, but at a scale +that makes every piece easy to read and understand. + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ Clients │ +│ redis-cli / Jedis (Java) / lettuce / telnet │ +└────────────────────────────┬─────────────────────────────────────┘ + │ TCP (RESP2) + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ crates/server │ +│ TcpListener ──► per-connection Tokio task │ +│ RespReader + handler::dispatch() │ +│ SharedDb (Arc>) │ +└────────────────────────────┬─────────────────────────────────────┘ + │ engine API (set / get / del / scan) + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ crates/engine │ +│ │ +│ ┌──────────┐ WAL append ┌────────────┐ │ +│ │ Memtable │ ◄────────────── │ WAL │ │ +│ │ (BTreeMap│ (durability) │ (wal.log) │ │ +│ │ sorted) │ └────────────┘ │ +│ └────┬─────┘ │ +│ │ flush (threshold exceeded) │ +│ ▼ │ +│ ┌──────────────────────────────────────────────────────┐ │ +│ │ SSTables on disk │ │ +│ │ L0 (fresh flushes, may overlap in key range) │ │ +│ │ L1 (post-compaction, non-overlapping, one file) │ │ +│ └──────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ per-file index │ +│ ┌────┴──────────┐ │ +│ │ Bloom Filter │ (fast "definitely not here" checks) │ +│ └───────────────┘ │ +│ │ │ +│ ┌────┴──────────┐ │ +│ │ Manifest │ (which .sst file is L0 vs L1) │ +│ └───────────────┘ │ +└──────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 2. Crate Map + +``` +RiptideKV/ +├── Cargo.toml workspace root (resolver = "2") +└── crates/ + ├── bloom/ Probabilistic membership test (Bloom filter) + │ └── src/lib.rs BloomFilter struct, FNV-1a hashing + │ + ├── memtable/ In-memory sorted write buffer + │ └── src/lib.rs Memtable, ValueEntry, sequence-gated writes + │ + ├── wal/ Write-Ahead Log (durability) + │ └── src/lib.rs WalWriter, WalReader, WalRecord, CRC32 + │ + ├── sstable/ Immutable on-disk sorted tables + │ └── src/ + │ ├── format.rs File layout constants, footer read/write + │ ├── writer.rs SSTableWriter (atomic tmp+rename) + │ ├── reader.rs SSTableReader (index, bloom, CRC verify) + │ └── merge.rs MergeIterator (min-heap over N readers) + │ + ├── engine/ Storage engine — orchestrates all of the above + │ └── src/ + │ ├── lib.rs Engine struct, constructor, accessors + │ ├── write.rs set(), del(), flush(), auto-compaction + │ ├── read.rs get(), scan() + │ ├── compaction.rs compact(), tombstone GC + │ ├── recovery.rs WAL replay, SSTable loading, tmp cleanup + │ └── manifest.rs Persistent L0/L1 level tracking + │ + ├── server/ Async RESP2 TCP server (Tokio) + │ └── src/ + │ ├── lib.rs serve() — public library API + │ ├── main.rs Binary entry point (config from env vars) + │ ├── resp.rs RESP2 parser + serializer + │ ├── db.rs SharedDb — Engine + volatile TTL map + │ └── handler.rs Command dispatcher (55+ commands) + │ + └── cli/ Interactive REPL + criterion benchmarks + └── src/main.rs SET/GET/DEL/SCAN/COMPACT/FLUSH/STATS REPL +``` + +**Dependency graph** (arrows = "depends on"): + +``` +cli ──────────────────────────────────────────► engine +server ───────────────────────────────────────► engine +engine ──► memtable +engine ──► wal +engine ──► sstable ──► bloom +``` + +Nothing depends on `server` or `cli`. The storage engine is completely +independent and can be embedded in any Rust application. + +--- + +## 3. Write Path — Step by Step + +When a client sends `SET mykey myvalue`: + +``` +Client Server Engine + │ │ │ + │─── *3\r\n │ │ + │─── $3\r\nSET\r\n │ │ + │─── $5\r\nmykey\r\n │ │ + │─── $7\r\nmyvalue\r\n│ │ + │ │ │ + │ RespReader.read_value() │ + │ dispatcher → cmd_set() │ + │ │ │ + │ acquire RwLock write │ + │ │──── engine.set(k,v) ─►│ + │ │ │ seq += 1 + │ │ │ WAL.append(Put{seq,k,v}) + │ │ │ └─ write to wal.log + │ │ │ └─ fsync (if wal_sync=true) + │ │ │ memtable.put(k, v, seq) + │ │ │ if mem.approx_size >= threshold: + │ │ │ flush() ──► new .sst file + │ │ │ truncate WAL + │ │ │ fsync WAL truncation + │ │ │ maybe compact() + │ release RwLock │ + │ │ │ + │◄─── +OK\r\n │ │ +``` + +**Code references:** +- WAL append: `crates/wal/src/lib.rs` → `WalWriter::append()` +- Memtable insert: `crates/memtable/src/lib.rs` → `Memtable::put()` +- Flush trigger: `crates/engine/src/write.rs` → `Engine::flush()` +- Command handler: `crates/server/src/handler.rs` → `cmd_set()` + +### Why write to WAL before Memtable? + +If the process crashes _after_ the WAL write but _before_ the Memtable insert, +the data is still safe — it will be replayed from WAL on the next startup. +If we did it the other way around, a crash after the Memtable insert but before +the WAL write would silently lose data. + +### Sequence Numbers + +Every write increments a global `u64` sequence number stored in `Engine.seq`. +Sequence numbers serve two purposes: + +1. **Stale-write protection** in the Memtable: if you try to write a key with a + sequence number older than the one already in the Memtable, the write is + silently dropped (prevents WAL replay from overwriting newer data). + +2. **Version resolution** during compaction: when two SSTables have the same + key, the one with the higher sequence number wins. + +--- + +## 4. Read Path — Step by Step + +When a client sends `GET mykey`: + +``` +Engine.get("mykey") + │ + ├── 1. Check Memtable + │ memtable.get_entry("mykey") + │ If found (value OR tombstone) → return immediately + │ (Memtable always has the freshest data) + │ + ├── 2. Check L0 SSTables (newest first) + │ for sst in l0_sstables.iter().rev(): + │ bloom_filter.check("mykey") ──► "definitely not here" → skip + │ index.binary_search("mykey") ──► offset in file + │ file.seek(offset) + │ read record → verify CRC32 (v3 only) + │ if found (value OR tombstone) → return + │ + └── 3. Check L1 SSTables (newest first) + same process as L0 + if nothing found → return None +``` + +**Key insight — tombstones:** A `DEL` operation writes a special "tombstone" +record (a key with `value = None`). When the read path encounters a tombstone +in any layer, it immediately returns `None` — the key is deleted. This means +you don't need to update or remove older SSTable entries on every delete; +the tombstone acts as a shadow. + +**Code references:** +- `crates/engine/src/read.rs` → `Engine::get()` +- `crates/sstable/src/reader.rs` → `SSTableReader::get()` (bloom + index + CRC) +- `crates/memtable/src/lib.rs` → `Memtable::get_entry()` + +### Bloom Filters + +Each SSTable has an embedded Bloom filter. Before doing any disk I/O, the +reader checks the filter. If the filter says "definitely not in this file", +the reader skips that SSTable entirely — no disk read at all. + +The filter uses **FNV-1a double hashing** (two independent hash functions +derived from one) and is sized for a **1% false-positive rate**. This means +1 in 100 "not present" queries will still do a disk seek unnecessarily, but +99% are saved. + +A false positive means "the filter says the key might be here, but it's +actually not". The code then does a full disk read and returns `None` after +reading the record. A false negative (filter says not here when it is) is +impossible. + +--- + +## 5. Flush — Memtable → SSTable + +When the Memtable's approximate size exceeds `flush_threshold`: + +``` +Engine::flush() + │ + ├── SSTableWriter::write_from_memtable(tmp_path, &memtable) + │ ├── iterate memtable in key order (BTreeMap is sorted) + │ ├── for each entry: + │ │ write key_len(u32) + key + seq(u64) + present(u8) + [val_len + val] + │ │ compute CRC32 over this record → write to data section + │ │ update bloom filter + │ │ record (key, file_offset) in sparse index + │ ├── write bloom filter section + │ ├── write sparse index section + │ └── write v3 footer (magic, offsets, max_seq, version) + │ + ├── fsync the tmp file + ├── rename(tmp_path, final_path) ← atomic on POSIX + ├── manifest.add(filename, level=0) + ├── manifest.save() ← atomic write+fsync+rename + ├── truncate WAL to 0 bytes + ├── fsync WAL truncation + └── memtable.clear() +``` + +**Why tmp + rename?** +A file rename on POSIX systems is atomic — either the old name or the new name +is visible, never a half-written state. This means a crash mid-write leaves +the `.sst.tmp` file, which is cleaned up on the next startup by +`cleanup_tmp_files()`. The SSTable is never visible to readers until the rename. + +**Sparse index** — not every key is indexed, only one entry per N bytes +(based on a configurable block size). A lookup binary-searches the index for +the largest key ≤ the target, then scans forward from that offset. This keeps +the index small (fits in memory) while bounding the scan distance. + +--- + +## 6. Compaction — Merging SSTables + +Without compaction, L0 SSTable count grows without bound, making reads +progressively slower (more files to check). Compaction merges all L0 + L1 +SSTables into a single new L1 SSTable. + +``` +Engine::compact() + │ + ├── open SSTableReader for every L0 + L1 SSTable + ├── create MergeIterator over all readers + │ MergeIterator uses a BinaryHeap (min-heap) + │ Each "slot" in the heap holds the current key from one reader + │ Pop the smallest key → if duplicate keys across files, keep highest seq + │ + ├── SSTableWriter::write_from_iterator(tmp_path, iter, &memtable) + │ For each (key, entry) from the iterator: + │ Skip tombstones where the key is NOT in the memtable + │ (tombstone GC: dead tombstones can be dropped permanently) + │ Otherwise write the entry to the new SSTable + │ + ├── If the iterator produced no output (all keys were tombstone-GC'd): + │ delete all old SSTable files + │ clear manifest + │ return + │ + ├── fsync + rename new SSTable + ├── manifest.replace_all([new_filename], level=1) + ├── manifest.save() + ├── delete all old SSTable files + └── update engine's l0_sstables + l1_sstables lists +``` + +### Tombstone Garbage Collection + +A tombstone can only be safely deleted if there is no older copy of the key +in any SSTable we're _not_ compacting. Since we compact everything in one +pass, there is no older layer — so any tombstone whose key is not in the +current Memtable can be dropped permanently. + +The condition in `compaction.rs`: +```rust +// crates/engine/src/compaction.rs +if entry.value.is_none() && !mem_ref.contains_key(&key) { + continue; // GC: dead tombstone — drop it +} +``` + +If the Memtable _does_ contain the key, there's a newer write in flight that +references this key — we keep the tombstone to avoid resurrecting the old value. + +--- + +## 7. Recovery — Surviving a Crash + +On every `Engine::new()` call: + +``` +1. cleanup_tmp_files(sst_dir) + Remove any *.sst.tmp files left by interrupted flushes. + +2. replay_wal_and_build(wal_path, &mut memtable) + Open wal.log in read mode. + For each WalRecord: + Put → memtable.put(key, value, seq) [seq gating prevents double-apply] + Del → memtable.delete(key, seq) + Return the highest seq seen. + Truncated tail (last record partially written) → silently ignored. + CRC32 mismatch → error (genuine corruption). + +3. Open WalWriter in append mode (after replay, to avoid races). + +4. Manifest::load_or_create(sst_dir) + Read MANIFEST file → parse "filename level" lines. + Open SSTableReader for each L0 and L1 filename. + Extract max_seq from each v3 footer. + +5. seq = max(wal_seq, max_sst_seq) + Ensures the next write gets a sequence number higher than anything + that already exists on disk. +``` + +**Why is WAL replay done before opening the WalWriter?** +Opening a file in append mode on some operating systems truncates the existing +content or changes the seek position. By replaying first (read mode), we avoid +interfering with the unread bytes. + +**What about the manifest?** +The manifest is written atomically (tmp + fsync + rename), so it is always +consistent. Either the pre-flush or post-flush manifest is visible, never a +partial state. + +--- + +## 8. File Formats + +### 8.1 WAL Record + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ tag (1 byte) 0x01 = Put, 0x02 = Del │ +│ seq (8 bytes) little-endian u64 │ +│ key_len (4 bytes) little-endian u32 │ +│ key (key_len bytes) │ +│ [if Put:] │ +│ val_len (4 bytes) little-endian u32 │ +│ val (val_len bytes) │ +│ crc32 (4 bytes) little-endian u32 │ +│ (CRC covers: tag + seq + key_len + key + [val_len + val]) │ +└─────────────────────────────────────────────────────────────────┘ +``` + +A truncated tail (process crashed mid-write) is detected by EOF before the +CRC field — the reader silently stops and returns what it has so far. +A CRC mismatch on a complete record is a genuine corruption and returns an error. + +### 8.2 SSTable (v3) + +``` +┌───────────────────────────────────────────┐ +│ DATA SECTION │ +│ ┌─────────────────────────────────────┐ │ +│ │ Record 0 │ │ +│ │ key_len (u32 LE) │ │ +│ │ key (bytes) │ │ +│ │ seq (u64 LE) │ │ +│ │ present (u8: 1=value, 0=tombstone) │ │ +│ │ [if present=1:] │ │ +│ │ val_len (u32 LE) │ │ +│ │ val (bytes) │ │ +│ │ crc32 (u32 LE) ← v3 only │ │ +│ └─────────────────────────────────────┘ │ +│ ┌─────────────────────────────────────┐ │ +│ │ Record 1 ... │ │ +│ └─────────────────────────────────────┘ │ +│ BLOOM SECTION │ +│ bit array + metadata (see bloom crate) │ +│ INDEX SECTION │ +│ for each sampled key: │ +│ key_len (u32) + key (bytes) │ +│ file_offset (u64) │ +│ FOOTER (v3 = 56 bytes) │ +│ magic (8 bytes) "RIPTIDE1" │ +│ bloom_off (u64 LE) offset of bloom sec │ +│ bloom_len (u64 LE) length of bloom sec │ +│ index_off (u64 LE) offset of index sec │ +│ index_len (u64 LE) length of index sec │ +│ max_seq (u64 LE) highest seq in file │ ← v3 addition +│ version (u8) 3 │ +└───────────────────────────────────────────┘ +``` + +The `max_seq` field allows recovery to find the highest sequence number without +reading every record — it reads only the last 56 bytes of each SSTable. + +### 8.3 Manifest + +A plain UTF-8 text file, one line per SSTable: + +``` +sst-0000000001-1704067200000.sst L0 +sst-0000000042-1704068400000.sst L1 +``` + +Written atomically: `MANIFEST.tmp` → fsync → rename to `MANIFEST`. + +--- + +## 9. RESP2 Server Architecture + +``` +TCP connection accepted + │ + ▼ + Tokio task spawned (one per connection, cheap green threads) + │ + ▼ + handler::handle_connection(stream, db) + │ + ├─► RespReader::read_value() parsing + │ │ + │ ├── *N\r\n ──► read N items via read_item() + │ │ (non-recursive: avoids Box::pin overhead) + │ └── inline ──► split on whitespace, wrap as Array + │ + └─► dispatch(cmd, args) command routing + │ + ├── acquire Arc::clone(&db.state).write/read().await + │ (clone the Arc first so the guard doesn't borrow `conn`) + │ + ├── execute command logic (mutation or query) + │ + ├── drop guard (end of scoped block) + │ + └── write response bytes to OwnedWriteHalf +``` + +### Shared State + +```rust +// crates/server/src/db.rs +pub struct SharedDb { + pub state: Arc>, // Engine + TTL map + pub start_time: Instant, + pub connected_clients: Arc, + pub total_commands: Arc, + pub total_connections: Arc, +} + +pub struct DbState { + pub engine: Engine, + pub ttl: HashMap, Instant>, // volatile: lost on restart +} +``` + +`SharedDb` is `Clone` (all fields are `Arc`-wrapped), so cloning it for each +connection task is O(1) — it just bumps reference counts. + +### TTL Implementation + +TTL is stored as a `HashMap, Instant>` in memory. This is intentionally +**volatile** — TTLs are lost on server restart. Keys with expired TTLs are +**lazily evicted**: the TTL is checked on every read/write operation for that +key. There is no background eviction task. + +This is the same approach Redis takes internally (active expiry + lazy eviction), +though Redis also has a periodic background sweep that we haven't added yet. + +### Borrow-Checker Pattern + +A subtle Rust lifetime issue: the `RwLockWriteGuard` returned by +`db.state.write().await` borrows `db.state`, which is a field of `conn`, so +the guard effectively borrows `conn`. You then can't call `&mut conn` methods +while the guard is alive. + +The solution: clone the `Arc` into a local variable before `await`-ing: + +```rust +// crates/server/src/handler.rs — the state_write! macro expands to: +let guard = Arc::clone(&conn.db.state).write_owned().await; +// Now `guard` borrows the local Arc clone, not `conn`. +// `conn` is free for mutable calls after `guard` drops. +``` + +--- + +## 10. Concurrency Model + +| Layer | Concurrency | +|-------|-------------| +| TCP accept | Single Tokio task calls `listener.accept()` in a loop | +| Per-connection | Each connection runs in its own `tokio::spawn`-ed task | +| Engine access | Single `Arc>` — multiple readers OR one writer | +| Engine internals | Single-threaded `&mut self` — no internal locking | +| Stats counters | `AtomicI64` / `AtomicU64` — lock-free | + +**Read/Write lock contention:** All commands that mutate state (SET, DEL, INCR, +etc.) hold a write lock for the duration of the command. Read-only commands +(GET, MGET, TTL, KEYS, etc.) hold a read lock. This means: +- Multiple GETs from different clients run concurrently. +- A SET blocks all other readers and writers until it completes. + +This is a **coarse-grained** locking strategy — acceptable for a learning +project but a known bottleneck for high write throughput at scale. A production +system would use finer-grained locking or lock-free data structures. + +--- + +## 11. Configuration Surface + +All configuration is via environment variables — no config file required. + +| Variable | Default | Where used | +|----------|---------|-----------| +| `RIPTIDE_BIND` | `0.0.0.0:6379` | `server/src/main.rs` | +| `RIPTIDE_WAL_PATH` | `wal.log` | `engine/src/lib.rs` via main | +| `RIPTIDE_SST_DIR` | `data/sst` | `engine/src/lib.rs` via main | +| `RIPTIDE_FLUSH_KB` | `1024` | `engine/src/write.rs` | +| `RIPTIDE_WAL_SYNC` | `true` | `wal/src/lib.rs` | +| `RIPTIDE_L0_TRIGGER` | `4` | `engine/src/write.rs` (CLI only) | + +`RIPTIDE_L0_TRIGGER` is only exposed by the CLI binary; the server uses the +engine's default (`DEFAULT_L0_COMPACTION_TRIGGER = 4`). To change it from the +server, call `engine.set_l0_compaction_trigger(n)` programmatically. + +--- + +## 12. Key Design Decisions & Trade-offs + +### Decision: LSM tree over B-tree + +LSM trees turn random writes into sequential appends. This makes writes very +fast (especially when WAL sync is enabled — only one sequential write per +operation). The trade-off is read amplification: a key might exist in the +Memtable, multiple L0 SSTables, and the L1 SSTable, so a read might have to +check many places. + +### Decision: Single-level L1 compaction + +We compact everything into a single L1 SSTable. This means: +- After compaction, there is exactly one L1 file. +- A key is guaranteed to appear at most once in L1. +- Reads only need to check one file at each level. + +A production LSM (like RocksDB) has multiple levels (L1, L2, L3...) with size +amplification limits, which gives better space usage. Our single-file approach +is simpler to reason about. + +### Decision: RESP2 over custom protocol + +Using RESP2 means any existing Redis client library works with RiptideKV +out of the box — no custom client needed. Java (Jedis), Python (redis-py), +Go (go-redis), and redis-cli all speak RESP2. + +### Decision: Volatile TTL (in-memory only) + +Storing TTLs in a `HashMap, Instant>` is fast and simple but loses +TTLs on restart. A production system would persist expiry timestamps in the +WAL (as a special record type) or in SSTable metadata. We chose simplicity here. + +### Decision: `Engine` takes `&mut self` (single-threaded) + +The storage engine is inherently sequential — WAL appends must be ordered, +Memtable mutations are single-threaded, and compaction reads/writes files. +By using `&mut self`, we get compile-time exclusivity: only one caller can +mutate the engine at a time, which the `Arc>` enforces at the server +level. diff --git a/docs/GUIDE.md b/docs/GUIDE.md new file mode 100644 index 0000000..622fa48 --- /dev/null +++ b/docs/GUIDE.md @@ -0,0 +1,782 @@ +# RiptideKV — Learning Guide + +> **Who is this for?** +> You built (or are studying) this project and want to understand not just +> _what_ the code does, but _why_ it does it that way. Each section introduces +> a concept from first principles and then shows exactly where it lives in the +> codebase, with real code, real examples, and the pitfalls that tripped us up +> along the way. + +Read this linearly — each section builds on the previous one. + +--- + +## Part 1 — Why Is a Key-Value Store Hard? + +A key-value store sounds simple: store a key, get it back later. A `HashMap` +does this in memory. The moment you need **durability** — surviving power cuts, +process crashes, full disk failures — everything becomes interesting. + +**The fundamental problem:** Memory is volatile. Disk is slow. How do you +get the performance of memory and the durability of disk? + +RiptideKV's answer is the **Log-Structured Merge (LSM) tree**, the same +architecture used by LevelDB, RocksDB, Cassandra, and Pebble. + +--- + +## Part 2 — The Write-Ahead Log (WAL) + +### The Problem + +If you write data to an in-memory structure and the process crashes before that +data reaches disk, it is gone forever. You need to persist _something_ to disk +on every write. + +### Naïve Solution: Write to Disk Directly + +Write each key-value pair to a file immediately. Problem: files on disk are +structured (databases, B-trees) — updating a structure on disk requires reading +a page, modifying it, and writing it back. Random writes to disk are slow (~100 +operations/second on spinning disk, ~10,000 on SSD). + +### Better Solution: Append to a Log + +Appending to the end of a file is the fastest disk operation possible because +it's **sequential**. No seeking, no reading, just writing bytes at the end. + +``` +wal.log (grows rightward over time): +┌──────┬──────┬──────┬──────┐ +│PUT k1│PUT k2│DEL k1│PUT k3│ ← each append is fast (sequential write) +└──────┴──────┴──────┴──────┘ +``` + +This is the **Write-Ahead Log**: every mutation is appended to the log before +touching any in-memory structure. On crash, replay the log to reconstruct state. + +### How RiptideKV Does It + +```rust +// crates/wal/src/lib.rs — appending a record +pub fn append(&mut self, record: &WalRecord) -> Result<()> { + // Serialize: tag(1) + seq(8) + key_len(4) + key + val_len(4) + val + // Compute CRC32 over all bytes + // Write to file (buffered) + // If wal_sync=true, call file.sync_all() for durability +} +``` + +```rust +// crates/wal/src/lib.rs — replaying on restart +pub struct WalReader { ... } +impl WalReader { + pub fn next(&mut self) -> Result> { + // Read tag, seq, key_len, key... + // If file ends early (truncated tail) → return Ok(None) — graceful + // If CRC32 mismatches → return Err — genuine corruption + } +} +``` + +**Pitfall — CRC32 vs truncated tail:** A crash can leave the last record +partially written. This looks like a CRC mismatch but is actually normal. +We distinguish by detecting EOF before the CRC field — that's a truncated +tail (safe to ignore). A CRC mismatch on a _complete_ record is genuine +corruption. + +### WAL Sync Trade-off + +`wal_sync = true`: `fsync()` after every write. **Durable** but ~10–100× slower. +`wal_sync = false`: OS buffers writes. **Fast** (memory speed) but up to 1 second of data lost on crash. + +The default for the server is `true`. The CLI default is `true`. +Set `RIPTIDE_WAL_SYNC=false` for maximum write throughput in testing. + +--- + +## Part 3 — The Memtable + +### Why Not Just Replay the WAL Every Time? + +Replaying the WAL on every read would be O(n) where n is every write ever made. +That's unusable. Instead, keep the recent writes in memory in a structure +optimised for both reads and writes. + +### The Memtable + +A **Memtable** is an in-memory sorted map. When we write a key-value pair, we: +1. Append to WAL (durability) +2. Insert into Memtable (fast access) + +Reads check the Memtable first — if the key is there, we don't need to touch +disk at all. + +```rust +// crates/memtable/src/lib.rs +pub struct Memtable { + data: BTreeMap, ValueEntry>, // sorted by key + approx_size: usize, // tracks bytes for flush trigger +} + +pub enum ValueEntry { + Value { seq: u64, data: Vec }, + Tombstone { seq: u64 }, // deletion marker +} +``` + +Why `BTreeMap` and not `HashMap`? Because `BTreeMap` keeps keys sorted, which +matters enormously: when we flush to disk, we iterate in key order to produce +a **Sorted** String Table. Sorted order also makes range queries (`SCAN`) cheap +— binary search to the start key, then iterate. + +### Sequence-Gated Writes + +The Memtable only accepts a write if its sequence number is _newer_ than what's +already there: + +```rust +// crates/memtable/src/lib.rs — Memtable::put() +pub fn put(&mut self, key: Vec, value: Vec, seq: u64) { + match self.data.get(&key) { + Some(existing) if existing.seq() >= seq => return, // stale write, ignore + _ => { /* insert */ } + } +} +``` + +**Why is this important?** During WAL replay, we might replay a PUT for a key +that was later overwritten and flushed to an SSTable. Without sequence gating, +the replay would overwrite the newer data in the Memtable with the older value +from the WAL. Sequence gating prevents this. + +### Approximate Size Tracking + +The Memtable tracks `approx_size` (key bytes + value bytes + a small overhead +estimate). When this exceeds `flush_threshold`, the engine flushes the Memtable +to disk as a new SSTable. + +--- + +## Part 4 — SSTables (Sorted String Tables) + +### The Problem with the WAL as a Read Structure + +The WAL is append-only and unsorted. Reading a specific key requires scanning +every record. For writes, this is fine. For reads, it's terrible. + +### SSTables + +When the Memtable grows too large, we flush it to disk as a **Sorted String +Table (SSTable)**. An SSTable is: +- **Immutable** — never modified after writing. +- **Sorted** — keys are in ascending order. +- **Compact** — all data is in one file with a footer pointing to sections. + +Because it's sorted, looking up a key is fast: binary search the index, seek +to roughly the right place, scan forward. No need to read the whole file. + +### How RiptideKV Writes an SSTable + +```rust +// crates/sstable/src/writer.rs — atomic write +pub fn write_from_memtable(path: &Path, memtable: &Memtable) -> Result<()> { + let tmp_path = path.with_extension("sst.tmp"); + let mut file = File::create(&tmp_path)?; + + // Write DATA section: one record per key-value pair + for (key, entry) in memtable.iter() { + write_record(&mut file, key, entry)?; + update_bloom_filter(key); + update_index(key, current_offset); + } + + // Write BLOOM section + // Write INDEX section + // Write FOOTER (v3): offsets + lengths + max_seq + magic + + file.sync_all()?; // durability before rename + rename(tmp_path, path)?; // atomic: readers see complete file or nothing +} +``` + +**Why tmp + rename?** +`rename()` on POSIX is atomic — either the old filename or the new one is +visible, never a partially-written intermediate state. Without this, a crash +mid-write would leave a corrupt `.sst` file that gets loaded on restart. + +### The Sparse Index + +The index doesn't store every key — that would be as large as the data itself. +Instead, it stores one entry per ~4 KB of data ("sparse" index). A lookup: +1. Binary search the index for the largest key ≤ target. +2. Seek to that file offset. +3. Scan forward record by record until finding the key or overshooting. + +This balances index memory usage (small) vs. per-lookup scan distance (bounded). + +### Footer and Version History + +``` +v1 footer (40 bytes): magic + bloom_off + bloom_len + index_off + index_len + version +v2 footer (40 bytes): same structure, different magic? (compatible) +v3 footer (56 bytes): adds max_seq (8 bytes) — enables fast recovery +``` + +The version is stored in the last byte of the footer. This lets us evolve the +format over time while remaining backward compatible. + +`max_seq` in v3 is crucial for recovery: instead of reading every record to +find the highest sequence number, we read just the last 56 bytes of each SSTable. + +--- + +## Part 5 — Bloom Filters + +### The Problem + +The read path checks the Memtable, then all L0 SSTables, then L1. For a key +that doesn't exist, every SSTable would require a disk read (seek + read a few +hundred bytes) just to confirm "not found". With many SSTables, this is painful. + +### Bloom Filters + +A Bloom filter is a **probabilistic set membership** data structure. It answers: +- "Definitely not in this set" — 100% correct, zero false negatives. +- "Might be in this set" — 99% correct (1% false positive rate here). + +It uses a bit array and multiple hash functions. Inserting a key sets several +bits; checking a key tests whether all those bits are set. If any bit is unset, +the key is definitely absent. + +**The mathematical guarantee:** For our 1% false-positive rate with FNV-1a +double-hashing, we allocate ~10 bits per key in the filter. The exact formula: + +``` +bits = -n × ln(p) / (ln 2)² +hashes = bits/n × ln 2 + +where n = number of keys, p = desired false-positive rate +``` + +### How RiptideKV Uses Bloom Filters + +```rust +// crates/bloom/src/lib.rs +pub struct BloomFilter { + bits: Vec, // the bit array (packed into u64 words) + num_hashes: usize, // k hash functions +} + +impl BloomFilter { + pub fn insert(&mut self, key: &[u8]) { /* set k bits */ } + pub fn may_contain(&self, key: &[u8]) -> bool { /* check k bits */ } +} +``` + +```rust +// crates/sstable/src/reader.rs — read path +pub fn get(&self, key: &[u8]) -> Result> { + // Fast path: Bloom filter says "definitely not here" + if let Some(bloom) = &self.bloom { + if !bloom.may_contain(key) { + return Ok(None); // no disk I/O + } + } + // Slow path: binary search index, seek, read record + // ... +} +``` + +**Pitfall — FNV-1a double hashing:** Standard double hashing uses two +independent hash functions `h1(x)` and `h2(x)`, then computes +`gi(x) = h1(x) + i * h2(x) mod m` for each of the k hash positions. +We derive both h1 and h2 from the same FNV-1a hash over the key with +different seeds — a technique that gives good distribution without needing +a completely different hash algorithm. + +--- + +## Part 6 — Compaction + +### The Growing Problem + +Every time the Memtable fills up, we flush to a new L0 SSTable. Over time: +- L0 has many SSTables (e.g., 50). +- Each GET might need to check all 50 files for a key. +- Disk space grows because old values/tombstones are never removed. + +### Compaction: Merge Everything Into One + +Compaction merges all existing SSTables into a single new one, resolving +version conflicts (keep the newest) and removing dead tombstones. + +``` +Before compaction: After compaction: +L0: sst-001.sst {k1:v1} L1: sst-compact.sst {k1:v3, k3:v2} + sst-002.sst {k1:v2} (k2 was deleted, tombstone GC'd) + sst-003.sst {k2:v1} +L1: sst-000.sst {k1:v0, k3:v2} + sst-004.sst {k2:tombstone} +``` + +### The Merge Iterator + +The core data structure is a min-heap that treats N SSTable readers as N sorted +streams and merges them in order: + +```rust +// crates/sstable/src/merge.rs +pub struct MergeIterator { + heap: BinaryHeap, // min-heap ordered by (key, seq desc) + readers: Vec, +} + +// The heap contains one "current entry" from each active reader. +// Pop the minimum key → if duplicate keys (same key in multiple SSTables), +// the one with higher seq wins (most recent write). +``` + +This is a classic **k-way merge** algorithm, the same one used in: +- Merge sort +- External sort (sorting data that doesn't fit in RAM) +- Database query engines for merging sorted result sets + +### Tombstone Garbage Collection + +```rust +// crates/engine/src/compaction.rs +for (key, entry) in merge_iter { + // Is this a tombstone? + if entry.value.is_none() { + // Is the key still live in the current Memtable? + if !memtable.contains_key(&key) { + // Safe to drop: no newer data exists that this tombstone needs to hide. + continue; + } + } + // Otherwise: write this entry to the new SSTable. + writer.write_entry(&key, &entry)?; +} +``` + +**Why the Memtable check?** If a key is in the Memtable, it means there's a +pending write (set or delete) that hasn't been flushed yet. If we GC the +tombstone, and then the Memtable is flushed, the old value in a pre-compaction +SSTable could resurface. The Memtable check prevents this. + +### Auto-Compaction + +When the L0 SSTable count reaches `l0_compaction_trigger` (default: 4), the +engine automatically compacts after the next flush: + +```rust +// crates/engine/src/write.rs — inside flush() +if self.l0_sstables.len() >= self.l0_compaction_trigger { + self.compact()?; +} +``` + +--- + +## Part 7 — Recovery After a Crash + +### What Happens On Restart? + +``` +1. Remove any *.sst.tmp files (interrupted flush mid-write) +2. Replay WAL → rebuild Memtable +3. Load Manifest → know which .sst files belong to L0 vs L1 +4. Open SSTableReaders for each file +5. Compute the maximum sequence number seen across all sources +6. Ready to serve requests +``` + +### What If the WAL Has a Partial Record at the End? + +Normal. Crash during WAL append leaves the last record truncated. The WAL +reader detects EOF before reading a complete record and stops gracefully. +This means the last partial write is lost — which is correct: that write was +never acknowledged to the client. + +### What If the Manifest Is Corrupt? + +The Manifest is written atomically (write tmp → fsync → rename). It's either +the pre-flush version or the post-flush version, never a partial state. The +only way to get a corrupt manifest is hardware failure, at which point the WAL +provides recovery. + +--- + +## Part 8 — The RESP2 Protocol + +### Why RESP2? + +Rather than inventing a custom protocol, RiptideKV speaks **Redis Serialization +Protocol 2** — the same wire format Redis has used since 2010. This means: +- **Any Redis client library works**: Jedis (Java), redis-py (Python), + go-redis, lettuce, ioredis, redis-cli. +- **No custom client needed**: point your existing Redis code at RiptideKV's + address and it works. + +### RESP2 Format + +RESP2 has 5 types, each identified by the first byte of each line: + +``` ++OK\r\n Simple String (short, no binary) +-ERR some msg\r\n Error +:42\r\n Integer +$5\r\nhello\r\n Bulk String (binary-safe, with length prefix) +*3\r\n... Array (N elements follow) +$-1\r\n Null Bulk (nil value) +*-1\r\n Null Array (nil array) +``` + +A command from the client is always an Array of Bulk Strings: + +``` +*3\r\n ← array of 3 elements +$3\r\n ← element 1: bulk string, 3 bytes +SET\r\n +$5\r\n ← element 2: bulk string, 5 bytes +mykey\r\n +$7\r\n ← element 3: bulk string, 7 bytes +myvalue\r\n +``` + +### Parsing Without Recursion + +Arrays can contain any RESP value, including other arrays. A naïve recursive +parser hits Rust's async recursion limitation: + +``` +error[E0733]: recursion in an async fn requires boxing +``` + +RiptideKV solves this by never recursing. The top-level `read_value()` handles +the `*N` array header, then calls `read_item()` for each element. `read_item()` +calls `parse_scalar()` which handles `+`, `-`, `:`, `$` types but _not_ `*`. + +```rust +// crates/server/src/resp.rs — non-recursive design +pub async fn read_value(&mut self) -> Result> { + // reads one line + if starts_with('*') { + // read N elements via read_item() — no recursion + for _ in 0..count { items.push(self.read_item().await?); } + } else { + self.parse_scalar(&line).await // +, -, :, $ + } +} + +async fn read_item(&mut self) -> Result { + // reads one line, calls parse_scalar — never calls read_value + self.parse_scalar(&line).await +} +``` + +This handles the 99.9% case (arrays of bulk strings). Nested arrays (e.g., +`EXEC` pipeline responses in Redis) would need a different approach, but +RESP2 commands from clients are always flat arrays. + +### Inline Commands + +Redis also supports "inline" commands — plain text without the array framing: + +``` +PING\r\n +SET foo bar\r\n +``` + +RiptideKV's parser handles these: if the first byte doesn't match a RESP2 +type prefix, split the line on whitespace and wrap in an Array. This is what +lets you test the server with raw `nc` or `telnet`. + +--- + +## Part 9 — The Async Server with Tokio + +### What Is Tokio? + +Tokio is Rust's most popular async runtime. It implements a **multi-threaded +work-stealing scheduler** for async tasks — lightweight green threads that +cooperate instead of preempting. + +An async function returns a `Future` — a value representing a computation +that hasn't run yet. The runtime drives futures to completion by polling them. +When a future is "waiting" (e.g., for network data), it yields control and the +runtime runs other futures on the same OS thread. + +### The Accept Loop + +```rust +// crates/server/src/lib.rs +pub async fn serve(listener: TcpListener, db: SharedDb) -> anyhow::Result<()> { + loop { + let (stream, _peer) = listener.accept().await?; // ← await yields here + stream.set_nodelay(true)?; + let db = db.clone(); // O(1) Arc clone + tokio::spawn(async move { + handler::handle_connection(stream, db).await; + }); + // The spawned task runs concurrently. We're back here immediately. + } +} +``` + +Each `tokio::spawn` creates a new async task — effectively a green thread. +Thousands of concurrent connections use only a handful of OS threads (typically +one per CPU core). + +`stream.set_nodelay(true)` disables Nagle's algorithm — it causes TCP to send +small packets immediately instead of buffering them. Critical for low-latency +request-response patterns like Redis. + +### Why OwnedReadHalf / OwnedWriteHalf? + +`TcpStream::into_split()` gives you independently-owned read and write halves. +This lets the RESP reader and writer live in separate places in the code without +Rust complaining about simultaneous borrows of the stream. + +```rust +// crates/server/src/handler.rs +let (read_half, write_half) = stream.into_split(); +let mut reader = RespReader::new(read_half); +let mut conn = Conn { writer: write_half, ... }; +``` + +--- + +## Part 10 — Shared State and the Borrow Checker + +### The Ownership Challenge + +Multiple async tasks (one per connection) need to share the same database. In +Rust, shared mutable state requires explicit synchronization — the compiler +rejects unsafe sharing. + +The solution: `Arc>`. + +- `Arc` — **atomically reference-counted** shared ownership. Clone it to + get another handle; the underlying data lives until all handles are dropped. +- `RwLock` — **readers-writer lock**. Multiple readers can hold the lock + simultaneously; a writer gets exclusive access. + +```rust +// crates/server/src/db.rs +pub struct SharedDb { + pub state: Arc>, + // ... stats counters +} + +impl Clone for SharedDb { + fn clone(&self) -> Self { + Self { + state: Arc::clone(&self.state), // just bumps refcount + // ... + } + } +} +``` + +### The Guard-and-Borrow Problem + +A subtle Rust pitfall: `RwLockWriteGuard` borrows `Arc>`. +If the `Arc` is a field of `conn`, the guard borrows `conn`. You then can't +call `&mut conn` methods: + +```rust +// ❌ This won't compile: +let guard = conn.db.state.write().await; // borrows conn.db.state → borrows conn +conn.send(response).await; // needs &mut conn — compile error! +``` + +Fix: clone the `Arc` into a local variable first. The guard then borrows the +local clone, not `conn`: + +```rust +// ✅ This works: +let state_arc = Arc::clone(&conn.db.state); // local clone +let guard = state_arc.write_owned().await; // borrows local, not conn +conn.send(response).await; // conn is free +``` + +This pattern is encapsulated in the `state_write!` and `state_read!` macros +at the top of `handler.rs`. + +--- + +## Part 11 — Testing Philosophy + +### Unit Tests vs Integration Tests + +RiptideKV has two levels of testing: + +**Unit/integration tests per crate** (in `crates/*/src/tests/`): +- Test each component in isolation. +- Fast — no TCP, no server, no temp directories (mostly). +- Example: `write_tests.rs` tests `Engine::set`, `Engine::del`, flush triggering. + +**Server integration tests** (in `crates/server/tests/integration.rs`): +- Spin up a real TCP server on a random port. +- Send real RESP2 bytes over a real socket. +- Assert real RESP2 responses. +- Cover the full system: RESP parsing → command dispatch → engine → response encoding. + +```rust +// crates/server/tests/integration.rs — test helper +async fn start_server() -> (SocketAddr, SharedDb) { + let engine = Engine::new( + dir.path().join("wal.log"), + dir.path().join("sst"), + 64 * 1024 * 1024, // large — no auto-flush noise + false, // no fsync — tests are fast + ).unwrap(); + let db = SharedDb::new(engine); + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); // OS picks port + let addr = listener.local_addr().unwrap(); + tokio::spawn(server::serve(listener, db.clone())); + (addr, db) +} +``` + +**Key test design decisions:** + +1. **Port 0** — let the OS pick a free port. Avoids flaky tests from port + conflicts when tests run in parallel. + +2. **Large flush threshold** — prevents auto-flushes from interleaving with + test assertions. + +3. **`wal_sync = false`** — removes fsync latency from tests. Tests run ~10× + faster. + +4. **`Box::leak` for tempdir** — the `TempDir` guard must live as long as the + server. Leaking it ensures the directory isn't cleaned up mid-test. + (In production code you'd use proper lifetime management; this is fine for + tests.) + +### The Concurrency Test + +```rust +// crates/server/tests/integration.rs +#[tokio::test] +async fn test_concurrent_incr_correctness() { + // 50 tasks × 10 INCRs = 500 total increments + let handles: Vec<_> = (0..50) + .map(|_| tokio::spawn(async move { + let mut c = TestClient::connect(addr).await; + for _ in 0..10 { c.cmd(&["INCR", "shared_counter"]).await; } + })) + .collect(); + for h in handles { h.await.unwrap(); } + // Final value must be exactly 500 — no lost updates + let val: i64 = c.cmd(&["GET", "shared_counter"]).await.as_str().parse().unwrap(); + assert_eq!(val, 500); +} +``` + +This test confirms that the `RwLock` prevents lost updates when many clients +concurrently increment the same counter. + +--- + +## Part 12 — Pitfalls and Lessons Learned + +### Pitfall 1: Tombstone GC Predicate Inversion + +**Bug:** After compaction, deleted keys were resurrecting. + +**Cause:** The GC predicate was inverted: +```rust +// ❌ Wrong: keeps tombstone only if the key IS in memtable +if entry.value.is_none() && mem_ref.contains_key(&key) { + continue; // This was dropping tombstones that we needed! +} +``` + +**Fix:** +```rust +// ✅ Correct: GC the tombstone only if the key is NOT in memtable +if entry.value.is_none() && !mem_ref.contains_key(&key) { + continue; // Safe to GC: no live data in memtable references this key +} +``` + +**Lesson:** When writing GC predicates, double-check the direction of the +condition. Write a test that: +1. Writes a key. +2. Deletes it. +3. Compacts. +4. Restarts the engine. +5. Asserts the key is still gone. + +### Pitfall 2: WAL Truncation Without fsync + +**Bug:** On some filesystems (ext4 with barriers), a crash immediately after +`truncate()` but before any new write could leave the old WAL content visible +on restart, causing duplicate WAL replay. + +**Fix:** Always `fsync()` after `truncate()`: +```rust +// crates/engine/src/write.rs +wal.truncate()?; +wal.sync()?; // ← this line was missing +``` + +**Lesson:** Every crash-safety guarantee in storage engineering requires +explicit `fsync()` calls. Assume the OS will do the wrong thing unless +you tell it explicitly what must be durable before proceeding. + +### Pitfall 3: Async Recursion in the RESP Parser + +**Bug:** Rust rejected the recursive `read_value()` call inside arrays: +``` +error[E0733]: recursion in an async fn requires boxing +``` + +**Why:** Each `async fn` compiles to a state machine whose size is computed +at compile time. Recursion makes the type infinitely large. + +**Fix:** Split the parser into non-recursive functions (see Part 8). The +`read_item()` helper handles array elements but never calls `read_value()`. + +### Pitfall 4: Holding RwLock Guards Across Await Points + +**Bug:** Borrow checker rejected command handler code. + +**Why:** The `RwLockWriteGuard` borrows `db.state`. If you hold the guard +and then call an `async` function (which yields to other tasks), Rust must +ensure the guard lives across the await point — but it also means the lock +is held while other tasks try to acquire it, causing a deadlock. + +**Fix:** Always drop the guard before awaiting. Use a scoped block: +```rust +let result = { + let mut state = db.write().await; + // do all synchronous engine work here + state.engine.set(key, val)? + // state guard dropped here, at end of block +}; +// Now await is safe — no lock held +conn.send(encode_ok()).await?; +``` + +### Pitfall 5: engine.get Returns (seq, value), Not Just value + +RiptideKV's `Engine::get()` returns `Option<(u64, Vec)>` — a tuple of +`(sequence_number, value)`. Every call site that only needs the value must +destructure it: + +```rust +// ✅ Correct +let val = engine.get(b"key")?.map(|(_, v)| v); + +// ❌ Wrong — (u64, Vec) is not Vec +let val = engine.get(b"key")?; +``` + +The sequence number is exposed so callers can implement optimistic concurrency +control ("only update if seq matches X") — but most command implementations +don't need it. diff --git a/docs/HOWTORUN.md b/docs/HOWTORUN.md new file mode 100644 index 0000000..5f44034 --- /dev/null +++ b/docs/HOWTORUN.md @@ -0,0 +1,601 @@ +# RiptideKV — How To Run + +Everything you need to build, run, test, and benchmark RiptideKV. + +--- + +## Prerequisites + +| Tool | Version | Install | +|------|---------|---------| +| Rust toolchain | 1.75+ | `curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \| sh` | +| Cargo | ships with Rust | — | +| `redis-cli` (optional) | any | `brew install redis` (macOS) or `apt install redis-tools` | + +Check your Rust version: + +```bash +rustc --version # should print 1.75.0 or newer +cargo --version +``` + +--- + +## 1 — Build + +```bash +# Clone the repository +git clone RiptideKV +cd RiptideKV + +# Build all crates in debug mode (fast compile, slower runtime) +cargo build --workspace + +# Build in release mode (slow compile, fast runtime — use for benchmarks) +cargo build --workspace --release +``` + +Compiled binaries end up in: +``` +target/debug/riptidekv-server (server binary, debug) +target/debug/cli (CLI REPL, debug) +target/release/riptidekv-server (server binary, release) +target/release/cli (CLI REPL, release) +``` + +--- + +## 2 — Interactive CLI (REPL) + +The CLI lets you interact with the storage engine directly — no network stack, +no RESP protocol, just raw engine calls. + +```bash +cargo run -p cli +``` + +You will see a startup message, then a `>` prompt: + +``` +RiptideKV started (seq=0, wal=wal.log, sst_dir=data/sst, flush=1024KiB, l0_trigger=4) +Commands: SET key value | GET key | DEL key | SCAN [start] [end] + COMPACT | FLUSH | STATS | EXIT +> +``` + +### CLI Commands + +| Command | Description | Example | +|---------|-------------|---------| +| `SET key value` | Store a key-value pair | `SET name Alice` | +| `GET key` | Retrieve a value | `GET name` → `Alice` | +| `DEL key` | Delete a key | `DEL name` | +| `SCAN [start [end]]` | List keys in range | `SCAN a z` or `SCAN` (all) | +| `FLUSH` | Flush Memtable to SSTable now | `FLUSH` | +| `COMPACT` | Merge all SSTables | `COMPACT` | +| `STATS` | Show engine statistics | `STATS` | +| `EXIT` | Exit the REPL | `EXIT` | + +### CLI Session Example + +``` +> SET user:1 Alice +OK +> SET user:2 Bob +OK +> GET user:1 +Alice +> SCAN user: user; +user:1 → Alice +user:2 → Bob +> STATS +seq=2, memtable_keys=2, l0_sstables=0, l1_sstables=0, approx_mem=~0 KiB +> DEL user:1 +OK +> GET user:1 +(nil) +> COMPACT +Compacted. +> EXIT +``` + +### CLI Configuration (Environment Variables) + +```bash +RIPTIDE_WAL_PATH=./mywal.log \ +RIPTIDE_SST_DIR=./my-sst \ +RIPTIDE_FLUSH_KB=512 \ +RIPTIDE_WAL_SYNC=true \ +RIPTIDE_L0_TRIGGER=4 \ +cargo run -p cli +``` + +| Variable | Default | Description | +|----------|---------|-------------| +| `RIPTIDE_WAL_PATH` | `wal.log` | Path to the WAL file | +| `RIPTIDE_SST_DIR` | `data/sst` | Directory for SSTable files | +| `RIPTIDE_FLUSH_KB` | `1024` | Memtable flush threshold in KiB (1024 = 1 MiB) | +| `RIPTIDE_WAL_SYNC` | `true` | `fsync` after each WAL write (`true`/`false`) | +| `RIPTIDE_L0_TRIGGER` | `4` | Auto-compact when L0 SSTable count reaches this | + +--- + +## 3 — RESP2 Server + +The server exposes the storage engine over TCP using the Redis Serialization +Protocol (RESP2). Any Redis client library can connect to it. + +### Start the Server + +```bash +# Default: binds to 0.0.0.0:6379, writes to ./wal.log and ./data/sst +cargo run -p server --bin riptidekv-server + +# Or with release optimizations (recommended for real use): +cargo run -p server --bin riptidekv-server --release +``` + +On startup you will see: +``` +2024-01-01T00:00:00.000000Z INFO server: RiptideKV listening on 0.0.0.0:6379 +``` + +Press **Ctrl-C** to gracefully shut down (pending writes are flushed). + +### Server Configuration (Environment Variables) + +```bash +RIPTIDE_BIND=127.0.0.1:6380 \ +RIPTIDE_WAL_PATH=./data/wal.log \ +RIPTIDE_SST_DIR=./data/sst \ +RIPTIDE_FLUSH_KB=4096 \ +RIPTIDE_WAL_SYNC=true \ +cargo run -p server --bin riptidekv-server --release +``` + +| Variable | Default | Description | +|----------|---------|-------------| +| `RIPTIDE_BIND` | `0.0.0.0:6379` | TCP address to listen on | +| `RIPTIDE_WAL_PATH` | `wal.log` | Path to the WAL file | +| `RIPTIDE_SST_DIR` | `data/sst` | Directory for SSTable files | +| `RIPTIDE_FLUSH_KB` | `1024` | Memtable flush threshold in KiB | +| `RIPTIDE_WAL_SYNC` | `true` | `fsync` after each WAL write | + +> **Tip:** Set `RIPTIDE_WAL_SYNC=false` to disable fsync for maximum write +> throughput in development or testing. Never do this if durability matters. + +### Connect with redis-cli + +If you have `redis-cli` installed: + +```bash +redis-cli -h 127.0.0.1 -p 6379 + +127.0.0.1:6379> PING +PONG +127.0.0.1:6379> SET mykey "hello world" +OK +127.0.0.1:6379> GET mykey +"hello world" +127.0.0.1:6379> SET counter 0 +OK +127.0.0.1:6379> INCR counter +(integer) 1 +127.0.0.1:6379> INCR counter +(integer) 2 +127.0.0.1:6379> TTL mykey +(integer) -1 +127.0.0.1:6379> EXPIRE mykey 60 +(integer) 1 +127.0.0.1:6379> TTL mykey +(integer) 59 +127.0.0.1:6379> KEYS * +1) "counter" +2) "mykey" +``` + +### Connect with raw TCP (netcat) + +If you don't have `redis-cli`, use `nc` (netcat) to test the raw RESP2 protocol: + +```bash +printf '*1\r\n$4\r\nPING\r\n' | nc 127.0.0.1 6379 +# → +PONG + +printf '*3\r\n$3\r\nSET\r\n$3\r\nfoo\r\n$3\r\nbar\r\n' | nc 127.0.0.1 6379 +# → +OK + +printf '*2\r\n$3\r\nGET\r\n$3\r\nfoo\r\n' | nc 127.0.0.1 6379 +# → $3 +# → bar +``` + +### Supported Commands + +RiptideKV implements **55+ Redis commands**. Here is the full list grouped by category: + +#### Connection & Server +| Command | Description | +|---------|-------------| +| `PING [message]` | Returns PONG or echoes the message | +| `ECHO message` | Returns the message | +| `SELECT index` | Switch database (only 0 is supported) | +| `QUIT` | Close the connection | +| `RESET` | Reset connection state | +| `HELLO [version]` | Protocol negotiation (RESP2 only) | +| `CLIENT SETNAME name` | Set connection name | +| `CLIENT GETNAME` | Get connection name | +| `CLIENT ID` | Get connection ID | +| `CLIENT INFO` | Connection information | +| `CONFIG GET pattern` | Returns empty array (stub) | +| `CONFIG SET ...` | Returns OK (stub) | +| `INFO [section]` | Server information (server, clients, stats, keyspace, replication, memory) | +| `COMMAND COUNT` | Number of supported commands | +| `DBSIZE` | Number of keys in the database | +| `FLUSHDB` / `FLUSHALL` | Delete all keys | +| `DEBUG SLEEP seconds` | Sleep for N seconds | +| `WAIT` | Returns 0 (replica sync, not implemented) | +| `BGSAVE` / `SAVE` | Returns stub response | +| `ACL WHOAMI` | Returns "default" | +| `ACL LIST` | Returns default user entry | +| `SLOWLOG GET/LEN/RESET` | Returns stubs | +| `MEMORY USAGE key` | Returns nil | + +#### String Operations +| Command | Syntax | Description | +|---------|--------|-------------| +| `SET` | `SET key value [EX s] [PX ms] [NX\|XX] [GET] [KEEPTTL]` | Set a key | +| `GET` | `GET key` | Get a value | +| `SETNX` | `SETNX key value` | Set only if absent | +| `SETEX` | `SETEX key seconds value` | Set with expiry in seconds | +| `PSETEX` | `PSETEX key ms value` | Set with expiry in milliseconds | +| `GETSET` | `GETSET key value` | Set and return old value | +| `GETDEL` | `GETDEL key` | Get and delete | +| `GETEX` | `GETEX key [EX s\|PX ms\|PERSIST]` | Get and modify expiry | +| `MGET` | `MGET key [key ...]` | Get multiple keys | +| `MSET` | `MSET k v [k v ...]` | Set multiple keys | +| `MSETNX` | `MSETNX k v [k v ...]` | Set multiple keys only if all absent | +| `APPEND` | `APPEND key value` | Append to string | +| `STRLEN` | `STRLEN key` | Length of string | +| `INCR` | `INCR key` | Increment integer by 1 | +| `INCRBY` | `INCRBY key delta` | Increment by delta | +| `INCRBYFLOAT` | `INCRBYFLOAT key delta` | Increment float by delta | +| `DECR` | `DECR key` | Decrement integer by 1 | +| `DECRBY` | `DECRBY key delta` | Decrement by delta | +| `GETRANGE` | `GETRANGE key start end` | Substring by byte index | +| `SETRANGE` | `SETRANGE key offset value` | Overwrite substring | + +#### Key / Generic Operations +| Command | Syntax | Description | +|---------|--------|-------------| +| `DEL` | `DEL key [key ...]` | Delete one or more keys | +| `UNLINK` | `UNLINK key [key ...]` | Same as DEL | +| `EXISTS` | `EXISTS key [key ...]` | Count existing keys | +| `TYPE` | `TYPE key` | Returns `string` or `none` | +| `EXPIRE` | `EXPIRE key seconds` | Set TTL in seconds | +| `PEXPIRE` | `PEXPIRE key ms` | Set TTL in milliseconds | +| `EXPIREAT` | `EXPIREAT key unix-seconds` | Set expiry as Unix timestamp | +| `PEXPIREAT` | `PEXPIREAT key unix-ms` | Set expiry as Unix timestamp in ms | +| `TTL` | `TTL key` | Remaining TTL in seconds (-1 = no expiry, -2 = absent) | +| `PTTL` | `PTTL key` | Remaining TTL in milliseconds | +| `PERSIST` | `PERSIST key` | Remove expiry | +| `EXPIRETIME` | `EXPIRETIME key` | Absolute expiry as Unix timestamp | +| `PEXPIRETIME` | `PEXPIRETIME key` | Absolute expiry as Unix timestamp in ms | +| `KEYS` | `KEYS pattern` | List keys matching glob pattern (`*`, `?`, `[...]`) | +| `SCAN` | `SCAN cursor [MATCH pat] [COUNT n]` | Iterate keys | +| `RENAME` | `RENAME src dst` | Rename a key | +| `RENAMENX` | `RENAMENX src dst` | Rename only if dst absent | +| `RANDOMKEY` | `RANDOMKEY` | Random key from the database | +| `TOUCH` | `TOUCH key [key ...]` | Count existing keys (access-time update) | + +--- + +## 4 — Running Tests + +```bash +# Run all tests in the workspace +cargo test --workspace + +# Run only the server integration tests +cargo test -p server + +# Run only the engine tests +cargo test -p engine + +# Run a specific test by name +cargo test -p server test_concurrent_incr_correctness + +# Show test output (don't capture stdout) +cargo test --workspace -- --nocapture + +# Run tests in parallel with N threads (default: number of CPUs) +cargo test --workspace -- --test-threads=4 +``` + +### What Gets Tested + +**Rust crates (`cargo test --workspace`):** + +| Crate | Tests | What's covered | +|-------|-------|----------------| +| `bloom` | 17 | Insert, lookup, false-positive rate, serialization, edge cases | +| `memtable` | 43 | CRUD, sequence gating, tombstones, iteration, size tracking | +| `wal` | 22 | Append, CRC32, truncated tail, corruption detection, fsync | +| `sstable` | 21 | Write (atomic), read, bloom filter, merge iterator, v1/v2/v3 compat | +| `engine` | 55 | CRUD, flush, compaction, recovery, scan, manifest, tombstone GC | +| `server` | 84 | All 55+ commands, TTL expiry, concurrent clients, pipelining, binary values | +| doctests | 3 | Usage examples in bloom, memtable, wal | +| **Total (Rust)** | **245** | | + +**Java embedding library (`mvn test -f java/pom.xml`):** + +```bash +# Run all Java tests (requires the native binary in src/main/resources/native/) +mvn test -f java/pom.xml + +# Run a specific test class +mvn test -f java/pom.xml -Dtest=RespCommandsTest +mvn test -f java/pom.xml -Dtest=RiptideKVConfigTest +mvn test -f java/pom.xml -Dtest=RiptideKVServerTest +``` + +| Test class | Tests | What's covered | +|---|---|---| +| `RiptideKVConfigTest` | 20 | Builder defaults, validation (null, blank, bad format, out-of-range port), fluency | +| `RiptideKVServerTest` | 14 | start, stop, isRunning, close idempotency, port release, null config guard | +| `RespCommandsTest` | 116 | All 55+ commands over real TCP — Connection, Database, Strings, Keys, expiry, pipelining, concurrent clients, binary safety | +| **Total (Java)** | **150** | | + +> **Note:** Java tests require the native binary to be present in +> `java/src/main/resources/native/-/riptidekv-server[.exe]`. +> Build it first with `cargo build --release -p server` and copy or symlink it. + +--- + +## 5 — Running Benchmarks + +Benchmarks use [Criterion.rs](https://bheisler.github.io/criterion.rs/book/) +for statistically rigorous measurements with warm-up, outlier detection, and +comparison across runs. + +```bash +# Run all benchmarks (release build, takes several minutes) +cargo bench -p cli +cargo bench -p server + +# Run a specific benchmark group +cargo bench -p cli -- engine_set_no_flush +cargo bench -p server -- server_ping_1k + +# Save baseline for comparison +cargo bench -p server -- --save-baseline before_change + +# Compare against saved baseline +cargo bench -p server -- --baseline before_change +``` + +Results are saved in `target/criterion/`. Open `target/criterion/report/index.html` +in a browser for graphs. + +### Available Benchmarks + +**`crates/cli/benches/` (storage engine level):** + +| Benchmark | Measures | +|-----------|----------| +| `memtable_put_10k_sequential` | Memtable insert throughput | +| `memtable_get_hit_10k` | Memtable point lookup | +| `memtable_get_miss_10k` | Memtable miss (not found) | +| `sstable_write_from_memtable_10k` | Full flush to disk | +| `sstable_get_hit_10k` | SSTable point lookup (with bloom) | +| `sstable_get_miss_10k` | SSTable miss (bloom short-circuits) | +| `wal_append_nosync_5k` | WAL write without fsync | +| `engine_set_no_flush_1k` | End-to-end write (memtable only) | +| `engine_set_with_flush_1k` | End-to-end write (with flush) | +| `engine_get_memtable_hit_1k` | End-to-end read from memtable | +| `engine_get_sstable_hit_1k` | End-to-end read from SSTable | +| `engine_mixed_set_get_del_1k` | Mixed workload | + +**`crates/server/benches/` (TCP server level):** + +| Benchmark | Measures | +|-----------|----------| +| `server_ping_1k` | Round-trip latency for 1k PINGs | +| `server_set/set_1k_64b_values` | Write throughput over TCP | +| `server_get/get_1k_existing_keys` | Read throughput over TCP | +| `server_pipeline/pipeline_500_set_500_get` | Pipelined mixed workload | +| `server_mset_100_keys` | Batch write throughput | + +--- + +## 6 — Persistent Data Storage + +By default the server stores data in: +``` +./wal.log ← WAL (append-only log) +./data/sst/ ← SSTables (immutable on-disk sorted files) + │ MANIFEST (which file is L0 vs L1) + │ sst-0000000001-.sst + └── sst-0000000002-.sst +``` + +To use a specific data directory: + +```bash +mkdir -p /var/lib/riptidekv +RIPTIDE_WAL_PATH=/var/lib/riptidekv/wal.log \ +RIPTIDE_SST_DIR=/var/lib/riptidekv/sst \ +cargo run -p server --bin riptidekv-server --release +``` + +### Data Survives Restarts + +Stop the server with Ctrl-C, then restart it pointing at the same data directory: + +```bash +# Session 1 +RIPTIDE_SST_DIR=/tmp/rkv-data cargo run -p server --bin riptidekv-server +# → SET persistent_key hello +# → Ctrl-C + +# Session 2 (same data directory) +RIPTIDE_SST_DIR=/tmp/rkv-data cargo run -p server --bin riptidekv-server +# → GET persistent_key → "hello" ✓ data survived +``` + +> **Note:** TTLs (EXPIRE/PEXPIRE) are **not** persisted across restarts. +> A key with a TTL will behave as if it has no expiry after a restart. + +--- + +## 7 — Connecting from Java (Jedis) + +Add Jedis to your Maven `pom.xml`: + +```xml + + redis.clients + jedis + 5.1.0 + +``` + +Connect to RiptideKV exactly as you would connect to Redis: + +```java +import redis.clients.jedis.Jedis; + +try (Jedis jedis = new Jedis("localhost", 6379)) { + // String operations + jedis.set("user:1", "Alice"); + String val = jedis.get("user:1"); // "Alice" + + // Counter + jedis.set("counter", "0"); + jedis.incr("counter"); // 1 + jedis.incrBy("counter", 5); // 6 + + // TTL + jedis.setex("session:abc", 3600, "user_data"); // expires in 1 hour + long ttl = jedis.ttl("session:abc"); // ~3600 + + // Bulk operations + jedis.mset("k1", "v1", "k2", "v2"); + List vals = jedis.mget("k1", "k2", "absent"); + // ["v1", "v2", null] + + // Pattern matching + Set keys = jedis.keys("user:*"); +} +``` + +--- + +## 8 — Connecting from Python (redis-py) + +```bash +pip install redis +``` + +```python +import redis + +r = redis.Redis(host='localhost', port=6379, decode_responses=True) + +r.set('name', 'Alice') +print(r.get('name')) # Alice + +r.setex('session', 3600, 'data') +print(r.ttl('session')) # ~3600 + +r.mset({'k1': 'v1', 'k2': 'v2'}) +print(r.mget('k1', 'k2')) # ['v1', 'v2'] + +print(r.keys('k*')) # ['k1', 'k2'] +``` + +--- + +## 9 — Troubleshooting + +### "Address already in use" + +Another process is using port 6379 (perhaps a local Redis server). +Use a different port: + +```bash +RIPTIDE_BIND=127.0.0.1:6380 cargo run -p server --bin riptidekv-server +redis-cli -p 6380 PING +``` + +### Data Directory Errors on Startup + +``` +Error: No such file or directory (os error 2) +``` + +The SSTable directory doesn't exist. Create it or let the server create it: + +```bash +mkdir -p data/sst +# or set a directory that exists: +RIPTIDE_SST_DIR=/tmp/rkv-sst cargo run -p server --bin riptidekv-server +``` + +### WAL Corruption Warning + +``` +WalReader: CRC mismatch at record, stopping replay +``` + +This indicates genuine data corruption (not a truncated tail, which is handled +silently). The engine will stop WAL replay at the corrupt record and continue +with what it successfully replayed. + +### INCR on a non-numeric value returns 1 (not an error) + +RiptideKV treats un-parseable values as `0` for INCR/DECR (graceful degradation). +This differs from Redis, which returns an error. If strict Redis compatibility is +required, validate values before incrementing. + +--- + +### "NOPROTO this server does not support RESP3" + +You are connecting with a client configured for RESP3 (newer Redis protocol). +Disable RESP3 in your client: + +```python +# redis-py: disable RESP3 +r = redis.Redis(host='localhost', port=6379, protocol=2) +``` + +```java +// Jedis: use standard connection (defaults to RESP2) +Jedis jedis = new Jedis("localhost", 6379); // fine as-is +``` + +### TTL Disappeared After Restart + +TTLs are stored in memory only and are lost on server restart. This is a known +limitation. Keys survive, but their expiry times do not. + +--- + +## 10 — Building a Release Binary + +```bash +cargo build -p server --release + +# The binary is at: +./target/release/riptidekv-server + +# Run it: +RIPTIDE_BIND=0.0.0.0:6379 \ +RIPTIDE_WAL_PATH=/var/lib/riptidekv/wal.log \ +RIPTIDE_SST_DIR=/var/lib/riptidekv/sst \ +./target/release/riptidekv-server +``` + +The release binary is roughly 10–20× faster than the debug build due to +LLVM optimizations and inlining. diff --git a/java/.gitignore b/java/.gitignore new file mode 100644 index 0000000..a13de70 --- /dev/null +++ b/java/.gitignore @@ -0,0 +1,6 @@ +# Maven build output +target/ + +# Native binaries are injected by CI (publish-maven.yml) at release time. +# They are large binary files and must NOT be committed to git. +src/main/resources/native/ diff --git a/java/pom.xml b/java/pom.xml new file mode 100644 index 0000000..4fb128f --- /dev/null +++ b/java/pom.xml @@ -0,0 +1,120 @@ + + + 4.0.0 + + + + io.github.YOUR_GITHUB_USERNAME + riptidekv-server + + 0.0.0-SNAPSHOT + jar + + RiptideKV Server + + Embedded RiptideKV RESP2 key-value server. Bundles native binaries for + Linux x86_64/aarch64, macOS x86_64/aarch64, and Windows x86_64. + RiptideKVServer extracts the right binary at runtime and starts it as a + subprocess so any Redis client (Jedis, lettuce, redis-py, go-redis) can + connect to it immediately. + + https://github.com/YOUR_GITHUB_USERNAME/RiptideKV + + + + MIT License + https://opensource.org/licenses/MIT + + + + + scm:git:git://github.com/YOUR_GITHUB_USERNAME/RiptideKV.git + scm:git:ssh://github.com/YOUR_GITHUB_USERNAME/RiptideKV.git + https://github.com/YOUR_GITHUB_USERNAME/RiptideKV + + + + 17 + 17 + UTF-8 + + + + + + + github + GitHub Packages + https://maven.pkg.github.com/YOUR_GITHUB_USERNAME/RiptideKV + + + + + + org.junit.jupiter + junit-jupiter + 5.11.4 + test + + + + + + + + org.apache.maven.plugins + maven-source-plugin + 3.3.1 + + + attach-sources + jar-no-fork + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.4.1 + + + + true + + + ${project.version} + + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.5.2 + + + + + org.codehaus.mojo + versions-maven-plugin + 2.17.1 + + + + diff --git a/java/src/main/java/io/riptidekv/RiptideKVConfig.java b/java/src/main/java/io/riptidekv/RiptideKVConfig.java new file mode 100644 index 0000000..4e83528 --- /dev/null +++ b/java/src/main/java/io/riptidekv/RiptideKVConfig.java @@ -0,0 +1,153 @@ +package io.riptidekv; + +import java.nio.file.Path; +import java.nio.file.Paths; + +/** + * Immutable configuration for a {@link RiptideKVServer} instance. + * + *

Use the fluent {@link Builder} to construct a config: + * + *

{@code
+ * RiptideKVConfig config = RiptideKVConfig.builder()
+ *     .bind("127.0.0.1:6380")          // TCP address to listen on
+ *     .dataDir(Paths.get("/tmp/rkv"))   // where WAL + SSTables live
+ *     .flushKb(4096)                    // flush memtable at 4 MiB
+ *     .walSync(false)                   // disable fsync for speed in tests
+ *     .build();
+ * }
+ * + *

All fields map directly to environment variables consumed by + * {@code riptidekv-server}: + * + * + * + * + * + * + * + * + *
Builder methodEnv variableDefault
{@link Builder#bind}RIPTIDE_BIND127.0.0.1:6379
{@link Builder#dataDir} (WAL)RIPTIDE_WAL_PATH<dataDir>/wal.log
{@link Builder#dataDir} (SST)RIPTIDE_SST_DIR<dataDir>/sst
{@link Builder#flushKb}RIPTIDE_FLUSH_KB1024
{@link Builder#walSync}RIPTIDE_WAL_SYNCtrue
+ */ +public final class RiptideKVConfig { + + private final String bind; + private final Path dataDir; + private final int flushKb; + private final boolean walSync; + + private RiptideKVConfig(Builder b) { + this.bind = b.bind; + this.dataDir = b.dataDir; + this.flushKb = b.flushKb; + this.walSync = b.walSync; + } + + /** TCP address the server binds to, e.g. {@code "127.0.0.1:6379"}. */ + public String getBind() { return bind; } + + /** Directory under which {@code wal.log} and {@code sst/} are stored. */ + public Path getDataDir() { return dataDir; } + + /** Memtable flush threshold in KiB. */ + public int getFlushKb() { return flushKb; } + + /** Whether the server calls {@code fsync} after every WAL write. */ + public boolean isWalSync() { return walSync; } + + /** + * Extracts the port number from the bind address. + * E.g. {@code "127.0.0.1:6380"} → {@code 6380}. + */ + public int getPort() { + int colon = bind.lastIndexOf(':'); + if (colon < 0) throw new IllegalStateException("Bind address has no port: " + bind); + return Integer.parseInt(bind.substring(colon + 1)); + } + + /** Returns a new {@link Builder} with all defaults set. */ + public static Builder builder() { return new Builder(); } + + // ── Builder ────────────────────────────────────────────────────────────── + + /** Fluent builder for {@link RiptideKVConfig}. */ + public static final class Builder { + + private String bind = "127.0.0.1:6379"; + private Path dataDir = Paths.get(System.getProperty("java.io.tmpdir"), "riptidekv"); + private int flushKb = 1024; + private boolean walSync = true; + + private Builder() {} + + /** + * TCP address the server binds to. + * Use {@code "127.0.0.1:0"} to let the OS pick a free port (useful for + * tests, but the actual port must then be read from the process output). + * Default: {@code "127.0.0.1:6379"}. + */ + public Builder bind(String bind) { + if (bind == null || bind.isBlank()) throw new IllegalArgumentException("bind must not be blank"); + this.bind = bind; + return this; + } + + /** + * Root directory where the WAL file ({@code wal.log}) and SSTable + * directory ({@code sst/}) will be created. The directory is created + * automatically by {@link RiptideKVServer#start()} if it does not exist. + * Default: {@code /riptidekv}. + */ + public Builder dataDir(Path dataDir) { + if (dataDir == null) throw new IllegalArgumentException("dataDir must not be null"); + this.dataDir = dataDir; + return this; + } + + /** + * Memtable flush threshold in KiB. When the in-memory write buffer + * reaches this size, its contents are flushed to a new immutable + * SSTable on disk. Larger values mean fewer flushes and more RAM + * usage; smaller values mean more frequent disk writes. + * Default: {@code 1024} (= 1 MiB). + */ + public Builder flushKb(int flushKb) { + if (flushKb <= 0) throw new IllegalArgumentException("flushKb must be > 0"); + this.flushKb = flushKb; + return this; + } + + /** + * Whether to call {@code fsync} after every WAL write. + * {@code true} (default) — fully durable; every acknowledged write + * survives a power loss or OS crash. + * {@code false} — up to ~1 second of writes may be lost on a hard + * crash, but throughput is significantly higher. Safe for + * ephemeral/test data. + */ + public Builder walSync(boolean walSync) { + this.walSync = walSync; + return this; + } + + /** Build the immutable {@link RiptideKVConfig}. */ + public RiptideKVConfig build() { + int colon = bind.lastIndexOf(':'); + if (colon < 0) { + throw new IllegalArgumentException( + "bind must be in host:port format (no colon found): " + bind); + } + try { + int port = Integer.parseInt(bind.substring(colon + 1)); + if (port < 0 || port > 65535) { + throw new IllegalArgumentException( + "bind port must be in [0, 65535]: " + port); + } + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "bind port is not a valid integer: " + bind, e); + } + return new RiptideKVConfig(this); + } + } +} diff --git a/java/src/main/java/io/riptidekv/RiptideKVServer.java b/java/src/main/java/io/riptidekv/RiptideKVServer.java new file mode 100644 index 0000000..258265d --- /dev/null +++ b/java/src/main/java/io/riptidekv/RiptideKVServer.java @@ -0,0 +1,305 @@ +package io.riptidekv; + +import java.io.IOException; +import java.io.InputStream; +import java.net.Socket; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.util.Locale; +import java.util.concurrent.TimeUnit; + +/** + * Manages the lifecycle of an embedded RiptideKV server process. + * + *

How it works

+ *
    + *
  1. On {@link #start()}, the platform-specific {@code riptidekv-server} + * binary is extracted from the JAR's {@code /native/<os>-<arch>/} + * classpath resource to a temporary file.
  2. + *
  3. The binary is launched as a child {@link Process} with the configured + * environment variables.
  4. + *
  5. {@link #start()} blocks until the server accepts TCP connections + * (up to 10 seconds), then returns.
  6. + *
  7. Any Redis client (Jedis, lettuce, redis-py, redis-cli, go-redis) can + * connect to {@code 127.0.0.1:} and issue commands.
  8. + *
  9. {@link #close()} sends SIGTERM and waits for a clean shutdown.
  10. + *
+ * + *

Quick start — plain Java

+ *
{@code
+ * RiptideKVConfig cfg = RiptideKVConfig.builder()
+ *     .bind("127.0.0.1:6379")
+ *     .dataDir(Paths.get("/var/lib/myapp/rkv"))
+ *     .build();
+ *
+ * try (RiptideKVServer server = new RiptideKVServer(cfg)) {
+ *     server.start();
+ *     // now talk to it with any Redis client:
+ *     try (Jedis j = new Jedis("127.0.0.1", server.getPort())) {
+ *         j.set("hello", "world");
+ *         System.out.println(j.get("hello")); // world
+ *     }
+ * } // server shuts down here
+ * }
+ * + *

Quick start — Spring Boot test

+ *
{@code
+ * @BeforeAll
+ * static void startKv() throws IOException {
+ *     server = new RiptideKVServer(
+ *         RiptideKVConfig.builder()
+ *             .bind("127.0.0.1:16379")
+ *             .walSync(false)   // fast for tests
+ *             .build());
+ *     server.start();
+ * }
+ *
+ * @AfterAll
+ * static void stopKv() { server.close(); }
+ * }
+ * + *

Supported platforms

+ *
    + *
  • Linux x86_64
  • + *
  • Linux aarch64 (when included in the release)
  • + *
  • macOS x86_64 (Intel)
  • + *
  • macOS aarch64 (Apple Silicon)
  • + *
  • Windows x86_64
  • + *
+ */ +public final class RiptideKVServer implements AutoCloseable { + + private final RiptideKVConfig config; + + private volatile Process process; + private volatile Path extractedBinary; + + /** + * Create a server manager with the given configuration. + * The server is not started until {@link #start()} is called. + */ + public RiptideKVServer(RiptideKVConfig config) { + if (config == null) throw new IllegalArgumentException("config must not be null"); + this.config = config; + } + + // ── Lifecycle ───────────────────────────────────────────────────────────── + + /** + * Extract the native binary, create the data directory, launch the server + * process, and block until it is accepting TCP connections. + * + * @throws IOException if the binary cannot be extracted, the + * process fails to start, or it does not + * become ready within 10 seconds. + * @throws IllegalStateException if the server is already running. + */ + public void start() throws IOException { + if (process != null && process.isAlive()) { + throw new IllegalStateException("RiptideKV server is already running (pid=" + process.pid() + ")"); + } + + extractedBinary = extractBinary(); + + // Create data directory layout expected by the server. + Path dataDir = config.getDataDir(); + Path walPath = dataDir.resolve("wal.log"); + Path sstDir = dataDir.resolve("sst"); + Files.createDirectories(sstDir); + + ProcessBuilder pb = new ProcessBuilder(extractedBinary.toString()); + pb.environment().put("RIPTIDE_BIND", config.getBind()); + pb.environment().put("RIPTIDE_WAL_PATH", walPath.toString()); + pb.environment().put("RIPTIDE_SST_DIR", sstDir.toString()); + pb.environment().put("RIPTIDE_FLUSH_KB", String.valueOf(config.getFlushKb())); + pb.environment().put("RIPTIDE_WAL_SYNC", config.isWalSync() ? "true" : "false"); + + // Redirect server stderr/stdout to /dev/null by default. + // Override by calling pb.inheritIO() before start() if you need logs. + pb.redirectErrorStream(true); + pb.redirectOutput(ProcessBuilder.Redirect.DISCARD); + + process = pb.start(); + + waitUntilReady(10_000); + } + + /** + * Returns the TCP port the server is listening on. + * Derived from the configured {@code bind} address. + */ + public int getPort() { + return config.getPort(); + } + + /** + * Returns the full bind address string, e.g. {@code "127.0.0.1:6379"}. + */ + public String getBind() { + return config.getBind(); + } + + /** + * Returns {@code true} if the server process is currently running. + */ + public boolean isRunning() { + return process != null && process.isAlive(); + } + + /** + * Terminate the server. + * + *

Sends SIGTERM (graceful shutdown — flushes the memtable to disk), + * then waits up to 5 seconds for the process to exit. If it does not + * exit in time, {@code SIGKILL} is sent. + * + *

Safe to call multiple times; subsequent calls are no-ops. + */ + @Override + public void close() { + Process p = process; + if (p == null || !p.isAlive()) return; + + p.destroy(); // SIGTERM — gives the server a chance to flush + try { + if (!p.waitFor(5, TimeUnit.SECONDS)) { + p.destroyForcibly(); // SIGKILL — no mercy + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + p.destroyForcibly(); + } finally { + // Delete the extracted temp binary. + Path bin = extractedBinary; + if (bin != null) { + try { Files.deleteIfExists(bin); } catch (IOException ignored) {} + } + } + } + + // ── Private helpers ─────────────────────────────────────────────────────── + + /** + * Detect the current platform, find the matching resource inside the JAR, + * copy it to a temp file, and mark it executable. + */ + private Path extractBinary() throws IOException { + String platform = detectPlatform(); + boolean isWin = platform.startsWith("windows"); + String ext = isWin ? ".exe" : ""; + String resource = "/native/" + platform + "/riptidekv-server" + ext; + + InputStream in = RiptideKVServer.class.getResourceAsStream(resource); + if (in == null) { + throw new IOException( + "RiptideKV native binary not bundled for platform '" + platform + "'.\n" + + "Looked for classpath resource: " + resource + "\n" + + "Supported platforms: linux-x86_64, linux-aarch64, " + + "macos-x86_64, macos-aarch64, windows-x86_64.\n" + + "Make sure you are using the official riptidekv-server JAR from " + + "GitHub Packages, not a locally-built snapshot without binaries." + ); + } + + Path tmp = Files.createTempFile("riptidekv-server-", ext.isEmpty() ? "" : ext); + tmp.toFile().deleteOnExit(); + + try (InputStream src = in) { + Files.copy(src, tmp, StandardCopyOption.REPLACE_EXISTING); + } + + if (!isWin) { + // chmod +x so the OS will actually execute it. + if (!tmp.toFile().setExecutable(true, true)) { + throw new IOException("Failed to set executable bit on: " + tmp); + } + } + + return tmp; + } + + /** + * Derive the platform key used as the resource directory name. + * E.g. {@code "linux-x86_64"}, {@code "macos-aarch64"}, {@code "windows-x86_64"}. + */ + private static String detectPlatform() { + String os = System.getProperty("os.name", "").toLowerCase(Locale.ROOT); + String arch = System.getProperty("os.arch", "").toLowerCase(Locale.ROOT); + + String osKey; + if (os.contains("linux")) { + osKey = "linux"; + } else if (os.contains("mac") || os.contains("darwin")) { + osKey = "macos"; + } else if (os.contains("windows")) { + osKey = "windows"; + } else { + throw new UnsupportedOperationException( + "Unsupported operating system: '" + System.getProperty("os.name") + "'. " + + "RiptideKV supports Linux, macOS, and Windows."); + } + + String archKey; + if (arch.equals("amd64") || arch.equals("x86_64")) { + archKey = "x86_64"; + } else if (arch.equals("aarch64") || arch.equals("arm64")) { + archKey = "aarch64"; + } else { + throw new UnsupportedOperationException( + "Unsupported CPU architecture: '" + System.getProperty("os.arch") + "'. " + + "RiptideKV supports x86_64 (amd64) and aarch64 (arm64)."); + } + + return osKey + "-" + archKey; + } + + /** + * Poll the server's TCP port until a connection succeeds or we time out. + * + * @param timeoutMs maximum wait time in milliseconds + * @throws IOException if the process dies early or the timeout expires + */ + private void waitUntilReady(long timeoutMs) throws IOException { + // Determine host to probe from the bind address. + String bindHost = config.getBind(); + int colon = bindHost.lastIndexOf(':'); + String host = bindHost.substring(0, colon); + int port = config.getPort(); + + // "0.0.0.0" means "all interfaces" — probe loopback. + if (host.equals("0.0.0.0") || host.isEmpty()) { + host = "127.0.0.1"; + } + + long deadline = System.currentTimeMillis() + timeoutMs; + IOException lastError = null; + + while (System.currentTimeMillis() < deadline) { + // Check the process hasn't already died. + if (!process.isAlive()) { + throw new IOException( + "RiptideKV server process exited unexpectedly before becoming ready " + + "(exit code: " + process.exitValue() + "). " + + "Check that the data directory is writable: " + config.getDataDir()); + } + + try (Socket socket = new Socket(host, port)) { + return; // TCP handshake succeeded — server is accepting connections + } catch (IOException e) { + lastError = e; + try { + Thread.sleep(50); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while waiting for RiptideKV to start", ie); + } + } + } + + close(); // clean up the zombie process + throw new IOException( + "RiptideKV server did not become ready within " + timeoutMs + " ms on " + + config.getBind() + ". Last connection error: " + lastError); + } +} diff --git a/java/src/test/java/io/riptidekv/RespClient.java b/java/src/test/java/io/riptidekv/RespClient.java new file mode 100644 index 0000000..425758f --- /dev/null +++ b/java/src/test/java/io/riptidekv/RespClient.java @@ -0,0 +1,150 @@ +package io.riptidekv; + +import java.io.*; +import java.net.Socket; +import java.nio.charset.StandardCharsets; + +/** + * Minimal RESP2 client for tests only. + * Reads responses at the byte level so binary values are handled correctly. + */ +class RespClient implements AutoCloseable { + + private final Socket socket; + private final OutputStream out; + private final InputStream in; + + RespClient(int port) throws IOException { + this("127.0.0.1", port); + } + + RespClient(String host, int port) throws IOException { + socket = new Socket(host, port); + socket.setSoTimeout(5_000); + out = new BufferedOutputStream(socket.getOutputStream()); + in = socket.getInputStream(); + } + + // ── Send ───────────────────────────────────────────────────────────────── + + /** Write a RESP2 inline array command. */ + void send(String... args) throws IOException { + StringBuilder sb = new StringBuilder(); + sb.append('*').append(args.length).append("\r\n"); + for (String arg : args) { + byte[] b = arg.getBytes(StandardCharsets.UTF_8); + sb.append('$').append(b.length).append("\r\n").append(arg).append("\r\n"); + } + out.write(sb.toString().getBytes(StandardCharsets.UTF_8)); + out.flush(); + } + + // ── Receive ─────────────────────────────────────────────────────────────── + + /** + * Read one RESP2 value. Returns: + *

    + *
  • {@code String} — simple string (+) or bulk string ($) + *
  • {@code Long} — integer (:) + *
  • {@code Object[]} — array (*) + *
  • {@code null} — null bulk ($-1) or null array (*-1) + *
  • {@link RespError}— error (-) + *
+ */ + Object recv() throws IOException { + String line = readLine(); + char type = line.charAt(0); + String payload = line.substring(1); + + return switch (type) { + case '+' -> payload; + case '-' -> new RespError(payload); + case ':' -> Long.parseLong(payload); + case '$' -> { + int len = Integer.parseInt(payload); + if (len == -1) yield null; + byte[] buf = in.readNBytes(len); + in.readNBytes(2); // CRLF + yield new String(buf, StandardCharsets.UTF_8); + } + case '*' -> { + int count = Integer.parseInt(payload); + if (count == -1) yield null; + Object[] arr = new Object[count]; + for (int i = 0; i < count; i++) arr[i] = recv(); + yield arr; + } + default -> throw new IOException("Unknown RESP prefix '" + type + "' in: " + line); + }; + } + + // ── Typed receive helpers ───────────────────────────────────────────────── + + /** Assert the next response is a simple string; return it. */ + String recvSimple() throws IOException { + Object r = recv(); + if (r instanceof String s) return s; + throw new AssertionError("Expected simple string, got: " + r); + } + + /** Assert the next response is +OK. */ + void recvOk() throws IOException { + String s = recvSimple(); + if (!"OK".equals(s)) throw new AssertionError("Expected OK, got: " + s); + } + + /** Read a bulk string; may be null (null bulk reply). */ + String recvBulk() throws IOException { + Object r = recv(); + if (r == null || r instanceof String) return (String) r; + throw new AssertionError("Expected bulk string, got: " + r); + } + + /** Read an integer reply. */ + long recvInt() throws IOException { + Object r = recv(); + if (r instanceof Long l) return l; + throw new AssertionError("Expected integer, got: " + r); + } + + /** Read an array reply; may be null. */ + Object[] recvArray() throws IOException { + Object r = recv(); + if (r == null || r instanceof Object[]) return (Object[]) r; + throw new AssertionError("Expected array, got: " + r); + } + + /** Read an error reply. */ + RespError recvError() throws IOException { + Object r = recv(); + if (r instanceof RespError e) return e; + throw new AssertionError("Expected error, got: " + r); + } + + // ── Low-level ───────────────────────────────────────────────────────────── + + private String readLine() throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(64); + int b; + while ((b = in.read()) != -1) { + if (b == '\r') { + in.read(); // consume \n + return baos.toString(StandardCharsets.UTF_8); + } + baos.write(b); + } + throw new EOFException("Server closed connection mid-line"); + } + + @Override + public void close() throws IOException { + socket.close(); + } + + // ── Error wrapper ───────────────────────────────────────────────────────── + + record RespError(String message) { + boolean startsWith(String prefix) { return message.startsWith(prefix); } + @Override public String toString() { return "-" + message; } + } +} diff --git a/java/src/test/java/io/riptidekv/RespCommandsTest.java b/java/src/test/java/io/riptidekv/RespCommandsTest.java new file mode 100644 index 0000000..e503e5a --- /dev/null +++ b/java/src/test/java/io/riptidekv/RespCommandsTest.java @@ -0,0 +1,1064 @@ +package io.riptidekv; + +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.net.ServerSocket; +import java.nio.file.Path; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.*; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * End-to-end integration tests for every RESP2 command supported by RiptideKV. + * + *

One server is started per outer-class lifecycle ({@code @BeforeAll}/{@code @AfterAll}). + * A fresh {@link RespClient} is opened before each test and {@code FLUSHALL} is called + * so each test starts with an empty keyspace. + * + *

Commands are grouped into {@code @Nested} classes by category: + *

    + *
  • {@link ConnectionTests} — PING, ECHO, SELECT, HELLO, CLIENT, INFO, CONFIG, COMMAND, QUIT
  • + *
  • {@link DatabaseTests} — DBSIZE, FLUSHDB, FLUSHALL, ACL, SLOWLOG, MEMORY, WAIT
  • + *
  • {@link StringTests} — GET, SET (all options), SETNX, SETEX, PSETEX, GETSET, GETDEL, + * GETEX, MGET, MSET, MSETNX, APPEND, STRLEN, INCR, INCRBY, + * INCRBYFLOAT, DECR, DECRBY, GETRANGE, SETRANGE
  • + *
  • {@link KeyTests} — DEL, UNLINK, EXISTS, TYPE, RENAME, RENAMENX, RANDOMKEY, TOUCH, + * EXPIRE, PEXPIRE, EXPIREAT, PEXPIREAT, TTL, PTTL, PERSIST, + * EXPIRETIME, PEXPIRETIME, KEYS, SCAN
  • + *
  • {@link ExpiryTests} — real-time TTL expiry behaviour (uses Thread.sleep)
  • + *
  • {@link EdgeCaseTests} — pipelining, concurrent clients, binary-safe values, unknown cmd
  • + *
+ */ +@TestInstance(TestInstance.Lifecycle.PER_CLASS) +class RespCommandsTest { + + RiptideKVServer server; + int port; + RespClient c; + + // ── Suite lifecycle ─────────────────────────────────────────────────────── + + @BeforeAll + void startServer(@TempDir Path tempDir) throws IOException { + port = freePort(); + server = new RiptideKVServer( + RiptideKVConfig.builder() + .bind("127.0.0.1:" + port) + .dataDir(tempDir) + .walSync(false) + .build()); + server.start(); + } + + @AfterAll + void stopServer() { + if (server != null) server.close(); + } + + @BeforeEach + void openClientAndFlush() throws IOException { + c = new RespClient(port); + c.send("FLUSHALL"); + assertEquals("OK", c.recvSimple()); + } + + @AfterEach + void closeClient() throws IOException { + if (c != null) c.close(); + } + + // ── Shared helpers ──────────────────────────────────────────────────────── + + static int freePort() throws IOException { + try (var ss = new ServerSocket(0)) { return ss.getLocalPort(); } + } + + /** Convenience: SET k v and assert +OK. */ + void set(String k, String v) throws IOException { + c.send("SET", k, v); + c.recvOk(); + } + + // ═════════════════════════════════════════════════════════════════════════ + // CONNECTION / SERVER COMMANDS + // ═════════════════════════════════════════════════════════════════════════ + + @Nested + class ConnectionTests { + + @Test void ping_noArgs_returnsPong() throws IOException { + c.send("PING"); + assertEquals("PONG", c.recvSimple()); + } + + @Test void ping_withMessage_returnsMessage() throws IOException { + c.send("PING", "hello world"); + assertEquals("hello world", c.recvBulk()); + } + + @Test void echo_returnsArgument() throws IOException { + c.send("ECHO", "foobar"); + assertEquals("foobar", c.recvBulk()); + } + + @Test void echo_emptyString() throws IOException { + c.send("ECHO", ""); + assertEquals("", c.recvBulk()); + } + + @Test void select_zero_returnsOk() throws IOException { + c.send("SELECT", "0"); + c.recvOk(); + } + + @Test void select_nonZero_returnsError() throws IOException { + c.send("SELECT", "1"); + assertTrue(c.recvError().startsWith("ERR")); + } + + @Test void hello_resp2_returnsResponse() throws IOException { + c.send("HELLO", "2"); + // RiptideKV returns a bulk string for HELLO 2 + Object r = c.recv(); + assertNotNull(r); + } + + @Test void hello_resp3_returnsError() throws IOException { + c.send("HELLO", "3"); + // RiptideKV returns NOPROTO (not ERR) for unsupported protocol versions + var err = c.recvError(); + assertTrue(err.startsWith("NOPROTO") || err.startsWith("ERR"), + "HELLO 3 should return a NOPROTO or ERR error, got: " + err); + } + + @Test void client_setname_returnsOk() throws IOException { + c.send("CLIENT", "SETNAME", "myapp"); + c.recvOk(); + } + + @Test void client_getname_returnsSetName() throws IOException { + c.send("CLIENT", "SETNAME", "testclient"); + c.recvOk(); + c.send("CLIENT", "GETNAME"); + assertEquals("testclient", c.recvBulk()); + } + + @Test void client_id_returnsInteger() throws IOException { + c.send("CLIENT", "ID"); + long id = c.recvInt(); + assertTrue(id >= 0, "CLIENT ID should be non-negative"); + } + + @Test void command_count_returnsPositiveInteger() throws IOException { + c.send("COMMAND", "COUNT"); + long count = c.recvInt(); + assertTrue(count > 10, "Expected at least 10 commands, got: " + count); + } + + @Test void info_returnsNonEmptyBulkString() throws IOException { + c.send("INFO"); + String info = c.recvBulk(); + assertNotNull(info); + assertFalse(info.isBlank()); + } + + @Test void info_serverSection_containsVersionField() throws IOException { + c.send("INFO", "server"); + String info = c.recvBulk(); + assertNotNull(info); + assertTrue(info.contains("redis_version") || info.contains("riptidekv"), + "INFO server should contain version info: " + info); + } + + @Test void config_get_returnsArray() throws IOException { + c.send("CONFIG", "GET", "*"); + Object r = c.recv(); + assertNotNull(r); + // RiptideKV returns an empty array for CONFIG GET + assertTrue(r instanceof Object[]); + } + + @Test void quit_returnsOkAndClosesConnection() throws IOException { + // Use a dedicated client — QUIT closes the connection + try (var qc = new RespClient(port)) { + qc.send("QUIT"); + assertEquals("OK", qc.recvSimple()); + // Server closes the connection after OK; further reads return EOF + } + } + } + + // ═════════════════════════════════════════════════════════════════════════ + // DATABASE COMMANDS + // ═════════════════════════════════════════════════════════════════════════ + + @Nested + class DatabaseTests { + + @Test void dbsize_emptyDb_returnsZero() throws IOException { + c.send("DBSIZE"); + assertEquals(0L, c.recvInt()); + } + + @Test void dbsize_afterSet_returnsCount() throws IOException { + set("a", "1"); + set("b", "2"); + c.send("DBSIZE"); + assertEquals(2L, c.recvInt()); + } + + @Test void flushdb_clearsAllKeys() throws IOException { + set("k1", "v1"); + set("k2", "v2"); + c.send("FLUSHDB"); + c.recvOk(); + c.send("DBSIZE"); + assertEquals(0L, c.recvInt()); + } + + @Test void flushall_clearsAllKeys() throws IOException { + set("x", "y"); + c.send("FLUSHALL"); + c.recvOk(); + c.send("DBSIZE"); + assertEquals(0L, c.recvInt()); + } + + @Test void acl_whoami_returnsDefault() throws IOException { + c.send("ACL", "WHOAMI"); + assertEquals("default", c.recvBulk()); + } + + @Test void slowlog_get_returnsArray() throws IOException { + c.send("SLOWLOG", "GET"); + Object[] arr = c.recvArray(); + assertNotNull(arr); + assertEquals(0, arr.length); + } + + @Test void memory_usage_existingKey_returnsInteger() throws IOException { + set("memkey", "hello"); + c.send("MEMORY", "USAGE", "memkey"); + Object r = c.recv(); + // Returns integer (bytes) or null if not supported + assertTrue(r == null || r instanceof Long, "MEMORY USAGE should return integer or null"); + } + + @Test void wait_returnsZero() throws IOException { + c.send("WAIT", "0", "0"); + assertEquals(0L, c.recvInt()); + } + } + + // ═════════════════════════════════════════════════════════════════════════ + // STRING COMMANDS + // ═════════════════════════════════════════════════════════════════════════ + + @Nested + class StringTests { + + // ── GET / SET ───────────────────────────────────────────────────────── + + @Test void get_missingKey_returnsNull() throws IOException { + c.send("GET", "no-such-key"); + assertNull(c.recvBulk()); + } + + @Test void set_andGet_roundtrip() throws IOException { + c.send("SET", "foo", "bar"); + c.recvOk(); + c.send("GET", "foo"); + assertEquals("bar", c.recvBulk()); + } + + @Test void set_overwritesExistingValue() throws IOException { + set("k", "original"); + c.send("SET", "k", "updated"); + c.recvOk(); + c.send("GET", "k"); + assertEquals("updated", c.recvBulk()); + } + + @Test void set_getFlag_returnsOldValue() throws IOException { + set("k", "old"); + c.send("SET", "k", "new", "GET"); + assertEquals("old", c.recvBulk()); + } + + @Test void set_getFlag_missingKey_returnsNull() throws IOException { + c.send("SET", "newkey", "v", "GET"); + assertNull(c.recvBulk()); + } + + @Test void set_nxFlag_absentKey_setsAndReturnsOk() throws IOException { + c.send("SET", "k", "v", "NX"); + assertEquals("OK", c.recvBulk()); + } + + @Test void set_nxFlag_presentKey_returnsNull() throws IOException { + set("k", "original"); + c.send("SET", "k", "new", "NX"); + assertNull(c.recvBulk()); + c.send("GET", "k"); + assertEquals("original", c.recvBulk()); // unchanged + } + + @Test void set_xxFlag_presentKey_setsAndReturnsOk() throws IOException { + set("k", "original"); + c.send("SET", "k", "updated", "XX"); + assertEquals("OK", c.recvBulk()); + } + + @Test void set_xxFlag_absentKey_returnsNull() throws IOException { + c.send("SET", "absent", "v", "XX"); + assertNull(c.recvBulk()); + } + + @Test void set_withEx_setsTtl() throws IOException { + c.send("SET", "k", "v", "EX", "100"); + c.recvOk(); + c.send("TTL", "k"); + long ttl = c.recvInt(); + assertTrue(ttl > 0 && ttl <= 100, "TTL should be in (0, 100], got: " + ttl); + } + + @Test void set_withPx_setsTtlMs() throws IOException { + c.send("SET", "k", "v", "PX", "100000"); + c.recvOk(); + c.send("PTTL", "k"); + long pttl = c.recvInt(); + assertTrue(pttl > 0 && pttl <= 100_000, "PTTL should be in (0, 100000], got: " + pttl); + } + + @Test void set_withExInvalid_returnsError() throws IOException { + c.send("SET", "k", "v", "EX", "0"); + assertTrue(c.recvError().startsWith("ERR")); + } + + @Test void set_keepttl_preservesExistingTtl() throws IOException { + c.send("SET", "k", "v1", "EX", "100"); + c.recvOk(); + c.send("SET", "k", "v2", "KEEPTTL"); + c.recvOk(); + c.send("TTL", "k"); + long ttl = c.recvInt(); + assertTrue(ttl > 0, "KEEPTTL should preserve TTL, got: " + ttl); + } + + @Test void set_noTtlOption_clearsPreviousTtl() throws IOException { + c.send("SET", "k", "v1", "EX", "100"); + c.recvOk(); + c.send("SET", "k", "v2"); + c.recvOk(); + c.send("TTL", "k"); + assertEquals(-1L, c.recvInt()); // no TTL + } + + // ── SETNX ───────────────────────────────────────────────────────────── + + @Test void setnx_absentKey_returns1() throws IOException { + c.send("SETNX", "k", "v"); + assertEquals(1L, c.recvInt()); + } + + @Test void setnx_presentKey_returns0() throws IOException { + set("k", "existing"); + c.send("SETNX", "k", "new"); + assertEquals(0L, c.recvInt()); + c.send("GET", "k"); + assertEquals("existing", c.recvBulk()); // not changed + } + + // ── SETEX / PSETEX ──────────────────────────────────────────────────── + + @Test void setex_setsValueAndTtl() throws IOException { + c.send("SETEX", "k", "60", "hello"); + c.recvOk(); + c.send("GET", "k"); + assertEquals("hello", c.recvBulk()); + c.send("TTL", "k"); + long ttl = c.recvInt(); + assertTrue(ttl > 0 && ttl <= 60); + } + + @Test void setex_zeroTimeout_returnsError() throws IOException { + c.send("SETEX", "k", "0", "v"); + assertTrue(c.recvError().startsWith("ERR")); + } + + @Test void psetex_setsValueAndTtlMs() throws IOException { + c.send("PSETEX", "k", "60000", "hello"); + c.recvOk(); + c.send("PTTL", "k"); + long pttl = c.recvInt(); + assertTrue(pttl > 0 && pttl <= 60_000); + } + + // ── GETSET / GETDEL / GETEX ─────────────────────────────────────────── + + @Test void getset_returnsOldValue() throws IOException { + set("k", "old"); + c.send("GETSET", "k", "new"); + assertEquals("old", c.recvBulk()); + c.send("GET", "k"); + assertEquals("new", c.recvBulk()); + } + + @Test void getset_missingKey_returnsNull() throws IOException { + c.send("GETSET", "absent", "v"); + assertNull(c.recvBulk()); + } + + @Test void getdel_presentKey_returnsAndDeletes() throws IOException { + set("k", "hello"); + c.send("GETDEL", "k"); + assertEquals("hello", c.recvBulk()); + c.send("EXISTS", "k"); + assertEquals(0L, c.recvInt()); + } + + @Test void getdel_missingKey_returnsNull() throws IOException { + c.send("GETDEL", "absent"); + assertNull(c.recvBulk()); + } + + @Test void getex_withEx_setsExpiry() throws IOException { + set("k", "v"); + c.send("GETEX", "k", "EX", "30"); + assertEquals("v", c.recvBulk()); + c.send("TTL", "k"); + long ttl = c.recvInt(); + assertTrue(ttl > 0 && ttl <= 30); + } + + @Test void getex_withPersist_removesExpiry() throws IOException { + c.send("SET", "k", "v", "EX", "30"); + c.recvOk(); + c.send("GETEX", "k", "PERSIST"); + assertEquals("v", c.recvBulk()); + c.send("TTL", "k"); + assertEquals(-1L, c.recvInt()); + } + + // ── MSET / MGET / MSETNX ────────────────────────────────────────────── + + @Test void mset_andMget_roundtrip() throws IOException { + c.send("MSET", "a", "1", "b", "2", "c", "3"); + c.recvOk(); + c.send("MGET", "a", "b", "c", "missing"); + Object[] results = c.recvArray(); + assertNotNull(results); + assertEquals(4, results.length); + assertEquals("1", results[0]); + assertEquals("2", results[1]); + assertEquals("3", results[2]); + assertNull(results[3]); + } + + @Test void msetnx_allAbsent_returns1() throws IOException { + c.send("MSETNX", "x", "1", "y", "2"); + assertEquals(1L, c.recvInt()); + } + + @Test void msetnx_anyPresent_returns0AndSetsNothing() throws IOException { + set("x", "existing"); + c.send("MSETNX", "x", "new", "y", "2"); + assertEquals(0L, c.recvInt()); + c.send("GET", "y"); + assertNull(c.recvBulk()); // y was NOT set because x was present + } + + // ── APPEND / STRLEN ─────────────────────────────────────────────────── + + @Test void append_createsKeyAndReturnsLength() throws IOException { + c.send("APPEND", "k", "hello"); + assertEquals(5L, c.recvInt()); + c.send("GET", "k"); + assertEquals("hello", c.recvBulk()); + } + + @Test void append_toExistingKeyExtends() throws IOException { + set("k", "hello"); + c.send("APPEND", "k", " world"); + assertEquals(11L, c.recvInt()); + c.send("GET", "k"); + assertEquals("hello world", c.recvBulk()); + } + + @Test void strlen_existingKey_returnsLength() throws IOException { + set("k", "hello"); + c.send("STRLEN", "k"); + assertEquals(5L, c.recvInt()); + } + + @Test void strlen_missingKey_returnsZero() throws IOException { + c.send("STRLEN", "absent"); + assertEquals(0L, c.recvInt()); + } + + // ── INCR / INCRBY / INCRBYFLOAT / DECR / DECRBY ────────────────────── + + @Test void incr_absentKey_createsAndReturns1() throws IOException { + c.send("INCR", "counter"); + assertEquals(1L, c.recvInt()); + } + + @Test void incr_existingKey_increments() throws IOException { + set("counter", "10"); + c.send("INCR", "counter"); + assertEquals(11L, c.recvInt()); + } + + @Test void incr_nonParseableValue_treatsAsZero() throws IOException { + // RiptideKV uses unwrap_or(0) for un-parseable values: INCR treats them as 0 + set("incr_bad_type", "notanumber"); + c.send("INCR", "incr_bad_type"); + assertEquals(1L, c.recvInt(), "INCR on non-numeric value should treat it as 0 and return 1"); + } + + @Test void incrby_incrementsByAmount() throws IOException { + set("k", "10"); + c.send("INCRBY", "k", "5"); + assertEquals(15L, c.recvInt()); + } + + @Test void incrbyfloat_addsFraction() throws IOException { + set("k", "10"); + c.send("INCRBYFLOAT", "k", "1.5"); + String result = c.recvBulk(); + assertNotNull(result); + assertEquals(11.5, Double.parseDouble(result), 0.001); + } + + @Test void decr_decrementsBy1() throws IOException { + set("k", "10"); + c.send("DECR", "k"); + assertEquals(9L, c.recvInt()); + } + + @Test void decrby_decrementsByAmount() throws IOException { + set("k", "10"); + c.send("DECRBY", "k", "3"); + assertEquals(7L, c.recvInt()); + } + + // ── GETRANGE / SETRANGE ─────────────────────────────────────────────── + + @Test void getrange_returnsSubstring() throws IOException { + set("k", "hello world"); + c.send("GETRANGE", "k", "0", "4"); + assertEquals("hello", c.recvBulk()); + } + + @Test void getrange_negativeIndex_fromEnd() throws IOException { + set("k", "hello world"); + c.send("GETRANGE", "k", "6", "-1"); + assertEquals("world", c.recvBulk()); + } + + @Test void setrange_overwritesPortionAndReturnsLength() throws IOException { + set("k", "hello world"); + c.send("SETRANGE", "k", "6", "Redis"); + assertEquals(11L, c.recvInt()); + c.send("GET", "k"); + assertEquals("hello Redis", c.recvBulk()); + } + + // ── Large value ─────────────────────────────────────────────────────── + + @Test void set_largeValue_roundtrip() throws IOException { + String large = "x".repeat(512 * 1024); // 512 KiB + c.send("SET", "bigkey", large); + c.recvOk(); + c.send("GET", "bigkey"); + assertEquals(large, c.recvBulk()); + } + } + + // ═════════════════════════════════════════════════════════════════════════ + // KEY COMMANDS + // ═════════════════════════════════════════════════════════════════════════ + + @Nested + class KeyTests { + + // ── DEL / UNLINK ────────────────────────────────────────────────────── + + @Test void del_presentKey_returns1() throws IOException { + set("k", "v"); + c.send("DEL", "k"); + assertEquals(1L, c.recvInt()); + } + + @Test void del_missingKey_returns0() throws IOException { + c.send("DEL", "absent"); + assertEquals(0L, c.recvInt()); + } + + @Test void del_multipleKeys_returnsDeletedCount() throws IOException { + set("a", "1"); + set("b", "2"); + c.send("DEL", "a", "b", "missing"); + assertEquals(2L, c.recvInt()); + } + + @Test void unlink_presentKey_returns1() throws IOException { + set("k", "v"); + c.send("UNLINK", "k"); + assertEquals(1L, c.recvInt()); + c.send("EXISTS", "k"); + assertEquals(0L, c.recvInt()); + } + + // ── EXISTS ──────────────────────────────────────────────────────────── + + @Test void exists_presentKey_returns1() throws IOException { + set("k", "v"); + c.send("EXISTS", "k"); + assertEquals(1L, c.recvInt()); + } + + @Test void exists_absentKey_returns0() throws IOException { + c.send("EXISTS", "absent"); + assertEquals(0L, c.recvInt()); + } + + @Test void exists_multipleKeys_returnsCount() throws IOException { + set("a", "1"); + set("b", "2"); + c.send("EXISTS", "a", "b", "missing"); + assertEquals(2L, c.recvInt()); + } + + // ── TYPE ────────────────────────────────────────────────────────────── + + @Test void type_stringKey_returnsString() throws IOException { + set("k", "v"); + c.send("TYPE", "k"); + assertEquals("string", c.recvSimple()); + } + + @Test void type_absentKey_returnsNone() throws IOException { + c.send("TYPE", "absent"); + assertEquals("none", c.recvSimple()); + } + + // ── RENAME / RENAMENX ───────────────────────────────────────────────── + + @Test void rename_movesValue() throws IOException { + set("src", "hello"); + c.send("RENAME", "src", "dst"); + c.recvOk(); + c.send("GET", "dst"); + assertEquals("hello", c.recvBulk()); + c.send("EXISTS", "src"); + assertEquals(0L, c.recvInt()); + } + + @Test void rename_missingSource_returnsError() throws IOException { + c.send("RENAME", "absent", "dst"); + assertTrue(c.recvError().startsWith("ERR")); + } + + @Test void rename_preservesTtl() throws IOException { + c.send("SET", "src", "v", "EX", "100"); + c.recvOk(); + c.send("RENAME", "src", "dst"); + c.recvOk(); + c.send("TTL", "dst"); + long ttl = c.recvInt(); + assertTrue(ttl > 0 && ttl <= 100, "Renamed key should preserve TTL, got: " + ttl); + } + + @Test void renamenx_absentDest_returns1() throws IOException { + set("src", "hello"); + c.send("RENAMENX", "src", "dst"); + assertEquals(1L, c.recvInt()); + } + + @Test void renamenx_presentDest_returns0() throws IOException { + set("src", "hello"); + set("dst", "existing"); + c.send("RENAMENX", "src", "dst"); + assertEquals(0L, c.recvInt()); + c.send("GET", "dst"); + assertEquals("existing", c.recvBulk()); // not overwritten + } + + // ── RANDOMKEY / TOUCH ───────────────────────────────────────────────── + + @Test void randomkey_noKeys_returnsNull() throws IOException { + c.send("RANDOMKEY"); + assertNull(c.recvBulk()); + } + + @Test void randomkey_withKeys_returnsAKey() throws IOException { + set("a", "1"); + set("b", "2"); + c.send("RANDOMKEY"); + String key = c.recvBulk(); + assertNotNull(key); + assertTrue(key.equals("a") || key.equals("b"), "Unexpected key: " + key); + } + + @Test void touch_existingKeys_returnsCount() throws IOException { + set("a", "1"); + set("b", "2"); + c.send("TOUCH", "a", "b", "missing"); + assertEquals(2L, c.recvInt()); + } + + // ── EXPIRE / PEXPIRE / EXPIREAT / PEXPIREAT ─────────────────────────── + + @Test void expire_setsTtlInSeconds() throws IOException { + set("k", "v"); + c.send("EXPIRE", "k", "60"); + assertEquals(1L, c.recvInt()); + c.send("TTL", "k"); + long ttl = c.recvInt(); + assertTrue(ttl > 0 && ttl <= 60, "TTL should be in (0, 60], got: " + ttl); + } + + @Test void expire_absentKey_returns0() throws IOException { + c.send("EXPIRE", "absent", "60"); + assertEquals(0L, c.recvInt()); + } + + @Test void pexpire_setsTtlInMs() throws IOException { + set("k", "v"); + c.send("PEXPIRE", "k", "60000"); + assertEquals(1L, c.recvInt()); + c.send("PTTL", "k"); + long pttl = c.recvInt(); + assertTrue(pttl > 0 && pttl <= 60_000, "PTTL should be in (0, 60000], got: " + pttl); + } + + @Test void expireat_setsUnixTimestamp() throws IOException { + set("k", "v"); + long future = Instant.now().getEpochSecond() + 120; + c.send("EXPIREAT", "k", String.valueOf(future)); + assertEquals(1L, c.recvInt()); + c.send("TTL", "k"); + long ttl = c.recvInt(); + assertTrue(ttl > 0 && ttl <= 120, "TTL should be in (0, 120], got: " + ttl); + } + + @Test void pexpireat_setsUnixMs() throws IOException { + set("k", "v"); + long futureMs = Instant.now().toEpochMilli() + 120_000; + c.send("PEXPIREAT", "k", String.valueOf(futureMs)); + assertEquals(1L, c.recvInt()); + c.send("PTTL", "k"); + long pttl = c.recvInt(); + assertTrue(pttl > 0 && pttl <= 120_000, "PTTL out of range: " + pttl); + } + + // ── TTL / PTTL ──────────────────────────────────────────────────────── + + @Test void ttl_noExpiry_returnsMinusOne() throws IOException { + set("k", "v"); + c.send("TTL", "k"); + assertEquals(-1L, c.recvInt()); + } + + @Test void ttl_absentKey_returnsMinusTwo() throws IOException { + c.send("TTL", "absent"); + assertEquals(-2L, c.recvInt()); + } + + @Test void pttl_absentKey_returnsMinusTwo() throws IOException { + c.send("PTTL", "absent"); + assertEquals(-2L, c.recvInt()); + } + + @Test void pttl_noExpiry_returnsMinusOne() throws IOException { + set("k", "v"); + c.send("PTTL", "k"); + assertEquals(-1L, c.recvInt()); + } + + // ── PERSIST ─────────────────────────────────────────────────────────── + + @Test void persist_removesExpiry() throws IOException { + c.send("SET", "k", "v", "EX", "60"); + c.recvOk(); + c.send("PERSIST", "k"); + assertEquals(1L, c.recvInt()); + c.send("TTL", "k"); + assertEquals(-1L, c.recvInt()); // no longer expires + } + + @Test void persist_noExpiry_returns0() throws IOException { + set("k", "v"); + c.send("PERSIST", "k"); + assertEquals(0L, c.recvInt()); + } + + // ── EXPIRETIME / PEXPIRETIME ────────────────────────────────────────── + + @Test void expiretime_keyWithTtl_returnsUnixTimestamp() throws IOException { + long future = Instant.now().getEpochSecond() + 120; + set("k", "v"); // create the key first + c.send("EXPIREAT", "k", String.valueOf(future)); + assertEquals(1L, c.recvInt()); // must read the EXPIREAT reply + c.send("EXPIRETIME", "k"); + long et = c.recvInt(); + assertTrue(et > 0 && et <= future + 1, "EXPIRETIME out of range: " + et); + } + + @Test void expiretime_noExpiry_returnsMinusOne() throws IOException { + set("k", "v"); + c.send("EXPIRETIME", "k"); + assertEquals(-1L, c.recvInt()); + } + + @Test void expiretime_absentKey_returnsMinusTwo() throws IOException { + c.send("EXPIRETIME", "absent"); + assertEquals(-2L, c.recvInt()); + } + + @Test void pexpiretime_absentKey_returnsMinusTwo() throws IOException { + c.send("PEXPIRETIME", "absent"); + assertEquals(-2L, c.recvInt()); + } + + // ── KEYS / SCAN ─────────────────────────────────────────────────────── + + @Test void keys_wildcardPattern_returnsMatchingKeys() throws IOException { + c.send("MSET", "user:1", "a", "user:2", "b", "item:1", "c"); + c.recvOk(); + c.send("KEYS", "user:*"); + Object[] keys = c.recvArray(); + assertNotNull(keys); + assertEquals(2, keys.length); + List keyList = Arrays.stream(keys).map(Object::toString).collect(Collectors.toList()); + assertTrue(keyList.stream().allMatch(k -> k.startsWith("user:"))); + } + + @Test void keys_questionMarkPattern_matchesSingleChar() throws IOException { + c.send("MSET", "foo", "1", "bar", "2", "baz", "3", "foobar", "4"); + c.recvOk(); + c.send("KEYS", "???"); + Object[] keys = c.recvArray(); + assertNotNull(keys); + assertEquals(3, keys.length); // foo, bar, baz + } + + @Test void keys_starPattern_returnsAllKeys() throws IOException { + set("a", "1"); + set("b", "2"); + c.send("KEYS", "*"); + Object[] keys = c.recvArray(); + assertNotNull(keys); + assertEquals(2, keys.length); + } + + @Test void scan_basicCursor_returnsKeysArray() throws IOException { + set("k1", "v1"); + set("k2", "v2"); + c.send("SCAN", "0"); + Object[] result = c.recvArray(); + assertNotNull(result); + assertEquals(2, result.length); + // result[0] = next cursor, result[1] = keys array + Object[] scanKeys = (Object[]) result[1]; + assertNotNull(scanKeys); + } + + @Test void scan_withMatchPattern_filtersKeys() throws IOException { + c.send("MSET", "prefix:1", "a", "prefix:2", "b", "other", "c"); + c.recvOk(); + c.send("SCAN", "0", "MATCH", "prefix:*"); + Object[] result = c.recvArray(); + Object[] keys = (Object[]) result[1]; + assertNotNull(keys); + for (Object k : keys) { + assertTrue(k.toString().startsWith("prefix:"), + "SCAN MATCH returned unexpected key: " + k); + } + } + } + + // ═════════════════════════════════════════════════════════════════════════ + // REAL-TIME EXPIRY TESTS (use Thread.sleep — kept minimal) + // ═════════════════════════════════════════════════════════════════════════ + + @Nested + class ExpiryTests { + + @Test void key_expiresAndBecomesInvisible() throws Exception { + c.send("SET", "ex", "v", "PX", "300"); // 300 ms TTL + c.recvOk(); + Thread.sleep(400); + c.send("GET", "ex"); + assertNull(c.recvBulk(), "Key should be gone after TTL expires"); + } + + @Test void del_afterExpiry_returns0() throws Exception { + c.send("SET", "ex", "v", "PX", "300"); + c.recvOk(); + Thread.sleep(400); + c.send("DEL", "ex"); + assertEquals(0L, c.recvInt(), "DEL on expired key should return 0"); + } + + @Test void exists_afterExpiry_returns0() throws Exception { + c.send("SET", "ex", "v", "PX", "300"); + c.recvOk(); + Thread.sleep(400); + c.send("EXISTS", "ex"); + assertEquals(0L, c.recvInt(), "EXISTS on expired key should return 0"); + } + + @Test void ttl_afterExpiry_returnsMinusTwo() throws Exception { + c.send("SET", "ex", "v", "PX", "300"); + c.recvOk(); + Thread.sleep(400); + c.send("TTL", "ex"); + assertEquals(-2L, c.recvInt(), "TTL on expired key should return -2"); + } + + @Test void dbsize_afterExpiry_decrements() throws Exception { + c.send("SET", "ex", "v", "PX", "300"); + c.recvOk(); + set("perm", "v"); + c.send("DBSIZE"); + assertEquals(2L, c.recvInt()); + Thread.sleep(400); + // Access the expired key to trigger eviction + c.send("GET", "ex"); + c.recvBulk(); + c.send("DBSIZE"); + assertEquals(1L, c.recvInt()); + } + } + + // ═════════════════════════════════════════════════════════════════════════ + // EDGE CASES + // ═════════════════════════════════════════════════════════════════════════ + + @Nested + class EdgeCaseTests { + + @Test void unknownCommand_returnsError() throws IOException { + c.send("NOTACOMMAND"); + assertTrue(c.recvError().startsWith("ERR")); + } + + @Test void pipelining_sendMultipleBeforeReading() throws IOException { + // Pipeline 5 SETs without reading responses + for (int i = 0; i < 5; i++) { + c.send("SET", "pk" + i, "pv" + i); + } + // Now read all 5 OKs + for (int i = 0; i < 5; i++) { + c.recvOk(); + } + // Verify with MGET + c.send("MGET", "pk0", "pk1", "pk2", "pk3", "pk4"); + Object[] vals = c.recvArray(); + assertNotNull(vals); + assertEquals(5, vals.length); + for (int i = 0; i < 5; i++) { + assertEquals("pv" + i, vals[i], "Unexpected value at index " + i); + } + } + + @Test void concurrentClients_doNotInterfereSets() throws Exception { + int threads = 10; + int opsEach = 20; + var executor = Executors.newFixedThreadPool(threads); + var errors = new CopyOnWriteArrayList(); + + List> futures = new ArrayList<>(); + for (int t = 0; t < threads; t++) { + final int tid = t; + futures.add(executor.submit(() -> { + try (var tc = new RespClient(port)) { + for (int i = 0; i < opsEach; i++) { + String key = "thread" + tid + ":key" + i; + tc.send("SET", key, "val" + i); + tc.recvOk(); + tc.send("GET", key); + String got = tc.recvBulk(); + if (!("val" + i).equals(got)) { + errors.add("thread" + tid + " key=" + key + " expected val" + i + " got " + got); + } + } + } catch (Exception e) { + errors.add("thread" + tid + " threw: " + e.getMessage()); + } + return null; + })); + } + + for (var f : futures) f.get(10, TimeUnit.SECONDS); + executor.shutdown(); + assertTrue(errors.isEmpty(), "Concurrent errors: " + errors); + } + + @Test void concurrentIncr_isSerializedByLock() throws Exception { + set("counter", "0"); + int threads = 10; + int incrEach = 50; + var executor = Executors.newFixedThreadPool(threads); + List> futures = new ArrayList<>(); + + for (int t = 0; t < threads; t++) { + futures.add(executor.submit(() -> { + try (var tc = new RespClient(port)) { + for (int i = 0; i < incrEach; i++) { + tc.send("INCR", "counter"); + tc.recvInt(); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + return null; + })); + } + + for (var f : futures) f.get(10, TimeUnit.SECONDS); + executor.shutdown(); + + c.send("GET", "counter"); + String val = c.recvBulk(); + assertEquals(threads * incrEach, Integer.parseInt(val), + "Concurrent INCRs should be serialized; expected " + (threads * incrEach) + " got " + val); + } + + @Test void binarySafeValue_roundtrip() throws Exception { + // Write a raw RESP command containing binary bytes (0x00 0x01 0x02) via raw socket + byte[] cmd = "*3\r\n$3\r\nSET\r\n$6\r\nbinkey\r\n$3\r\n\u0000\u0001\u0002\r\n" + .getBytes(java.nio.charset.StandardCharsets.ISO_8859_1); + c.send("SET", "binkey", "\u0000\u0001\u0002"); + c.recvOk(); + c.send("STRLEN", "binkey"); + assertEquals(3L, c.recvInt()); + } + + @Test void multipleConsecutivePings_allReturnPong() throws IOException { + for (int i = 0; i < 10; i++) { + c.send("PING"); + assertEquals("PONG", c.recvSimple(), "PING #" + i + " failed"); + } + } + + @Test void info_keyspace_reflectsActualKeyCount() throws IOException { + c.send("MSET", "x", "1", "y", "2", "z", "3"); + c.recvOk(); + c.send("INFO", "keyspace"); + String info = c.recvBulk(); + assertNotNull(info); + assertTrue(info.contains("keys=3"), + "INFO keyspace should report keys=3 but got: " + info); + } + } +} diff --git a/java/src/test/java/io/riptidekv/RiptideKVConfigTest.java b/java/src/test/java/io/riptidekv/RiptideKVConfigTest.java new file mode 100644 index 0000000..e53c55d --- /dev/null +++ b/java/src/test/java/io/riptidekv/RiptideKVConfigTest.java @@ -0,0 +1,155 @@ +package io.riptidekv; + +import org.junit.jupiter.api.Test; + +import java.nio.file.Paths; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Unit tests for {@link RiptideKVConfig} and its builder. + * No server process is started — purely tests the Java config object. + */ +class RiptideKVConfigTest { + + // ── Default values ──────────────────────────────────────────────────────── + + @Test + void defaults_bindIsLocalhost6379() { + var cfg = RiptideKVConfig.builder().build(); + assertEquals("127.0.0.1:6379", cfg.getBind()); + } + + @Test + void defaults_flushKbIs1024() { + var cfg = RiptideKVConfig.builder().build(); + assertEquals(1024, cfg.getFlushKb()); + } + + @Test + void defaults_walSyncIsTrue() { + var cfg = RiptideKVConfig.builder().build(); + assertTrue(cfg.isWalSync()); + } + + @Test + void defaults_dataDirIsUnderTmpdir() { + var cfg = RiptideKVConfig.builder().build(); + assertTrue(cfg.getDataDir().toString().contains("riptidekv"), + "default dataDir should contain 'riptidekv', got: " + cfg.getDataDir()); + } + + // ── Port extraction ─────────────────────────────────────────────────────── + + @Test + void getPort_extractsFromDefaultBind() { + assertEquals(6379, RiptideKVConfig.builder().build().getPort()); + } + + @Test + void getPort_extractsCustomPort() { + var cfg = RiptideKVConfig.builder().bind("127.0.0.1:6380").build(); + assertEquals(6380, cfg.getPort()); + } + + @Test + void getPort_worksWithAllInterfaces() { + var cfg = RiptideKVConfig.builder().bind("0.0.0.0:9999").build(); + assertEquals(9999, cfg.getPort()); + } + + // ── Custom values ───────────────────────────────────────────────────────── + + @Test + void customBind_isStored() { + var cfg = RiptideKVConfig.builder().bind("0.0.0.0:7777").build(); + assertEquals("0.0.0.0:7777", cfg.getBind()); + } + + @Test + void customDataDir_isStored() { + var dir = Paths.get("/tmp/mydb"); + var cfg = RiptideKVConfig.builder().dataDir(dir).build(); + assertEquals(dir, cfg.getDataDir()); + } + + @Test + void customFlushKb_isStored() { + var cfg = RiptideKVConfig.builder().flushKb(4096).build(); + assertEquals(4096, cfg.getFlushKb()); + } + + @Test + void walSyncFalse_isStored() { + var cfg = RiptideKVConfig.builder().walSync(false).build(); + assertFalse(cfg.isWalSync()); + } + + // ── Validation ──────────────────────────────────────────────────────────── + + @Test + void flushKbZero_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().flushKb(0).build()); + } + + @Test + void flushKbNegative_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().flushKb(-1).build()); + } + + @Test + void nullDataDir_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().dataDir(null).build()); + } + + @Test + void blankBind_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().bind("").build()); + } + + @Test + void nullBind_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().bind(null).build()); + } + + // ── Builder fluency ─────────────────────────────────────────────────────── + + @Test + void invalidBindNoColon_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().bind("localhost").build()); + } + + @Test + void invalidBindNonNumericPort_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().bind("127.0.0.1:abc").build()); + } + + @Test + void invalidBindPortOutOfRange_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> RiptideKVConfig.builder().bind("127.0.0.1:99999").build()); + } + + @Test + void builder_isFullyFluent() { + var cfg = RiptideKVConfig.builder() + .bind("127.0.0.1:16379") + .dataDir(Paths.get("/tmp/test")) + .flushKb(512) + .walSync(false) + .build(); + + assertEquals("127.0.0.1:16379", cfg.getBind()); + assertEquals(Paths.get("/tmp/test"), cfg.getDataDir()); + assertEquals(512, cfg.getFlushKb()); + assertFalse(cfg.isWalSync()); + assertEquals(16379, cfg.getPort()); + } +} diff --git a/java/src/test/java/io/riptidekv/RiptideKVServerTest.java b/java/src/test/java/io/riptidekv/RiptideKVServerTest.java new file mode 100644 index 0000000..365e3d3 --- /dev/null +++ b/java/src/test/java/io/riptidekv/RiptideKVServerTest.java @@ -0,0 +1,179 @@ +package io.riptidekv; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.net.ServerSocket; +import java.net.Socket; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for {@link RiptideKVServer} lifecycle: start, stop, isRunning, error cases. + * Each test starts and stops its own server for full isolation. + */ +class RiptideKVServerTest { + + // ── Helpers ─────────────────────────────────────────────────────────────── + + static int freePort() throws IOException { + try (var ss = new ServerSocket(0)) { return ss.getLocalPort(); } + } + + static RiptideKVServer startServer(int port, Path dataDir) throws IOException { + var cfg = RiptideKVConfig.builder() + .bind("127.0.0.1:" + port) + .dataDir(dataDir) + .walSync(false) + .build(); + var server = new RiptideKVServer(cfg); + server.start(); + return server; + } + + static boolean canConnect(int port) { + try (var ignored = new Socket("127.0.0.1", port)) { + return true; + } catch (IOException e) { + return false; + } + } + + // ── Start ───────────────────────────────────────────────────────────────── + + @Test + void start_serverAcceptsConnections(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var server = startServer(port, tmp)) { + assertTrue(server.isRunning()); + assertTrue(canConnect(port), "Should be able to connect after start()"); + } + } + + @Test + void start_respondsToRespPing(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var server = startServer(port, tmp); + var c = new RespClient(port)) { + c.send("PING"); + assertEquals("PONG", c.recvSimple()); + } + } + + @Test + void start_createsDataDirectoryIfAbsent(@TempDir Path tmp) throws Exception { + int port = freePort(); + Path nested = tmp.resolve("a").resolve("b").resolve("c"); + // nested does NOT exist yet + assertFalse(nested.toFile().exists()); + try (var ignored = startServer(port, nested)) { + assertTrue(nested.toFile().exists(), "start() must create the data directory"); + } + } + + @Test + void start_createsSstSubdirectory(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var ignored = startServer(port, tmp)) { + assertTrue(tmp.resolve("sst").toFile().isDirectory(), + "start() must create sst/ subdirectory"); + } + } + + // ── isRunning ───────────────────────────────────────────────────────────── + + @Test + void isRunning_falseBeforeStart(@TempDir Path tmp) { + var cfg = RiptideKVConfig.builder() + .dataDir(tmp).walSync(false).build(); + var server = new RiptideKVServer(cfg); + assertFalse(server.isRunning()); + } + + @Test + void isRunning_trueAfterStart(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var server = startServer(port, tmp)) { + assertTrue(server.isRunning()); + } + } + + @Test + void isRunning_falseAfterClose(@TempDir Path tmp) throws Exception { + int port = freePort(); + var server = startServer(port, tmp); + assertTrue(server.isRunning()); + server.close(); + assertFalse(server.isRunning()); + } + + // ── Stop / close ────────────────────────────────────────────────────────── + + @Test + void close_releasesPort(@TempDir Path tmp) throws Exception { + int port = freePort(); + var server = startServer(port, tmp); + assertTrue(canConnect(port)); + server.close(); + // Give the OS a moment to release the port + Thread.sleep(200); + assertFalse(canConnect(port), "Port should be released after close()"); + } + + @Test + void close_isIdempotent(@TempDir Path tmp) throws Exception { + int port = freePort(); + var server = startServer(port, tmp); + server.close(); + assertDoesNotThrow(server::close, "Second close() should not throw"); + } + + @Test + void tryWithResources_closesAutomatically(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var ignored = startServer(port, tmp)) { + assertTrue(canConnect(port)); + } + Thread.sleep(200); + assertFalse(canConnect(port), "Server should stop at end of try-with-resources"); + } + + // ── Double start ────────────────────────────────────────────────────────── + + @Test + void start_whenAlreadyRunning_throwsIllegalState(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var server = startServer(port, tmp)) { + assertThrows(IllegalStateException.class, server::start, + "start() on a running server should throw IllegalStateException"); + } + } + + // ── getPort / getBind ───────────────────────────────────────────────────── + + @Test + void getPort_matchesConfig(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var server = startServer(port, tmp)) { + assertEquals(port, server.getPort()); + } + } + + @Test + void getBind_matchesConfig(@TempDir Path tmp) throws Exception { + int port = freePort(); + try (var server = startServer(port, tmp)) { + assertEquals("127.0.0.1:" + port, server.getBind()); + } + } + + // ── Null config guard ───────────────────────────────────────────────────── + + @Test + void constructor_nullConfig_throwsIllegalArgument() { + assertThrows(IllegalArgumentException.class, + () -> new RiptideKVServer(null)); + } +}