diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2af6678 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,132 @@ +name: CI + +on: + push: + pull_request: + workflow_dispatch: + +permissions: + contents: read + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: "1" + RUSTFLAGS: "-D warnings" + +jobs: + fmt: + name: Rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Print runner info + run: | + echo "cores: $(nproc)" + echo "cpu model: $(lscpu | sed -n 's/^Model name:[[:space:]]*//p' | head -n 1)" + echo "ram: $(free -h | awk '/Mem:/ {print $2}')" + - run: rustup component add rustfmt + - run: cargo fmt -- --check + + clippy: + name: Clippy + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Print runner info + run: | + echo "cores: $(nproc)" + echo "cpu model: $(lscpu | sed -n 's/^Model name:[[:space:]]*//p' | head -n 1)" + echo "ram: $(free -h | awk '/Mem:/ {print $2}')" + - uses: actions/cache@v5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: clippy-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} + - run: rustup component add clippy + - run: cargo clippy --all-targets -- -D warnings + + test-ubuntu: + name: Test (Ubuntu, ${{ matrix.profile }}) + runs-on: ubuntu-latest + strategy: + matrix: + profile: [debug, release] + steps: + - uses: actions/checkout@v6 + - name: Print runner info + run: | + echo "cores: $(nproc)" + echo "cpu model: $(lscpu | sed -n 's/^Model name:[[:space:]]*//p' | head -n 1)" + echo "ram: $(free -h | awk '/Mem:/ {print $2}')" + - uses: actions/cache@v5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: test-ubuntu-${{ matrix.profile }}-${{ hashFiles('**/Cargo.lock') }} + - name: Run tests + run: | + if [ "${{ matrix.profile }}" = "release" ]; then + cargo test --release -- --skip test_crash_recovery --test-threads=1 + else + cargo test -- --skip test_crash_recovery --test-threads=1 + fi + - name: Run examples + run: | + cargo run --example simple + cargo run --example multithreaded + cargo run --example atomics + cargo run --example lists + cargo run --example typed + cargo test --features whitebox-testing --test whitebox + - name: Run perf + if: matrix.profile == 'release' + run: cargo run --release --example perf + - name: Run crasher + if: matrix.profile == 'release' + run: cargo test --release --test crasher -- --nocapture + + test-windows: + name: Test (Windows, ${{ matrix.profile }}) + runs-on: windows-latest + strategy: + matrix: + profile: [debug, release] + steps: + - uses: actions/checkout@v6 + - name: Print runner info + shell: pwsh + run: | + $cpu = Get-CimInstance Win32_Processor | Select-Object -First 1 + $computer = Get-CimInstance Win32_ComputerSystem + Write-Host "cores: $($cpu.NumberOfCores)" + Write-Host "cpu model: $($cpu.Name.Trim())" + Write-Host "ram: $([math]::Round($computer.TotalPhysicalMemory / 1GB, 2)) GB" + - uses: actions/cache@v5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: test-windows-${{ matrix.profile }}-${{ hashFiles('**/Cargo.lock') }} + - name: Run tests + run: | + if ("${{ matrix.profile }}" -eq "release") { + cargo test --release -- --skip test_crash_recovery --test-threads=1 + } else { + cargo test -- --skip test_crash_recovery --test-threads=1 + } + - name: Run examples + run: | + cargo run --example simple + cargo run --example multithreaded + cargo run --example atomics + cargo run --example lists + cargo run --example typed + cargo test --features whitebox-testing --test whitebox + - name: Run perf + if: matrix.profile == 'release' + run: cargo run --release --example perf diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml deleted file mode 100644 index 7e98560..0000000 --- a/.github/workflows/ubuntu.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: Linux - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -env: - CARGO_TERM_COLOR: always - -jobs: - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Build - run: cargo build - - name: Run tests - run: cargo test --release -- --nocapture - - name: Run simple example - run: cargo run --example simple - - name: Run multithreaded example - run: cargo run --example multithreaded - - name: Run lists example - run: cargo run --example lists - - name: Run typed example - run: cargo run --example typed - - name: Run perftest - run: cd candy-perf; cargo run --release - - name: Run crasher - run: cd candy-crasher; cargo run --release - - name: Run longliving - run: cd candy-longliving; cargo run --release -- 10 40001 10000 - - name: Run mini-candy - run: cd mini-candy; cargo run - - name: Run test-list-collisions - run: cargo test -F whitebox_testing --test test_list_collisions -- --nocapture - - name: Run test-flush-agg - run: cargo test -F flush_aggregation --test test_flush_agg -- --nocapture diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml deleted file mode 100644 index 7e079e1..0000000 --- a/.github/workflows/windows.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Windows - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -env: - CARGO_TERM_COLOR: always - -jobs: - build: - - runs-on: windows-latest - - steps: - - uses: actions/checkout@v4 - - name: Build - run: cargo build - - name: Run simple example - run: cargo run --example simple diff --git a/.gitignore b/.gitignore index 95d8dd0..ea8c4bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,18 +1 @@ -# Generated by Cargo -# will have compiled files and executables -debug/ -target/ -dbdir/ - -# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries -# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html -#Cargo.lock - -# These are backup files generated by rustfmt -**/*.rs.bk - -# MSVC Windows builds of rustc generate these, which store debugging information -*.pdb - -# Jetbrains files -.idea +/target diff --git a/Cargo.lock b/Cargo.lock index b62552e..84e8834 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,81 +4,79 @@ version = 4 [[package]] name = "anyhow" -version = "1.0.100" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] -name = "bitflags" -version = "2.10.0" +name = "atomic-polyfill" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4" +dependencies = [ + "critical-section", +] [[package]] -name = "bumpalo" -version = "3.19.0" +name = "autocfg" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] -name = "bytemuck" -version = "1.24.0" +name = "bit-set" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" dependencies = [ - "bytemuck_derive", + "bit-vec", ] [[package]] -name = "bytemuck_derive" -version = "1.10.2" +name = "bit-vec" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] -name = "candy-crasher" -version = "0.1.0" -dependencies = [ - "candystore", - "libc", - "rand 0.8.5", -] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] -name = "candy-longliving" -version = "0.1.0" -dependencies = [ - "candystore", -] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] -name = "candy-perf" -version = "0.1.0" -dependencies = [ - "candystore", -] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "candystore" -version = "0.5.6" +version = "1.0.0" dependencies = [ - "anyhow", - "bytemuck", - "crossbeam-channel", - "databuf", + "crc16-ibm3740-fast", "fslock", "libc", - "memmap", + "memmap2", + "num_cpus", "parking_lot", - "rand 0.9.2", + "postcard", + "proptest", + "rand 0.10.0", + "serde", "simd-itertools", "siphasher", + "smallvec", + "tempfile", + "thiserror", "uuid", + "zerocopy", ] [[package]] @@ -88,48 +86,108 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] -name = "crossbeam-channel" -version = "0.5.15" +name = "chacha20" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ - "crossbeam-utils", + "cfg-if", + "cpufeatures", + "rand_core 0.10.0", ] [[package]] -name = "crossbeam-utils" -version = "0.8.21" +name = "cobs" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" +dependencies = [ + "thiserror", +] [[package]] -name = "databuf" -version = "0.5.0" +name = "core_detect" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8f80099a98041a3d1622845c271458a2d73e688351bf3cb999266764b81d48" + +[[package]] +name = "cpufeatures" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1ad1d99bee317a8dac0b7cd86896c5a5f24307009292985dabbf3e412c8b9d" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" dependencies = [ - "databuf-derive", + "libc", ] [[package]] -name = "databuf-derive" +name = "crc-fast-gen" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04040c9fc8fcb4084222a26c99faf5b3014772a6115e076b7a50fe49bf25d0ea" +checksum = "8d4e7ca1a78a554d1675e8489415c76c5fd804686a7b6902ed8ce55ab498364d" + +[[package]] +name = "crc16-ibm3740-fast" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78bd5030bcadf3aa65886c31c60bb36fab8db9eae235ff081acb64ea962aa5d6" dependencies = [ - "databuf_derive_impl", + "core_detect", + "crc-fast-gen", ] [[package]] -name = "databuf_derive_impl" -version = "0.2.3" +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + +[[package]] +name = "embedded-io" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daf656eb071fe87d23716f933788a35a8ad6baa6fdbf66a67a261dbd3f9dc81a" +checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" + +[[package]] +name = "embedded-io" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ - "quote2", - "syn", + "libc", + "windows-sys", ] +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "fslock" version = "0.2.1" @@ -142,42 +200,131 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "wasi", + "r-efi 5.3.0", + "wasip2", ] [[package]] name = "getrandom" -version = "0.3.4" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", + "rand_core 0.10.0", "wasip2", + "wasip3", +] + +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", ] +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "heapless" +version = "0.7.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version", + "serde", + "spin", + "stable_deref_trait", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + [[package]] name = "js-sys" -version = "0.3.82" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", ] +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "libc" -version = "0.2.177" +version = "0.2.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "lock_api" @@ -189,21 +336,24 @@ dependencies = [ ] [[package]] -name = "memmap" -version = "0.7.0" +name = "log" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi", -] +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] -name = "mini-candy" -version = "0.1.0" +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" dependencies = [ - "memmap", - "siphasher", + "libc", ] [[package]] @@ -228,11 +378,30 @@ dependencies = [ "target-features", ] +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "parking_lot" @@ -257,6 +426,19 @@ dependencies = [ "windows-link", ] +[[package]] +name = "postcard" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" +dependencies = [ + "cobs", + "embedded-io 0.4.0", + "embedded-io 0.6.1", + "heapless", + "serde", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -267,39 +449,57 @@ dependencies = [ ] [[package]] -name = "proc-macro2" -version = "1.0.103" +name = "prettyplease" +version = "0.2.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ - "unicode-ident", + "proc-macro2", + "syn", ] [[package]] -name = "quote" -version = "1.0.42" +name = "proc-macro2" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ - "proc-macro2", + "unicode-ident", ] [[package]] -name = "quote2" -version = "0.7.0" +name = "proptest" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "970573b86f7e5795c8c6c50c56ef602368593f0687188da27fd489a59e253630" +checksum = "37566cb3fdacef14c0737f9546df7cfeadbfbc9fef10991038bf5015d0c80532" dependencies = [ - "proc-macro2", - "quote", - "quote2-macros", + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand 0.9.2", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", ] [[package]] -name = "quote2-macros" -version = "0.7.0" +name = "quick-error" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4b89c37b2d870a28629ad20da669bb0e7d7214878d0d5111b304aa466e1977" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] [[package]] name = "r-efi" @@ -308,15 +508,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] -name = "rand" -version = "0.8.5" +name = "r-efi" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" @@ -324,18 +519,19 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ - "rand_chacha 0.9.0", - "rand_core 0.9.3", + "rand_chacha", + "rand_core 0.9.5", ] [[package]] -name = "rand_chacha" -version = "0.3.1" +name = "rand" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" dependencies = [ - "ppv-lite86", - "rand_core 0.6.4", + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.0", ] [[package]] @@ -345,25 +541,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.3", + "rand_core 0.9.5", ] [[package]] name = "rand_core" -version = "0.6.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.3.4", ] [[package]] name = "rand_core" -version = "0.9.3" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" dependencies = [ - "getrandom 0.3.4", + "rand_core 0.9.5", ] [[package]] @@ -375,12 +577,52 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -388,26 +630,68 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] -name = "simd-itertools" -version = "0.3.0" +name = "semver" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a037ed5ba0cb7102a5b720453b642c5b2cf39960edd2ceace91af8ec3743082a" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ - "multiversion", + "serde_core", + "serde_derive", ] [[package]] -name = "simulator" -version = "0.1.0" +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ - "rand 0.8.5", + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "simd-itertools" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a037ed5ba0cb7102a5b720453b642c5b2cf39960edd2ceace91af8ec3743082a" +dependencies = [ + "multiversion", ] [[package]] name = "siphasher" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" [[package]] name = "smallvec" @@ -415,11 +699,26 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + [[package]] name = "syn" -version = "2.0.111" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -432,42 +731,99 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5" +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" -version = "1.0.22" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" [[package]] name = "uuid" -version = "1.18.1" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" +name = "wait-timeout" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.105" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", @@ -478,9 +834,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.105" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -488,9 +844,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.105" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ "bumpalo", "proc-macro2", @@ -501,13 +857,47 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.105" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "winapi" version = "0.3.9" @@ -536,28 +926,125 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] [[package]] name = "zerocopy" -version = "0.8.30" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ea879c944afe8a2b25fef16bb4ba234f47c694565e97383b36f3a878219065c" +checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.30" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf955aa904d6040f70dc8e9384444cb1030aed272ba3cb09bbc4ab9e7c1f34f5" +checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" dependencies = [ "proc-macro2", "quote", "syn", ] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index 15241b8..7c6cc01 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,29 +1,32 @@ [package] name = "candystore" -version = "0.5.6" -edition = "2021" +version = "1.0.0" +edition = "2024" license = "Apache-2.0" keywords = ["key-value", "database", "persistent", "store", "rocksdb"] -description = "A lean, efficient and fast peristent in-process key-value store" +description = "A lean, efficient and fast persistent in-process key-value store" repository = "https://github.com/sweet-security/candystore" [dependencies] -bytemuck = { version = "1.24.0", features = ["derive"] } -databuf = "0.5.0" -memmap = "0.7.0" -siphasher = "1.0.1" -anyhow = "1.0.100" -parking_lot = "0.12.5" -uuid = "1.10.0" -rand = "0.9.2" +crc16-ibm3740-fast = "0.5.0" fslock = "0.2.1" -libc = "0.2.158" -crossbeam-channel = "0.5.15" +libc = "0.2.183" +memmap2 = "0.9.10" +num_cpus = "1.17.0" +parking_lot = "0.12.5" +postcard = { version = "1.1.3", features = ["use-std"] } +serde = { version = "1", features = ["derive"] } simd-itertools = "0.3.0" +siphasher = "1.0.2" +smallvec = { version = "1.15.1", features = ["write"] } +thiserror = "2.0.18" +uuid = "1.18.1" +zerocopy = { version = "0.8.47", features = ["derive"] } -[features] -whitebox_testing = [] -flush_aggregation = [] +[dev-dependencies] +proptest = "1.10.0" +tempfile = "3" +rand = "0.10.0" -[workspace] -members = ["simulator", "candy-crasher", "candy-longliving", "candy-perf", "mini-candy"] +[features] +whitebox-testing = [] diff --git a/README.md b/README.md index 4d7c365..70d9400 100644 --- a/README.md +++ b/README.md @@ -1,150 +1,181 @@ -
- -
v0.5.6 fixes some potential concurrency issues and panics on missing shards after a crash
-
🪟 v0.5.5 now supports Windows (experimental)! 🎉
-
+
+ +> [!NOTE] +> 😸 v1.0 brings true crash-consistency, improved compaction and an overall simpler design. +> v1.0 marks the data-file format as stable. +> +> The append-only data files are the compatibility boundary. The index format +> may still evolve, and when the data-file format is recognized but the index +> format is outdated, Candy recreates the index on open by default. +> +> Pre-v1.0 stores are not covered by this compatibility promise. # CandyStore -A pure rust implementation of a fast (*blazingly* :tm:, of course), persistent, in-process key-value store, that relies -on a novel sharding algorithm. Just how blazingly? It's over 9000! + +A pure Rust implementation of a fast (*blazingly* :tm:, of course), persistent, in-process +key-value store that relies on a hash-based sharding algorithm. All operations — lookup, +insert, and removal — are O(1). | Operation | Time* | |-----------|--------| | Lookup | < 1us | -| Insert | < 2us | -| Removal | < 1us | +| Insert | < 1us | +| Update | < 2us | +| Removal | < 2us | + +On my laptop (32 core AMD RYZEN AI MAX+ 395 with 64GB RAM, running Ubuntu 25.10 kernel `6.17.0-19-generic`) I'm getting + +```bash +$ cargo run --release --example perf + +Testing key-value using 1 threads, each with 1000000 items (key size: 16, value size: 16) + Inserts: 0.514698 us/op + Updates: 0.608783 us/op + Positive Lookups: 0.308571 us/op + Negative Lookups: 0.047365 us/op + Iter all: 0.360074 us/op + Removes: 0.605519 us/op +``` -The algorithm can be thought of as a "zero-overhead" extension to a hash table stored over files, -as it's designed to minimizes IO operations. See [the benchmark](candy-perf/README.md) and -[how to interpret the results*](#how-to-interpret-the-performance-results). +See [how to interpret the results\*](#how-to-interpret-the-performance-results). -## Overview -Being a hash-table, the key is hashed, producing a 64 bit number. The 16 most significant bits select -the *shard*, followed by 16 bits selecting the *row* in the shard, and the remaining 32 bits serve as an -opaque signature. The signature is matched against the signature array within the selected row. -The row also stores the file offset of the entry, which is used to retrive the entry's key and value. +## APIs -![](diagram.png) +Candy offers +* A simple key-value API (`get`, `set`, `remove` and atomic operations like `replace`) +* A typed interface on top (`get`, `set`, etc.) +* Double-ended queues (`push_to_queue_tail`, `pop_queue_head`, etc.) as well as a typed interface on top of them +* Lists (`get_from_list`, `set_in_list`, etc.) as well as a typed interface on top of them +* The DB is completely thread-safe in idiomatic Rust (just `Arc<>` it) -Each shard is mapped to a shard file, and a shard file can cover a wide range of consecutive shards. -We begin with a single shard file covering the whole shard span of `[0-65536]`. +```rust +use candystore::{CandyStore, Config, Result}; -When a shard file gets too big, or when one of its rows gets full, it undergoes a *split*. -This operation takes all entries and splits them into a bottom half and a top half (of roughly -equal sizes). For instance, if the file covered shards `[0-65536)`, after the split we have two files, -one covering `[0-32768)` and the other covering `[32768-65536)`. This process repeats as needed, -and essentially builds a tree of shard files. Each file is split independently, and the amount of work -is constant (unlike LSM trees). +fn main() -> Result<()> { + let db = CandyStore::open("/path/to/db", Config::default())?; -``` - [0-65536) - / \ - / \ - [0-32768) [32768-65536) - / \ - / \ - [0-16384) [16384-32768) + db.set("hello", "world")?; + let val = db.get("hello")?; + assert_eq!(val, Some(b"world".to_vec())); + db.remove("hello")?; + + db.set_in_list("cities", "Barcelona", "Spain")?; + db.set_in_list("cities", "Chicago", "USA")?; + db.set_in_list("cities", "Caracas", "Venezuela")?; + + let cities: Vec = db.iter_list("cities") + .map(|res| String::from_utf8(res.unwrap().0).unwrap()) + .collect(); + + assert_eq!(cities, vec!["Barcelona", "Chicago", "Caracas"]); + + Ok(()) +} ``` -The shard file's header (the rows, signatures and file offsets) are kept in an `mmap`, and the rest -of the file's data is accessed using `pread` and `pwrite`. The file is only ever extended (until either -a split or *compaction* takes place), so the algorithm is *crash safe*, in the sense that it will always -return some valid version of a key-value pair, although it might lose unflushed data. +## Algorithm -The library puts its faith in the kernel's page cache, and assumes the `mmap` and writes are flushed to -disk every so often. This allows us to forgo a journal or write-ahead log (WAL). +The algorithm can be thought of as a "zero-overhead extension" to a hash table stored over +files, designed to minimize IO operations. It does not employ a WAL or a journal, and instead +uses append-only files that serve both as a source of truth and as the final data structure. +Unlike LSM-based stores that need to maintain large SSTables in memory, sort them and later +merge them, Candy uses a small mmap'ed index that points to on-disk data directly. -The default parameters (chosen by simulations) are of shards with 64 rows, each with 512 entries. The chances -of collisions with these parameters are minimal, and they allow for ~90% utilization of the shard, while -requiring relatively small header tables (32K entries, taking up 384KB). With the expected 90% utilization, -you should be expect to hold 29.5K keys per shard. For a shard file of 64MB, that's 0.6% overhead. +![Algorithm](algo.png) -Because the data structure is a hash table rather than a search tree, insertion, lookup and removal are -all O(1) operations. +The core of the algorithm is the concept of *hash coordinates*: breaking up a 64-bit hash +into two 32-bit values, a *row selector* and a *signature*, which can be thought of +as coordinates into the rows table. First, the row selector is used to locate the relevant +row in the table, and the signature locates the column. To find the row, we take the row +selector's bits and mask them with a *split level mask*, essentially, the number of rows +in the table. -The concept can be extended to a distributed database, by adding a layer of master-shards that select a -server, followed by the normal sharding mechanism described above. +To locate the signature (32 bits) within the row, we employ a parallel lookup (using SIMD) +to find the matching column(s). Then we fetch the corresponding pointers for the matched +columns, from which we extract another 18 bits of entropy. If both match, we fetch the entry +from the relevant file (the pointer stores a file index and a file offset). -## Example -```rust -use candystore::{CandyStore, Config, Result}; +Note: the chances of a collision (meaning we fetch a wrong entry from the file) are +virtually zero, about 1 in 20 billion according to the birthday paradox (a collision in 336 +uniformly-distributed 50-bits numbers). -fn main() -> Result<()> { - let db = CandyStore::open("/tmp/candy-dir", Config::default())?; +Candy supports up to 4096 files, each up to 1GB in size (a span of 4TB). In terms of key-space, +Candy allows 2^21 rows, each with 336 keys, so a total of 704M keys. The maximum size of a key +is 16KB and the maximum size of a value is 64KB. Of course these are theoretical limits, +it would be wise to halve them in practice due to imbalances. - // simple API +### Splitting - db.set("mykey", "myval")?; - assert_eq!(db.get("mykey")?, Some("myval".into())); +What happens when a row reaches its limit of 336 keys? We need to split it, of course. +To do that, we increase the row's split level by one, which means we take an extra bit +into account when selecting the row. For example, row 2 (0b010) will be split into +rows 2 (0b0010) and 10 (0b1010). Because the bits are uniformly distributed, we expect about +half of the entries to move from row 2 to row 10. - assert_eq!(db.get("yourkey")?, None); +![Split](split.png) - assert_eq!(db.iter().count(), 1); +Note that splitting may require increasing the global split level (the size of the table) +which will incur doubling the mmap's size. This may sound like a costly operation, but since +it's file-backed it's mostly only page-table work, and it's amortized. And since we only +split a single row -- we do not need to rehash the whole table -- the amount of work we +do is O(1). - for res in db.iter() { - let (k, v) = res?; - assert_eq!(k, Vec::::from("mykey")); - assert_eq!(v, Vec::::from("myval")); - } +Another optimization is that the pointer contains 18 bits of the row selector, which means +we do not need to read and recompute the hash coordinates of the keys. Splitting is thus +memory-bound. - assert_eq!(db.iter().count(), 1); +### Compaction - // lists API +Data is always written (appended) to the *active data file*. When a file reaches a certain size, +Candy rotates the active files and creates a new one. The old file becomes an *immutable data +file*. - db.set_in_list("mylist", "key1", "123")?; - db.set_in_list("mylist", "key2", "456")?; - assert_eq!(db.get_from_list("mylist", "key1")?, Some("123".into())); +As data is created, updated and removed, the store accumulates waste. To handle it we have +*background compaction*: a thread that iterates over the rows table, finds all entries that +belong to files that should be compacted and rewrites them to the active file. After such +a pass, it simply deletes the old immutable file since no entry points to it. - assert_eq!(db.iter_list("mylist").count(), 2); +You can configure the throughput (bytes per second) of compaction. - for res in db.iter_list("mylist") { - let (k, v) = res?; - println!("{k:?} => {v:?}"); - } - Ok(()) -} -``` +### Checkpointing & Rebuild + +We trust the operating system to flush the data files and mmap'ed rows table to storage, +which means that even if your process crashes, your data will be fully consistent. However, +this is not true on a power failure or a kernel panic — in which case the state of the +index file is unknown relative to the data files. + +To handle this gracefully, Candy employs **background checkpointing**. Instead of synchronously `fsync`ing index and data files on every write (which would block the writer), a background worker asynchronously persists a consistent snapshot of the current state at user-defined intervals or after a configured amount of bytes have been written. + +On an unexpected crash or an unclean shutdown, Candy features an efficient rebuild mechanism. It resumes from the latest successful checkpoint and rapidly replays only the recent mutating operations, restoring the full, robust state from the append-only data files. + +Starting with v1.0, those append-only data files are also the on-disk compatibility contract. By default (`Config::port_to_current_format = true`), Candy uses that same rebuild path when it encounters an outdated index-file version alongside data files whose format is still recognized. In that case it recreates only the `index` and `rows` files and rebuilds them from the append-only data files. + +This does not make arbitrary older releases compatible. The v1.0 compatibility promise applies to stores written with the stable v1.x data-file format; if the data-file format itself is not recognized, open still fails. ## Design Goals -* Fast and efficient, with a very low memory footprint (~0.6% overhead) -* No heavy/unbounded merges -* No Write-Ahead Log (WAL) or journalling of any kind -* Process crash safe: you may lose the latest operations, but never be in an inconsistent state - if the process crashes. However, if the machine itself crashes, the data on disk may be in an - inconsistent state. -* Splitting/compaction happens per-shard, so there's no global locking -* Suitable for both write-heavy and read-heavy workloads -* Concurrent by design (multiple threads getting/setting/removing keys at the same time) -* The backing store is taken to be an SSD, thus it's not optimized for HDDs - -## Notes -* The file format is not yet stable -* Uses very little `unsafe` (required due to `mmap`) - -## Roadmap -* Distributed protocol based on file locks (meant to run on a shared network folder) -* Add generations as an adapter on top, so that older generations are compacted into exponentially larger - time spans. It's an alternative to TTL, and amortizes the number of times an entry will move around as the - dataset grows. -* Maybe add Arithmethic coding/Huffman coding as a cheap compression for the keys and values -## How to Interpret the Performance Results -While the numbers above are incredible, it is obvious that any file-backed store will be limited by the -filesystem's latency and bandwidth. For example, you can expect a read latency of 20-100us from SSDs (NVMe), -so that's the lower bound on reading a random location in the file. +Unlike many key-value stores, Candy serves the purpose of *reducing* the memory footprint of your +process, e.g., offloading data to the disk instead of keeping it in-memory. It intentionally does not +include any caching/LRU layer like many traditional KVs/DBs. + +Example use cases for Candy are +* A hash table that needs to hold more keys than you can hold in memory +* Persistent work queues (e.g., producers append large work items to a queue and consumers then fetch + and perform them) +* A caching layer for your application logic -What the numbers above measure is the performance of the *algorithm*, not the *storage*: given you can spare an -overhead of 0.6% mapped into memory, lookup/insert/removal require a single disk IO. Replacing (updating) an -existing element requires two IOs, since it needs to compare the key before writing it anew. -These IOs may return from the kernel's page cache, in which case it's practically immediate, or from disk, -in which case you can expect it to take 1-2 round-trip times of your device. +It is designed to be durable for process crashes (where the kernel will flush everything properly) +but it does not attempt to optimize for durability under kernel panics (full rebuild). -Inserting to/removing from a lists require 2-3 IOs, since these operations need to update the list's -head or tail, as well as a "chain" element. Such operations should really be done with a "large enough page cache". -Updating/fetching an existing element element in a list is a single IO as above. +## How to Interpret the Performance Results -If your memory is too constrainted for keeping the lookup tables mapped-in (i.e., they get evicted to disk), -you'll incur one more unit of "IO latency" for fetching the row from the table. Since the row spans 2KB (and -aligned to 4KB), it should behave nicely with 4K IOs. +While the numbers shown above are incredible, it is obvious that any persistent store will be +limited by the underlying latency and bandwidth of the storage. For example, you can expect a +read latency of 20-100us from SSDs (NVMe), so that's the lower bound on reading a random +location in the file. -See also [this guide to LTO/PGO](https://github.com/sweet-security/candystore/issues/7) by Alexander Zaitsev. +What the numbers above measure is the performance of the *algorithm*, not the *storage layer*: +given the index can be kept mapped into memory (12 bytes per item), lookup and insert require +a single disk IO, while updating or removing requires two IOs. These IOs may be served +from the kernel's page cache, in which case you only pay for the syscall's latency, or +from disk, in which case you can expect it to take 1-2 round-trip times of your device. diff --git a/algo.png b/algo.png new file mode 100644 index 0000000..c374baa Binary files /dev/null and b/algo.png differ diff --git a/candy-crasher/.gitignore b/candy-crasher/.gitignore deleted file mode 100644 index 9a59ec5..0000000 --- a/candy-crasher/.gitignore +++ /dev/null @@ -1 +0,0 @@ -dbdir diff --git a/candy-crasher/Cargo.toml b/candy-crasher/Cargo.toml deleted file mode 100644 index 71cb49b..0000000 --- a/candy-crasher/Cargo.toml +++ /dev/null @@ -1,9 +0,0 @@ -[package] -name = "candy-crasher" -version = "0.1.0" -edition = "2021" - -[dependencies] -libc = "0.2.155" -rand = "0.8.5" -candystore={path=".."} diff --git a/candy-crasher/README.md b/candy-crasher/README.md deleted file mode 100644 index 72d62c4..0000000 --- a/candy-crasher/README.md +++ /dev/null @@ -1,31 +0,0 @@ -## VickiStore Crasher - -Fork a child process to insert 1M keys into the DB, while the parent kills it repeatedly. The test -makes sure the child is able to make progress as well as making sure the DB remains consistent. - -Note: the store is not meant to be used by multiple processes concurrently -- it uses thread syncrhonization, -not inter-process synchronization. The test uses the store only from a single process at a time. - - -``` -$ cargo run -child starting at 0 -[0] killing child -child starting at 20445 -[1] killing child -child starting at 31656 -[2] killing child -child starting at 55500 -. -. -. -child starting at 978418 -[219] killing child -child starting at 982138 -[220] killing child -child starting at 991255 -child finished -child finished in 221 iterations -Parent starts validating the DB... -DB validated successfully -``` diff --git a/candy-longliving/.gitignore b/candy-longliving/.gitignore deleted file mode 100644 index ca63e2e..0000000 --- a/candy-longliving/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -dbdir - diff --git a/candy-longliving/Cargo.toml b/candy-longliving/Cargo.toml deleted file mode 100644 index 347d19d..0000000 --- a/candy-longliving/Cargo.toml +++ /dev/null @@ -1,7 +0,0 @@ -[package] -name = "candy-longliving" -version = "0.1.0" -edition = "2021" - -[dependencies] -candystore={path=".."} diff --git a/candy-longliving/src/main.rs b/candy-longliving/src/main.rs deleted file mode 100644 index 4527053..0000000 --- a/candy-longliving/src/main.rs +++ /dev/null @@ -1,69 +0,0 @@ -use std::{ - sync::{atomic::AtomicU64, Arc}, - time::Instant, -}; - -use candystore::{CandyStore, CandyTypedList, Config, Result}; - -fn main() -> Result<()> { - let args = std::env::args().collect::>(); - assert!( - args.len() == 4, - "usage: {} ", - args[0] - ); - let num_threads: usize = args[1].parse().expect("num_threads not a number"); - let num_iters: usize = args[2].parse().expect("num_iters not a number"); - let tail_length: usize = args[3].parse().expect("tail_length not a number"); - - let db = Arc::new(CandyStore::open( - "dbdir", - Config { - min_compaction_threashold: 1024 * 1024, - ..Default::default() - }, - )?); - db.clear()?; - - let mut handles = vec![]; - - let ops = Arc::new(AtomicU64::new(0)); - - for thd in 0..num_threads { - let db = db.clone(); - let ops = ops.clone(); - let h = std::thread::spawn(move || { - println!("started thread {thd}"); - let typed = CandyTypedList::::new(db.clone()); - let listname = format!("mylist"); //format!("mylist{thd}"); - let mut t0 = Instant::now(); - for i in 0..num_iters { - if i % 10000 == 0 { - let t1 = Instant::now(); - println!( - "thread {thd} at {i} {} rate={}us", - db.stats(), - t1.duration_since(t0).as_micros() / 10_000, - ); - t0 = t1; - } - - typed.set(&listname, &(thd * num_iters + i), "xxx")?; - ops.fetch_add(1, std::sync::atomic::Ordering::Relaxed); - if i >= tail_length { - typed.remove(&listname, &(thd * num_iters + i - tail_length))?; - ops.fetch_add(1, std::sync::atomic::Ordering::Relaxed); - } - } - Result::<()>::Ok(()) - }); - handles.push(h); - } - - for h in handles { - h.join().unwrap()?; - } - println!("ops={}", ops.load(std::sync::atomic::Ordering::Relaxed)); - - Ok(()) -} diff --git a/candy-perf/.gitignore b/candy-perf/.gitignore deleted file mode 100644 index 9a59ec5..0000000 --- a/candy-perf/.gitignore +++ /dev/null @@ -1 +0,0 @@ -dbdir diff --git a/candy-perf/Cargo.toml b/candy-perf/Cargo.toml deleted file mode 100644 index 0490d50..0000000 --- a/candy-perf/Cargo.toml +++ /dev/null @@ -1,7 +0,0 @@ -[package] -name = "candy-perf" -version = "0.1.0" -edition = "2021" - -[dependencies] -candystore={path=".."} diff --git a/candy-perf/README.md b/candy-perf/README.md deleted file mode 100644 index 1f8bc69..0000000 --- a/candy-perf/README.md +++ /dev/null @@ -1,75 +0,0 @@ -Performance results from my machine - -* Ubuntu 24.04 LTS -* Lenovo ThinkPad X1 Carbon Gen 10 (12th Gen Intel® Core™ i7-1260P × 16) -* RAM: 32.0 GiB -* SSD: 512 GB -* Built with `cargo build --release` -* Running on a local filesystem - -### Smallish entries (4 byte keys, 3 byte values) -``` -1000000 small entries with pre-split - Small entries insert: 1.347us - Small entries get 100% existing: 0.477us - Small entries get 50% existing: 0.474us - Small entries removal: 0.493us - Small entries mixed: 1.822us - -1000000 small entries without pre-split - Small entries insert: 4.151us - Small entries get 100% existing: 0.517us - Small entries get 50% existing: 0.515us - Small entries removal: 0.535us - Small entries mixed: 4.633us -``` - -### Largish entries (100 byte keys, 300 byte values) -``` -500000 large entries with pre-split - Large entries insert: 1.624us - Large entries get 100% existing: 0.618us - Large entries removal: 0.128us - -500000 large entries without pre-split - Large entries insert: 5.422us - Large entries get 100% existing: 0.731us - Large entries removal: 0.139us -``` - -### Lists -``` -10 collections with 100000 items in each - Inserts: 8.104us - Updates: 2.593us - Gets: 0.612us - Iterations: 0.556us - Removal of 50% items: 7.945us - Discards: 0.972us -``` - -### Threads without contention (different keys) -``` -No-contention: 10 threads accessing 100000 different keys - with pre-split - Inserts: 3.238us - Gets: 1.004us - Removals: 0.929us - -No-contention: 10 threads accessing 100000 different keys - without pre-split - Inserts: 19.497us - Gets: 1.119us - Removals: 1.001us -``` - -### Threads with contention (same keys) -``` -Contention: 10 threads accessing 1000000 same keys - with pre-split - Inserts: 4.556us - Gets: 1.204us - Removals: 1.334us - -Contention: 10 threads accessing 1000000 same keys - without pre-split - Inserts: 12.167us - Gets: 2.195us - Removals: 2.257us -``` diff --git a/candy-perf/src/main.rs b/candy-perf/src/main.rs deleted file mode 100644 index 6712f4d..0000000 --- a/candy-perf/src/main.rs +++ /dev/null @@ -1,431 +0,0 @@ -use candystore::{CandyStore, Config, Result}; -use std::{ - hint::black_box, - sync::{atomic::AtomicU64, Arc}, - time::Instant, -}; - -fn run2(msg: &str, iters: u32, mut func: impl FnMut() -> Result<()>) -> Result<()> { - let t0 = Instant::now(); - func()?; - let t1 = Instant::now(); - println!( - "{msg}: {:.3}us", - ((t1.duration_since(t0).as_nanos() as f64) / 1000.0) / (iters as f64) - ); - Ok(()) -} - -fn run(msg: &str, iters: u32, mut func: impl FnMut(u32) -> Result<()>) -> Result<()> { - run2(msg, iters, || { - for i in 0u32..iters { - func(i)?; - } - Ok(()) - }) -} - -fn test_small_keys(num_keys: u32) -> Result<()> { - for pre_split in [true, false] { - let db = CandyStore::open( - "./dbdir", - Config { - expected_number_of_keys: if pre_split { num_keys as usize } else { 0 }, - ..Default::default() - }, - )?; - - db.clear()?; - - if pre_split { - println!("{num_keys} small entries with pre-split"); - } else { - println!("{num_keys} small entries without pre-split"); - } - - run(" Small entries insert", num_keys, |i| { - db.set(&(i * 2).to_le_bytes(), "xxx")?; - Ok(()) - })?; - - run(" Small entries get 100% existing", num_keys, |i| { - let val = db.get(&(i * 2).to_le_bytes())?; - black_box(val.unwrap()); - Ok(()) - })?; - - run(" Small entries get 50% existing", num_keys, |i| { - let val = db.get(&(i * 2).to_le_bytes())?; - black_box(val.unwrap()); - Ok(()) - })?; - - run(" Small entries removal", num_keys, |i| { - let val = db.remove(&(i * 2).to_le_bytes())?; - black_box(val.unwrap()); - Ok(()) - })?; - - db.clear()?; - - run(" Small entries mixed", num_keys, |i| { - db.set(&(i * 2).to_le_bytes(), "xxx")?; - let val = db.get(&(i / 2).to_le_bytes())?; - black_box(val); - if i % 8 == 7 { - db.remove(&(i / 2).to_le_bytes())?; - } - Ok(()) - })?; - - println!(); - } - - Ok(()) -} - -fn test_large_keys(num_keys: u32) -> Result<()> { - for pre_split in [true, false] { - let db = CandyStore::open( - "./dbdir", - Config { - expected_number_of_keys: if pre_split { num_keys as usize } else { 0 }, - ..Default::default() - }, - )?; - - db.clear()?; - - if pre_split { - println!("{num_keys} large entries with pre-split"); - } else { - println!("{num_keys} large entries without pre-split"); - } - - run(" Large entries insert", num_keys, |i| { - let mut key = [99u8; 100]; - key[0..4].copy_from_slice(&i.to_le_bytes()); - let val = [7u8; 300]; - db.set(&key, &val)?; - Ok(()) - })?; - - run(" Large entries get 100% existing", num_keys, |i| { - let mut key = [99u8; 100]; - key[0..4].copy_from_slice(&i.to_le_bytes()); - let val = db.get(&key)?; - black_box(val); - Ok(()) - })?; - - run(" Large entries removal", num_keys, |i| { - let mut key = [99u8; 100]; - key[0..4].copy_from_slice(&i.to_le_bytes()); - let val = db.remove(&(i * 2).to_le_bytes())?; - black_box(val); - Ok(()) - })?; - - println!(); - } - - Ok(()) -} - -fn test_lists(num_lists: u32, num_items_per_list: u32) -> Result<()> { - let db = CandyStore::open( - "./dbdir", - Config { - expected_number_of_keys: (num_lists * num_items_per_list) as usize, - ..Default::default() - }, - )?; - - println!("{num_lists} lists with {num_items_per_list} items in each"); - run2(" Inserts", num_lists * num_items_per_list, || { - for list in 0..num_lists { - for item in 0..num_items_per_list { - db.set_in_list(&list.to_le_bytes(), &item.to_le_bytes(), "xxx")?; - } - } - Ok(()) - })?; - - run2(" Updates", num_lists * num_items_per_list, || { - for list in 0..num_lists { - for item in 0..num_items_per_list { - db.set_in_list(&list.to_le_bytes(), &item.to_le_bytes(), "yyy")?; - } - } - Ok(()) - })?; - - run2(" Gets", num_lists * num_items_per_list, || { - for list in 0..num_lists { - for item in 0..num_items_per_list { - let val = db.get_from_list(&list.to_le_bytes(), &item.to_le_bytes())?; - black_box(val); - } - } - Ok(()) - })?; - - run2(" Iterations", num_lists * num_items_per_list, || { - for list in 0..num_lists { - let count = db.iter_list(&list.to_le_bytes()).count(); - black_box(count); - debug_assert_eq!(count, num_items_per_list as usize); - } - Ok(()) - })?; - - run2( - " Removal of 50% items", - num_lists * num_items_per_list / 2, - || { - for list in 0..num_lists { - for item in 0..num_items_per_list { - if item % 2 == 0 { - let val = db.remove_from_list(&list.to_le_bytes(), &item.to_le_bytes())?; - black_box(val.unwrap()); - } - } - } - Ok(()) - }, - )?; - - run2(" Discards", num_lists * num_items_per_list / 2, || { - for list in 0..num_lists { - db.discard_list(&list.to_le_bytes())?; - } - Ok(()) - })?; - - println!(); - - Ok(()) -} - -fn test_concurrency_without_contention(num_threads: u32, num_keys: u32) -> Result<()> { - for pre_split in [true, false] { - let db = Arc::new(CandyStore::open( - "./dbdir", - Config { - expected_number_of_keys: if pre_split { - (num_threads * num_keys) as usize - } else { - 0 - }, - ..Default::default() - }, - )?); - db.clear()?; - - if pre_split { - println!("No-contention: {num_threads} threads accessing {num_keys} different keys - with pre-split"); - } else { - println!( - "No-contention: {num_threads} threads accessing {num_keys} different keys - without pre-split" - ); - } - - let insert_time_ns = Arc::new(AtomicU64::new(0)); - let get_time_ns = Arc::new(AtomicU64::new(0)); - let removal_time_ns = Arc::new(AtomicU64::new(0)); - - let mut handles = vec![]; - for thd in 0..num_threads { - let db = db.clone(); - let insert_time_ns = insert_time_ns.clone(); - let get_time_ns = get_time_ns.clone(); - let removal_time_ns = removal_time_ns.clone(); - - let h = std::thread::spawn(move || { - { - let t0 = Instant::now(); - for i in thd * num_keys..(thd + 1) * num_keys { - let status = db.set(&i.to_le_bytes(), &thd.to_le_bytes())?; - debug_assert!(status.was_created()); - } - insert_time_ns.fetch_add( - Instant::now().duration_since(t0).as_nanos() as u64, - std::sync::atomic::Ordering::SeqCst, - ); - } - - { - let t0 = Instant::now(); - for i in thd * num_keys..(thd + 1) * num_keys { - let val = db.get(&i.to_le_bytes())?; - debug_assert_eq!(val, Some(thd.to_le_bytes().to_vec()), "thd={thd} i={i}"); - black_box(val.unwrap()); - } - get_time_ns.fetch_add( - Instant::now().duration_since(t0).as_nanos() as u64, - std::sync::atomic::Ordering::SeqCst, - ); - } - - { - let t0 = Instant::now(); - for i in thd * num_keys..(thd + 1) * num_keys { - let val = db.remove(&i.to_le_bytes())?; - debug_assert!(val.is_some()); - black_box(val.unwrap()); - } - removal_time_ns.fetch_add( - Instant::now().duration_since(t0).as_nanos() as u64, - std::sync::atomic::Ordering::SeqCst, - ); - } - - Result::<()>::Ok(()) - }); - handles.push(h); - } - for h in handles { - h.join().unwrap()?; - } - - let insert_time_ns = insert_time_ns.load(std::sync::atomic::Ordering::SeqCst) as f64; - let get_time_ns = get_time_ns.load(std::sync::atomic::Ordering::SeqCst) as f64; - let removal_time_ns = removal_time_ns.load(std::sync::atomic::Ordering::SeqCst) as f64; - let ops = (num_threads * num_keys) as f64; - - println!(" Inserts: {:.3}us", (insert_time_ns / 1000.0) / ops); - println!(" Gets: {:.3}us", (get_time_ns / 1000.0) / ops); - println!(" Removals: {:.3}us", (removal_time_ns / 1000.0) / ops); - println!(); - } - - Ok(()) -} - -fn do_inserts( - thd: u32, - num_keys: u32, - insert_time_ns: &Arc, - db: &Arc, -) -> Result<()> { - let t0 = Instant::now(); - for i in 0..num_keys { - db.set(&i.to_le_bytes(), &thd.to_le_bytes())?; - } - insert_time_ns.fetch_add( - Instant::now().duration_since(t0).as_nanos() as u64, - std::sync::atomic::Ordering::SeqCst, - ); - Ok(()) -} - -fn do_gets(num_keys: u32, get_time_ns: &Arc, db: &Arc) -> Result<()> { - let t0 = Instant::now(); - for i in 0..num_keys { - let val = db.get(&i.to_le_bytes())?; - black_box(val); - } - get_time_ns.fetch_add( - Instant::now().duration_since(t0).as_nanos() as u64, - std::sync::atomic::Ordering::SeqCst, - ); - Ok(()) -} - -fn do_removals( - num_keys: u32, - removal_time_ns: &Arc, - db: &Arc, -) -> Result<()> { - let t0 = Instant::now(); - for i in 0..num_keys { - let val = db.remove(&i.to_le_bytes())?; - black_box(val); - } - removal_time_ns.fetch_add( - Instant::now().duration_since(t0).as_nanos() as u64, - std::sync::atomic::Ordering::SeqCst, - ); - Ok(()) -} - -fn test_concurrency_with_contention(num_threads: u32, num_keys: u32) -> Result<()> { - for pre_split in [true, false] { - let db = Arc::new(CandyStore::open( - "./dbdir", - Config { - expected_number_of_keys: if pre_split { - (num_threads * num_keys) as usize - } else { - 0 - }, - ..Default::default() - }, - )?); - db.clear()?; - - if pre_split { - println!( - "Contention: {num_threads} threads accessing {num_keys} same keys - with pre-split" - ); - } else { - println!("Contention: {num_threads} threads accessing {num_keys} same keys - without pre-split"); - } - - let insert_time_ns = Arc::new(AtomicU64::new(0)); - let get_time_ns = Arc::new(AtomicU64::new(0)); - let removal_time_ns = Arc::new(AtomicU64::new(0)); - - let mut handles = vec![]; - for thd in 0..num_threads { - let db = db.clone(); - let insert_time_ns = insert_time_ns.clone(); - let get_time_ns = get_time_ns.clone(); - let removal_time_ns = removal_time_ns.clone(); - - let h = std::thread::spawn(move || { - if thd % 3 == 0 { - do_inserts(thd, num_keys, &insert_time_ns, &db)?; - do_gets(num_keys, &get_time_ns, &db)?; - do_removals(num_keys, &removal_time_ns, &db)?; - } else if thd % 3 == 1 { - do_gets(num_keys, &get_time_ns, &db)?; - do_removals(num_keys, &removal_time_ns, &db)?; - do_inserts(thd, num_keys, &insert_time_ns, &db)?; - } else { - do_removals(num_keys, &removal_time_ns, &db)?; - do_inserts(thd, num_keys, &insert_time_ns, &db)?; - do_gets(num_keys, &get_time_ns, &db)?; - } - - Result::<()>::Ok(()) - }); - handles.push(h); - } - for h in handles { - h.join().unwrap()?; - } - - let insert_time_ns = insert_time_ns.load(std::sync::atomic::Ordering::SeqCst) as f64; - let get_time_ns = get_time_ns.load(std::sync::atomic::Ordering::SeqCst) as f64; - let removal_time_ns = removal_time_ns.load(std::sync::atomic::Ordering::SeqCst) as f64; - let ops = (num_threads * num_keys) as f64; - - println!(" Inserts: {:.3}us", (insert_time_ns / 1000.0) / ops); - println!(" Gets: {:.3}us", (get_time_ns / 1000.0) / ops); - println!(" Removals: {:.3}us", (removal_time_ns / 1000.0) / ops); - println!(); - } - - Ok(()) -} - -fn main() -> Result<()> { - test_small_keys(1_000_000)?; - test_large_keys(500_000)?; - test_lists(10, 100_000)?; - test_concurrency_without_contention(10, 100_000)?; - test_concurrency_with_contention(10, 1_000_000)?; - - Ok(()) -} diff --git a/diagram.png b/diagram.png deleted file mode 100644 index 8485586..0000000 Binary files a/diagram.png and /dev/null differ diff --git a/examples/atomics.rs b/examples/atomics.rs index bc2a269..e108abc 100644 --- a/examples/atomics.rs +++ b/examples/atomics.rs @@ -14,12 +14,9 @@ use candystore::{CandyStore, Config, GetOrCreateStatus, Result}; // ... fn main() -> Result<()> { + _ = std::fs::remove_dir_all("/tmp/candy-dir"); let db = Arc::new(CandyStore::open("/tmp/candy-dir", Config::default())?); - // clear the DB just in case we has something there before. in real-life scenarios you would probably - // not clear the DB every time - db.clear()?; - let mut handles = vec![]; for thd in 0..3 { let db = db.clone(); diff --git a/examples/lists.rs b/examples/lists.rs index 1c27853..c9e813b 100644 --- a/examples/lists.rs +++ b/examples/lists.rs @@ -1,12 +1,9 @@ use candystore::{CandyStore, Config, Result}; fn main() -> Result<()> { + _ = std::fs::remove_dir_all("/tmp/candy-dir"); let db = CandyStore::open("/tmp/candy-dir", Config::default())?; - // clear the DB just in case we has something there before. in real-life scenarios you would probably - // not clear the DB every time - db.clear()?; - db.set_in_list("asia", "iraq", "arabic")?; db.set_in_list("asia", "china", "chinese")?; db.set_in_list("asia", "russia", "russian")?; diff --git a/examples/multithreaded.rs b/examples/multithreaded.rs index 7a0503c..c9f6a59 100644 --- a/examples/multithreaded.rs +++ b/examples/multithreaded.rs @@ -4,17 +4,14 @@ use std::{sync::Arc, time::Duration}; use candystore::{CandyStore, Config, Result}; fn main() -> Result<()> { + _ = std::fs::remove_dir_all("/tmp/candy-dir"); let db = Arc::new(CandyStore::open("/tmp/candy-dir-mt", Config::default())?); - // clear the DB just in case we has something there before. in real-life scenarios you would probably - // not clear the DB every time - db.clear()?; - // clone db and spawn thread 1 let db1 = db.clone(); let h1 = std::thread::spawn(move || -> Result<()> { for i in 0..100 { - db1.set(&format!("key{i}"), "thread 1")?; + db1.set(format!("key{i}"), "thread 1")?; std::thread::sleep(Duration::from_millis(1)); } Ok(()) @@ -24,7 +21,7 @@ fn main() -> Result<()> { let db2 = db.clone(); let h2 = std::thread::spawn(move || -> Result<()> { for i in 0..100 { - db2.set(&format!("key{i}"), "thread 2")?; + db2.set(format!("key{i}"), "thread 2")?; std::thread::sleep(Duration::from_millis(1)); } Ok(()) @@ -33,7 +30,7 @@ fn main() -> Result<()> { h1.join().unwrap()?; h2.join().unwrap()?; - for res in db.iter() { + for res in db.iter_items() { let (k, v) = res?; println!( "{} = {}", diff --git a/examples/perf.rs b/examples/perf.rs new file mode 100644 index 0000000..202091d --- /dev/null +++ b/examples/perf.rs @@ -0,0 +1,348 @@ +use candystore::{CandyStore, Config}; +use std::{ + hint::black_box, + sync::{Arc, atomic::AtomicU64}, + thread, + time::Instant, +}; + +fn run_perf( + store: Arc, + n: u32, + n_threads: usize, + key_size: usize, + val_size: usize, +) -> Result<(), Box> { + let mut handles = Vec::new(); + + let inserts_us = Arc::new(AtomicU64::new(0)); + let updates_us = Arc::new(AtomicU64::new(0)); + let pos_gets_us = Arc::new(AtomicU64::new(0)); + let neg_gets_us = Arc::new(AtomicU64::new(0)); + let iter_us = Arc::new(AtomicU64::new(0)); + let removes_us = Arc::new(AtomicU64::new(0)); + + for t in 0..n_threads { + let store = store.clone(); + let inserts_us = inserts_us.clone(); + let updates_us = updates_us.clone(); + let pos_gets_us = pos_gets_us.clone(); + let neg_gets_us = neg_gets_us.clone(); + let iter_us = iter_us.clone(); + let removes_us = removes_us.clone(); + + let handle = thread::spawn(move || { + let mut key = vec![b'k'; key_size.max(4)]; + let value1 = vec![b'v'; val_size]; + let value2 = vec![b'V'; val_size]; + let start_idx = t as u32 * n; + let end_idx = start_idx + n; + + { + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + store.set(&key, &value1).unwrap(); + } + let duration = t0.elapsed(); + inserts_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + store.set(&key, &value2).unwrap(); + } + let duration = t0.elapsed(); + updates_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + store.get(&key).unwrap().unwrap(); + } + let duration = t0.elapsed(); + pos_gets_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let mut key = vec![b'Q'; key_size.max(4)]; + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + assert!(store.get(&key).unwrap().is_none()); + } + let duration = t0.elapsed(); + neg_gets_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let t0 = Instant::now(); + black_box(store.iter_items().count()); + let duration = t0.elapsed(); + iter_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + store.remove(&key).unwrap(); + } + let duration = t0.elapsed(); + removes_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + }); + handles.push(handle); + } + + println!( + "Testing key-value using {} threads, each with {} items (key size: {}, value size: {})", + n_threads, n, key_size, val_size + ); + + for handle in handles { + handle.join().unwrap(); + } + + println!( + " Inserts: {} us/op", + inserts_us.load(std::sync::atomic::Ordering::Relaxed) as f64 + / (n_threads * n as usize) as f64 + ); + println!( + " Updates: {} us/op", + updates_us.load(std::sync::atomic::Ordering::Relaxed) as f64 + / (n_threads * n as usize) as f64 + ); + println!( + " Positive Lookups: {} us/op", + pos_gets_us.load(std::sync::atomic::Ordering::Relaxed) as f64 + / (n_threads * n as usize) as f64 + ); + println!( + " Negative Lookups: {} us/op", + neg_gets_us.load(std::sync::atomic::Ordering::Relaxed) as f64 + / (n_threads * n as usize) as f64 + ); + println!( + " Iter all: {} us/op", + iter_us.load(std::sync::atomic::Ordering::Relaxed) as f64 / (n_threads * n as usize) as f64 + ); + println!( + " Removes: {} us/op\n", + removes_us.load(std::sync::atomic::Ordering::Relaxed) as f64 + / (n_threads * n as usize) as f64 + ); + + Ok(()) +} + +fn run_queue_perf( + store: Arc, + n: u32, + n_threads: usize, + val_size: usize, +) -> Result<(), Box> { + let mut handles = Vec::new(); + + let pushes_us = Arc::new(AtomicU64::new(0)); + let pops_us = Arc::new(AtomicU64::new(0)); + + for _ in 0..n_threads { + let store = store.clone(); + let pushes_us = pushes_us.clone(); + let pops_us = pops_us.clone(); + + let handle = thread::spawn(move || { + let value = vec![b'v'; val_size]; + { + let t0 = Instant::now(); + for _ in 0..n { + store.push_to_queue_tail("myqueue", &value).unwrap(); + } + let duration = t0.elapsed(); + pushes_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let t0 = Instant::now(); + for _ in 0..n { + store.pop_queue_head("myqueue").unwrap().unwrap(); + } + let duration = t0.elapsed(); + pops_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + }); + handles.push(handle); + } + + println!( + "Testing a queue using {} threads, each with {} items (value size: {})", + n_threads, n, val_size + ); + for handle in handles { + handle.join().unwrap(); + } + + println!( + " Pushes: {} us/op", + pushes_us.load(std::sync::atomic::Ordering::Relaxed) as f64 + / (n_threads * n as usize) as f64 + ); + println!( + " Pops: {} us/op\n", + pops_us.load(std::sync::atomic::Ordering::Relaxed) as f64 / (n_threads * n as usize) as f64 + ); + + Ok(()) +} + +fn run_list_perf( + store: Arc, + n: u32, + n_threads: usize, + key_size: usize, + val_size: usize, +) -> Result<(), Box> { + let mut handles = Vec::new(); + + let sets_us = Arc::new(AtomicU64::new(0)); + let gets_us = Arc::new(AtomicU64::new(0)); + let removes_us = Arc::new(AtomicU64::new(0)); + + for t in 0..n_threads { + let store = store.clone(); + let sets_us = sets_us.clone(); + let gets_us = gets_us.clone(); + let removes_us = removes_us.clone(); + + let handle = thread::spawn(move || { + let mut key = vec![b'k'; key_size.max(4)]; + let value = vec![b'v'; val_size]; + let start_idx = t as u32 * n; + let end_idx = start_idx + n; + + { + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + store.set_in_list("mylist", &key, &value).unwrap(); + } + let duration = t0.elapsed(); + sets_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + store.get_from_list("mylist", &key).unwrap(); + } + let duration = t0.elapsed(); + gets_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + + { + let t0 = Instant::now(); + for i in start_idx..end_idx { + key[..4].copy_from_slice(&i.to_le_bytes()); + store.remove_from_list("mylist", &key).unwrap(); + } + let duration = t0.elapsed(); + removes_us.fetch_add( + duration.as_micros() as u64, + std::sync::atomic::Ordering::Relaxed, + ); + } + }); + handles.push(handle); + } + + println!( + "Testing a list using {} threads, each with {} items (value size: {})", + n_threads, n, val_size + ); + for handle in handles { + handle.join().unwrap(); + } + + println!( + " Sets: {} us/op", + sets_us.load(std::sync::atomic::Ordering::Relaxed) as f64 / (n_threads * n as usize) as f64 + ); + println!( + " Gets: {} us/op", + gets_us.load(std::sync::atomic::Ordering::Relaxed) as f64 / (n_threads * n as usize) as f64 + ); + println!( + " Removes: {} us/op\n", + removes_us.load(std::sync::atomic::Ordering::Relaxed) as f64 + / (n_threads * n as usize) as f64 + ); + + Ok(()) +} + +fn main() -> Result<(), Box> { + let dir = tempfile::tempdir()?; + let store = Arc::new(CandyStore::open( + dir.path(), + Config { + checkpoint_delta_bytes: None, + checkpoint_interval: None, + ..Default::default() + }, + )?); + + // single threaded + run_perf(store.clone(), 1_000_000, 1, 16, 16)?; + run_perf(store.clone(), 100_000, 1, 1024, 4096)?; + + // multi threaded + run_perf(store.clone(), 250_000, 4, 16, 16)?; + //run_perf(store.clone(), 10_000, 20, 16, 16)?; + + // queues + run_queue_perf(store.clone(), 500_000, 1, 16)?; + run_queue_perf(store.clone(), 100_000, 4, 16)?; + + // lists + run_list_perf(store.clone(), 500_000, 1, 16, 16)?; + run_list_perf(store.clone(), 100_000, 4, 16, 16)?; + + Ok(()) +} diff --git a/examples/simple.rs b/examples/simple.rs index df1e568..3b6a4bc 100644 --- a/examples/simple.rs +++ b/examples/simple.rs @@ -3,12 +3,9 @@ use core::str; use candystore::{CandyStore, Config, Result}; fn main() -> Result<()> { + _ = std::fs::remove_dir_all("/tmp/candy-dir"); let db = CandyStore::open("/tmp/candy-dir", Config::default())?; - // clear the DB just in case we has something there before. in real-life scenarios you would probably - // not clear the DB every time - db.clear()?; - println!("{:?}", db.get("mykey")?); // None db.set("mykey", "myval")?; @@ -20,9 +17,9 @@ fn main() -> Result<()> { println!("{:?}", db.get("mykey")?); // None for i in 0..10 { - db.set(&format!("mykey{i}"), &format!("myval{i}"))?; + db.set(format!("mykey{i}"), format!("myval{i}"))?; } - for res in db.iter() { + for res in db.iter_items() { let (k, v) = res?; println!( "{} = {}", diff --git a/examples/typed.rs b/examples/typed.rs index f5085d2..71bbf7c 100644 --- a/examples/typed.rs +++ b/examples/typed.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use candystore::{CandyStore, CandyTypedStore, Config, Result}; fn main() -> Result<()> { + _ = std::fs::remove_dir_all("/tmp/candy-dir"); let db = Arc::new(CandyStore::open("/tmp/candy-dir", Config::default())?); let typed = CandyTypedStore::>::new(db); diff --git a/mini-candy/Cargo.toml b/mini-candy/Cargo.toml deleted file mode 100644 index 8fe30fd..0000000 --- a/mini-candy/Cargo.toml +++ /dev/null @@ -1,8 +0,0 @@ -[package] -name = "mini-candy" -version = "0.1.0" -edition = "2021" - -[dependencies] -memmap = "0.7.0" -siphasher = "1.0.1" diff --git a/mini-candy/README.md b/mini-candy/README.md deleted file mode 100644 index f0e32e9..0000000 --- a/mini-candy/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Mini Candy -A very minimal implementation of the algorithm in ~250 lines of code, for educational purposes diff --git a/mini-candy/src/main.rs b/mini-candy/src/main.rs deleted file mode 100644 index 263098d..0000000 --- a/mini-candy/src/main.rs +++ /dev/null @@ -1,294 +0,0 @@ -//! a very minimal implementation of CandyStore, for educational purposes. handles single-threaded get/set/remove/iter -//! -use std::{ - cell::RefCell, - fs::{File, OpenOptions}, - io::{Seek, Write}, - os::unix::fs::FileExt, - path::{Path, PathBuf}, -}; - -use memmap::{MmapMut, MmapOptions}; -use siphasher::sip::SipHasher24; - -type Result = std::io::Result; -const WIDTH: usize = 512; -const ROWS: usize = 64; - -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] -struct PartedHash(u64); - -impl PartedHash { - const INVALID_SIG: u32 = 0; - fn new(buf: &[u8]) -> Self { - Self(SipHasher24::new().hash(buf)) - } - fn sig(&self) -> u32 { - if self.0 as u32 == Self::INVALID_SIG { - 0x12345678 // can't return INVALID_SIG - } else { - self.0 as u32 - } - } - fn row(&self) -> usize { - (self.0 as usize >> 32) % ROWS - } - fn shard(&self) -> u32 { - (self.0 >> 48) as u32 - } -} - -#[derive(Debug, Clone, Copy)] -#[repr(C)] -struct Descriptor { - offset: u32, - klen: u16, - vlen: u16, -} - -#[repr(C)] -struct ShardRow { - sigs: [u32; WIDTH], - descs: [Descriptor; WIDTH], -} - -#[repr(C)] -struct ShardHeader { - rows: [ShardRow; ROWS], -} - -struct ShardFile { - start: u32, - end: u32, - file: RefCell, - mmap: MmapMut, -} - -type Buf = Vec; -type KV = (Buf, Buf); - -impl ShardFile { - const HEADER_SIZE: u64 = size_of::() as u64; - - fn open(dirpath: impl AsRef, start: u32, end: u32) -> Result { - let filepath = dirpath.as_ref().join(format!("{start}-{end}")); - let mut file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) - .open(filepath)?; - file.set_len(Self::HEADER_SIZE)?; - file.seek(std::io::SeekFrom::End(0))?; - let mmap = unsafe { - MmapOptions::new() - .len(Self::HEADER_SIZE as usize) - .map_mut(&file) - }?; - Ok(Self { - start, - end, - file: RefCell::new(file), - mmap, - }) - } - - fn header_row(&self, r: usize) -> &mut ShardRow { - &mut unsafe { &mut *(self.mmap.as_ptr() as *const ShardHeader as *mut ShardHeader) }.rows[r] - } - - fn read(&self, desc: Descriptor) -> Result { - let mut k = vec![0; desc.klen as usize]; - let mut v = vec![0; desc.vlen as usize]; - let f = self.file.borrow(); - f.read_exact_at(&mut k, desc.offset as u64)?; - f.read_exact_at(&mut v, desc.offset as u64 + desc.klen as u64)?; - Ok((k, v)) - } - fn write(&self, key: &[u8], val: &[u8]) -> Result { - let mut f = self.file.borrow_mut(); - let offset = f.stream_position()?; - f.write_all(key)?; - f.write_all(val)?; - Ok(Descriptor { - offset: offset as u32, - klen: key.len() as u16, - vlen: val.len() as u16, - }) - } - - fn get(&self, ph: PartedHash, key: &[u8]) -> Result> { - let row = self.header_row(ph.row()); - for (i, s) in row.sigs.iter().enumerate() { - if *s == ph.sig() { - let desc = row.descs[i]; - let (k, v) = self.read(desc)?; - if k == key { - return Ok(Some(v)); - } - } - } - Ok(None) - } - - fn set(&mut self, ph: PartedHash, key: &[u8], val: &[u8]) -> Result { - let row = self.header_row(ph.row()); - for (i, s) in row.sigs.iter().enumerate() { - if *s == ph.sig() { - let desc = row.descs[i]; - let (k, _) = self.read(desc)?; - if k == key { - row.descs[i] = self.write(key, val)?; - return Ok(true); - } - } - } - - for (i, s) in row.sigs.iter_mut().enumerate() { - if *s == PartedHash::INVALID_SIG { - // insert new - *s = ph.sig(); - row.descs[i] = self.write(key, val)?; - return Ok(true); - } - } - - Ok(false) - } - - fn remove(&mut self, ph: PartedHash, key: &[u8]) -> Result { - let row = self.header_row(ph.row()); - for (i, s) in row.sigs.iter_mut().enumerate() { - if *s == ph.sig() { - let desc = row.descs[i]; - let (k, _) = self.read(desc)?; - if k == key { - *s = PartedHash::INVALID_SIG; - return Ok(true); - } - } - } - Ok(false) - } - - fn iter<'a>(&'a self) -> impl Iterator> + 'a { - (0..ROWS).map(|r| self.header_row(r)).flat_map(|row| { - row.sigs.iter().enumerate().filter_map(|(i, sig)| { - if *sig == PartedHash::INVALID_SIG { - return None; - } - Some(self.read(row.descs[i])) - }) - }) - } -} - -struct Store { - dirpath: PathBuf, - shards: Vec, -} - -impl Store { - const MAX_SHARD: u32 = u16::MAX as u32 + 1; - - fn open(dirpath: impl AsRef) -> Result { - let dirpath = dirpath.as_ref().to_path_buf(); - std::fs::create_dir_all(&dirpath)?; - let first_shard = ShardFile::open(&dirpath, 0, Self::MAX_SHARD)?; - Ok(Self { - dirpath, - shards: vec![first_shard], - }) - } - - fn get(&self, key: &[u8]) -> Result> { - let ph = PartedHash::new(key); - for shard in self.shards.iter() { - if ph.shard() < shard.end { - return shard.get(ph, key); - } - } - unreachable!(); - } - - fn remove(&mut self, key: &[u8]) -> Result { - let ph = PartedHash::new(key); - for shard in self.shards.iter_mut() { - if ph.shard() < shard.end { - return shard.remove(ph, key); - } - } - unreachable!(); - } - - fn split(&mut self, shard_idx: usize) -> Result<()> { - let removed_shard = self.shards.remove(shard_idx); - - let start = removed_shard.start; - let end = removed_shard.end; - let mid = (removed_shard.start + removed_shard.end) / 2; - println!("splitting [{start}, {end}) to [{start}, {mid}) and [{mid}, {end})"); - - let mut bottom = ShardFile::open(&self.dirpath, start, mid)?; - let mut top = ShardFile::open(&self.dirpath, mid, end)?; - - for res in removed_shard.iter() { - let (key, val) = res?; - let ph = PartedHash::new(&key); - if ph.shard() < mid { - bottom.set(ph, &key, &val)?; - } else { - top.set(ph, &key, &val)?; - } - } - - std::fs::remove_file(self.dirpath.join(format!("{start}-{end}")))?; - - self.shards.push(bottom); - self.shards.push(top); - self.shards.sort_by(|x, y| x.end.cmp(&y.end)); - Ok(()) - } - - fn set(&mut self, key: &[u8], val: &[u8]) -> Result { - let ph = PartedHash::new(key); - loop { - let mut shard_to_split = None; - for (i, shard) in self.shards.iter_mut().enumerate() { - if ph.shard() < shard.end { - if shard.set(ph, key, val)? { - return Ok(true); - } - shard_to_split = Some(i); - break; - } - } - self.split(shard_to_split.unwrap())?; - } - } - - fn iter<'a>(&'a self) -> impl Iterator> + 'a { - self.shards.iter().flat_map(|shard| shard.iter()) - } -} - -fn main() -> Result<()> { - let mut db = Store::open("/tmp/mini-dbdir")?; - db.set(b"hello", b"world")?; - - println!("{:?}", db.get(b"hello")?); - println!("{:?}", db.get(b"nonexistent")?); - - db.remove(b"hello")?; - println!("{:?}", db.get(b"hello")?); - - println!("{}", db.iter().count()); - - for i in 0..100_000u32 { - db.set(&i.to_le_bytes(), &(i * 2).to_le_bytes())?; - } - - println!("{}", db.iter().count()); - - Ok(()) -} diff --git a/simulator/Cargo.lock b/simulator/Cargo.lock deleted file mode 100644 index c8878b3..0000000 --- a/simulator/Cargo.lock +++ /dev/null @@ -1,140 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "getrandom" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "libc" -version = "0.2.155" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" - -[[package]] -name = "ppv-lite86" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee4364d9f3b902ef14fab8a1ddffb783a1cb6b4bba3bfc1fa3922732c7de97f" -dependencies = [ - "zerocopy", -] - -[[package]] -name = "proc-macro2" -version = "1.0.86" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "simulator" -version = "0.1.0" -dependencies = [ - "rand", -] - -[[package]] -name = "syn" -version = "2.0.72" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "zerocopy" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854e949ac82d619ee9a14c66a1b674ac730422372ccb759ce0c39cabcf2bf8e6" -dependencies = [ - "byteorder", - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] diff --git a/simulator/Cargo.toml b/simulator/Cargo.toml deleted file mode 100644 index bde7056..0000000 --- a/simulator/Cargo.toml +++ /dev/null @@ -1,7 +0,0 @@ -[package] -name = "simulator" -version = "0.1.0" -edition = "2021" - -[dependencies] -rand = "0.8.5" diff --git a/simulator/README.md b/simulator/README.md deleted file mode 100644 index a86e22e..0000000 --- a/simulator/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# Candy Simulator - -* Tests the fill level that each shard can have using different params -* Tests the number of collisions in the same row (signatures) -* Tests the performance of position_simd for various sizes (compile with `--release`) - -# Results -``` -r= 32 w= 32 avg=0.687102 elems= 1024 sz= 12KB collisions=0 collisions-probability=0.000000115483993 -r= 32 w= 64 avg=0.755089 elems= 2048 sz= 24KB collisions=0 collisions-probability=0.000000469386467 -r= 32 w= 128 avg=0.832785 elems= 4096 sz= 48KB collisions=0 collisions-probability=0.000001892445681 GOOD -r= 32 w= 256 avg=0.871744 elems= 8192 sz= 96KB collisions=0 collisions-probability=0.000007599563332 GOOD -r= 32 w= 512 avg=0.907163 elems= 16384 sz= 192KB collisions=0 collisions-probability=0.000030457509641 GOOD -r= 32 w=1024 avg=0.935280 elems= 32768 sz= 384KB collisions=0 collisions-probability=0.000121943667477 GOOD -r= 64 w= 32 avg=0.647315 elems= 2048 sz= 24KB collisions=0 collisions-probability=0.000000115483993 -r= 64 w= 64 avg=0.728652 elems= 4096 sz= 48KB collisions=0 collisions-probability=0.000000469386467 -r= 64 w= 128 avg=0.805568 elems= 8192 sz= 96KB collisions=0 collisions-probability=0.000001892445681 GOOD -r= 64 w= 256 avg=0.853133 elems= 16384 sz= 192KB collisions=0 collisions-probability=0.000007599563332 GOOD -r= 64 w= 512 avg=0.899420 elems= 32768 sz= 384KB collisions=0 collisions-probability=0.000030457509641 GOOD -r= 64 w=1024 avg=0.927043 elems= 65536 sz= 768KB collisions=6 collisions-probability=0.000121943667477 GOOD -r= 128 w= 32 avg=0.615332 elems= 4096 sz= 48KB collisions=0 collisions-probability=0.000000115483993 -r= 128 w= 64 avg=0.708627 elems= 8192 sz= 96KB collisions=0 collisions-probability=0.000000469386467 -r= 128 w= 128 avg=0.784355 elems= 16384 sz= 192KB collisions=0 collisions-probability=0.000001892445681 -r= 128 w= 256 avg=0.843362 elems= 32768 sz= 384KB collisions=0 collisions-probability=0.000007599563332 GOOD -r= 128 w= 512 avg=0.884743 elems= 65536 sz= 768KB collisions=0 collisions-probability=0.000030457509641 GOOD -r= 128 w=1024 avg=0.920297 elems= 131072 sz=1536KB collisions=3 collisions-probability=0.000121943667477 GOOD BIG -r= 256 w= 32 avg=0.599061 elems= 8192 sz= 96KB collisions=0 collisions-probability=0.000000115483993 -r= 256 w= 64 avg=0.688738 elems= 16384 sz= 192KB collisions=0 collisions-probability=0.000000469386467 -r= 256 w= 128 avg=0.768617 elems= 32768 sz= 384KB collisions=0 collisions-probability=0.000001892445681 -r= 256 w= 256 avg=0.832496 elems= 65536 sz= 768KB collisions=0 collisions-probability=0.000007599563332 GOOD -r= 256 w= 512 avg=0.877548 elems= 131072 sz=1536KB collisions=0 collisions-probability=0.000030457509641 GOOD BIG -r= 256 w=1024 avg=0.914863 elems= 262144 sz=3072KB collisions=6 collisions-probability=0.000121943667477 GOOD BIG -``` - -``` -width= 32 time per simd= 4ns -width= 64 time per simd= 21ns -width= 128 time per simd= 26ns -width= 256 time per simd= 36ns -width= 512 time per simd= 59ns -width=1024 time per simd= 100ns -``` - -``` -width= 32 time per non-simd= 25ns -width= 64 time per non-simd= 53ns -width= 128 time per non-simd= 85ns -width= 256 time per non-simd= 145ns -width= 512 time per non-simd= 266ns -width=1024 time per non-simd= 507ns -``` diff --git a/simulator/rust-toolchain.toml b/simulator/rust-toolchain.toml deleted file mode 100644 index 5d56faf..0000000 --- a/simulator/rust-toolchain.toml +++ /dev/null @@ -1,2 +0,0 @@ -[toolchain] -channel = "nightly" diff --git a/simulator/src/main.rs b/simulator/src/main.rs deleted file mode 100644 index 8da6c1c..0000000 --- a/simulator/src/main.rs +++ /dev/null @@ -1,226 +0,0 @@ -#![feature(btree_cursors)] -use std::{collections::BTreeMap, sync::atomic::AtomicUsize, time::Instant, u32}; - -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] -struct PartedHash { - shard_idx: u32, - row_idx: u32, - signature: u32, -} - -impl PartedHash { - fn new_random() -> Self { - Self { - shard_idx: rand::random(), - row_idx: rand::random(), - signature: rand::random(), - } - } -} - -#[derive(Debug, Default, Clone)] -struct ShardRow { - entries: Vec, -} - -static TOTAL_COLLISIONS: AtomicUsize = AtomicUsize::new(0); - -#[derive(Debug)] -struct Shard { - row_width: usize, - total: usize, - rows: Vec, -} -impl Shard { - fn new(num_rows: usize, row_width: usize) -> Self { - Self { - row_width, - total: 0, - rows: vec![ShardRow::default(); num_rows], - } - } - fn add(&mut self, h: PartedHash) -> bool { - let len = self.rows.len(); - let row = &mut self.rows[(h.row_idx as usize) % len]; - if row.entries.len() >= self.row_width { - false - } else { - if row - .entries - .iter() - .find(|h2| h2.signature == h.signature) - .is_some() - { - TOTAL_COLLISIONS.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - } - row.entries.push(h); - self.total += 1; - true - } - } -} - -struct DB { - num_rows: usize, - row_width: usize, - total: usize, - num_splits: usize, - fill_level_on_split: usize, - fill_levels: Vec, - shards: BTreeMap, -} -impl DB { - fn new(num_rows: usize, row_width: usize) -> Self { - let mut bt = BTreeMap::new(); - bt.insert(1 << 32, Shard::new(num_rows, row_width)); - Self { - num_rows, - row_width, - total: 0, - num_splits: 0, - fill_level_on_split: 0, - fill_levels: vec![], - shards: bt, - } - } - fn add(&mut self, to_add: PartedHash) { - let (key_before, key_after) = { - let shard_idx = to_add.shard_idx as u64; - let mut cursor = self - .shards - .lower_bound_mut(std::ops::Bound::Excluded(&shard_idx)); - let key_before = cursor.peek_prev().map(|(k, _)| *k).unwrap_or(0); - let Some((key_after, shard)) = cursor.peek_next() else { - panic!("no key_after for 0x{:x}", to_add.shard_idx); - }; - - if shard.add(to_add) { - self.total += 1; - return; - } - (key_before, *key_after) - }; - - let prev_shard = self.shards.remove(&key_after).unwrap(); - let midpoint = (key_before / 2) + (key_after / 2); - self.shards - .insert(midpoint, Shard::new(self.num_rows, self.row_width)); - self.shards - .insert(key_after, Shard::new(self.num_rows, self.row_width)); - - self.num_splits += 1; - self.fill_level_on_split += prev_shard.total; - - /*println!( - "split ({:3}) 0x{key_before:08x}..0x{midpoint:08x}..0x{key_after:09x} [total: {:8}, shard avg fill: {:.4}, shard size: {}]", - self.num_splits, - self.total, - ((self.fill_level_on_split as f64) / (self.num_splits as f64)) - / ((self.num_rows * self.row_width) as f64), - self.num_rows * self.row_width - );*/ - self.fill_levels.push( - ((self.fill_level_on_split as f64) / (self.num_splits as f64)) - / ((self.num_rows * self.row_width) as f64), - ); - self.total -= prev_shard.total; - - for row in prev_shard.rows.iter() { - for h in row.entries.iter() { - self.add(*h); - } - } - self.add(to_add); - } -} - -fn main() { - for rows in [32, 64, 128, 256] { - for width in [32, 64, 128, 256, 512, 1024] { - let mut db = DB::new(rows, width); - let mut added = 0; - TOTAL_COLLISIONS.store(0, std::sync::atomic::Ordering::SeqCst); - for _ in 0..100 { - for _ in 0..db.num_rows * db.row_width { - db.add(PartedHash::new_random()); - added += 1; - } - } - - let mut summed = 0; - for (_, sh) in db.shards.iter() { - summed += sh.total; - } - - let mut summed_last_fills = 0.0; - for lf in db.fill_levels.iter() { - summed_last_fills += lf; - } - - assert_eq!(db.total, summed); - assert_eq!(db.total, added); - let avg = summed_last_fills / (db.fill_levels.len() as f64); - let sz = (db.num_rows * db.row_width * 12) / 1024; - println!( - "r={rows:4} w={width:4} avg={:.6} elems={:7} sz={:4}KB collisions={} collisions-probability={:.015} {} {}", - avg, - db.num_rows * db.row_width, - sz, - TOTAL_COLLISIONS.load(std::sync::atomic::Ordering::SeqCst), - 1.0 - (-(width as f64) * (width as f64 - 1.0) / ((1u64 << 33) as f64)).exp(), - if avg > 0.8 {"GOOD"} else {""}, - if sz > 800 {"BIG"} else {""}, - ); - } - } - - let reps = 10_000_000usize; - for width in [32, 64, 128, 256, 512, 1024] { - let mut v = vec![0u32; width]; - for i in 0..width { - v[i] = i as u32; - } - v[width - 1] = 80808080; - assert_eq!(v.iter().position(|x| *x == 80808080), Some(width - 1)); - assert_eq!(v.iter().position(|x| *x == 80808081), None); - let mut pos: usize = 0; - - let t0 = Instant::now(); - for _ in 0..reps { - pos += v.iter().position(|x| *x == 80808080).unwrap_or(0); - pos += v.iter().position(|x| *x == 80808081).unwrap_or(0); - } - - println!( - "width={width:4} time per simd={:4}ns", - Instant::now().duration_since(t0).as_nanos() as usize / reps, - ); - - assert_eq!(pos, (width - 1) * reps); - } - - let reps = 10_000_000usize; - for width in [32, 64, 128, 256, 512, 1024] { - let mut v = vec![0u32; width]; - for i in 0..width { - v[i] = i as u32; - } - v[width - 1] = 80808080; - assert_eq!(v.iter().position(|x| *x == 80808080), Some(width - 1)); - assert_eq!(v.iter().position(|x| *x == 80808081), None); - let mut pos: usize = 0; - - let t0 = Instant::now(); - for _ in 0..reps { - pos += v.iter().position(|x| *x == 80808080).unwrap_or(0); - pos += v.iter().position(|x| *x == 80808081).unwrap_or(0); - } - - println!( - "width={width:4} time per non-simd={:4}ns", - Instant::now().duration_since(t0).as_nanos() as usize / reps, - ); - - assert_eq!(pos, (width - 1) * reps); - } -} diff --git a/split.png b/split.png new file mode 100644 index 0000000..cc80e5a Binary files /dev/null and b/split.png differ diff --git a/src/data_file.rs b/src/data_file.rs new file mode 100644 index 0000000..fbb1074 --- /dev/null +++ b/src/data_file.rs @@ -0,0 +1,808 @@ +use parking_lot::{Mutex, RwLock}; +use smallvec::SmallVec; +use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; + +use std::{ + collections::VecDeque, + fs::File, + mem::size_of, + path::Path, + sync::{ + Arc, + atomic::{AtomicBool, AtomicU64, Ordering}, + }, +}; + +use crate::internal::{ + DATA_ENTRY_OFFSET_MAGIC, DATA_ENTRY_OFFSET_MASK, DATA_FILE_SIGNATURE, DATA_FILE_VERSION, + EntryType, FILE_OFFSET_ALIGNMENT, KEY_NAMESPACE_BITS, KVBuf, KVRef, KeyNamespace, + MAX_KEY_NAMESPACE, PAGE_SIZE, READ_BUFFER_SIZE, SIZE_HINT_UNIT, data_file_path, + invalid_data_error, read_available_at, read_into_at, sync_dir, sync_file_range, write_all_at, +}; +use crate::types::{Config, Error, MAX_USER_KEY_SIZE, MAX_USER_VALUE_SIZE, Result}; + +const INLINE_SCRATCH_BUFFER_SIZE: usize = 1024; + +struct ParsedDataEntry { + data_len: usize, + vlen: u16, + ns: u8, +} + +#[derive(Clone, Copy, FromBytes, IntoBytes, KnownLayout, Immutable)] +#[repr(C)] +struct DataFileHeader { + magic: [u8; 8], + version: u32, + _padding0: u32, + ordinal: u64, + _trailer: [u8; 4096 - 24], +} + +const _: () = assert!(size_of::() == PAGE_SIZE); + +struct InflightSlot { + seq: AtomicU64, + ordinal: AtomicU64, + offset: AtomicU64, +} + +pub(crate) struct InflightTracker { + snapshot_barrier: RwLock<()>, + next_seq: AtomicU64, + slots: Vec, + completed_deltas: Vec>>, +} + +impl InflightTracker { + pub(crate) fn new(num_shards: usize) -> Self { + Self { + snapshot_barrier: RwLock::new(()), + next_seq: AtomicU64::new(1), + slots: (0..num_shards) + .map(|_| InflightSlot { + seq: AtomicU64::new(0), + ordinal: AtomicU64::new(0), + offset: AtomicU64::new(0), + }) + .collect(), + completed_deltas: (0..num_shards) + .map(|_| Mutex::new(VecDeque::new())) + .collect(), + } + } + + fn reserve<'a>( + &'a self, + data_file: &DataFile, + shard_idx: usize, + len: u64, + delta: i8, + ) -> Result<(u64, InflightGuard<'a>)> { + let _barrier = self.snapshot_barrier.read(); + let offset = data_file.allocate(len)?; + let ordinal = data_file.file_ordinal; + let seq = self.next_seq.fetch_add(1, Ordering::Relaxed); + let slot = &self.slots[shard_idx]; + slot.ordinal.store(ordinal, Ordering::Relaxed); + slot.offset.store(offset, Ordering::Relaxed); + slot.seq.store(seq, Ordering::Release); + + Ok(( + offset, + InflightGuard { + tracker: self, + shard_idx, + seq, + delta, + armed: true, + }, + )) + } + + pub(crate) fn checkpoint_progress(&self, active_file: &DataFile) -> (u64, u64, i64) { + let _barrier = self.snapshot_barrier.write(); + + let mut min_slot: Option<(u64, u64, u64)> = None; + for slot in &self.slots { + let seq = slot.seq.load(Ordering::Acquire); + if seq == 0 { + continue; + } + let ordinal = slot.ordinal.load(Ordering::Relaxed); + let offset = slot.offset.load(Ordering::Relaxed); + let current = (seq, ordinal, offset); + min_slot = Some(min_slot.map_or(current, |min_current| min_current.min(current))); + } + + let checkpoint = min_slot + .map(|(_, ordinal, offset)| (ordinal, offset)) + .unwrap_or_else(|| (active_file.file_ordinal, active_file.used_bytes())); + let completed_before_seq = min_slot.map_or(u64::MAX, |(seq, _, _)| seq); + let mut committed_delta = 0i64; + for queue in &self.completed_deltas { + let mut queue = queue.lock(); + while let Some(&(seq, delta)) = queue.front() { + if seq >= completed_before_seq { + break; + } + queue.pop_front(); + committed_delta += delta; + } + } + + (checkpoint.0, checkpoint.1, committed_delta) + } + + pub(crate) fn clear_all(&self) { + let _barrier = self.snapshot_barrier.write(); + for slot in &self.slots { + slot.seq.store(0, Ordering::Release); + slot.ordinal.store(0, Ordering::Relaxed); + slot.offset.store(0, Ordering::Relaxed); + } + for queue in &self.completed_deltas { + queue.lock().clear(); + } + } + + fn clear_matching(&self, shard_idx: usize, expected_seq: u64) { + let _barrier = self.snapshot_barrier.read(); + let slot = &self.slots[shard_idx]; + if slot.seq.load(Ordering::Acquire) == expected_seq { + slot.seq.store(0, Ordering::Release); + } + } + + fn complete_matching(&self, shard_idx: usize, expected_seq: u64, delta: i8) { + let _barrier = self.snapshot_barrier.read(); + let slot = &self.slots[shard_idx]; + if slot.seq.load(Ordering::Acquire) == expected_seq { + slot.seq.store(0, Ordering::Release); + if delta != 0 { + self.completed_deltas[shard_idx] + .lock() + .push_back((expected_seq, i64::from(delta))); + } + } + } +} + +pub(crate) struct InflightGuard<'a> { + tracker: &'a InflightTracker, + shard_idx: usize, + seq: u64, + delta: i8, + armed: bool, +} + +impl InflightGuard<'_> { + pub(crate) fn complete(mut self) { + if self.armed { + self.tracker + .complete_matching(self.shard_idx, self.seq, self.delta); + self.armed = false; + } + } +} + +impl Drop for InflightGuard<'_> { + fn drop(&mut self) { + if self.armed { + self.tracker.clear_matching(self.shard_idx, self.seq); + } + } +} + +pub(crate) struct DataFile { + pub(crate) file: File, + file_offset: AtomicU64, + last_synced_offset: AtomicU64, + sealed_for_rotation: AtomicBool, + config: Arc, + pub(crate) file_idx: u16, + pub(crate) file_ordinal: u64, + preallocated: bool, + recovery_tail_upper_bound: u64, +} + +impl DataFile { + fn read_header(file: &File) -> Result { + let header = + read_available_at(file, size_of::(), 0).map_err(Error::IOError)?; + if header.len() < size_of::() { + return Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "data file header too short", + ))); + } + + let header = DataFileHeader::read_from_bytes(&header).map_err(|_| { + Error::IOError(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "invalid data file header size", + )) + })?; + if &header.magic != DATA_FILE_SIGNATURE || header.version != DATA_FILE_VERSION { + return Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "invalid data file header", + ))); + } + + Ok(header) + } + + pub(crate) fn read_ordinal(base_path: &Path, file_idx: u16) -> Result { + let file = File::options() + .read(true) + .open(data_file_path(base_path, file_idx)) + .map_err(Error::IOError)?; + Ok(Self::read_header(&file)?.ordinal) + } + + pub(crate) fn used_bytes(&self) -> u64 { + self.file_offset.load(Ordering::Acquire) + } + + pub(crate) fn recovery_tail_upper_bound(&self) -> u64 { + self.recovery_tail_upper_bound + } + + pub(crate) fn sync_data(&self, start_offset: u64, end_offset: u64) -> Result<()> { + let used_bytes = self.used_bytes(); + let start_offset = start_offset.min(used_bytes); + let end_offset = end_offset.min(used_bytes); + if end_offset <= start_offset { + return Ok(()); + } + + if !self.preallocated { + self.file.sync_data().map_err(Error::IOError)?; + } else { + sync_file_range( + &self.file, + size_of::() as u64 + start_offset, + end_offset - start_offset, + )?; + } + self.last_synced_offset + .fetch_max(end_offset, Ordering::Release); + Ok(()) + } + + pub(crate) fn sync_to_current(&self) -> Result<()> { + let start = self.last_synced_offset.load(Ordering::Acquire); + self.sync_data(start, self.used_bytes()) + } + + pub(crate) fn truncate_to_offset(&self, file_offset: u64) -> Result<()> { + debug_assert_eq!(file_offset % FILE_OFFSET_ALIGNMENT, 0); + if self.preallocated { + // A crash between the two set_len calls would leave the file + // non-preallocated. That is harmless: the next open will + // detect it as non-preallocated and fall back to sync_all + // until rotation creates a fresh preallocated file. + self.file + .set_len(size_of::() as u64 + file_offset) + .map_err(Error::IOError)?; + self.file + .set_len(size_of::() as u64 + self.config.max_data_file_size as u64) + .map_err(Error::IOError)?; + } else { + self.file + .set_len(size_of::() as u64 + file_offset) + .map_err(Error::IOError)?; + } + self.file_offset.store(file_offset, Ordering::Release); + self.file.sync_all().map_err(Error::IOError)?; + self.last_synced_offset + .store(file_offset, Ordering::Release); + Ok(()) + } + + fn used_data_upper_bound(file: &File, physical_data_len: u64) -> Result { + if physical_data_len == 0 { + return Ok(0); + } + + let mut end = physical_data_len; + while end > 0 { + let start = end.saturating_sub(READ_BUFFER_SIZE as u64); + let chunk = read_available_at( + file, + (end - start) as usize, + size_of::() as u64 + start, + ) + .map_err(Error::IOError)?; + if let Some(rel) = chunk.iter().rposition(|byte| *byte != 0) { + let aligned = (start + rel as u64 + 1).next_multiple_of(FILE_OFFSET_ALIGNMENT); + return Ok(aligned.min(physical_data_len)); + } + end = start; + } + + Ok(0) + } + + /// Scans forward from offset 0, parsing each entry, and returns the + /// aligned end of the last valid entry. We temporarily set `file_offset` + /// to `tail_upper_bound` so that `read_next_entry_ref` won't short-circuit + /// before reaching it. This is safe because `open` is single-threaded; + /// the real value is overwritten by the caller immediately after. + fn detect_used_bytes(&self, tail_upper_bound: u64) -> Result { + if tail_upper_bound == 0 { + return Ok(0); + } + + self.file_offset.store(tail_upper_bound, Ordering::Release); + + let mut offset = 0u64; + let mut read_buf = Vec::new(); + let mut buf_file_offset = 0u64; + let mut last_durable_offset = 0u64; + while let Some((_, _, next_offset)) = + self.read_next_entry_ref(offset, &mut read_buf, &mut buf_file_offset)? + { + offset = next_offset; + last_durable_offset = next_offset.next_multiple_of(FILE_OFFSET_ALIGNMENT); + } + + Ok(last_durable_offset) + } + + fn parse_data_entry(buf: &[u8], offset: u64) -> Result { + if buf.len() < 8 { + return Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "entry too short", + ))); + } + + let header = u32::from_le_bytes(buf[0..4].try_into().unwrap()); + let magic_offset = (((offset / FILE_OFFSET_ALIGNMENT) as u32) ^ DATA_ENTRY_OFFSET_MAGIC) + & DATA_ENTRY_OFFSET_MASK; + + if header & DATA_ENTRY_OFFSET_MASK != magic_offset { + return Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "corrupt entry magic", + ))); + } + + let klen = u16::from_le_bytes(buf[4..6].try_into().unwrap()); + let vlen = u16::from_le_bytes(buf[6..8].try_into().unwrap()); + let entry_len = 4 + 4 + klen as usize + vlen as usize + 2; + if buf.len() < entry_len { + return Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "entry too short", + ))); + } + + let checksum = u16::from_le_bytes(buf[entry_len - 2..entry_len].try_into().unwrap()); + if checksum != crc16_ibm3740_fast::hash(&buf[..entry_len - 2]) as u16 { + return Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "checksum mismatch", + ))); + } + + let ns = ((header >> 24) & ((1 << KEY_NAMESPACE_BITS) - 1)) as u8; + let entry_type = (header >> 30) & 0b11; + if entry_type != EntryType::Insert as u32 && entry_type != EntryType::Update as u32 { + return Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "invalid entry type", + ))); + } + + Ok(ParsedDataEntry { + data_len: 8 + vlen as usize + klen as usize, + vlen, + ns, + }) + } + + pub(crate) fn open( + base_path: &Path, + config: Arc, + file_idx: u16, + validate_tail: bool, + ) -> Result { + let file = File::options() + .read(true) + .write(true) + .open(data_file_path(base_path, file_idx)) + .map_err(Error::IOError)?; + let header = Self::read_header(&file)?; + let physical_data_len = file + .metadata() + .map_err(Error::IOError)? + .len() + .saturating_sub(size_of::() as u64); + let preallocated = physical_data_len == config.max_data_file_size as u64; + let recovery_tail_upper_bound = Self::used_data_upper_bound(&file, physical_data_len)?; + + let inst = Self { + file, + file_offset: AtomicU64::new(physical_data_len), + last_synced_offset: AtomicU64::new(0), + sealed_for_rotation: AtomicBool::new(false), + config, + file_idx, + file_ordinal: header.ordinal, + preallocated, + recovery_tail_upper_bound, + }; + let used_bytes = if validate_tail { + inst.detect_used_bytes(recovery_tail_upper_bound)? + } else { + recovery_tail_upper_bound + }; + inst.file_offset.store(used_bytes, Ordering::Release); + inst.last_synced_offset.store(used_bytes, Ordering::Release); + + Ok(inst) + } + + pub(crate) fn create( + base_path: &Path, + config: Arc, + file_idx: u16, + ordinal: u64, + ) -> Result { + let file = File::options() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(data_file_path(base_path, file_idx)) + .map_err(Error::IOError)?; + file.set_len(size_of::() as u64 + config.max_data_file_size as u64) + .map_err(Error::IOError)?; + let header = DataFileHeader { + magic: *DATA_FILE_SIGNATURE, + version: DATA_FILE_VERSION, + _padding0: 0, + ordinal, + _trailer: [0; 4096 - 24], + }; + write_all_at(&file, header.as_bytes(), 0).map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + sync_dir(base_path)?; + Ok(Self { + file, + file_offset: AtomicU64::new(0), + last_synced_offset: AtomicU64::new(0), + sealed_for_rotation: AtomicBool::new(false), + config, + file_idx, + file_ordinal: ordinal, + preallocated: true, + recovery_tail_upper_bound: 0, + }) + } + + pub(crate) fn seal_for_rotation(&self) { + self.sealed_for_rotation.store(true, Ordering::SeqCst); + } + + fn allocate(&self, len: u64) -> Result { + if self.sealed_for_rotation.load(Ordering::SeqCst) { + return Err(Error::RotateDataFile(self.file_idx)); + } + let mut file_offset = self.file_offset.load(Ordering::Relaxed); + loop { + if file_offset + len > self.config.max_data_file_size as u64 { + return Err(Error::RotateDataFile(self.file_idx)); + } + match self.file_offset.compare_exchange( + file_offset, + file_offset + len, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return Ok(file_offset), + Err(current) => file_offset = current, + } + } + } + + fn append_entry<'a>( + &self, + entry_type: EntryType, + ns: KeyNamespace, + key: &[u8], + val: Option<&[u8]>, + shard_idx: usize, + inflight_tracker: &'a InflightTracker, + ) -> Result<(u64, usize, InflightGuard<'a>)> { + debug_assert!(key.len() <= MAX_USER_KEY_SIZE); + debug_assert!(ns as u8 <= MAX_KEY_NAMESPACE); + + let val_len = val.map_or(0, |v| v.len()); + if let Some(v) = val { + debug_assert!(v.len() <= MAX_USER_VALUE_SIZE); + } + + let entry_len = 4 + if val.is_some() { 4 } else { 2 } + val_len + key.len() + 2; + let aligned_len = entry_len.next_multiple_of(FILE_OFFSET_ALIGNMENT as usize); + let delta = match entry_type { + EntryType::Insert => 1, + EntryType::Tombstone => -1, + _ => 0, + }; + let (file_offset, inflight_guard) = + inflight_tracker.reserve(self, shard_idx, aligned_len as u64, delta)?; + debug_assert!(file_offset % FILE_OFFSET_ALIGNMENT == 0); + + let mut buf = SmallVec::<[u8; INLINE_SCRATCH_BUFFER_SIZE]>::with_capacity(aligned_len); + // We overwrite the entry bytes below and only zero the alignment padding. + unsafe { buf.set_len(aligned_len) }; + let buf = &mut buf[..]; + + let magic_offset = (((file_offset / FILE_OFFSET_ALIGNMENT) as u32) + ^ DATA_ENTRY_OFFSET_MAGIC) + & DATA_ENTRY_OFFSET_MASK; + let header = magic_offset | ((entry_type as u32) << 30) | ((ns as u32) << 24); + + buf[0..4].copy_from_slice(&header.to_le_bytes()); + buf[4..6].copy_from_slice(&(key.len() as u16).to_le_bytes()); + + if let Some(v) = val { + buf[6..8].copy_from_slice(&(v.len() as u16).to_le_bytes()); + buf[8..8 + v.len()].copy_from_slice(v); + buf[8 + v.len()..8 + v.len() + key.len()].copy_from_slice(key); + } else { + buf[6..6 + key.len()].copy_from_slice(key); + } + + buf[entry_len..aligned_len].fill(0); + let checksum = crc16_ibm3740_fast::hash(&buf[..entry_len - 2]) as u16; + buf[entry_len - 2..entry_len].copy_from_slice(&checksum.to_le_bytes()); + + let res = write_all_at( + &self.file, + buf, + size_of::() as u64 + file_offset, + ) + .map_err(Error::IOError); + res?; + + Ok((file_offset, aligned_len, inflight_guard)) + } + + pub(crate) fn append_kv<'a>( + &self, + entry_type: EntryType, + ns: KeyNamespace, + key: &[u8], + val: &[u8], + shard_idx: usize, + inflight_tracker: &'a InflightTracker, + ) -> Result<(u64, usize, InflightGuard<'a>)> { + debug_assert!(matches!(entry_type, EntryType::Insert | EntryType::Update)); + self.append_entry(entry_type, ns, key, Some(val), shard_idx, inflight_tracker) + } + + pub(crate) fn append_tombstone<'a>( + &self, + ns: KeyNamespace, + key: &[u8], + shard_idx: usize, + inflight_tracker: &'a InflightTracker, + ) -> Result<(u64, usize, InflightGuard<'a>)> { + self.append_entry( + EntryType::Tombstone, + ns, + key, + None, + shard_idx, + inflight_tracker, + ) + } + + pub(crate) fn read_kv_into<'a>( + &self, + offset: u64, + size_hint: usize, + buf: &'a mut Vec, + ) -> Result> { + debug_assert!(size_hint >= SIZE_HINT_UNIT); + read_into_at( + &self.file, + buf, + size_hint, + size_of::() as u64 + offset, + ) + .map_err(Error::IOError)?; + let parsed = Self::parse_data_entry(buf, offset)?; + buf.truncate(parsed.data_len); + Ok(KVRef { + buf, + vlen: parsed.vlen, + header_len: 8, + ns: parsed.ns, + entry_type: EntryType::Insert, + }) + } + + pub(crate) fn read_kv(&self, offset: u64, size_hint: usize) -> Result { + debug_assert!(size_hint >= SIZE_HINT_UNIT); + let mut buf = read_available_at( + &self.file, + size_hint, + size_of::() as u64 + offset, + ) + .map_err(Error::IOError)?; + let parsed = Self::parse_data_entry(&buf, offset)?; + buf.truncate(parsed.data_len); + Ok(KVBuf { + buf, + vlen: parsed.vlen, + header_len: 8, + ns: parsed.ns, + entry_type: EntryType::Insert, + }) + } + + fn ensure_verified_entry( + &self, + read_buf: &mut Vec, + buf_file_offset: &mut u64, + rel: usize, + entry_len: usize, + offset: u64, + ) -> Result> { + let start = if read_buf.len() - rel >= entry_len { + rel + } else { + read_into_at( + &self.file, + read_buf, + entry_len, + size_of::() as u64 + offset, + ) + .map_err(Error::IOError)?; + *buf_file_offset = offset; + if read_buf.len() < entry_len { + return Ok(None); + } + 0 + }; + + let entry_bytes = &read_buf[start..start + entry_len]; + let checksum = + u16::from_le_bytes(entry_bytes[entry_len - 2..entry_len].try_into().unwrap()); + if checksum != crc16_ibm3740_fast::hash(&entry_bytes[..entry_len - 2]) as u16 { + return Ok(None); + } + + Ok(Some(start)) + } + + pub(crate) fn read_next_entry_ref<'a>( + &self, + mut offset: u64, + read_buf: &'a mut Vec, + buf_file_offset: &mut u64, + ) -> Result, u64, u64)>> { + let used_bytes = self.used_bytes(); + if offset >= used_bytes { + return Ok(None); + } + offset = offset.next_multiple_of(FILE_OFFSET_ALIGNMENT); + + loop { + if offset >= used_bytes { + return Ok(None); + } + let buf_start = if offset >= *buf_file_offset { + (offset - *buf_file_offset) as usize + } else { + read_buf.clear(); + 0 + }; + + if buf_start >= read_buf.len() || read_buf.len() - buf_start < 8 { + read_into_at( + &self.file, + read_buf, + READ_BUFFER_SIZE, + size_of::() as u64 + offset, + ) + .map_err(Error::IOError)?; + *buf_file_offset = offset; + if read_buf.len() < 8 { + return Ok(None); + } + } + + let rel = (offset - *buf_file_offset) as usize; + let avail = &read_buf[rel..]; + + let header = u32::from_le_bytes(avail[0..4].try_into().unwrap()); + let magic_offset = (((offset / FILE_OFFSET_ALIGNMENT) as u32) + ^ DATA_ENTRY_OFFSET_MAGIC) + & DATA_ENTRY_OFFSET_MASK; + if header & DATA_ENTRY_OFFSET_MASK != magic_offset { + offset += FILE_OFFSET_ALIGNMENT; + continue; + } + + let ns = ((header >> 24) & ((1 << KEY_NAMESPACE_BITS) - 1)) as u8; + let entry_type = (header >> 30) & 0b11; + + match entry_type { + x if x == EntryType::Insert as u32 || x == EntryType::Update as u32 => { + let resolved_type = if x == EntryType::Insert as u32 { + EntryType::Insert + } else { + EntryType::Update + }; + let klen = u16::from_le_bytes(avail[4..6].try_into().unwrap()); + let vlen = u16::from_le_bytes(avail[6..8].try_into().unwrap()); + let entry_len = 4 + 4 + klen as usize + vlen as usize + 2; + + let Some(start) = self.ensure_verified_entry( + read_buf, + buf_file_offset, + rel, + entry_len, + offset, + )? + else { + offset += FILE_OFFSET_ALIGNMENT; + continue; + }; + let buf = &read_buf[start..start + 8 + vlen as usize + klen as usize]; + + return Ok(Some(( + KVRef { + buf, + vlen, + header_len: 8, + ns, + entry_type: resolved_type, + }, + offset, + offset + entry_len as u64, + ))); + } + x if x == EntryType::Tombstone as u32 => { + let klen = u16::from_le_bytes(avail[4..6].try_into().unwrap()); + let entry_len = 4 + 2 + klen as usize + 2; + + let Some(start) = self.ensure_verified_entry( + read_buf, + buf_file_offset, + rel, + entry_len, + offset, + )? + else { + offset += FILE_OFFSET_ALIGNMENT; + continue; + }; + let buf = &read_buf[start..start + 6 + klen as usize]; + + return Ok(Some(( + KVRef { + buf, + vlen: 0, + header_len: 6, + ns, + entry_type: EntryType::Tombstone, + }, + offset, + offset + entry_len as u64, + ))); + } + _ => { + return Err(invalid_data_error("unknown data entry type")); + } + } + } + } +} diff --git a/src/hashing.rs b/src/hashing.rs deleted file mode 100644 index 0ed1505..0000000 --- a/src/hashing.rs +++ /dev/null @@ -1,100 +0,0 @@ -use siphasher::sip128::{Hash128, SipHasher24}; - -use crate::shard::NUM_ROWS; - -use bytemuck::{Pod, Zeroable}; - -pub type HashSeed = [u8; 16]; - -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Pod, Zeroable, Hash)] -#[repr(transparent)] -pub(crate) struct PartedHash(u64); - -// impl std::fmt::Display for PartedHash { -// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { -// write!( -// f, -// "{:04x}.{:04x}.{:08x}", -// self.shard_selector(), -// self.row_selector(), -// self.signature() -// ) -// } -// } - -pub(crate) const INVALID_SIG: u32 = 0; - -#[cfg(feature = "whitebox_testing")] -pub static mut HASH_BITS_TO_KEEP: u64 = u64::MAX; // which bits to keep from the hash - for testing collisions - -impl PartedHash { - pub fn new(seed: &HashSeed, buf: &[u8]) -> Self { - Self::from_hash(SipHasher24::new_with_key(&seed).hash(buf)) - } - - #[inline] - pub fn is_valid(&self) -> bool { - self.signature() != INVALID_SIG - } - - #[inline] - pub fn shard_selector(&self) -> u32 { - ((self.0 >> 48) & 0xffff) as u32 - } - - #[inline] - pub fn row_selector(&self) -> usize { - (((self.0 >> 32) as u16) as usize) % NUM_ROWS - } - - #[inline] - pub fn signature(&self) -> u32 { - self.0 as u32 - } - - #[allow(dead_code)] - pub fn as_u64(&self) -> u64 { - self.0 - } - - fn from_hash(h: Hash128) -> Self { - let mut sig = h.h1 as u32; - if sig == INVALID_SIG { - sig = h.h2 as u32; - if sig == INVALID_SIG { - sig = (h.h2 >> 32) as u32; - if sig == INVALID_SIG { - sig = 0x6052_c9b7; // this is so unlikely that it doesn't really matter - } - } - } - let shard = h.h1 & 0xffff_0000_0000_0000; - let row = h.h1 & 0x0000_ffff_0000_0000; - let val = shard | row | (sig as u64); - - #[cfg(feature = "whitebox_testing")] - let val = (val & unsafe { HASH_BITS_TO_KEEP }) | 1 /* make sure sig != 0 */; - - Self(val) - } -} - -#[test] -fn test_parted_hash() -> crate::Result<()> { - use bytemuck::{bytes_of, from_bytes}; - - let h1 = PartedHash::new(b"aaaabbbbccccdddd", b"hello world"); - assert_eq!(h1.0, 13445180190757400308,); - let h2 = PartedHash(13445180190757400308); - assert_eq!(PartedHash::new(b"aaaabbbbccccdddd", b"hello world"), h2); - - let h3 = PartedHash(0x1020304050607080); - assert_eq!( - bytes_of(&h3), - [0x80, 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10] - ); - let h4: PartedHash = *from_bytes(&[0x80, 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10]); - assert_eq!(h4, h3); - - Ok(()) -} diff --git a/src/index_file.rs b/src/index_file.rs new file mode 100644 index 0000000..47535bd --- /dev/null +++ b/src/index_file.rs @@ -0,0 +1,971 @@ +use memmap2::MmapMut; +use parking_lot::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use simd_itertools::PositionSimd; +use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout, TryFromBytes}; + +use std::{ + fs::File, + hash::Hasher, + mem::{offset_of, size_of}, + ops::{Deref, DerefMut}, + path::Path, + sync::{ + Arc, + atomic::{AtomicU32, AtomicU64, Ordering}, + }, + time::{Duration, Instant}, +}; + +use crate::internal::{ + FILE_OFFSET_ALIGNMENT, HashCoord, INDEX_FILE_SIGNATURE, INDEX_FILE_VERSION, MAX_DATA_FILES, + MIN_INITIAL_ROWS, MIN_SPLIT_LEVEL, PAGE_SIZE, ROW_WIDTH, SIZE_HINT_UNIT, index_file_path, + index_rows_file_path, invalid_data_error, read_available_at, unexpected_eof_error, +}; +use crate::types::{Config, Error, Result}; + +const CHECKPOINT_SLOT_COUNT: usize = 2; + +#[derive(Clone, Copy)] +struct CheckpointCursor { + generation: u64, + file_ordinal: u64, + offset: u64, +} + +#[derive(FromBytes, IntoBytes, KnownLayout)] +#[repr(C)] +pub(crate) struct CheckpointSlot { + generation: AtomicU64, + file_ordinal: AtomicU64, + offset: AtomicU64, + checksum: AtomicU64, +} + +const _: () = assert!(size_of::() == 32); + +fn checkpoint_slot_checksum(generation: u64, file_ordinal: u64, offset: u64) -> u64 { + let mut hasher = siphasher::sip::SipHasher13::new(); + hasher.write_u64(generation); + hasher.write_u64(file_ordinal); + hasher.write_u64(offset); + hasher.finish() +} + +#[derive(FromBytes, IntoBytes, KnownLayout)] +#[repr(C)] +pub(crate) struct IndexFileHeader { + pub(crate) signature: [u8; 8], + pub(crate) version: u32, + _padding16: u32, + pub(crate) hash_key_0: u64, + pub(crate) hash_key_1: u64, + _padding64: [u8; 64 - 4 * 8], + + /////////////////////////////////// + // runtime state + /////////////////////////////////// + pub(crate) global_split_level: AtomicU64, + _padding128: [u8; 64 - 8], + + /////////////////////////////////// + // rebuild state + /////////////////////////////////// + /// Persisted replay cursor, double-buffered so recovery can pick the + /// newest valid slot after crashes or torn writes. + pub(crate) checkpoint_slots: [CheckpointSlot; CHECKPOINT_SLOT_COUNT], + _padding1024: [u8; 896 - CHECKPOINT_SLOT_COUNT * 32], + + /////////////////////////////////// + // stats + /////////////////////////////////// + pub(crate) committed_num_entries: AtomicU64, + + _trailer: [u8; PAGE_SIZE - 1024 - 8], +} + +const _: () = assert!(offset_of!(IndexFileHeader, global_split_level) == 64); +const _: () = assert!(offset_of!(IndexFileHeader, checkpoint_slots) == 128); +const _: () = assert!(offset_of!(IndexFileHeader, committed_num_entries) == 1024); +const _: () = assert!(size_of::() == PAGE_SIZE); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, FromBytes, IntoBytes, KnownLayout, Immutable)] +#[repr(transparent)] +pub(crate) struct EntryPointer(pub(crate) u64); + +impl EntryPointer { + pub(crate) const INVALID_POINTER: Self = Self(0); + + pub(crate) fn new( + file_idx: u16, + file_offset: u64, + size: usize, + masked_row_selector: u32, + ) -> Self { + debug_assert!(size > 0 && size <= u8::MAX as usize * SIZE_HINT_UNIT); + + let fi = (file_idx as u64) & ((1 << 12) - 1); + let fo = ((file_offset / FILE_OFFSET_ALIGNMENT) & ((1 << 26) - 1)) << 12; + let sh = (size.div_ceil(SIZE_HINT_UNIT) as u64) << (12 + 26); + let rs = (masked_row_selector as u64) << (12 + 26 + 8); + Self(fi | fo | sh | rs) + } + + pub(crate) fn file_idx(&self) -> u16 { + (self.0 & ((1 << 12) - 1)) as u16 + } + + pub(crate) fn file_offset(&self) -> u64 { + ((self.0 >> 12) & ((1 << 26) - 1)) * FILE_OFFSET_ALIGNMENT + } + + pub(crate) fn size_hint(&self) -> usize { + ((self.0 >> (12 + 26)) & ((1 << 8) - 1)) as usize * SIZE_HINT_UNIT + } + + pub(crate) fn masked_row_selector(&self) -> u32 { + (self.0 >> (12 + 26 + 8)) as u32 + } + + pub(crate) fn is_valid(&self) -> bool { + self.0 != Self::INVALID_POINTER.0 + } +} + +#[derive(FromBytes, IntoBytes, KnownLayout)] +#[repr(C)] +pub(crate) struct RowLayout { + pub(crate) split_level: AtomicU64, + _padding: [u8; 56], + pub(crate) signatures: [u32; ROW_WIDTH], + pub(crate) pointers: [EntryPointer; ROW_WIDTH], +} + +const _: () = assert!(size_of::() == PAGE_SIZE); +const _: () = assert!(offset_of!(RowLayout, signatures) % 8 == 0); +const _: () = assert!(offset_of!(RowLayout, pointers) % 8 == 0); + +impl RowLayout { + pub(crate) fn iter_matches(&self, hash_coord: HashCoord) -> RowMatchIterator<'_> { + RowMatchIterator { + row: self, + hash_coord, + offset: 0, + } + } + + pub(crate) fn find_free_slot(&self) -> Option { + self.signatures + .iter() + .position_simd(|&sig| sig == HashCoord::INVALID_SIG) + } + + pub(crate) fn insert(&mut self, idx: usize, sig: u32, ptr: EntryPointer) { + debug_assert!(self.signatures[idx] == HashCoord::INVALID_SIG); + self.signatures[idx] = sig; + crate::crash_point("insert_after_sig"); + self.pointers[idx] = ptr; + } + + pub(crate) fn remove(&mut self, idx: usize) { + self.signatures[idx] = HashCoord::INVALID_SIG; + self.pointers[idx] = EntryPointer::INVALID_POINTER; + } + + pub(crate) fn replace_pointer(&mut self, idx: usize, new_ptr: EntryPointer) { + self.pointers[idx] = new_ptr; + } + + pub(crate) fn set_split_level(&mut self, new_sl: u64) { + self.split_level.store(new_sl, Ordering::Release); + } +} + +pub(crate) struct RowMatchIterator<'a> { + row: &'a RowLayout, + hash_coord: HashCoord, + offset: usize, +} + +impl Iterator for RowMatchIterator<'_> { + type Item = (usize, EntryPointer); + + fn next(&mut self) -> Option { + while self.offset < ROW_WIDTH { + if let Some(idx) = self.row.signatures[self.offset..] + .iter() + .position_simd(|&sig| sig == self.hash_coord.sig) + { + let real_idx = self.offset + idx; + self.offset = real_idx + 1; + let ptr = self.row.pointers[real_idx]; + if ptr.is_valid() + && ptr.masked_row_selector() == self.hash_coord.masked_row_selector() + { + return Some((real_idx, ptr)); + } + } else { + self.offset = ROW_WIDTH; + } + } + None + } +} + +#[derive(FromBytes, IntoBytes, KnownLayout)] +#[repr(C)] +pub(crate) struct IndexFileLayout { + pub(crate) header: IndexFileHeader, + // note: we don't keep committed and uncommitted waste_levels for space efficiency and because + // they only need be approximate + pub(crate) waste_levels: [AtomicU32; MAX_DATA_FILES as usize], +} + +const _: () = assert!(size_of::() == PAGE_SIZE * 5); + +fn row_count_for_len(len: usize) -> usize { + len / size_of::() +} + +fn row_offset(idx: usize) -> usize { + idx * size_of::() +} + +fn row_bytes(bytes: &[u8], idx: usize) -> &[u8] { + let start = row_offset(idx); + let end = start + size_of::(); + &bytes[start..end] +} + +fn row_bytes_mut(bytes: &mut [u8], idx: usize) -> &mut [u8] { + let start = row_offset(idx); + let end = start + size_of::(); + &mut bytes[start..end] +} + +fn row_ref_bytes(bytes: &[u8], idx: usize) -> &RowLayout { + unsafe { &*(row_bytes(bytes, idx).as_ptr() as *const RowLayout) } +} + +fn row_mut_bytes(bytes: &mut [u8], idx: usize) -> &mut RowLayout { + RowLayout::try_mut_from_bytes(row_bytes_mut(bytes, idx)) + .expect("row bytes should contain an aligned row") +} + +unsafe fn row_mut_ptr(base_ptr: *const u8, idx: usize) -> *mut RowLayout { + unsafe { base_ptr.add(row_offset(idx)) as *mut RowLayout } +} + +pub(crate) struct RowsTableReadGuard<'a> { + index_file: &'a IndexFile, + pub(crate) row_guard: RwLockReadGuard<'a, MmapMut>, +} + +impl<'a> RowsTableReadGuard<'a> { + pub(crate) fn row(&self, idx: usize) -> RowReadGuard<'_> { + let row_guard = self.index_file.row_locks[idx & self.index_file.row_locks_mask].read(); + let row_count = row_count_for_len(self.row_guard.len()); + assert!( + idx < row_count, + "row index out of bounds: {idx} >= {row_count}" + ); + let row = row_ref_bytes(&self.row_guard[..], idx); + RowReadGuard { + _row_guard: row_guard, + row, + } + } + + pub(crate) fn shard_id(&self, idx: usize) -> usize { + idx & self.index_file.row_locks_mask + } + + pub(crate) fn lock_shard(&self, shard_id: usize) -> RwLockWriteGuard<'_, ()> { + self.index_file.row_locks[shard_id].write() + } + + pub(crate) fn row_mut(&self, idx: usize) -> RowWriteGuard<'_> { + let shard_idx = idx & self.index_file.row_locks_mask; + let row_guard = self.index_file.row_locks[shard_idx].write(); + let row_count = row_count_for_len(self.row_guard.len()); + assert!( + idx < row_count, + "row index out of bounds: {idx} >= {row_count}" + ); + let row = unsafe { &mut *row_mut_ptr(self.row_guard.as_ptr(), idx) }; + RowWriteGuard { + _row_guard: row_guard, + row, + shard_idx, + } + } + + pub(crate) unsafe fn unlocked_row_ptr(&self, idx: usize) -> *mut RowLayout { + let row_count = row_count_for_len(self.row_guard.len()); + assert!( + idx < row_count, + "row index out of bounds: {idx} >= {row_count}" + ); + unsafe { row_mut_ptr(self.row_guard.as_ptr(), idx) } + } +} + +pub(crate) struct RowsTableWriteGuard<'a> { + pub(crate) row_guard: RwLockWriteGuard<'a, MmapMut>, +} + +impl RowsTableWriteGuard<'_> { + fn row_mut(&mut self, idx: usize) -> &mut RowLayout { + let row_count = row_count_for_len(self.row_guard.len()); + assert!( + idx < row_count, + "row index out of bounds: {idx} >= {row_count}" + ); + row_mut_bytes(&mut self.row_guard[..], idx) + } +} + +pub(crate) struct RowReadGuard<'a> { + _row_guard: RwLockReadGuard<'a, ()>, + row: &'a RowLayout, +} + +impl Deref for RowReadGuard<'_> { + type Target = RowLayout; + + fn deref(&self) -> &Self::Target { + self.row + } +} + +pub(crate) struct RowWriteGuard<'a> { + _row_guard: RwLockWriteGuard<'a, ()>, + row: &'a mut RowLayout, + pub(crate) shard_idx: usize, +} + +impl Deref for RowWriteGuard<'_> { + type Target = RowLayout; + + fn deref(&self) -> &Self::Target { + self.row + } +} + +impl DerefMut for RowWriteGuard<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.row + } +} + +pub(crate) struct IndexFile { + /// Kept on Windows so `sync_all` can call `FlushFileBuffers`. + /// On Linux the fd is closed after mmap; `msync` suffices for durability. + #[cfg(windows)] + header_file: File, + rows_file: File, + /// Fixed mapping covering the header + waste-level pages. Never remapped, + /// so `header_ref()` / `layout_prefix_ref()` are always stable without a lock. + header_mmap: MmapMut, + /// Growable mapping covering only the row pages. Remapped on grow/shrink/reset. + rows_mmap: RwLock, + row_locks: Vec>, + row_locks_mask: usize, + config: Arc, + /// Cached checkpoint state so concurrent readers (e.g. compaction candidate + /// selection) always see a consistent snapshot without going through the + /// double-buffer slot protocol. + cached_checkpoint_generation: AtomicU64, + cached_checkpoint_ordinal: AtomicU64, + cached_checkpoint_offset: AtomicU64, +} + +impl IndexFile { + fn read_checkpoint_slot(slot: &CheckpointSlot) -> Option { + let generation = slot.generation.load(Ordering::Acquire); + if generation == 0 { + return None; + } + + let file_ordinal = slot.file_ordinal.load(Ordering::Relaxed); + let offset = slot.offset.load(Ordering::Relaxed); + let checksum = slot.checksum.load(Ordering::Acquire); + if checksum != checkpoint_slot_checksum(generation, file_ordinal, offset) { + return None; + } + + Some(CheckpointCursor { + generation, + file_ordinal, + offset, + }) + } + + fn durable_checkpoint(&self) -> Option { + self.header_ref() + .checkpoint_slots + .iter() + .filter_map(Self::read_checkpoint_slot) + .max_by_key(|cursor| cursor.generation) + } + + pub(crate) fn checkpoint_cursor(&self) -> (u64, u64) { + let generation = self.cached_checkpoint_generation.load(Ordering::Acquire); + if generation == 0 { + return (0, 0); + } + let ordinal = self.cached_checkpoint_ordinal.load(Ordering::Relaxed); + let offset = self.cached_checkpoint_offset.load(Ordering::Relaxed); + (ordinal, offset) + } + + pub(crate) fn checkpoint_generation(&self) -> u64 { + self.cached_checkpoint_generation.load(Ordering::Acquire) + } + + pub(crate) fn persist_checkpoint_cursor(&self, ordinal: u64, offset: u64) { + let current_gen = self.cached_checkpoint_generation.load(Ordering::Relaxed); + let next_generation = current_gen + .checked_add(1) + .expect("checkpoint generation overflow"); + let slot = + &self.header_ref().checkpoint_slots[next_generation as usize % CHECKPOINT_SLOT_COUNT]; + + slot.checksum.store(0, Ordering::Release); + slot.generation.store(next_generation, Ordering::Relaxed); + slot.file_ordinal.store(ordinal, Ordering::Relaxed); + slot.offset.store(offset, Ordering::Relaxed); + slot.checksum.store( + checkpoint_slot_checksum(next_generation, ordinal, offset), + Ordering::Release, + ); + + // Update the cache so concurrent readers see the new values immediately. + self.cached_checkpoint_ordinal + .store(ordinal, Ordering::Relaxed); + self.cached_checkpoint_offset + .store(offset, Ordering::Relaxed); + self.cached_checkpoint_generation + .store(next_generation, Ordering::Release); + } + + #[cfg(target_os = "linux")] + fn maybe_lock_mmap(config: &Config, mmap: &MmapMut) { + if config.mlock_index { + let _ = mmap.lock(); + } + } + + #[cfg(not(target_os = "linux"))] + fn maybe_lock_mmap(_config: &Config, _mmap: &MmapMut) {} + + fn read_existing_header(header_file: &File, header_len: usize) -> Result<((u64, u64), u64)> { + if header_len < size_of::() { + return Err(unexpected_eof_error("index file header too short")); + } + if header_len != size_of::() { + return Err(invalid_data_error("index header file has unexpected size")); + } + + let header = read_available_at(header_file, size_of::(), 0) + .map_err(Error::IOError)?; + if header.len() < size_of::() { + return Err(unexpected_eof_error("index file header too short")); + } + let header = IndexFileHeader::read_from_bytes(&header) + .map_err(|_| invalid_data_error("invalid index file header size"))?; + if &header.signature != INDEX_FILE_SIGNATURE || header.version != INDEX_FILE_VERSION { + return Err(invalid_data_error("invalid index file header")); + } + + Ok(( + (header.hash_key_0, header.hash_key_1), + header.global_split_level.load(Ordering::Relaxed), + )) + } + + pub(crate) fn existing_hash_key(base_path: &Path) -> Result> { + let header_path = index_file_path(base_path); + let header_file = match File::options().read(true).open(header_path) { + Ok(file) => file, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(None), + Err(err) => return Err(Error::IOError(err)), + }; + let header_len = header_file.metadata().map_err(Error::IOError)?.len() as usize; + if header_len == 0 { + return Ok(None); + } + + let (hash_key, _) = Self::read_existing_header(&header_file, header_len)?; + Ok(Some(hash_key)) + } + + fn validate_existing( + header_file: &File, + header_len: usize, + rows_len: usize, + hash_key: (u64, u64), + ) -> Result<()> { + if !rows_len.is_multiple_of(PAGE_SIZE) { + return Err(invalid_data_error( + "index rows file size is not page aligned", + )); + } + + let (stored_hash_key, gsl) = Self::read_existing_header(header_file, header_len)?; + if stored_hash_key != hash_key { + return Err(invalid_data_error("index hash key mismatch")); + } + + let row_count = row_count_for_len(rows_len); + if row_count < MIN_INITIAL_ROWS || !row_count.is_power_of_two() { + return Err(invalid_data_error("invalid index file row count")); + } + + if gsl < MIN_SPLIT_LEVEL as u64 { + return Err(invalid_data_error("invalid index global split level")); + } + + let uncommitted_rows = 1usize + .checked_shl(gsl as u32) + .ok_or_else(|| invalid_data_error("index global split level overflow"))?; + if uncommitted_rows > row_count { + return Err(invalid_data_error( + "index global split level exceeds file size", + )); + } + + Ok(()) + } + + pub(crate) fn flush_header(&self) -> Result<()> { + self.header_mmap.flush().map_err(Error::IOError) + } + + pub(crate) fn open(base_path: &Path, config: Arc) -> Result { + let hash_key = config.hash_key; + let num_rows = (config.initial_capacity / ROW_WIDTH) + .max(MIN_INITIAL_ROWS) + .next_power_of_two(); + let num_locks = config.max_concurrency.min(num_rows).next_power_of_two(); + let row_locks = (0..num_locks).map(|_| RwLock::new(())).collect::>(); + let row_locks_mask = num_locks - 1; + + let header_path = index_file_path(base_path); + let rows_path = index_rows_file_path(base_path); + + let header_file = File::options() + .create(true) + .truncate(false) + .read(true) + .write(true) + .open(header_path) + .map_err(Error::IOError)?; + let rows_file = File::options() + .create(true) + .truncate(false) + .read(true) + .write(true) + .open(rows_path) + .map_err(Error::IOError)?; + + let header_size = size_of::(); + let header_len = header_file.metadata().map_err(Error::IOError)?.len() as usize; + let rows_len = rows_file.metadata().map_err(Error::IOError)?.len() as usize; + let new_file = header_len == 0 && rows_len == 0; + let rows_size = num_rows * size_of::(); + + if new_file { + header_file + .set_len(header_size as u64) + .map_err(Error::IOError)?; + rows_file + .set_len(rows_size as u64) + .map_err(Error::IOError)?; + } else { + Self::validate_existing(&header_file, header_len, rows_len, config.hash_key)?; + } + + let actual_rows_size = if new_file { rows_size } else { rows_len }; + + let header_mmap = unsafe { + memmap2::MmapOptions::new() + .len(header_size) + .map_mut(&header_file) + } + .map_err(Error::IOError)?; + Self::maybe_lock_mmap(config.as_ref(), &header_mmap); + + let rows_mmap = unsafe { + memmap2::MmapOptions::new() + .len(actual_rows_size) + .map_mut(&rows_file) + } + .map_err(Error::IOError)?; + Self::maybe_lock_mmap(config.as_ref(), &rows_mmap); + + if new_file { + header_file.sync_all().map_err(Error::IOError)?; + } + + let inst = Self { + #[cfg(windows)] + header_file, + rows_file, + header_mmap, + rows_mmap: RwLock::new(rows_mmap), + row_locks, + row_locks_mask, + config, + cached_checkpoint_generation: AtomicU64::new(0), + cached_checkpoint_ordinal: AtomicU64::new(0), + cached_checkpoint_offset: AtomicU64::new(0), + }; + + if new_file { + let rows_table = inst.rows_table_mut(); + inst.init_header_and_rows(rows_table, hash_key)?; + } else if let Some(cursor) = inst.durable_checkpoint() { + inst.cached_checkpoint_generation + .store(cursor.generation, Ordering::Relaxed); + inst.cached_checkpoint_ordinal + .store(cursor.file_ordinal, Ordering::Relaxed); + inst.cached_checkpoint_offset + .store(cursor.offset, Ordering::Relaxed); + } + + Ok(inst) + } + + pub(crate) fn sync_all(&self) -> Result<()> { + // Persist row updates before any header state that claims those rows are durable. + self.rows_mmap.write().flush().map_err(Error::IOError)?; + self.rows_file.sync_all().map_err(Error::IOError)?; + self.header_mmap.flush().map_err(Error::IOError)?; + #[cfg(windows)] + self.header_file.sync_all().map_err(Error::IOError)?; + Ok(()) + } + + pub(crate) fn file_size_bytes(&self) -> u64 { + let header = size_of::() as u64; + let rows = self.rows_file.metadata().map(|m| m.len()).unwrap_or(0); + header + rows + } + + pub(crate) fn rows_table(&self) -> RowsTableReadGuard<'_> { + RowsTableReadGuard { + index_file: self, + row_guard: self.rows_mmap.read(), + } + } + + pub(crate) fn rows_table_mut(&self) -> RowsTableWriteGuard<'_> { + RowsTableWriteGuard { + row_guard: self.rows_mmap.write(), + } + } + + /// Returns a direct reference to the header without acquiring any lock. + /// + /// Safe because the header mmap is never remapped and the header fields + /// used for stats are all atomics. + fn full_header_ref(&self) -> &IndexFileLayout { + unsafe { &*(self.header_mmap.as_ptr() as *const IndexFileLayout) } + } + + pub(crate) fn header_ref(&self) -> &IndexFileHeader { + &self.full_header_ref().header + } + + pub(crate) fn add_file_waste(&self, file_idx: u16, waste: u32) -> u32 { + self.full_header_ref().waste_levels[file_idx as usize].fetch_add(waste, Ordering::Relaxed) + + waste + } + + pub(crate) fn file_waste(&self, file_idx: u16) -> u32 { + self.full_header_ref().waste_levels[file_idx as usize].load(Ordering::Relaxed) + } + + /// Returns the combined waste across all file slots. + pub(crate) fn total_waste(&self) -> u64 { + let ref_full = self.full_header_ref(); + let mut total = 0u64; + for waste in ref_full.waste_levels.iter() { + total += waste.load(Ordering::Relaxed) as u64; + } + total + } + + /// Takes the combined waste and resets it + pub(crate) fn take_file_waste(&self, file_idx: u16) -> u32 { + self.full_header_ref().waste_levels[file_idx as usize].swap(0, Ordering::Relaxed) + } + + pub(crate) fn grow(&self, nsl: u64) -> Result> { + let mut layout_mut = self.rows_table_mut(); + let gsl = self.header_ref().global_split_level.load(Ordering::Acquire); + if nsl <= gsl { + return Ok(None); + } + + let mut remap_dur = None; + let required_rows_size = (1usize << nsl) * size_of::(); + if layout_mut.row_guard.len() < required_rows_size { + let remap_start = Instant::now(); + let alloc_split = nsl + self.config.remap_scaler as u64; + let new_rows_size = (1usize << alloc_split) * size_of::(); + + self.rows_file + .set_len(new_rows_size as u64) + .map_err(Error::IOError)?; + + #[cfg(target_os = "linux")] + unsafe { + layout_mut + .row_guard + .remap(new_rows_size, memmap2::RemapOptions::new().may_move(true)) + } + .map_err(Error::IOError)?; + + #[cfg(not(target_os = "linux"))] + { + *layout_mut.row_guard = unsafe { + memmap2::MmapOptions::new() + .len(new_rows_size) + .map_mut(&self.rows_file) + } + .map_err(Error::IOError)?; + } + + Self::maybe_lock_mmap(self.config.as_ref(), &layout_mut.row_guard); + remap_dur = Some(remap_start.elapsed()); + } + + self.header_ref() + .global_split_level + .store(nsl, Ordering::Release); + Ok(remap_dur) + } + + pub(crate) fn num_rows(&self) -> usize { + let gsl = self.header_ref().global_split_level.load(Ordering::Acquire) as usize; + 1usize << gsl + } + + pub(crate) fn num_shards(&self) -> usize { + self.row_locks.len() + } + + pub(crate) fn shrink_with_rows_guard( + &self, + min_rows: usize, + mut row_table: RowsTableWriteGuard<'_>, + ) -> Result { + loop { + let global_split_level = self.header_ref().global_split_level.load(Ordering::Acquire); + let current_rows = 1usize << global_split_level; + if current_rows <= min_rows { + break; + } + + let next_level = global_split_level - 1; + let half_count = 1usize << next_level; + + let mut can_merge = true; + for idx in 0..half_count { + let row1 = row_ref_bytes(&row_table.row_guard[..], idx); + let row1_split = row1.split_level.load(Ordering::Acquire); + if row1_split != global_split_level { + continue; + } + + let row2 = row_ref_bytes(&row_table.row_guard[..], idx + half_count); + let count1 = row1 + .signatures + .iter() + .filter(|&&sig| sig != HashCoord::INVALID_SIG) + .count(); + let count2 = row2 + .signatures + .iter() + .filter(|&&sig| sig != HashCoord::INVALID_SIG) + .count(); + if count1 + count2 > ROW_WIDTH { + can_merge = false; + break; + } + } + + if !can_merge { + break; + } + + for idx in 0..half_count { + let row1 = unsafe { &mut *row_mut_ptr(row_table.row_guard.as_ptr(), idx) }; + let row2 = + unsafe { &mut *row_mut_ptr(row_table.row_guard.as_ptr(), idx + half_count) }; + + if row1.split_level.load(Ordering::Acquire) != global_split_level { + continue; + } + + let mut dest_idx = 0usize; + for src_idx in 0..ROW_WIDTH { + if row2.signatures[src_idx] == HashCoord::INVALID_SIG { + continue; + } + + while dest_idx < ROW_WIDTH + && row1.signatures[dest_idx] != HashCoord::INVALID_SIG + { + dest_idx += 1; + } + + if dest_idx >= ROW_WIDTH { + break; + } + + row1.insert(dest_idx, row2.signatures[src_idx], row2.pointers[src_idx]); + row2.remove(src_idx); + } + + row2.set_split_level(0); + row1.set_split_level(next_level); + } + + self.header_ref() + .global_split_level + .store(next_level, Ordering::Release); + } + + let final_level = self.header_ref().global_split_level.load(Ordering::Acquire); + let new_rows_size = (1usize << final_level) * size_of::(); + + if new_rows_size < row_table.row_guard.len() { + #[cfg(target_os = "linux")] + { + unsafe { + row_table + .row_guard + .remap(new_rows_size, memmap2::RemapOptions::new().may_move(true)) + } + .map_err(Error::IOError)?; + self.rows_file + .set_len(new_rows_size as u64) + .map_err(Error::IOError)?; + } + + #[cfg(not(target_os = "linux"))] + { + row_table.row_guard.flush().map_err(Error::IOError)?; + + #[cfg(windows)] + { + // On Windows we must unmap before truncating. + *row_table.row_guard = memmap2::MmapOptions::new() + .len(1) + .map_anon() + .map_err(Error::IOError)?; + } + + self.rows_file + .set_len(new_rows_size as u64) + .map_err(Error::IOError)?; + *row_table.row_guard = unsafe { + memmap2::MmapOptions::new() + .len(new_rows_size) + .map_mut(&self.rows_file) + } + .map_err(Error::IOError)?; + } + + Self::maybe_lock_mmap(self.config.as_ref(), &row_table.row_guard); + } + + Ok(1usize << final_level) + } + + fn init_header_and_rows( + &self, + mut rows_table: RowsTableWriteGuard, + hash_key: (u64, u64), + ) -> Result<()> { + // Zero both mmaps first, then populate. + rows_table.row_guard.fill(0); + // Safety: header_mmap is a contiguous MmapMut that we own; no other &mut exists yet. + unsafe { + std::ptr::write_bytes( + self.header_mmap.as_ptr() as *mut u8, + 0, + self.header_mmap.len(), + ); + } + + // Now create the mutable reference after zeroing is complete. + // Safety: only called during init (open) or reset, both single-threaded + // w.r.t. this store instance. + let layout = unsafe { &mut *(self.header_mmap.as_ptr() as *mut IndexFileLayout) }; + + layout.header.signature = *INDEX_FILE_SIGNATURE; + layout.header.version = INDEX_FILE_VERSION; + layout.header.hash_key_0 = hash_key.0; + layout.header.hash_key_1 = hash_key.1; + layout + .header + .global_split_level + .store(MIN_SPLIT_LEVEL as u64, Ordering::Release); + + for row_idx in 0..MIN_INITIAL_ROWS { + rows_table + .row_mut(row_idx) + .set_split_level(MIN_SPLIT_LEVEL as u64); + } + + self.flush_header()?; + rows_table.row_guard.flush().map_err(Error::IOError)?; + self.rows_file.sync_all().map_err(Error::IOError)?; + Ok(()) + } + + pub(crate) fn reset(&self, mut row_table: RowsTableWriteGuard<'_>) -> Result<()> { + let min_rows_size = MIN_INITIAL_ROWS * size_of::(); + + #[cfg(target_os = "linux")] + unsafe { + self.rows_file + .set_len(min_rows_size as u64) + .map_err(Error::IOError)?; + row_table + .row_guard + .remap(min_rows_size, memmap2::RemapOptions::new().may_move(true)) + } + .map_err(Error::IOError)?; + + #[cfg(not(target_os = "linux"))] + { + row_table.row_guard.flush().map_err(Error::IOError)?; + + #[cfg(windows)] + { + *row_table.row_guard = memmap2::MmapOptions::new() + .len(1) + .map_anon() + .map_err(Error::IOError)?; + } + + self.rows_file + .set_len(min_rows_size as u64) + .map_err(Error::IOError)?; + *row_table.row_guard = unsafe { + memmap2::MmapOptions::new() + .len(min_rows_size) + .map_mut(&self.rows_file) + } + .map_err(Error::IOError)?; + } + + Self::maybe_lock_mmap(self.config.as_ref(), &row_table.row_guard); + + self.init_header_and_rows(row_table, self.config.hash_key) + } +} diff --git a/src/internal.rs b/src/internal.rs new file mode 100644 index 0000000..0972d2c --- /dev/null +++ b/src/internal.rs @@ -0,0 +1,405 @@ +use siphasher::sip128::{Hasher128, SipHasher13}; + +use std::{ + fs::File, + hash::Hasher, + path::{Path, PathBuf}, +}; + +use crate::types::{Error, Result}; + +pub(crate) const PAGE_SIZE: usize = 4096; +pub(crate) const ROW_WIDTH: usize = 16 * 21; +pub(crate) const MIN_SPLIT_LEVEL: usize = 3; +pub(crate) const MASKED_ROW_SELECTOR_BITS: u32 = 18; +pub(crate) const MIN_INITIAL_ROWS: usize = 1 << MIN_SPLIT_LEVEL; +pub(crate) const MAX_REPRESENTABLE_FILE_SIZE: u32 = + ((1u32 << 26) - 1) * FILE_OFFSET_ALIGNMENT as u32; +pub(crate) const ENTRY_TYPE_SHIFT: u32 = 14; +pub(crate) const MAX_INTERNAL_KEY_SIZE: usize = (1 << ENTRY_TYPE_SHIFT) - 1; +pub(crate) const MAX_INTERNAL_VALUE_SIZE: usize = (1 << 16) - 1; +pub(crate) const MAX_DATA_FILES: u16 = 1 << 12; +pub(crate) const MAX_DATA_FILE_IDX: u16 = MAX_DATA_FILES - 1; + +pub(crate) const INDEX_FILE_SIGNATURE: &[u8; 8] = b"CandyStr"; +pub(crate) const INDEX_FILE_VERSION: u32 = 0x0002_0009; +pub(crate) const DATA_FILE_SIGNATURE: &[u8; 8] = b"CandyDat"; +pub(crate) const DATA_FILE_VERSION: u32 = 0x0002_0002; +pub(crate) const FILE_OFFSET_ALIGNMENT: u64 = 16; +pub(crate) const SIZE_HINT_UNIT: usize = 512; +pub(crate) const DATA_ENTRY_OFFSET_MAGIC: u32 = 0x91c8_d7cd; +pub(crate) const DATA_ENTRY_OFFSET_MASK: u32 = (1 << 24) - 1; +pub(crate) const KEY_NAMESPACE_BITS: u8 = 6; +pub(crate) const MAX_KEY_NAMESPACE: u8 = (1 << KEY_NAMESPACE_BITS) - 1; +pub(crate) const READ_BUFFER_SIZE: usize = 128 * 1024; + +pub(crate) fn aligned_data_entry_waste(klen: usize, vlen: usize) -> u32 { + (10 + klen as u32 + vlen as u32).next_multiple_of(FILE_OFFSET_ALIGNMENT as u32) +} + +pub(crate) fn aligned_tombstone_entry_waste(klen: usize) -> u32 { + (8 + klen as u32).next_multiple_of(FILE_OFFSET_ALIGNMENT as u32) +} + +pub(crate) fn aligned_data_entry_size(klen: usize, vlen: usize) -> u64 { + (10 + klen as u64 + vlen as u64).next_multiple_of(FILE_OFFSET_ALIGNMENT) +} + +pub(crate) fn index_file_path(base_path: &Path) -> PathBuf { + base_path.join("index") +} + +pub(crate) fn index_rows_file_path(base_path: &Path) -> PathBuf { + base_path.join("rows") +} + +pub(crate) fn data_file_path(base_path: &Path, file_idx: u16) -> PathBuf { + base_path.join(format!("data_{file_idx:04}")) +} + +#[cfg(unix)] +pub(crate) fn sync_dir(path: &Path) -> Result<()> { + File::open(path) + .map_err(Error::IOError)? + .sync_all() + .map_err(Error::IOError) +} + +#[cfg(not(unix))] +pub(crate) fn sync_dir(_path: &Path) -> Result<()> { + Ok(()) +} + +#[cfg(target_os = "linux")] +pub(crate) fn sync_file_range(file: &File, offset: u64, len: u64) -> Result<()> { + use std::os::fd::AsRawFd; + + if len == 0 { + return Ok(()); + } + + let sync_offset = i64::try_from(offset) + .map_err(|_| Error::IOError(std::io::Error::other("sync offset overflow")))?; + let sync_len = i64::try_from(len) + .map_err(|_| Error::IOError(std::io::Error::other("sync length overflow")))?; + + let rc = unsafe { + libc::sync_file_range( + file.as_raw_fd(), + sync_offset, + sync_len, + libc::SYNC_FILE_RANGE_WAIT_BEFORE + | libc::SYNC_FILE_RANGE_WRITE + | libc::SYNC_FILE_RANGE_WAIT_AFTER, + ) + }; + if rc == 0 { + return Ok(()); + } + + let err = std::io::Error::last_os_error(); + match err.raw_os_error() { + Some(libc::EINVAL | libc::ENOSYS | libc::EOPNOTSUPP) => { + file.sync_data().map_err(Error::IOError) + } + _ => Err(Error::IOError(err)), + } +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn sync_file_range(file: &File, _offset: u64, len: u64) -> Result<()> { + if len == 0 { + return Ok(()); + } + file.sync_data().map_err(Error::IOError) +} + +pub(crate) fn parse_data_file_idx(path: &Path) -> Option { + let name = path.file_name()?.to_str()?; + let suffix = name.strip_prefix("data_")?; + if suffix.len() != 4 { + return None; + } + suffix.parse().ok() +} + +#[derive(Debug, Clone, Copy)] +pub(crate) struct RangeMetadata { + pub(crate) head: u64, + pub(crate) tail: u64, + pub(crate) count: u64, +} + +impl RangeMetadata { + pub(crate) fn new() -> Self { + Self { + head: 1u64 << 63, + tail: (1u64 << 63) - 1, + count: 0, + } + } + + pub(crate) fn to_bytes(self) -> [u8; 24] { + let mut buf = [0u8; 24]; + buf[0..8].copy_from_slice(&self.head.to_le_bytes()); + buf[8..16].copy_from_slice(&self.tail.to_le_bytes()); + buf[16..24].copy_from_slice(&self.count.to_le_bytes()); + buf + } + + pub(crate) fn from_bytes(bytes: &[u8]) -> Option { + if bytes.len() != 24 { + return None; + } + Some(Self { + head: u64::from_le_bytes(bytes[0..8].try_into().ok()?), + tail: u64::from_le_bytes(bytes[8..16].try_into().ok()?), + count: u64::from_le_bytes(bytes[16..24].try_into().ok()?), + }) + } +} + +#[repr(u16)] +pub(crate) enum EntryType { + Insert = 0, + Update = 1, + _Unused2 = 2, + Tombstone = 3, +} + +pub(crate) fn invalid_data_error(message: &'static str) -> Error { + Error::IOError(std::io::Error::new( + std::io::ErrorKind::InvalidData, + message, + )) +} + +pub(crate) fn unexpected_eof_error(message: &'static str) -> Error { + Error::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + message, + )) +} + +pub(crate) fn is_resettable_open_error(err: &Error) -> bool { + matches!( + err, + Error::IOError(io_err) + if matches!( + io_err.kind(), + std::io::ErrorKind::InvalidData | std::io::ErrorKind::UnexpectedEof + ) + ) +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[repr(u8)] +pub(crate) enum KeyNamespace { + User = 0, + QueueMeta = 1, + QueueData = 2, + BigMeta = 3, + BigData = 4, + ListMeta = 5, + ListIndex = 6, + ListData = 7, + Typed = 8, + TypedQueueMeta = 9, + TypedQueueData = 10, + TypedBigMeta = 11, + TypedBigData = 12, + TypedListMeta = 13, + TypedListIndex = 14, + TypedListData = 15, +} + +impl KeyNamespace { + pub(crate) fn from_u8(ns: u8) -> Option { + match ns { + x if x == Self::User as u8 => Some(Self::User), + x if x == Self::QueueMeta as u8 => Some(Self::QueueMeta), + x if x == Self::QueueData as u8 => Some(Self::QueueData), + x if x == Self::BigMeta as u8 => Some(Self::BigMeta), + x if x == Self::BigData as u8 => Some(Self::BigData), + x if x == Self::ListMeta as u8 => Some(Self::ListMeta), + x if x == Self::ListIndex as u8 => Some(Self::ListIndex), + x if x == Self::ListData as u8 => Some(Self::ListData), + x if x == Self::Typed as u8 => Some(Self::Typed), + x if x == Self::TypedQueueMeta as u8 => Some(Self::TypedQueueMeta), + x if x == Self::TypedQueueData as u8 => Some(Self::TypedQueueData), + x if x == Self::TypedBigMeta as u8 => Some(Self::TypedBigMeta), + x if x == Self::TypedBigData as u8 => Some(Self::TypedBigData), + x if x == Self::TypedListMeta as u8 => Some(Self::TypedListMeta), + x if x == Self::TypedListIndex as u8 => Some(Self::TypedListIndex), + x if x == Self::TypedListData as u8 => Some(Self::TypedListData), + _ => None, + } + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub(crate) struct HashCoord { + pub(crate) sig: u32, + pub(crate) row_selector: u32, +} + +impl HashCoord { + pub(crate) const INVALID_SIG: u32 = 0; + + pub(crate) fn new(ns: KeyNamespace, key: &[u8], hash_key: (u64, u64)) -> Self { + let mut hasher = SipHasher13::new_with_keys(hash_key.0, hash_key.1); + hasher.write_u8(ns as u8); + hasher.write(key); + let h = hasher.finish128(); + let row_selector = h.h1 as u32; + let mut sig = (h.h1 >> 32) as u32; + if sig == Self::INVALID_SIG { + sig = h.h2 as u32; + if sig == Self::INVALID_SIG { + sig = (h.h2 >> 32) as u32; + if sig == Self::INVALID_SIG { + sig = 0x6419_9a93; + } + } + } + + Self { sig, row_selector } + } + + pub(crate) fn masked_row_selector(&self) -> u32 { + (self.row_selector >> MIN_SPLIT_LEVEL) & ((1 << MASKED_ROW_SELECTOR_BITS) - 1) + } + + pub(crate) fn row_index(&self, split_level: u64) -> usize { + debug_assert!(split_level >= MIN_SPLIT_LEVEL as u64, "sl={split_level}"); + ((self.row_selector as u64) & ((1 << split_level) - 1)) as usize + } +} + +pub(crate) struct KVBuf { + pub(crate) buf: Vec, + pub(crate) vlen: u16, + pub(crate) header_len: u16, + #[allow(dead_code)] + pub(crate) ns: u8, + #[allow(dead_code)] + pub(crate) entry_type: EntryType, +} + +impl KVBuf { + pub(crate) fn value(&self) -> &[u8] { + let start = self.header_len as usize; + &self.buf[start..start + self.vlen as usize] + } + + pub(crate) fn key(&self) -> &[u8] { + &self.buf[self.header_len as usize + self.vlen as usize..] + } + + pub(crate) fn into_value(mut self) -> Vec { + let start = self.header_len as usize; + let vlen = self.vlen as usize; + if start > 0 { + self.buf.copy_within(start..start + vlen, 0); + } + self.buf.truncate(vlen); + self.buf + } +} + +pub(crate) struct KVRef<'a> { + pub(crate) buf: &'a [u8], + pub(crate) vlen: u16, + pub(crate) header_len: u16, + pub(crate) ns: u8, + pub(crate) entry_type: EntryType, +} + +impl KVRef<'_> { + pub(crate) fn value(&self) -> &[u8] { + let start = self.header_len as usize; + &self.buf[start..start + self.vlen as usize] + } + + pub(crate) fn key(&self) -> &[u8] { + &self.buf[self.header_len as usize + self.vlen as usize..] + } +} + +#[cfg(unix)] +pub(crate) fn read_into_at( + f: &File, + buf: &mut Vec, + count: usize, + file_offset: u64, +) -> std::io::Result<()> { + buf.resize(count, 0); + let mut offset = 0; + while offset < count { + let n = std::os::unix::fs::FileExt::read_at( + f, + &mut buf[offset..], + file_offset + offset as u64, + )?; + if n == 0 { + break; + } else { + offset += n; + } + } + buf.truncate(offset); + Ok(()) +} + +#[cfg(windows)] +pub(crate) fn read_into_at( + f: &File, + buf: &mut Vec, + count: usize, + file_offset: u64, +) -> std::io::Result<()> { + buf.resize(count, 0); + let mut offset = 0; + while offset < count { + let n = std::os::windows::fs::FileExt::seek_read( + f, + &mut buf[offset..], + file_offset + offset as u64, + )?; + if n == 0 { + break; + } else { + offset += n; + } + } + buf.truncate(offset); + Ok(()) +} + +pub(crate) fn read_available_at( + f: &File, + count: usize, + file_offset: u64, +) -> std::io::Result> { + let mut buf = Vec::new(); + read_into_at(f, &mut buf, count, file_offset)?; + Ok(buf) +} + +#[cfg(unix)] +pub(crate) fn write_all_at(f: &File, buf: &[u8], offset: u64) -> std::io::Result<()> { + std::os::unix::fs::FileExt::write_all_at(f, buf, offset) +} + +#[cfg(windows)] +pub(crate) fn write_all_at(f: &File, mut buf: &[u8], mut offset: u64) -> std::io::Result<()> { + while !buf.is_empty() { + let written = std::os::windows::fs::FileExt::seek_write(f, buf, offset)?; + if written == 0 { + return Err(std::io::Error::from(std::io::ErrorKind::UnexpectedEof)); + } + buf = &buf[written..]; + offset += written as u64; + } + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index a8ee29e..38468ba 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,142 +1,50 @@ -//! A fast (*blazingly*, of course), persistent, in-process key-value store that relies on a novel sharding -//! algorithm. Since Candy does not rely on log-structured merge (LSM) trees or B-Trees, no journal/WAL is needed -//! and IOs go directly to file. -//! -//! The algorithm can be thought of as a "zero-overhead" extension to a hash table that's stored over files, -//! as it's designed to minimizes disk IO operations. Most operations add an overhead of 1-2 microseconds -//! to the disk IO latency, and operations generally require 1-4 disk IOs. -//! -//! The algorithm, for the most part, is crash-safe. That is, you can crash at any point and still be in a consistent -//! state. You might lose the ongoing operation, but we consider this acceptable. -//! -//! Candy is designed to consume very little memory: entries are written directly to the shard-file, and only a -//! table of ~380KB is kept `mmap`-ed (it is also file-backed, so can be evicted if needed). A shard-file can -//! hold around 30K entries, and more shard-files are created as needed. -//! -//! A unique feature of Candy is the support of *lists*, which allow creating cheap collections. -//! -//! Note: the file format is not yet stable! -//! -//! Example: -//! ``` -//! use candystore::{CandyStore, Config, Result}; -//! -//! fn main() -> Result<()> { -//! let db = CandyStore::open("/tmp/candy-dir", Config::default())?; -//! db.set("hello", "world")?; -//! assert_eq!(db.get("hello")?, Some("world".into())); -//! db.remove("hello")?; -//! assert_eq!(db.get("hello")?, None); -//! -//! // lists -//! db.set_in_list("italian", "bye", "arrivederci")?; -//! db.set_in_list("italian", "thanks", "grazie")?; -//! assert_eq!(db.get_from_list("italian", "bye")?, Some("arrivederci".into())); -//! -//! db.set_in_list("spanish", "bye", "adios")?; -//! db.set_in_list("spanish", "thanks", "gracias")?; -//! -//! let items = db.iter_list("spanish").map(|res| res.unwrap()).collect::>(); -//! assert_eq!(items, vec![("bye".into(), "adios".into()), ("thanks".into(), "gracias".into())]); -//! -//! Ok(()) -//! } -//! ``` - -mod hashing; -mod lists; -mod queues; -mod router; -mod shard; -mod stats; +mod data_file; +mod index_file; +mod internal; +mod pacer; mod store; -mod typed; - -pub use hashing::HashSeed; -pub use lists::{ListCompactionParams, ListIterator}; -pub use stats::Stats; -pub use store::{CandyStore, GetOrCreateStatus, ReplaceStatus, SetStatus}; -pub use typed::{CandyTypedDeque, CandyTypedKey, CandyTypedList, CandyTypedStore}; - -use std::fmt::{Display, Formatter}; - -#[cfg(feature = "whitebox_testing")] -pub use hashing::HASH_BITS_TO_KEEP; - -#[derive(Debug, PartialEq, Eq, Clone)] -pub enum CandyError { - KeyTooLong(usize), - ValueTooLong(usize), - EntryCannotFitInShard(usize, usize), -} - -impl Display for CandyError { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - match self { - Self::KeyTooLong(sz) => write!(f, "key too long {sz}"), - Self::ValueTooLong(sz) => write!(f, "value too long {sz}"), - Self::EntryCannotFitInShard(sz, max) => { - write!(f, "entry too big ({sz}) for a single shard file ({max})") - } - } +mod types; + +/// Named crash point for whitebox testing. +/// +/// When the `whitebox-testing` feature is enabled and the environment variable +/// `CANDYSTORE_CRASH_POINT` matches `name`, the process aborts after the number +/// of hits specified by `CANDYSTORE_CRASH_AFTER` (default 0 = immediate). +#[cfg(feature = "whitebox-testing")] +pub(crate) fn crash_point(name: &str) { + use std::sync::atomic::{AtomicU64, Ordering}; + static COUNTER: AtomicU64 = AtomicU64::new(0); + + let Ok(target) = std::env::var("CANDYSTORE_CRASH_POINT") else { + return; + }; + if target != name { + return; } -} - -impl std::error::Error for CandyError {} - -pub type Result = anyhow::Result; - -/// The configuration options for CandyStore. Comes with sane defaults, feel free to use them -#[derive(Debug, Clone)] -pub struct Config { - /// we don't want huge shards, because splitting would be expensive - pub max_shard_size: u32, - /// should be ~10% of max_shard_size - pub min_compaction_threashold: u32, - /// just some entropy, not so important unless you fear DoS - pub hash_seed: HashSeed, - /// hint for creating number of shards accordingly) - pub expected_number_of_keys: usize, - /// number of keyed locks for concurrent list ops - pub max_concurrent_list_ops: u32, - /// whether or not to truncate up shard files to their max size (spare files) - pub truncate_up: bool, - /// whether or not to clear the DB if the version is unsupported - pub clear_on_unsupported_version: bool, - /// whether or not to mlock the shard headers to RAM (POSIX only) - pub mlock_headers: bool, - /// number of background compaction threads - pub num_compaction_threads: usize, - /// optionally delay modifying operations before for the given duration before flushing data to disk, - /// to ensure reboot consistency - #[cfg(feature = "flush_aggregation")] - pub flush_aggregation_delay: Option, -} - -impl Default for Config { - fn default() -> Self { - Self { - max_shard_size: 64 * 1024 * 1024, - min_compaction_threashold: 8 * 1024 * 1024, - hash_seed: *b"kOYLu0xvq2WtzcKJ", - expected_number_of_keys: 0, - max_concurrent_list_ops: 64, - truncate_up: true, - clear_on_unsupported_version: false, - mlock_headers: false, - num_compaction_threads: 4, - #[cfg(feature = "flush_aggregation")] - flush_aggregation_delay: None, - } + let after: u64 = std::env::var("CANDYSTORE_CRASH_AFTER") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(0); + if COUNTER.fetch_add(1, Ordering::Relaxed) >= after { + std::process::abort(); } } -pub(crate) const MAX_TOTAL_KEY_SIZE: usize = 0x3fff; // 14 bits -pub(crate) const MAX_TOTAL_VALUE_SIZE: usize = 0xffff; // 16 bits -pub(crate) const NAMESPACING_RESERVED_SIZE: usize = 0xff; -pub(crate) const VALUE_RESERVED_SIZE: usize = 0xff; -pub const MAX_KEY_SIZE: usize = MAX_TOTAL_KEY_SIZE - NAMESPACING_RESERVED_SIZE; -pub const MAX_VALUE_SIZE: usize = MAX_TOTAL_VALUE_SIZE - VALUE_RESERVED_SIZE; - -const _: () = assert!(MAX_KEY_SIZE <= u16::MAX as usize); -const _: () = assert!(MAX_VALUE_SIZE <= u16::MAX as usize); +#[cfg(not(feature = "whitebox-testing"))] +#[inline(always)] +pub(crate) fn crash_point(_name: &str) {} + +/// The main untyped store API. +pub use crate::store::{ + CandyStore, CandyTypedDeque, CandyTypedKey, CandyTypedList, CandyTypedStore, KVPair, + ListIterator, +}; +/// Public configuration, error, and stats types. +pub use crate::types::*; + +/// Backward-compatible alias for the crate error type. +pub type CandyError = Error; +/// Maximum supported user key length in bytes. +pub const MAX_KEY_LEN: usize = MAX_USER_KEY_SIZE; +/// Maximum supported inline value length in bytes. +pub const MAX_VALUE_LEN: usize = MAX_USER_VALUE_SIZE; diff --git a/src/lists.rs b/src/lists.rs deleted file mode 100644 index 2e8c83e..0000000 --- a/src/lists.rs +++ /dev/null @@ -1,863 +0,0 @@ -use std::ops::Range; - -use crate::{ - hashing::PartedHash, - shard::{InsertMode, KVPair}, - store::{CHAIN_NAMESPACE, ITEM_NAMESPACE, LIST_NAMESPACE}, - CandyStore, GetOrCreateStatus, ReplaceStatus, Result, SetStatus, -}; - -use bytemuck::{bytes_of, from_bytes, Pod, Zeroable}; -use parking_lot::MutexGuard; - -#[derive(Clone, Copy, Pod, Zeroable)] -#[repr(C)] -struct List { - head_idx: u64, // inclusive - tail_idx: u64, // exclusive - num_items: u64, -} - -impl std::fmt::Debug for List { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "List(0x{:016x}..0x{:016x} items={})", - self.head_idx, self.tail_idx, self.num_items - ) - } -} - -impl List { - fn span_len(&self) -> u64 { - self.tail_idx - self.head_idx - } - fn holes(&self) -> u64 { - self.span_len() - self.num_items - } - fn is_empty(&self) -> bool { - self.head_idx == self.tail_idx - } -} - -#[derive(Debug, Clone, Copy, Pod, Zeroable)] -#[repr(C, packed)] -struct ChainKey { - list_ph: PartedHash, - idx: u64, - namespace: u8, -} - -#[derive(Debug)] -pub struct ListCompactionParams { - pub min_length: u64, - pub min_holes_ratio: f64, -} - -impl Default for ListCompactionParams { - fn default() -> Self { - Self { - min_length: 100, - min_holes_ratio: 0.25, - } - } -} - -pub struct ListIterator<'a> { - store: &'a CandyStore, - list_key: Vec, - list_ph: PartedHash, - range: Option>, - fwd: bool, -} - -impl<'a> Iterator for ListIterator<'a> { - type Item = Result; - - fn next(&mut self) -> Option { - if self.range.is_none() { - let _guard = self.store.lock_list(self.list_ph); - let list_bytes = match self.store.get_raw(&self.list_key) { - Ok(Some(list_bytes)) => list_bytes, - Ok(None) => return None, - Err(e) => return Some(Err(e)), - }; - let list = *from_bytes::(&list_bytes); - self.range = Some(list.head_idx..list.tail_idx); - } - - loop { - let idx = if self.fwd { - self.range.as_mut().unwrap().next() - } else { - self.range.as_mut().unwrap().next_back() - }; - let Some(idx) = idx else { - return None; - }; - - match self.store.get_from_list_at_index(self.list_ph, idx, true) { - Err(e) => return Some(Err(e)), - Ok(Some((_, k, v))) => return Some(Ok((k, v))), - Ok(None) => { - // try next index - } - } - } - } - - fn size_hint(&self) -> (usize, Option) { - if let Some(ref range) = self.range { - range.size_hint() - } else { - (0, None) - } - } -} - -#[derive(Debug)] -enum InsertToListStatus { - Created(Vec), - DoesNotExist, - WrongValue(Vec), - ExistingValue(Vec), - Replaced(Vec), -} - -impl CandyStore { - const FIRST_LIST_IDX: u64 = 0x8000_0000_0000_0000; - - fn make_list_key(&self, mut list_key: Vec) -> (PartedHash, Vec) { - list_key.extend_from_slice(LIST_NAMESPACE); - (PartedHash::new(&self.config.hash_seed, &list_key), list_key) - } - - fn make_item_key(&self, list_ph: PartedHash, mut item_key: Vec) -> (PartedHash, Vec) { - item_key.extend_from_slice(bytes_of(&list_ph)); - item_key.extend_from_slice(ITEM_NAMESPACE); - (PartedHash::new(&self.config.hash_seed, &item_key), item_key) - } - - pub(crate) fn lock_list(&self, list_ph: PartedHash) -> MutexGuard<'_, ()> { - self.keyed_locks[(list_ph.signature() & self.keyed_locks_mask) as usize].lock() - } - - fn _insert_to_list( - &self, - list_key: Vec, - item_key: Vec, - mut val: Vec, - mode: InsertMode, - ) -> Result { - let (list_ph, list_key) = self.make_list_key(list_key); - let (item_ph, item_key) = self.make_item_key(list_ph, item_key); - - let _guard = self.lock_list(list_ph); - - // if the item already exists, it's already part of the list. just update it and preserve the index - if let Some(mut existing_val) = self.get_raw(&item_key)? { - match mode { - InsertMode::GetOrCreate => { - existing_val.truncate(existing_val.len() - size_of::()); - return Ok(InsertToListStatus::ExistingValue(existing_val)); - } - InsertMode::Replace(expected_val) => { - if let Some(expected_val) = expected_val { - if expected_val != &existing_val[existing_val.len() - size_of::()..] { - existing_val.truncate(existing_val.len() - size_of::()); - return Ok(InsertToListStatus::WrongValue(existing_val)); - } - } - // fall through - } - InsertMode::Set => { - // fall through - } - } - - val.extend_from_slice(&existing_val[existing_val.len() - size_of::()..]); - self.replace_raw(&item_key, &val, None)?; - existing_val.truncate(existing_val.len() - size_of::()); - return Ok(InsertToListStatus::Replaced(existing_val)); - } - - if matches!(mode, InsertMode::Replace(_)) { - // not allowed to create - return Ok(InsertToListStatus::DoesNotExist); - } - - // get of create the list - let res = self.get_or_create_raw( - &list_key, - bytes_of(&List { - head_idx: Self::FIRST_LIST_IDX, - tail_idx: Self::FIRST_LIST_IDX + 1, - num_items: 1, - }) - .to_owned(), - )?; - - match res { - crate::GetOrCreateStatus::CreatedNew(_) => { - // list was just created. create chain - self.set_raw( - bytes_of(&ChainKey { - list_ph, - idx: Self::FIRST_LIST_IDX, - namespace: CHAIN_NAMESPACE, - }), - bytes_of(&item_ph), - )?; - - // create item - val.extend_from_slice(bytes_of(&Self::FIRST_LIST_IDX)); - self.set_raw(&item_key, &val)?; - } - crate::GetOrCreateStatus::ExistingValue(list_bytes) => { - let mut list = *from_bytes::(&list_bytes); - - let idx = list.tail_idx; - list.tail_idx += 1; - - // update list - list.num_items += 1; - self.set_raw(&list_key, bytes_of(&list))?; - - // create chain - self.set_raw( - bytes_of(&ChainKey { - list_ph, - idx, - namespace: CHAIN_NAMESPACE, - }), - bytes_of(&item_ph), - )?; - - // create item - val.extend_from_slice(bytes_of(&idx)); - self.set_raw(&item_key, &val)?; - } - } - - val.truncate(val.len() - size_of::()); - Ok(InsertToListStatus::Created(val)) - } - - /// Inserts or updates an element `item_key` that belongs to list `list_key`. Returns [SetStatus::CreatedNew] if - /// the item did not exist, or [SetStatus::PrevValue] with the previous value of the item. - /// - /// See also [Self::set]. - pub fn set_in_list< - B1: AsRef<[u8]> + ?Sized, - B2: AsRef<[u8]> + ?Sized, - B3: AsRef<[u8]> + ?Sized, - >( - &self, - list_key: &B1, - item_key: &B2, - val: &B3, - ) -> Result { - self.owned_set_in_list( - list_key.as_ref().to_owned(), - item_key.as_ref().to_owned(), - val.as_ref().to_owned(), - false, - ) - } - - /// Like [Self::set_in_list] but "promotes" the element to the tail of the list: it's basically a - /// remove + insert operation. This can be usede to implement LRUs, where older elements are at the - /// beginning and newer ones at the end. - /// - /// Note: **not crash-safe** - pub fn set_in_list_promoting< - B1: AsRef<[u8]> + ?Sized, - B2: AsRef<[u8]> + ?Sized, - B3: AsRef<[u8]> + ?Sized, - >( - &self, - list_key: &B1, - item_key: &B2, - val: &B3, - ) -> Result { - self.owned_set_in_list( - list_key.as_ref().to_owned(), - item_key.as_ref().to_owned(), - val.as_ref().to_owned(), - true, - ) - } - - /// Owned version of [Self::set_in_list], which also takes promote as a parameter - pub fn owned_set_in_list( - &self, - list_key: Vec, - item_key: Vec, - val: Vec, - promote: bool, - ) -> Result { - if promote { - self.owned_remove_from_list(list_key.clone(), item_key.clone())?; - } - match self._insert_to_list(list_key, item_key, val, InsertMode::Set)? { - InsertToListStatus::Created(_v) => Ok(SetStatus::CreatedNew), - InsertToListStatus::Replaced(v) => Ok(SetStatus::PrevValue(v)), - _ => unreachable!(), - } - } - - /// Like [Self::set_in_list], but will only replace (update) an existing item, i.e., it will never create the - /// key - pub fn replace_in_list< - B1: AsRef<[u8]> + ?Sized, - B2: AsRef<[u8]> + ?Sized, - B3: AsRef<[u8]> + ?Sized, - >( - &self, - list_key: &B1, - item_key: &B2, - val: &B3, - expected_val: Option<&B3>, - ) -> Result { - self.owned_replace_in_list( - list_key.as_ref().to_owned(), - item_key.as_ref().to_owned(), - val.as_ref().to_owned(), - expected_val.map(|ev| ev.as_ref()), - ) - } - - /// Owned version of [Self::replace_in_list] - pub fn owned_replace_in_list( - &self, - list_key: Vec, - item_key: Vec, - val: Vec, - expected_val: Option<&[u8]>, - ) -> Result { - match self._insert_to_list(list_key, item_key, val, InsertMode::Replace(expected_val))? { - InsertToListStatus::DoesNotExist => Ok(ReplaceStatus::DoesNotExist), - InsertToListStatus::Replaced(v) => Ok(ReplaceStatus::PrevValue(v)), - InsertToListStatus::WrongValue(v) => Ok(ReplaceStatus::WrongValue(v)), - _ => unreachable!(), - } - } - - /// Like [Self::set_in_list] but will not replace (update) the element if it already exists - it will only - /// create the element with the default value if it did not exist. - pub fn get_or_create_in_list< - B1: AsRef<[u8]> + ?Sized, - B2: AsRef<[u8]> + ?Sized, - B3: AsRef<[u8]> + ?Sized, - >( - &self, - list_key: &B1, - item_key: &B2, - default_val: &B3, - ) -> Result { - self.owned_get_or_create_in_list( - list_key.as_ref().to_owned(), - item_key.as_ref().to_owned(), - default_val.as_ref().to_owned(), - ) - } - - /// Owned version of [Self::get_or_create_in_list] - pub fn owned_get_or_create_in_list( - &self, - list_key: Vec, - item_key: Vec, - default_val: Vec, - ) -> Result { - match self._insert_to_list(list_key, item_key, default_val, InsertMode::GetOrCreate)? { - InsertToListStatus::ExistingValue(v) => Ok(GetOrCreateStatus::ExistingValue(v)), - InsertToListStatus::Created(v) => Ok(GetOrCreateStatus::CreatedNew(v)), - _ => unreachable!(), - } - } - - /// Gets a list element identified by `list_key` and `item_key`. This is an O(1) operation. - /// - /// See also: [Self::get] - pub fn get_from_list + ?Sized, B2: AsRef<[u8]> + ?Sized>( - &self, - list_key: &B1, - item_key: &B2, - ) -> Result>> { - self.owned_get_from_list(list_key.as_ref().to_owned(), item_key.as_ref().to_owned()) - } - - /// Owned version of [Self::get_from_list] - pub fn owned_get_from_list( - &self, - list_key: Vec, - item_key: Vec, - ) -> Result>> { - let (list_ph, _) = self.make_list_key(list_key); - let (_, item_key) = self.make_item_key(list_ph, item_key); - let Some(mut val) = self.get_raw(&item_key)? else { - return Ok(None); - }; - val.truncate(val.len() - size_of::()); - Ok(Some(val)) - } - - /// Removes a element from the list, identified by `list_key` and `item_key. The element can be - /// at any position in the list, not just the head or the tail, but in this case, it will create a "hole". - /// This means that iterations will go over the missing element's index every time, until the list is compacted. - /// - /// See also [Self::remove], [Self::compact_list_if_needed] - pub fn remove_from_list + ?Sized, B2: AsRef<[u8]> + ?Sized>( - &self, - list_key: &B1, - item_key: &B2, - ) -> Result>> { - self.owned_remove_from_list(list_key.as_ref().to_owned(), item_key.as_ref().to_owned()) - } - - /// Owned version of [Self::remove_from_list] - pub fn owned_remove_from_list( - &self, - list_key: Vec, - item_key: Vec, - ) -> Result>> { - let (list_ph, list_key) = self.make_list_key(list_key); - let (_, item_key) = self.make_item_key(list_ph, item_key); - - let _guard = self.lock_list(list_ph); - - let Some(mut existing_val) = self.get_raw(&item_key)? else { - return Ok(None); - }; - - let item_idx = u64::from_le_bytes( - (&existing_val[existing_val.len() - size_of::()..]) - .try_into() - .unwrap(), - ); - existing_val.truncate(existing_val.len() - size_of::()); - - // update list, if the item was the head/tail - if let Some(list_bytes) = self.get_raw(&list_key)? { - let mut list = *from_bytes::(&list_bytes); - - list.num_items -= 1; - - if list.head_idx == item_idx || list.tail_idx == item_idx + 1 { - if list.head_idx == item_idx { - list.head_idx += 1; - } else if list.tail_idx == item_idx + 1 { - list.tail_idx -= 1; - } - } - if list.is_empty() { - self.remove_raw(&list_key)?; - } else { - self.set_raw(&list_key, bytes_of(&list))?; - } - } - - // remove chain - self.remove_raw(bytes_of(&ChainKey { - list_ph, - idx: item_idx, - namespace: CHAIN_NAMESPACE, - }))?; - - // remove item - self.remove_raw(&item_key)?; - - Ok(Some(existing_val)) - } - - const LIST_KEY_SUFFIX_LEN: usize = size_of::() + ITEM_NAMESPACE.len(); - - fn get_from_list_at_index( - &self, - list_ph: PartedHash, - idx: u64, - truncate: bool, - ) -> Result, Vec)>> { - let Some(item_ph_bytes) = self.get_raw(bytes_of(&ChainKey { - idx, - list_ph, - namespace: CHAIN_NAMESPACE, - }))? - else { - return Ok(None); - }; - let item_ph = *from_bytes::(&item_ph_bytes); - - let mut suffix = [0u8; Self::LIST_KEY_SUFFIX_LEN]; - suffix[0..size_of::()].copy_from_slice(bytes_of(&list_ph)); - suffix[size_of::()..].copy_from_slice(ITEM_NAMESPACE); - - for (mut k, mut v) in self.get_by_hash(item_ph)? { - if k.ends_with(&suffix) && v.ends_with(bytes_of(&idx)) { - if truncate { - v.truncate(v.len() - size_of::()); - k.truncate(k.len() - suffix.len()); - } - return Ok(Some((item_ph, k, v))); - } - } - - Ok(None) - } - - /// Compacts (rewrites) the list such that there will be no holes. Holes are created when removing an - /// element from the middle of the list (not the head or tail), which makes iteration less efficient. - /// You should call this function every so often if you're removing elements from lists at random locations. - /// The function takes parameters that control when to compact: the list has to be of a minimal length and - /// have a minimal holes-to-length ratio. The default values are expected to be okay for most use cases. - /// Returns true if the list was compacted, false otherwise. - /// - /// Note: **Not crash-safe** - pub fn compact_list_if_needed + ?Sized>( - &self, - list_key: &B, - params: ListCompactionParams, - ) -> Result { - let (list_ph, list_key) = self.make_list_key(list_key.as_ref().to_owned()); - let _guard = self.lock_list(list_ph); - - let Some(list_bytes) = self.get_raw(&list_key)? else { - return Ok(false); - }; - let list = *from_bytes::(&list_bytes); - if list.span_len() < params.min_length { - return Ok(false); - } - if (list.holes() as f64) < (list.span_len() as f64) * params.min_holes_ratio { - return Ok(false); - } - - let mut new_idx = list.tail_idx; - for idx in list.head_idx..list.tail_idx { - let Some((item_ph, full_k, mut full_v)) = - self.get_from_list_at_index(list_ph, idx, false)? - else { - continue; - }; - - // create new chain - self.set_raw( - bytes_of(&ChainKey { - idx: new_idx, - list_ph, - namespace: CHAIN_NAMESPACE, - }), - bytes_of(&item_ph), - )?; - - // update item's index suffix - let offset = full_v.len() - size_of::(); - full_v[offset..].copy_from_slice(bytes_of(&new_idx)); - self.set_raw(&full_k, &full_v)?; - - // remove old chain - self.remove_raw(bytes_of(&ChainKey { - idx, - list_ph, - namespace: CHAIN_NAMESPACE, - }))?; - - new_idx += 1; - } - - if list.tail_idx == new_idx { - // list is now empty - self.remove_raw(&list_key)?; - } else { - // update list head and tail, set holes=0 - self.set_raw( - &list_key, - bytes_of(&List { - head_idx: list.tail_idx, - tail_idx: new_idx, - num_items: new_idx - list.tail_idx, - }), - )?; - } - - Ok(true) - } - - /// Iterates over the elements of the list (identified by `list_key`) from the beginning (head) - /// to the end (tail). Note that if items are removed at random locations in the list, the iterator - /// will need to skip these holes. If you remove elements from the middle (not head/tail) of the list - /// frequently, and wish to use iteration, consider compacting the list every so often using - /// [Self::compact_list_if_needed] - pub fn iter_list + ?Sized>(&self, list_key: &B) -> ListIterator<'_> { - self.owned_iter_list(list_key.as_ref().to_owned()) - } - - /// Owned version of [Self::iter_list] - pub fn owned_iter_list(&self, list_key: Vec) -> ListIterator<'_> { - let (list_ph, list_key) = self.make_list_key(list_key); - ListIterator { - store: &self, - list_key, - list_ph, - range: None, - fwd: true, - } - } - - /// Same as [Self::iter_list] but iterates from the end (tail) to the beginning (head) - pub fn iter_list_backwards + ?Sized>(&self, list_key: &B) -> ListIterator<'_> { - self.owned_iter_list_backwards(list_key.as_ref().to_owned()) - } - - /// Owned version of [Self::iter_list_backwards] - pub fn owned_iter_list_backwards(&self, list_key: Vec) -> ListIterator<'_> { - let (list_ph, list_key) = self.make_list_key(list_key); - ListIterator { - store: &self, - list_key, - list_ph, - range: None, - fwd: false, - } - } - - /// Discards the given list, removing all elements it contains and dropping the list itself. - /// This is more efficient than iteration + removal of each element. - pub fn discard_list + ?Sized>(&self, list_key: &B) -> Result { - self.owned_discard_list(list_key.as_ref().to_owned()) - } - - /// Owned version of [Self::discard_list] - pub fn owned_discard_list(&self, list_key: Vec) -> Result { - let (list_ph, list_key) = self.make_list_key(list_key); - let _guard = self.lock_list(list_ph); - - let Some(list_bytes) = self.get_raw(&list_key)? else { - return Ok(false); - }; - let list = *from_bytes::(&list_bytes); - for idx in list.head_idx..list.tail_idx { - let Some((_, full_key, _)) = self.get_from_list_at_index(list_ph, idx, false)? else { - continue; - }; - self.remove_raw(bytes_of(&ChainKey { - list_ph, - idx, - namespace: CHAIN_NAMESPACE, - }))?; - self.remove_raw(&full_key)?; - } - self.remove_raw(&list_key)?; - - Ok(true) - } - - /// Returns the first (head) element of the list - pub fn peek_list_head + ?Sized>(&self, list_key: &B) -> Result> { - self.owned_peek_list_head(list_key.as_ref().to_owned()) - } - - /// Owned version of [Self::peek_list_head] - pub fn owned_peek_list_head(&self, list_key: Vec) -> Result> { - let Some(kv) = self.owned_iter_list(list_key).next() else { - return Ok(None); - }; - Ok(Some(kv?)) - } - - /// Returns the last (tail) element of the list - pub fn peek_list_tail + ?Sized>(&self, list_key: &B) -> Result> { - self.owned_peek_list_tail(list_key.as_ref().to_owned()) - } - - /// Owned version of [Self::peek_list_tail] - pub fn owned_peek_list_tail(&self, list_key: Vec) -> Result> { - for kv in self.owned_iter_list_backwards(list_key) { - return Ok(Some(kv?)); - } - Ok(None) - } - - /// Removes and returns the first (head) element of the list - pub fn pop_list_head + ?Sized>(&self, list_key: &B) -> Result> { - self.owned_pop_list_head(list_key.as_ref().to_owned()) - } - - fn _operate_on_list( - &self, - list_key: Vec, - default: T, - func: impl FnOnce(PartedHash, Vec, List) -> Result, - ) -> Result { - let (list_ph, list_key) = self.make_list_key(list_key); - let _guard = self.lock_list(list_ph); - let Some(list_bytes) = self.get_raw(&list_key)? else { - return Ok(default); - }; - let list = *from_bytes::(&list_bytes); - func(list_ph, list_key, list) - } - - fn _owned_pop_list(&self, list_key: Vec, fwd: bool) -> Result> { - self._operate_on_list(list_key, None, |list_ph, list_key, mut list| { - let range = list.head_idx..list.tail_idx; - - let mut pop = |idx| -> Result> { - let Some((_, mut untrunc_k, mut untrunc_v)) = - self.get_from_list_at_index(list_ph, idx, false)? - else { - return Ok(None); - }; - - if fwd { - list.head_idx = idx + 1; - } else { - list.tail_idx = idx - 1; - } - list.num_items -= 1; - if list.is_empty() { - self.remove_raw(&list_key)?; - } else { - self.set_raw(&list_key, bytes_of(&list))?; - } - - // remove chain - self.remove_raw(bytes_of(&ChainKey { - list_ph, - idx, - namespace: CHAIN_NAMESPACE, - }))?; - - // remove item - self.remove_raw(&untrunc_k)?; - - untrunc_v.truncate(untrunc_v.len() - size_of::()); - untrunc_k.truncate(untrunc_k.len() - Self::LIST_KEY_SUFFIX_LEN); - Ok(Some((untrunc_k, untrunc_v))) - }; - - if fwd { - for idx in range { - if let Some(kv) = pop(idx)? { - return Ok(Some(kv)); - } - } - } else { - for idx in range.rev() { - if let Some(kv) = pop(idx)? { - return Ok(Some(kv)); - } - } - } - - Ok(None) - }) - } - - /// Owned version of [Self::peek_list_tail] - pub fn owned_pop_list_head(&self, list_key: Vec) -> Result> { - self._owned_pop_list(list_key, true /* fwd */) - } - - /// Removes and returns the last (tail) element of the list - pub fn pop_list_tail + ?Sized>(&self, list_key: &B) -> Result> { - self.owned_pop_list_tail(list_key.as_ref().to_owned()) - } - - /// Owned version of [Self::peek_list_tail] - pub fn owned_pop_list_tail(&self, list_key: Vec) -> Result> { - self._owned_pop_list(list_key, false /* fwd */) - } - - /// Returns the estimated list length - pub fn list_len + ?Sized>(&self, list_key: &B) -> Result { - self.owned_list_len(list_key.as_ref().to_owned()) - } - pub fn owned_list_len(&self, list_key: Vec) -> Result { - let (_, list_key) = self.make_list_key(list_key); - - let Some(list_bytes) = self.get_raw(&list_key)? else { - return Ok(0); - }; - - Ok(from_bytes::(&list_bytes).num_items as usize) - } - - /// iterate over the given list and retain all elements for which the predicate returns `true`. In other - /// words, drop all other elements. This operation is not crash safe, and holds the list locked during the - /// whole iteration, so no other gets/sets/deletes can be done in by other threads on this list while - /// iterating over it. Beware of deadlocks. - /// - /// This operation will also compact the list, basically popping all elements and re-pushing the retained - /// ones at the end, so no holes will exist by the end. - pub fn retain_in_list + ?Sized>( - &self, - list_key: &B, - func: impl FnMut(&[u8], &[u8]) -> Result, - ) -> Result<()> { - self.owned_retain_in_list(list_key.as_ref().to_owned(), func) - } - - /// owned version of [Self::retain_in_list] - pub fn owned_retain_in_list( - &self, - list_key: Vec, - mut func: impl FnMut(&[u8], &[u8]) -> Result, - ) -> Result<()> { - self._operate_on_list(list_key, (), |list_ph, list_key, mut list| { - let range = list.head_idx..list.tail_idx; - - for idx in range { - list.head_idx = idx + 1; - let Some((item_ph, untrunc_k, mut untrunc_v)) = - self.get_from_list_at_index(list_ph, idx, false)? - else { - continue; - }; - - untrunc_v.truncate(untrunc_v.len() - size_of::()); - let mut v = untrunc_v; - let k = &untrunc_k[..untrunc_k.len() - Self::LIST_KEY_SUFFIX_LEN]; - - // remove chain - self.remove_raw(bytes_of(&ChainKey { - list_ph, - idx, - namespace: CHAIN_NAMESPACE, - }))?; - - if func(k, &v)? { - let tail_idx = list.tail_idx; - list.tail_idx += 1; - - // create chain - self.set_raw( - bytes_of(&ChainKey { - list_ph, - idx: tail_idx, - namespace: CHAIN_NAMESPACE, - }), - bytes_of(&item_ph), - )?; - - // create new item - v.extend_from_slice(bytes_of(&tail_idx)); - self.set_raw(&untrunc_k, &v)?; - } else { - // drop from list - list.num_items -= 1; - - // remove item - self.remove_raw(&untrunc_k)?; - } - } - // defer updating the list to the very end to save on IOs - if list.is_empty() { - self.remove_raw(&list_key)?; - } else { - self.set_raw(&list_key, bytes_of(&list))?; - } - Ok(()) - }) - } -} diff --git a/src/pacer.rs b/src/pacer.rs new file mode 100644 index 0000000..2fb23b6 --- /dev/null +++ b/src/pacer.rs @@ -0,0 +1,253 @@ +use std::time::{Duration, Instant}; + +/// A token-bucket pacer. +/// +/// The pacer refills `tokens_per_unit` tokens every `time_unit`, up to `max_tokens`. +/// Calls to `consume` spend immediately available tokens and block until enough +/// tokens have accrued to satisfy the request. +pub struct Pacer { + time_unit: Duration, + tokens_per_unit: u64, + max_tokens: u64, // burst capacity + last_refill: Instant, + available_tokens: u64, +} + +impl Pacer { + /// Creates a new pacer. + /// + /// `tokens_per_unit` must be non-zero and `time_unit` must be non-zero. + /// `max_tokens` is promoted to at least `tokens_per_unit`, ensuring the + /// bucket can hold one full refill interval. + pub fn new(tokens_per_unit: u64, time_unit: Duration, max_tokens: u64) -> Self { + assert!(tokens_per_unit > 0 && !time_unit.is_zero()); + let max_tokens = max_tokens.max(tokens_per_unit); + + Pacer { + time_unit, + tokens_per_unit, + max_tokens, + last_refill: Instant::now(), + available_tokens: max_tokens, + } + } + + fn added_tokens( + elapsed_ns: u128, + time_unit_ns: u128, + tokens_per_unit: u64, + capacity: u64, + ) -> u64 { + let produced_tokens = elapsed_ns.saturating_mul(tokens_per_unit as u128) / time_unit_ns; + produced_tokens.min(capacity as u128) as u64 + } + + fn duration_from_nanos_saturating(total_nanos: u128) -> Duration { + let secs = total_nanos / 1_000_000_000; + if secs > u64::MAX as u128 { + return Duration::MAX; + } + + Duration::new(secs as u64, (total_nanos % 1_000_000_000) as u32) + } + + fn refill(&mut self, now: Instant) { + if self.available_tokens == self.max_tokens { + self.last_refill = now; + return; + } + + let elapsed_ns = now.saturating_duration_since(self.last_refill).as_nanos(); + let time_unit_ns = self.time_unit.as_nanos(); + let capacity = self.max_tokens - self.available_tokens; + let added_tokens = + Self::added_tokens(elapsed_ns, time_unit_ns, self.tokens_per_unit, capacity); + if added_tokens == 0 { + return; + } + + self.available_tokens += added_tokens; + + if self.available_tokens == self.max_tokens { + self.last_refill = now; + } else { + // Advance last_refill by exact time accounted for by added_tokens + let time_advanced_ns = + (added_tokens as u128 * time_unit_ns) / self.tokens_per_unit as u128; + self.last_refill += Self::duration_from_nanos_saturating(time_advanced_ns); + } + } + + fn time_until_tokens(&self, now: Instant, tokens_needed: u64) -> Duration { + let elapsed_ns = now.saturating_duration_since(self.last_refill).as_nanos(); + let time_unit_ns = self.time_unit.as_nanos(); + let target_ns = (tokens_needed as u128) + .saturating_mul(time_unit_ns) + .div_ceil(self.tokens_per_unit as u128); + let remaining_ns = target_ns.saturating_sub(elapsed_ns); + + Self::duration_from_nanos_saturating(remaining_ns) + } + + /// Consumes `tokens`, sleeping through the provided callback while waiting for refills. + pub fn consume_with_sleep_fn(&mut self, mut tokens: u64, mut sleep: impl FnMut(Duration)) { + while tokens > 0 { + let now = Instant::now(); + self.refill(now); + + if self.available_tokens > 0 { + let consumed = self.available_tokens.min(tokens); + self.available_tokens -= consumed; + tokens -= consumed; + if tokens == 0 { + break; + } + } + + let tokens_to_wait = tokens.min(self.max_tokens); + sleep(self.time_until_tokens(now, tokens_to_wait)); + } + } + + /// Consumes `tokens`, blocking the current thread until enough tokens are available. + pub fn consume(&mut self, tokens: u64) { + self.consume_with_sleep_fn(tokens, std::thread::sleep); + } +} + +#[cfg(test)] +mod tests { + use super::Pacer; + use std::time::{Duration, Instant}; + + #[test] + fn test_consume_zero() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 40); + pacer.consume_with_sleep_fn(0, |_| unreachable!()); + } + + #[test] + fn test_consume_exact_burst() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 40); + pacer.consume_with_sleep_fn(40, |_| unreachable!()); + } + + #[test] + fn test_consume_burst_plus_one_sleeps() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 40); + let mut slept = false; + pacer.consume_with_sleep_fn(41, |d| { + std::thread::sleep(d); + slept = true; + }); + assert!(slept); + } + + #[test] + fn test_tokens_refill_after_idle() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 40); + pacer.consume_with_sleep_fn(40, |_| unreachable!()); + std::thread::sleep(Duration::from_millis(30)); + pacer.consume_with_sleep_fn(20, |_| unreachable!()); + } + + #[test] + fn test_rate_accuracy() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 10); + pacer.consume(10); + let t0 = Instant::now(); + pacer.consume(50); + let d = t0.elapsed(); + assert!(d >= Duration::from_millis(40), "Too fast: {d:?}"); + assert!(d < Duration::from_millis(150), "Too slow: {d:?}"); + } + + #[test] + fn test_many_small_consumes() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 10); + pacer.consume(10); + let t0 = Instant::now(); + for _ in 0..30 { + pacer.consume(1); + } + let d = t0.elapsed(); + assert!(d >= Duration::from_millis(20), "Too fast: {d:?}"); + assert!(d < Duration::from_millis(150), "Too slow: {d:?}"); + } + + #[test] + fn test_partial_bucket_refills_before_small_consume() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 40); + + pacer.consume_with_sleep_fn(5, |_| unreachable!()); + std::thread::sleep(Duration::from_millis(10)); + pacer.consume_with_sleep_fn(1, |_| unreachable!()); + + assert_eq!(pacer.available_tokens, 39); + } + + #[test] + fn test_waits_for_fractional_token_interval() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 10); + let mut requested_sleep = None; + + pacer.consume_with_sleep_fn(10, |_| unreachable!()); + pacer.consume_with_sleep_fn(1, |duration| { + requested_sleep = Some(duration); + std::thread::sleep(duration); + }); + + let requested_sleep = requested_sleep.expect("consume should need to sleep"); + assert!( + requested_sleep > Duration::ZERO, + "sleep duration should be positive" + ); + assert!( + requested_sleep < Duration::from_millis(5), + "expected to wait for a fractional token interval, got {requested_sleep:?}" + ); + } + + #[test] + fn test_burst_capacity_promotion() { + let mut pacer = Pacer::new(100, Duration::from_secs(1), 10); + let mut slept = false; + pacer.consume_with_sleep_fn(100, |_| slept = true); + assert!( + !slept, + "Should not sleep if burst capacity was correctly promoted to 100" + ); + } + + #[test] + fn test_large_consumes_are_batched() { + let mut pacer = Pacer::new(10, Duration::from_millis(10), 20); + let mut sleep_count = 0; + + pacer.consume_with_sleep_fn(20, |_| unreachable!()); + pacer.consume_with_sleep_fn(50, |duration| { + sleep_count += 1; + std::thread::sleep(duration); + }); + + assert!( + sleep_count <= 4, + "Should sleep in large batches (<= 4 sleeps), but slept {} times", + sleep_count + ); + } + + #[test] + fn test_added_tokens_caps_before_u64_cast() { + let added_tokens = Pacer::added_tokens(u128::MAX, 1, u64::MAX, 7); + assert_eq!(added_tokens, 7); + } + + #[test] + fn test_duration_from_nanos_saturates() { + assert_eq!( + Pacer::duration_from_nanos_saturating(u128::MAX), + Duration::MAX + ); + } +} diff --git a/src/queues.rs b/src/queues.rs deleted file mode 100644 index 7575a72..0000000 --- a/src/queues.rs +++ /dev/null @@ -1,455 +0,0 @@ -use std::ops::Range; - -use crate::{ - hashing::PartedHash, - store::{QUEUE_ITEM_NAMESPACE, QUEUE_NAMESPACE}, - CandyStore, -}; -use anyhow::Result; -use bytemuck::{bytes_of, checked::from_bytes_mut, from_bytes, Pod, Zeroable}; - -#[derive(Clone, Copy, Pod, Zeroable)] -#[repr(C)] -struct Queue { - head_idx: u64, // inclusive - tail_idx: u64, // exclusive - num_items: u64, -} - -impl Queue { - #[allow(dead_code)] - fn span_len(&self) -> u64 { - self.tail_idx - self.head_idx - } - #[allow(dead_code)] - fn holes(&self) -> u64 { - self.span_len() - self.num_items - } - fn is_empty(&self) -> bool { - self.head_idx == self.tail_idx - } -} - -enum QueuePos { - Head, - Tail, -} - -pub struct QueueIterator<'a> { - store: &'a CandyStore, - queue_key: Vec, - range: Option>, - fwd: bool, -} - -impl<'a> Iterator for QueueIterator<'a> { - type Item = Result<(usize, Vec)>; - fn next(&mut self) -> Option { - if self.range.is_none() { - match self.store.fetch_queue(&self.queue_key) { - Ok(queue) => match queue { - Some(queue) => { - self.range = Some(queue.head_idx..queue.tail_idx); - } - None => return None, - }, - Err(e) => return Some(Err(e)), - } - } - - loop { - let idx = if self.fwd { - self.range.as_mut().unwrap().next() - } else { - self.range.as_mut().unwrap().next_back() - }; - let Some(idx) = idx else { - return None; - }; - - match self - .store - .get_raw(&self.store.make_queue_item_key(&self.queue_key, idx)) - { - Ok(v) => { - match v { - Some(v) => return Some(Ok((idx as usize, v))), - None => { - // continue, we might have holes - } - } - } - Err(e) => return Some(Err(e)), - } - } - } - - fn size_hint(&self) -> (usize, Option) { - if let Some(ref range) = self.range { - range.size_hint() - } else { - (0, None) - } - } -} - -impl CandyStore { - const FIRST_QUEUE_IDX: u64 = 0x8000_0000_0000_0000; - - fn make_queue_key(&self, queue_key: &[u8]) -> (PartedHash, Vec) { - let mut full_queue_key = queue_key.to_owned(); - full_queue_key.extend_from_slice(QUEUE_NAMESPACE); - ( - PartedHash::new(&self.config.hash_seed, &queue_key), - full_queue_key, - ) - } - fn make_queue_item_key(&self, queue_key: &[u8], idx: u64) -> Vec { - let mut item_key = queue_key.to_owned(); - item_key.extend_from_slice(bytes_of(&idx)); - item_key.extend_from_slice(QUEUE_ITEM_NAMESPACE); - item_key - } - - fn _push_to_queue(&self, queue_key: &[u8], val: &[u8], pos: QueuePos) -> Result { - let (queue_ph, full_queue_key) = self.make_queue_key(queue_key); - let _guard = self.lock_list(queue_ph); - - let status = self.get_or_create_raw( - &full_queue_key, - bytes_of(&Queue { - head_idx: Self::FIRST_QUEUE_IDX, - tail_idx: Self::FIRST_QUEUE_IDX + 1, - num_items: 1, - }) - .to_owned(), - )?; - - let item_idx = match status { - crate::GetOrCreateStatus::CreatedNew(_) => Self::FIRST_QUEUE_IDX, - crate::GetOrCreateStatus::ExistingValue(mut queue_bytes) => { - let queue = from_bytes_mut::(&mut queue_bytes); - let item_idx = match pos { - QueuePos::Head => { - queue.head_idx -= 1; - queue.head_idx - } - QueuePos::Tail => { - let item_idx = queue.tail_idx; - queue.tail_idx += 1; - item_idx - } - }; - queue.num_items += 1; - self.set_raw(&full_queue_key, &queue_bytes)?; - item_idx - } - }; - - self.set_raw(&self.make_queue_item_key(queue_key, item_idx), val)?; - Ok(item_idx as usize) - } - - /// Pushed a new element at the front (head) of the queue, returning the element's index in the queue - pub fn push_to_queue_head + ?Sized, B2: AsRef<[u8]> + ?Sized>( - &self, - queue_key: &B1, - val: &B2, - ) -> Result { - self._push_to_queue(queue_key.as_ref(), val.as_ref(), QueuePos::Head) - } - - /// Pushed a new element at the end (tail) of the queue, returning the element's index in the queue - pub fn push_to_queue_tail + ?Sized, B2: AsRef<[u8]> + ?Sized>( - &self, - queue_key: &B1, - val: &B2, - ) -> Result { - self._push_to_queue(queue_key.as_ref(), val.as_ref(), QueuePos::Tail) - } - - fn _pop_queue(&self, queue_key: &[u8], pos: QueuePos) -> Result)>> { - let (queue_ph, full_queue_key) = self.make_queue_key(queue_key); - let _guard = self.lock_list(queue_ph); - - let Some(mut queue_bytes) = self.get_raw(&full_queue_key)? else { - return Ok(None); - }; - let queue = from_bytes_mut::(&mut queue_bytes); - let mut res = None; - - match pos { - QueuePos::Head => { - while queue.head_idx < queue.tail_idx { - let idx = queue.head_idx; - queue.head_idx += 1; - if let Some(v) = self.remove_raw(&self.make_queue_item_key(queue_key, idx))? { - res = Some((idx as usize, v)); - queue.num_items -= 1; - break; - } - } - } - QueuePos::Tail => { - while queue.tail_idx > queue.head_idx { - queue.tail_idx -= 1; - let idx = queue.tail_idx; - if let Some(v) = self.remove_raw(&self.make_queue_item_key(queue_key, idx))? { - res = Some((idx as usize, v)); - queue.num_items -= 1; - break; - } - } - } - } - - if queue.is_empty() { - self.remove_raw(&full_queue_key)?; - } else { - self.set_raw(&full_queue_key, &queue_bytes)?; - } - - Ok(res) - } - - /// Removes and returns the head element and its index of the queue, or None if the queue is empty - pub fn pop_queue_head_with_idx + ?Sized>( - &self, - queue_key: &B, - ) -> Result)>> { - self._pop_queue(queue_key.as_ref(), QueuePos::Head) - } - - /// Removes and returns the head element of the queue, or None if the queue is empty - pub fn pop_queue_head + ?Sized>( - &self, - queue_key: &B, - ) -> Result>> { - Ok(self - .pop_queue_head_with_idx(queue_key.as_ref())? - .map(|iv| iv.1)) - } - - /// Removes and returns the tail element and its index of the queue, or None if the queue is empty - pub fn pop_queue_tail_with_idx + ?Sized>( - &self, - queue_key: &B, - ) -> Result)>> { - self._pop_queue(queue_key.as_ref(), QueuePos::Tail) - } - - /// Removes and returns the tail element of the queue, or None if the queue is empty - pub fn pop_queue_tail + ?Sized>( - &self, - queue_key: &B, - ) -> Result>> { - Ok(self.pop_queue_tail_with_idx(queue_key)?.map(|iv| iv.1)) - } - - /// Removes an element by index from the queue, returning the value it had or None if it did not exist (as well - /// as if the queue itself does not exist). - /// - /// This will leave a "hole" in the queue, which means we will skip over it in future iterations, but this could - /// lead to inefficienies as if you keep only the head and tail elements of a long queue, while removing elements - /// from the middle. - pub fn remove_from_queue + ?Sized>( - &self, - queue_key: &B, - idx: usize, - ) -> Result>> { - let idx = idx as u64; - let queue_key = queue_key.as_ref(); - let (queue_ph, full_queue_key) = self.make_queue_key(queue_key); - let _guard = self.lock_list(queue_ph); - - let Some(val) = self.remove_raw(&self.make_queue_item_key(queue_key, idx as u64))? else { - return Ok(None); - }; - - if let Some(mut queue_bytes) = self.get_raw(&full_queue_key)? { - let queue = from_bytes_mut::(&mut queue_bytes); - if queue.head_idx == idx { - queue.head_idx += 1; - } - if queue.tail_idx == idx + 1 { - queue.tail_idx -= 1; - } - queue.num_items -= 1; - if queue.is_empty() { - self.remove_raw(&full_queue_key)?; - } else { - self.set_raw(&full_queue_key, &queue_bytes)?; - } - } - - Ok(Some(val)) - } - - /// Discards the queue (dropping all elements in contains). Returns true if it had existed before, false otherwise - pub fn discard_queue + ?Sized>(&self, queue_key: &B) -> Result { - let queue_key = queue_key.as_ref(); - let (queue_ph, full_queue_key) = self.make_queue_key(queue_key); - let _guard = self.lock_list(queue_ph); - - let Some(queue_bytes) = self.get_raw(&full_queue_key)? else { - return Ok(false); - }; - let queue = from_bytes::(&queue_bytes); - - for i in queue.head_idx..queue.tail_idx { - self.remove_raw(&self.make_queue_item_key(queue_key, i as u64))?; - } - - self.remove_raw(&full_queue_key)?; - Ok(true) - } - - fn fetch_queue(&self, queue_key: &[u8]) -> Result> { - let queue_key = queue_key.as_ref(); - let (queue_ph, full_queue_key) = self.make_queue_key(queue_key); - let _guard = self.lock_list(queue_ph); - if let Some(queue_bytes) = self.get_raw(&full_queue_key)? { - Ok(Some(*from_bytes::(&queue_bytes))) - } else { - Ok(None) - } - } - - /// Extends the queue with elements from the given iterator. The queue will be created if it did not exist before, - /// and elements are pushed at the tail-end of the queue. This is more efficient than calling - /// [Self::push_to_queue_tail] in a loop - /// - /// Note: this is not an atomic (crash-safe) operation: if your program crashes while extending the queue, it - /// is possible that only some of the elements will have been appended. - /// - /// Returns the indices of the elements added (a range) - pub fn extend_queue<'a, B: AsRef<[u8]> + ?Sized>( - &self, - queue_key: &B, - items: impl Iterator>, - ) -> Result> { - let queue_key = queue_key.as_ref(); - let (queue_ph, full_queue_key) = self.make_queue_key(queue_key); - let _guard = self.lock_list(queue_ph); - - let mut queue_bytes = &mut self - .get_or_create_raw( - &full_queue_key, - bytes_of(&Queue { - head_idx: Self::FIRST_QUEUE_IDX, - tail_idx: Self::FIRST_QUEUE_IDX, - num_items: 0, - }) - .to_owned(), - )? - .value(); - - let queue = from_bytes_mut::(&mut queue_bytes); - - let first_idx = queue.tail_idx; - for item in items { - self.set_raw( - &self.make_queue_item_key(queue_key, queue.tail_idx), - item.as_ref(), - )?; - queue.tail_idx += 1; - queue.num_items += 1; - } - - let indices = first_idx as usize..queue.tail_idx as usize; - self.set_raw(&full_queue_key, &queue_bytes)?; - - Ok(indices) - } - - /// Returns (without removing) the head element of the queue and its index, or None if the queue is empty - pub fn peek_queue_head_with_idx + ?Sized>( - &self, - queue_key: &B, - ) -> Result)>> { - for res in self.iter_queue(queue_key) { - return Ok(Some(res?)); - } - Ok(None) - } - - /// Returns (without removing) the head element of the queue, or None if the queue is empty - pub fn peek_queue_head + ?Sized>( - &self, - queue_key: &B, - ) -> Result>> { - for res in self.iter_queue(queue_key) { - return Ok(Some(res?.1)); - } - Ok(None) - } - - /// Returns (without removing) the head element of the queue and its index, or None if the queue is empty - pub fn peek_queue_tail_with_idx + ?Sized>( - &self, - queue_key: &B, - ) -> Result)>> { - for res in self.iter_queue_backwards(queue_key) { - return Ok(Some(res?)); - } - Ok(None) - } - - /// Returns (without removing) the tail element of the queue, or None if the queue is empty - pub fn peek_queue_tail + ?Sized>( - &self, - queue_key: &B, - ) -> Result>> { - for res in self.iter_queue_backwards(queue_key) { - return Ok(Some(res?.1)); - } - Ok(None) - } - - /// Returns a forward iterator (head to tail) over the elements of the queue. If the queue does not exist, - /// this is an empty iterator. - /// - /// Note: the iterator will go over the indices that existed when it was created -- new elements that are - /// pushed afterwards will not be returned - pub fn iter_queue<'a, B: AsRef<[u8]> + ?Sized>(&'a self, queue_key: &B) -> QueueIterator<'a> { - QueueIterator { - store: &self, - queue_key: queue_key.as_ref().to_owned(), - range: None, - fwd: true, - } - } - - /// Returns a backward iterator (tail to head) over the elements of the queue. If the queue does not exist, - /// this is an empty iterator. - /// - /// Note: the iterator will go over the indices that existed when it was created -- new elements that are - /// pushed afterwards will not be returned - pub fn iter_queue_backwards<'a, B: AsRef<[u8]> + ?Sized>( - &'a self, - queue_key: &B, - ) -> QueueIterator<'a> { - QueueIterator { - store: &self, - queue_key: queue_key.as_ref().to_owned(), - range: None, - fwd: false, - } - } - - /// Returns a the length of the given queue (number of elements in the queue) or 0 if the queue does not exist - pub fn queue_len + ?Sized>(&self, queue_key: &B) -> Result { - let Some(queue) = self.fetch_queue(queue_key.as_ref())? else { - return Ok(0); - }; - Ok(queue.num_items as usize) - } - - /// Returns a the range (indices) of the given queue or an empty range if the queue does not exist - pub fn queue_range + ?Sized>(&self, queue_key: &B) -> Result> { - let Some(queue) = self.fetch_queue(queue_key.as_ref())? else { - return Ok(Self::FIRST_QUEUE_IDX as usize..Self::FIRST_QUEUE_IDX as usize); - }; - Ok(queue.head_idx as usize..queue.tail_idx as usize) - } -} diff --git a/src/router.rs b/src/router.rs deleted file mode 100644 index a53a26c..0000000 --- a/src/router.rs +++ /dev/null @@ -1,548 +0,0 @@ -use anyhow::ensure; -use parking_lot::RwLock; -use std::{ops::Range, sync::Arc}; - -use crate::shard::{CompactionThreadPool, InsertMode, InsertStatus, Shard}; -use crate::stats::InternalStats; -use crate::Result; -use crate::{hashing::PartedHash, store::InternalConfig}; - -fn consolidate_ranges(mut ranges: Vec>) -> (Vec>, Vec>) { - // we may encounter unfinished splits, where we have any combination of the bottom half, top half and - // original shard existing. in this case, we want to keep the largest of them, e.g, suppose we find - // [0..16, 0..32], we want to remove 0..16 and keep only 0..32. to do that, we sort by `start` - // followed by sorting by end, so [0..16, 16..32, 0..32] is sorted as [0..32, 0..16, 16..32], which means - // we'll encounter all over-arching ranges before smaller ones - ranges.sort_by(|a, b| { - if a.start == b.start { - b.end.cmp(&a.end) - } else { - a.start.cmp(&b.start) - } - }); - - let mut removed = vec![]; - let mut i = 1; - while i < ranges.len() { - if ranges[i].start >= ranges[i - 1].start && ranges[i].end <= ranges[i - 1].end { - removed.push(ranges.remove(i)); - } else { - i += 1; - } - } - (ranges, removed) -} - -#[test] -fn test_consolidate_ranges() { - assert_eq!(consolidate_ranges(vec![0..16]), (vec![0..16], vec![])); - assert_eq!( - consolidate_ranges(vec![16..32, 0..16]), - (vec![0..16, 16..32], vec![]) - ); - assert_eq!( - consolidate_ranges(vec![16..32, 0..16, 0..32]), - (vec![0..32], vec![0..16, 16..32]) - ); - assert_eq!( - consolidate_ranges(vec![16..32, 0..16, 0..32, 48..64, 32..48, 50..60]), - (vec![0..32, 32..48, 48..64], vec![0..16, 16..32, 50..60]) - ); -} - -#[derive(Clone)] -enum ShardNode { - Leaf(Arc), - Vertex(Arc, Arc), -} - -impl ShardNode { - fn span(&self) -> Range { - match self { - Self::Leaf(sh) => sh.span.clone(), - Self::Vertex(bottom, top) => bottom.span.start..top.span.end, - } - } - fn len(&self) -> u32 { - self.span().end - self.span().start - } -} - -pub(crate) struct ShardRouter { - span: Range, - config: Arc, - node: RwLock, - stats: Arc, - threadpool: Arc, -} - -impl ShardRouter { - pub(crate) const END_OF_SHARDS: u32 = 1u32 << 16; - - pub(crate) fn new( - config: Arc, - stats: Arc, - threadpool: Arc, - ) -> Result { - let mut shards = Self::load(&config, &stats, &threadpool)?; - if shards.is_empty() { - shards = Self::create_initial_shards(&config, &stats, &threadpool)?; - } - let root = Self::treeify(shards, &stats, &threadpool); - Ok(Self { - span: root.span(), - config, - node: RwLock::new(root), - stats, - threadpool, - }) - } - - fn load( - config: &Arc, - stats: &Arc, - threadpool: &Arc, - ) -> Result>> { - let mut found_shards = vec![]; - for res in std::fs::read_dir(&config.dir_path)? { - let entry = res?; - let filename = entry.file_name(); - let Some(filename) = filename.to_str() else { - continue; - }; - let Ok(filetype) = entry.file_type() else { - continue; - }; - if !filetype.is_file() { - continue; - } - if filename.starts_with("bottom_") - || filename.starts_with("top_") - || filename.starts_with("merge_") - { - std::fs::remove_file(entry.path())?; - continue; - } else if !filename.starts_with("shard_") { - continue; - } - let Some((_, span)) = filename.split_once("_") else { - continue; - }; - let Some((start, end)) = span.split_once("-") else { - continue; - }; - let start = u32::from_str_radix(start, 16).expect(filename); - let end = u32::from_str_radix(end, 16).expect(filename); - - ensure!( - start < end && end <= Self::END_OF_SHARDS, - "Bad span for {filename}" - ); - - found_shards.push(start..end); - } - - let (shards_to_keep, shards_to_remove) = consolidate_ranges(found_shards); - for span in shards_to_remove { - std::fs::remove_file( - config - .dir_path - .join(format!("shard_{:04x}-{:04x}", span.start, span.end)), - )?; - } - - if shards_to_keep.is_empty() { - return Ok(vec![]); - } - - let mut shards = vec![]; - let mut current = 0; - - for span in shards_to_keep { - if span.start > current { - let mut gap_start = current; - let gap_end = span.start; - while gap_start < gap_end { - let mut size = 1; - while gap_start % (size * 2) == 0 && gap_start + (size * 2) <= gap_end { - size *= 2; - } - shards.push(Arc::new(Shard::open( - gap_start..gap_start + size, - true, - config.clone(), - stats.clone(), - threadpool.clone(), - )?)); - gap_start += size; - } - } - - shards.push(Arc::new(Shard::open( - span.clone(), - false, - config.clone(), - stats.clone(), - threadpool.clone(), - )?)); - current = span.end; - } - - if current < Self::END_OF_SHARDS { - let mut gap_start = current; - let gap_end = Self::END_OF_SHARDS; - while gap_start < gap_end { - let mut size = 1; - while gap_start % (size * 2) == 0 && gap_start + (size * 2) <= gap_end { - size *= 2; - } - shards.push(Arc::new(Shard::open( - gap_start..gap_start + size, - true, - config.clone(), - stats.clone(), - threadpool.clone(), - )?)); - gap_start += size; - } - } - - Ok(shards) - } - - fn calc_step(num_items: usize) -> u32 { - let step = (Self::END_OF_SHARDS as f64) - / (num_items as f64 / Shard::EXPECTED_CAPACITY as f64).max(1.0); - 1 << (step as u32).ilog2() - } - pub(crate) fn calc_num_shards(num_items: usize) -> u32 { - Self::END_OF_SHARDS / Self::calc_step(num_items) - } - - fn create_initial_shards( - config: &Arc, - stats: &Arc, - threadpool: &Arc, - ) -> Result>> { - let step = Self::calc_step(config.expected_number_of_keys); - - let mut shards = vec![]; - let mut start = 0; - while start < Self::END_OF_SHARDS { - let end = start + step; - shards.push(Arc::new(Shard::open( - start..end, - true, - config.clone(), - stats.clone(), - threadpool.clone(), - )?)); - start = end; - } - - Ok(shards) - } - - fn from_shardnode( - n: ShardNode, - stats: Arc, - threadpool: Arc, - ) -> Self { - let config = match n { - ShardNode::Leaf(ref sh) => sh.config.clone(), - ShardNode::Vertex(ref bottom, _) => bottom.config.clone(), - }; - Self { - config, - span: n.span(), - node: RwLock::new(n), - stats, - threadpool, - } - } - - fn treeify( - shards: Vec>, - stats: &Arc, - threadpool: &Arc, - ) -> ShardNode { - // algorithm: first find the smallest span, and let that be our base unit, say it's 1K. then go over - // 0..64K in 1K increments and pair up every consecutive pairs whose size is 1K. we count on the spans to be - // sorted, so we'll merge 0..1K with 1K..2K, and not 1K..3K with 2K..3K. - // then we double our base unit and repeat, until base unit = 64K. - - let mut nodes = vec![]; - let mut unit: u32 = Self::END_OF_SHARDS; - { - let mut spans_debug: Vec> = vec![]; - for sh in shards { - assert!( - spans_debug.is_empty() || spans_debug.last().unwrap().start != sh.span.start, - "two elements with the same start {spans_debug:?} {:?}", - sh.span - ); - spans_debug.push(sh.span.clone()); - let n = ShardNode::Leaf(sh); - if unit > n.len() { - unit = n.len(); - } - nodes.push(n); - } - assert!( - spans_debug.is_sorted_by(|a, b| a.start < b.start), - "not sorted {spans_debug:?}" - ); - - assert!(unit >= 1 && unit.is_power_of_two(), "unit={unit}"); - assert!(nodes.len() > 0, "No shards to merge"); - assert!(nodes.len() > 1 || unit == Self::END_OF_SHARDS); - } - - while unit < Self::END_OF_SHARDS { - let mut i = 0; - while i < nodes.len() - 1 { - if nodes[i].len() == unit && nodes[i + 1].len() == unit { - let n0 = nodes.remove(i); - let n1 = nodes.remove(i); - nodes.insert( - i, - ShardNode::Vertex( - Arc::new(Self::from_shardnode(n0, stats.clone(), threadpool.clone())), - Arc::new(Self::from_shardnode(n1, stats.clone(), threadpool.clone())), - ), - ); - } else { - i += 1; - } - } - - unit *= 2; - } - - assert_eq!(nodes.len(), 1); - nodes.remove(0) - } - - pub(crate) fn shared_op( - &self, - shard_selector: u32, - func: impl FnOnce(&Shard) -> Result, - ) -> Result { - match &*self.node.read() { - ShardNode::Leaf(sh) => func(sh), - ShardNode::Vertex(bottom, top) => { - if shard_selector < bottom.span.end { - bottom.shared_op(shard_selector, func) - } else { - top.shared_op(shard_selector, func) - } - } - } - } - - pub(crate) fn clear(&self) -> Result<()> { - let mut guard = self.node.write(); - - for res in std::fs::read_dir(&self.config.dir_path)? { - let entry = res?; - let filename = entry.file_name(); - let Some(filename) = filename.to_str() else { - continue; - }; - let Ok(filetype) = entry.file_type() else { - continue; - }; - if !filetype.is_file() { - continue; - } - if filename.starts_with("shard_") - || filename.starts_with("compact_") - || filename.starts_with("bottom_") - || filename.starts_with("top_") - { - std::fs::remove_file(entry.path())?; - } - } - - let shards = Self::create_initial_shards(&self.config, &self.stats, &self.threadpool)?; - *guard = Self::treeify(shards, &self.stats, &self.threadpool); - - Ok(()) - } - - pub(crate) fn call_on_all_shards( - &self, - mut func: impl FnMut(&Shard) -> Result + Copy, - ) -> Result> { - match &*self.node.read() { - ShardNode::Leaf(sh) => Ok(vec![func(sh)?]), - ShardNode::Vertex(bottom, top) => { - let mut v = bottom.call_on_all_shards(func)?; - v.extend(top.call_on_all_shards(func)?); - Ok(v) - } - } - } - - pub(crate) fn insert( - &self, - ph: PartedHash, - full_key: &[u8], - val: &[u8], - mode: InsertMode, - ) -> Result { - loop { - let res = match &*self.node.read() { - ShardNode::Leaf(sh) => sh.insert(ph, full_key, val, mode)?, - ShardNode::Vertex(bottom, top) => { - if ph.shard_selector() < bottom.span.end { - bottom.insert(ph, full_key, val, mode)? - } else { - top.insert(ph, full_key, val, mode)? - } - } - }; - - match res { - InsertStatus::SplitNeeded => { - let mut guard = self.node.write(); - let ShardNode::Leaf(sh) = &*guard else { - // already split - continue; - }; - - let (bottom, top) = sh.split()?; - - *guard = ShardNode::Vertex( - Arc::new(ShardRouter { - span: bottom.span.clone(), - config: self.config.clone(), - node: RwLock::new(ShardNode::Leaf(Arc::new(bottom))), - stats: self.stats.clone(), - threadpool: self.threadpool.clone(), - }), - Arc::new(ShardRouter { - span: top.span.clone(), - config: self.config.clone(), - node: RwLock::new(ShardNode::Leaf(Arc::new(top))), - stats: self.stats.clone(), - threadpool: self.threadpool.clone(), - }), - ); - - // retry - } - _ => { - return Ok(res); - } - } - } - } - - fn _merge( - &self, - bottom: &ShardRouter, - top: &ShardRouter, - max_fill: usize, - shards_to_remove: &mut u32, - ) -> Result> { - if *shards_to_remove == 0 { - return Ok(None); - } - - let (bottom_node, top_node) = { - let bottom_guard = bottom.node.read(); - let top_guard = top.node.read(); - (bottom_guard.clone(), top_guard.clone()) - }; - - match (bottom_node, top_node) { - (ShardNode::Leaf(b), ShardNode::Leaf(t)) => { - if b.get_stats()?.num_items() > max_fill { - return Ok(None); - } - if t.get_stats()?.num_items() > max_fill { - return Ok(None); - } - if let Some(sh) = Shard::merge(&b, &t)? { - *shards_to_remove = *shards_to_remove - 1; - let span = sh.span.clone(); - Ok(Some(ShardRouter { - config: self.config.clone(), - node: RwLock::new(ShardNode::Leaf(Arc::new(sh))), - span, - stats: self.stats.clone(), - threadpool: self.threadpool.clone(), - })) - } else { - Ok(None) - } - } - (ShardNode::Leaf(_), ShardNode::Vertex(b, t)) => { - if let Some(merged_top) = self._merge(&b, &t, max_fill, shards_to_remove)? { - self._merge(bottom, &merged_top, max_fill, shards_to_remove) - } else { - Ok(None) - } - } - (ShardNode::Vertex(b, t), ShardNode::Leaf(_)) => { - if let Some(merged_bottom) = self._merge(&b, &t, max_fill, shards_to_remove)? { - self._merge(&merged_bottom, top, max_fill, shards_to_remove) - } else { - Ok(None) - } - } - (ShardNode::Vertex(b1, t1), ShardNode::Vertex(b2, t2)) => { - let m1 = self._merge(&b1, &t1, max_fill, shards_to_remove)?; - let m2 = self._merge(&b2, &t2, max_fill, shards_to_remove)?; - match (m1, m2) { - (Some(m1), Some(m2)) => self._merge(&m1, &m2, max_fill, shards_to_remove), - (Some(m1), None) => self._merge(&m1, top, max_fill, shards_to_remove), - (None, Some(m2)) => self._merge(bottom, &m2, max_fill, shards_to_remove), - (None, None) => Ok(None), - } - } - } - } - - pub(crate) fn merge_small_shards(&self, max_fill_level: f32) -> Result { - ensure!(max_fill_level > 0.0 && max_fill_level < 0.5); - let max_fill = (Shard::EXPECTED_CAPACITY as f32 * max_fill_level) as usize; - - let mut num_items = 0usize; - let mut starting_num_shards = 0u32; - for count in self.call_on_all_shards(|sh| Ok(sh.get_stats()?.num_items()))? { - starting_num_shards += 1; - num_items += count; - } - - let needed_shards = - Self::calc_num_shards(num_items.max(self.config.expected_number_of_keys)); - - if starting_num_shards <= needed_shards { - return Ok(false); - } - let mut shards_to_remove = starting_num_shards - needed_shards; - - { - let mut guard = self.node.write(); - - match &*guard { - ShardNode::Leaf(_) => None, - ShardNode::Vertex(bottom, top) => { - self._merge(&bottom, &top, max_fill, &mut shards_to_remove)? - } - }; - - *guard = Self::treeify( - Self::load(&self.config, &self.stats, &self.threadpool)?, - &self.stats, - &self.threadpool, - ); - } - - let new_num_shards: u32 = self.call_on_all_shards(|_| Ok(1))?.iter().sum(); - - Ok(new_num_shards != starting_num_shards) - } -} diff --git a/src/shard.rs b/src/shard.rs deleted file mode 100644 index 95cffa8..0000000 --- a/src/shard.rs +++ /dev/null @@ -1,1172 +0,0 @@ -use anyhow::bail; -use bytemuck::{bytes_of_mut, Pod, Zeroable}; -use parking_lot::{Mutex, RwLock, RwLockWriteGuard}; -use std::{ - fs::{File, OpenOptions}, - io::Read, - ops::Range, - path::{Path, PathBuf}, - sync::{ - atomic::{AtomicU64, AtomicUsize, Ordering}, - Arc, - }, - thread::JoinHandle, - time::Instant, -}; - -use memmap::{MmapMut, MmapOptions}; - -use crate::Result; -use crate::{ - hashing::{PartedHash, INVALID_SIG}, - stats::InternalStats, - store::InternalConfig, -}; - -// -// these numbers were chosen according to the simulation, as they allow for 90% utilization of the shard with -// virtually zero chance of in-row collisions and "smallish" shard size: shards start at 384KB and -// can hold 32K entries, and since we're limited at 4GB file sizes, we can key-value pairs of up to 128KB -// (keys and values are limited to 64KB each anyway) -// -// other good combinations are 32/512, 32/1024, 64/256, 64/1024, 128/512, 256/256 -// -pub(crate) const NUM_ROWS: usize = 64; -pub(crate) const ROW_WIDTH: usize = 512; - -#[repr(C)] -struct ShardRow { - signatures: [u32; ROW_WIDTH], - offsets_and_sizes: [u64; ROW_WIDTH], // | key_size: 16 | val_size: 16 | file_offset: 32 | -} - -impl ShardRow { - #[inline] - fn lookup(&self, sig: u32, start_idx: &mut usize) -> Option { - use simd_itertools::PositionSimd; - if let Some(rel_idx) = self.signatures[*start_idx..] - .iter() - .position_simd(|x| *x == sig) - { - let abs_idx = rel_idx + *start_idx; - *start_idx = abs_idx + 1; - Some(abs_idx) - } else { - None - } - } -} - -#[test] -fn test_row_lookup() -> Result<()> { - let mut row = ShardRow { - signatures: [0; ROW_WIDTH], - offsets_and_sizes: [0; ROW_WIDTH], - }; - - row.signatures[7] = 123; - row.signatures[8] = 123; - row.signatures[9] = 123; - row.signatures[90] = 123; - row.signatures[ROW_WIDTH - 1] = 999; - - let mut start = 0; - assert_eq!(row.lookup(123, &mut start), Some(7)); - assert_eq!(start, 8); - assert_eq!(row.lookup(123, &mut start), Some(8)); - assert_eq!(start, 9); - assert_eq!(row.lookup(123, &mut start), Some(9)); - assert_eq!(start, 10); - assert_eq!(row.lookup(123, &mut start), Some(90)); - assert_eq!(start, 91); - assert_eq!(row.lookup(123, &mut start), None); - assert_eq!(start, 91); - - start = 0; - assert_eq!(row.lookup(0, &mut start), Some(0)); - assert_eq!(start, 1); - - start = 0; - assert_eq!(row.lookup(999, &mut start), Some(ROW_WIDTH - 1)); - assert_eq!(start, ROW_WIDTH); - - assert_eq!(row.lookup(999, &mut start), None); - assert_eq!(start, ROW_WIDTH); - - Ok(()) -} - -#[repr(C, align(4096))] -struct PageAligned(T); - -pub(crate) const SHARD_FILE_MAGIC: [u8; 8] = *b"CandyStr"; -pub(crate) const SHARD_FILE_VERSION: u64 = 11; - -#[derive(Clone, Copy, Default, Debug, Pod, Zeroable)] -#[repr(C)] -struct MetaHeader { - magic: [u8; 8], - version: u64, -} - -#[repr(C)] -struct ShardHeader { - metadata: MetaHeader, - wasted_bytes: AtomicU64, - write_offset: AtomicU64, - num_inserts: AtomicU64, - num_removals: AtomicU64, - compacted_up_to: AtomicUsize, - rows: PageAligned<[ShardRow; NUM_ROWS]>, -} - -pub(crate) const HEADER_SIZE: u64 = size_of::() as u64; -const _: () = assert!(HEADER_SIZE % 4096 == 0); - -#[derive(Debug)] -pub(crate) enum InsertStatus { - Added, - Replaced(Vec), - KeyDoesNotExist, - SplitNeeded, - AlreadyExists(Vec), -} - -#[derive(Debug, Clone, Copy)] -pub(crate) enum InsertMode<'a> { - Set, - Replace(Option<&'a [u8]>), - GetOrCreate, -} - -enum TryReplaceStatus<'a> { - KeyDoesNotExist(RwLockWriteGuard<'a, ()>, bool), - KeyExistsNotReplaced(Vec), - KeyExistsReplaced(Vec), -} - -pub(crate) type KVPair = (Vec, Vec); - -struct MmapFile { - file: File, - mmap: MmapMut, -} - -#[cfg(unix)] -fn read_exact_at(f: &File, buf: &mut [u8], offset: u64) -> std::io::Result<()> { - std::os::unix::fs::FileExt::read_exact_at(f, buf, offset) -} - -#[cfg(unix)] -fn write_all_at(f: &File, buf: &[u8], offset: u64) -> std::io::Result<()> { - std::os::unix::fs::FileExt::write_all_at(f, buf, offset) -} - -#[cfg(windows)] -fn read_exact_at(f: &File, mut buf: &mut [u8], mut offset: u64) -> std::io::Result<()> { - while !buf.is_empty() { - match std::os::windows::fs::FileExt::seek_read(f, buf, offset) { - Ok(0) => break, - Ok(n) => { - let tmp = buf; - buf = &mut tmp[n..]; - offset += n as u64; - } - Err(e) => return Err(e), - } - } - if !buf.is_empty() { - Err(std::io::Error::from(std::io::ErrorKind::UnexpectedEof)) - } else { - Ok(()) - } -} - -#[cfg(windows)] -fn write_all_at(f: &File, mut buf: &[u8], mut offset: u64) -> std::io::Result<()> { - while !buf.is_empty() { - match std::os::windows::fs::FileExt::seek_write(f, buf, offset) { - Ok(0) => return Err(std::io::Error::from(std::io::ErrorKind::UnexpectedEof)), - Ok(n) => { - buf = &buf[n..]; - offset += n as u64; - } - Err(e) => return Err(e), - } - } - Ok(()) -} - -impl MmapFile { - fn new(file: File, mlock_headers: bool) -> Result { - let mmap = unsafe { MmapOptions::new().len(HEADER_SIZE as usize).map_mut(&file) }?; - - #[cfg(windows)] - let _ = mlock_headers; // Prevent unused variable warning on Windows - - #[cfg(unix)] - if mlock_headers { - unsafe { libc::mlock(mmap.as_ptr() as *const _, mmap.len()) }; - } - - // optimization, we don't care about the return code - #[cfg(all(unix, not(target_os = "macos")))] - unsafe { - libc::posix_fallocate( - std::os::fd::AsRawFd::as_raw_fd(&file), - 0, - HEADER_SIZE as i64, - ) - }; - - let header = unsafe { &mut *(mmap.as_ptr() as *mut ShardHeader) }; - header.metadata.magic = SHARD_FILE_MAGIC; - header.metadata.version = SHARD_FILE_VERSION; - - Ok(Self { file, mmap }) - } - - fn create(filename: impl AsRef, config: &InternalConfig) -> Result { - let file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) - .open(filename)?; - file.set_len( - HEADER_SIZE - + if config.truncate_up { - config.max_shard_size as u64 - } else { - 0 - }, - )?; - Self::new(file, config.mlock_headers) - } - - #[inline(always)] - fn header(&self) -> &ShardHeader { - unsafe { &*(self.mmap.as_ptr() as *const ShardHeader) } - } - #[inline(always)] - fn header_mut(&self) -> &mut ShardHeader { - unsafe { &mut *(self.mmap.as_ptr() as *mut ShardHeader) } - } - #[inline(always)] - fn row(&self, row_idx: usize) -> &ShardRow { - &self.header().rows.0[row_idx] - } - #[inline(always)] - fn row_mut(&self, row_idx: usize) -> &mut ShardRow { - &mut self.header_mut().rows.0[row_idx] - } - - // reading doesn't require holding any locks - we only ever extend the file, never overwrite data - fn _read_kv( - &self, - stats: &InternalStats, - offset_and_size: u64, - include_val: bool, - ) -> Result { - let klen = (offset_and_size >> 48) as usize; - debug_assert_eq!(klen >> 14, 0, "attempting to read a special key"); - let vlen = if include_val { - ((offset_and_size >> 32) & 0xffff) as usize - } else { - 0 - }; - let offset = (offset_and_size as u32) as u64; - let mut buf = vec![0u8; klen + vlen]; - read_exact_at(&self.file, &mut buf, HEADER_SIZE + offset)?; - - stats.num_read_bytes.fetch_add(buf.len(), Ordering::Relaxed); - stats.num_read_ops.fetch_add(1, Ordering::Relaxed); - - if include_val { - let val = buf[klen..klen + vlen].to_owned(); - buf.truncate(klen); - - Ok((buf, val)) - } else { - Ok((buf, vec![])) - } - } - - fn read_kv(&self, stats: &InternalStats, offset_and_size: u64) -> Result { - self._read_kv(stats, offset_and_size, true) - } - - // writing doesn't require holding any locks since we write with an offset - fn write_kv(&self, stats: &InternalStats, key: &[u8], val: &[u8]) -> Result { - let entry_size = key.len() + val.len(); - let mut buf = vec![0u8; entry_size]; - buf[..key.len()].copy_from_slice(key); - buf[key.len()..].copy_from_slice(val); - - // atomically allocate some area. it may leak if the IO below fails or if we crash before updating the - // offsets_and_size array, but we're okay with leaks - let write_offset = self - .header() - .write_offset - .fetch_add(buf.len() as u64, Ordering::SeqCst) as u64; - - // now writing can be non-atomic (pwrite) - write_all_at(&self.file, &buf, HEADER_SIZE + write_offset)?; - stats.add_entry(entry_size); - - Ok(((key.len() as u64) << 48) | ((val.len() as u64) << 32) | write_offset) - } -} - -struct TPHandle { - rx: crossbeam_channel::Receiver>, -} -impl TPHandle { - fn wait(&self) -> Result<()> { - self.rx.recv()? - } - fn finished(&self) -> bool { - !self.rx.is_empty() - } -} - -struct CompactionInfo { - config: Arc, - stats: Arc, - files: Arc)>>, - row_locks: Arc<[RwLock<()>; NUM_ROWS]>, - t0: Instant, - src_filename: PathBuf, - target_filename: PathBuf, -} - -pub(crate) struct CompactionThreadPool { - tx: crossbeam_channel::Sender>)>>, - threads: Vec>>, -} - -impl CompactionThreadPool { - pub fn new(num_threads: usize) -> Self { - let (tx, rx) = crossbeam_channel::unbounded::< - Option<(CompactionInfo, crossbeam_channel::Sender>)>, - >(); - let mut threads = Vec::with_capacity(num_threads); - for _ in 0..num_threads { - let rx = rx.clone(); - let handle = std::thread::spawn(move || { - for elem in rx.iter() { - let Some((info, handle_tx)) = elem else { - break; - }; - let res = Shard::background_compact(info); - handle_tx.send(res)?; - } - Ok(()) - }); - threads.push(handle); - } - - Self { tx, threads } - } - - fn submit(&self, info: CompactionInfo) -> Result { - let (tx, rx) = crossbeam_channel::bounded(1); - self.tx.send(Some((info, tx)))?; - Ok(TPHandle { rx }) - } - - #[allow(dead_code)] - pub fn terminate(self) -> Result<()> { - for _ in self.threads.iter() { - self.tx.send(None)?; - } - - for th in self.threads { - match th.join() { - Err(e) => std::panic::resume_unwind(e), - Ok(res) => res?, - } - } - Ok(()) - } -} - -#[derive(Debug, Clone)] -pub(crate) struct ShardStats { - pub write_offset: usize, - pub wasted_bytes: usize, - pub num_inserts: usize, - pub num_removals: usize, -} - -impl ShardStats { - pub(crate) fn num_items(&self) -> usize { - self.num_inserts - self.num_removals - } -} - -pub(crate) struct Shard { - pub(crate) span: Range, - pub(crate) config: Arc, - stats: Arc, - files: Arc)>>, - row_locks: Arc<[RwLock<()>; NUM_ROWS]>, - threadpool: Arc, - compaction_handle: Arc>>, - #[cfg(feature = "flush_aggregation")] - sync_agg_mutex: parking_lot::Mutex<()>, - #[cfg(feature = "flush_aggregation")] - in_sync_agg_delay: std::sync::atomic::AtomicBool, -} - -impl Shard { - pub(crate) const EXPECTED_CAPACITY: usize = (NUM_ROWS * ROW_WIDTH * 9) / 10; // ~ 29,500 - - pub(crate) fn open( - span: Range, - truncate: bool, - config: Arc, - stats: Arc, - threadpool: Arc, - ) -> Result { - let filename = config - .dir_path - .join(format!("shard_{:04x}-{:04x}", span.start, span.end)); - let mut file = OpenOptions::new() - .create(true) - .read(true) - .write(true) - .truncate(truncate) - .open(&filename)?; - - let mut file_size = file.metadata()?.len(); - if file_size != 0 { - let mut meta_header = MetaHeader::default(); - let sz = file.read(bytes_of_mut(&mut meta_header))?; - if sz != size_of::() - || meta_header.magic != SHARD_FILE_MAGIC - || meta_header.version != SHARD_FILE_VERSION - { - if config.clear_on_unsupported_version { - file.set_len(0)?; - file_size = 0; - } else { - bail!( - "{filename:?} unsupported magic={:?} version=0x{:016x} size={}", - meta_header.magic, - meta_header.version, - file_size, - ); - } - } - - if file_size != 0 && file_size < HEADER_SIZE { - if config.clear_on_unsupported_version { - file.set_len(0)?; - file_size = 0; - } else { - bail!("corrupt shard file (size={})", file_size); - } - } - } - - if file_size == 0 { - if config.truncate_up { - // when creating, set the file's length so that we won't need to extend it every time we write - // (saves on file metadata updates) - file.set_len(HEADER_SIZE + config.max_shard_size as u64)?; - } else { - file.set_len(HEADER_SIZE)?; - } - } - - let mut row_locks = Vec::with_capacity(NUM_ROWS); - for _ in 0..NUM_ROWS { - row_locks.push(RwLock::new(())); - } - let row_locks: [RwLock<()>; NUM_ROWS] = row_locks.try_into().unwrap(); - - let mut mmap_file = MmapFile::new(file, config.mlock_headers)?; - - let compacted_filename = config - .dir_path - .join(format!("compact_{:04x}-{:04x}", span.start, span.end)); - if truncate { - _ = std::fs::remove_file(compacted_filename); - } else { - if let Ok(compacted_file) = OpenOptions::new() - .read(true) - .write(true) - .open(&compacted_filename) - { - let target = MmapFile::new(compacted_file, config.mlock_headers)?; - Self::do_compaction(&row_locks, &mmap_file, &target, &stats, &config)?; - std::fs::rename(compacted_filename, filename)?; - mmap_file = target; - } - } - - Ok(Self { - span, - config, - stats, - files: Arc::new(RwLock::new((mmap_file, None))), - row_locks: Arc::new(row_locks), - threadpool, - compaction_handle: Arc::new(Mutex::new(None)), - #[cfg(feature = "flush_aggregation")] - sync_agg_mutex: parking_lot::Mutex::new(()), - #[cfg(feature = "flush_aggregation")] - in_sync_agg_delay: std::sync::atomic::AtomicBool::new(false), - }) - } - - fn new( - span: Range, - mmap_file: MmapFile, - config: Arc, - stats: Arc, - threadpool: Arc, - ) -> Result { - let mut row_locks = Vec::with_capacity(NUM_ROWS); - for _ in 0..NUM_ROWS { - row_locks.push(RwLock::new(())); - } - let row_locks: [RwLock<()>; NUM_ROWS] = row_locks.try_into().unwrap(); - - Ok(Self { - span, - config, - stats, - files: Arc::new(RwLock::new((mmap_file, None))), - row_locks: Arc::new(row_locks), - threadpool, - compaction_handle: Arc::new(Mutex::new(None)), - #[cfg(feature = "flush_aggregation")] - sync_agg_mutex: parking_lot::Mutex::new(()), - #[cfg(feature = "flush_aggregation")] - in_sync_agg_delay: std::sync::atomic::AtomicBool::new(false), - }) - } - - fn do_compaction( - row_locks: &[RwLock<()>; NUM_ROWS], - src: &MmapFile, - target: &MmapFile, - stats: &InternalStats, - config: &InternalConfig, - ) -> Result<()> { - let mut first_row = true; - loop { - let row_idx = target.header().compacted_up_to.load(Ordering::Acquire); - if row_idx >= NUM_ROWS { - break; - } - - let _row_guard = row_locks[row_idx].write(); - let src_row = src.row(row_idx); - let target_row = target.row_mut(row_idx); - let mut target_col = 0; - - for (src_col, &sig) in src_row.signatures.iter().enumerate() { - if sig == INVALID_SIG { - continue; - } - let (k, v) = src.read_kv(&stats, src_row.offsets_and_sizes[src_col])?; - - assert!( - first_row || target_row.signatures[target_col] == INVALID_SIG, - "row={row_idx} col={target_col} sig={}", - target_row.signatures[target_col] - ); - let ph = PartedHash::new(&config.hash_seed, &k); - assert_eq!(ph.row_selector(), row_idx); - target_row.offsets_and_sizes[target_col] = target.write_kv(&stats, &k, &v)?; - std::sync::atomic::fence(Ordering::SeqCst); - target_row.signatures[target_col] = ph.signature(); - target.header().num_inserts.fetch_add(1, Ordering::Relaxed); - target_col += 1; - } - - target - .header() - .compacted_up_to - .fetch_add(1, Ordering::Release); - first_row = false; - } - - Ok(()) - } - - pub(crate) fn flush(&self) -> Result<()> { - //self.mmap.flush()? -- fdatasync should take care of that as well - self.files.read().0.file.sync_data()?; - Ok(()) - } - - pub(crate) fn split(&self) -> Result<(Shard, Shard)> { - let mut handle_guard = self.compaction_handle.lock(); - if let Some(handle) = handle_guard.take() { - handle.wait()?; - } - - let files_guard = self.files.write(); - - let mid = (self.span.start + self.span.end) / 2; - - let t0 = Instant::now(); - - let bottom_filename = self - .config - .dir_path - .join(format!("bottom_{:04x}-{:04x}", self.span.start, mid)); - let top_filename = self - .config - .dir_path - .join(format!("top_{:04x}-{:04x}", mid, self.span.end)); - - let bottom_file = MmapFile::create(&bottom_filename, &self.config)?; - let top_file = MmapFile::create(&top_filename, &self.config)?; - - for (row_idx, src_row) in files_guard.0.header().rows.0.iter().enumerate() { - let mut bottom_col = 0; - let mut top_col = 0; - for (col, &sig) in src_row.signatures.iter().enumerate() { - if sig == INVALID_SIG { - continue; - } - let (k, v) = files_guard - .0 - .read_kv(&self.stats, src_row.offsets_and_sizes[col])?; - let ph = PartedHash::new(&self.config.hash_seed, &k); - assert_eq!(row_idx, ph.row_selector()); - - let (file, col) = if ph.shard_selector() < mid { - (&bottom_file, &mut bottom_col) - } else { - (&top_file, &mut top_col) - }; - - let target_row = file.row_mut(ph.row_selector()); - assert_eq!( - target_row.signatures[*col], INVALID_SIG, - "row={} col={} sig={}", - row_idx, *col, target_row.signatures[*col] - ); - target_row.offsets_and_sizes[*col] = file.write_kv(&self.stats, &k, &v)?; - std::sync::atomic::fence(Ordering::SeqCst); - target_row.signatures[*col] = ph.signature(); - file.header().num_inserts.fetch_add(1, Ordering::Relaxed); - *col += 1; - } - } - - std::fs::rename( - bottom_filename, - self.config - .dir_path - .join(format!("shard_{:04x}-{:04x}", self.span.start, mid,)), - )?; - std::fs::rename( - top_filename, - self.config - .dir_path - .join(format!("shard_{:04x}-{:04x}", mid, self.span.end)), - )?; - std::fs::remove_file(self.config.dir_path.join(format!( - "shard_{:04x}-{:04x}", - self.span.start, self.span.end - )))?; - - self.stats.report_split( - t0, - bottom_file.header().write_offset.load(Ordering::Relaxed), - top_file.header().write_offset.load(Ordering::Relaxed), - ); - - let bottom = Self::new( - self.span.start..mid, - bottom_file, - self.config.clone(), - self.stats.clone(), - self.threadpool.clone(), - )?; - let top = Self::new( - mid..self.span.end, - top_file, - self.config.clone(), - self.stats.clone(), - self.threadpool.clone(), - )?; - - Ok((bottom, top)) - } - - pub(crate) fn merge(bottom: &Shard, top: &Shard) -> Result> { - let bottom_files = bottom.files.write(); - let top_files = top.files.write(); - - let tmp_filename = bottom.config.dir_path.join(format!( - "merge_{:04x}-{:04x}", - bottom.span.start, top.span.end - )); - let mmap_file = MmapFile::create(&tmp_filename, &bottom.config)?; - - let combined = Shard::new( - bottom.span.start..top.span.end, - mmap_file, - bottom.config.clone(), - bottom.stats.clone(), - bottom.threadpool.clone(), - )?; - let combined_files = combined.files.write(); - - for row_idx in 0..NUM_ROWS { - let mut target_col = 0; - for files in [&bottom_files, &top_files] { - let src_row = &files.0.header().rows.0[row_idx]; - for (src_col, &sig) in src_row.signatures.iter().enumerate() { - if sig == INVALID_SIG { - continue; - } - let (k, v) = files - .0 - .read_kv(&combined.stats, src_row.offsets_and_sizes[src_col])?; - let ph = PartedHash::new(&combined.config.hash_seed, &k); - assert_eq!(row_idx, ph.row_selector()); - - let target_row = combined_files.0.row_mut(ph.row_selector()); - if target_col >= ROW_WIDTH { - // too many items fall in this row, we can't merge - std::fs::remove_file(tmp_filename)?; - return Ok(None); - } - assert_eq!( - target_row.signatures[target_col], INVALID_SIG, - "row={} target_col={} sig={}", - row_idx, target_col, target_row.signatures[target_col] - ); - target_row.offsets_and_sizes[target_col] = - combined_files.0.write_kv(&combined.stats, &k, &v)?; - std::sync::atomic::fence(Ordering::SeqCst); - target_row.signatures[target_col] = ph.signature(); - combined_files - .0 - .header() - .num_inserts - .fetch_add(1, Ordering::Relaxed); - target_col += 1; - } - } - } - - let dst_filename = combined.config.dir_path.join(format!( - "shard_{:04x}-{:04x}", - combined.span.start, combined.span.end - )); - let bottom_filename = combined.config.dir_path.join(format!( - "shard_{:04x}-{:04x}", - bottom.span.start, bottom.span.end - )); - let top_filename = combined - .config - .dir_path - .join(format!("shard_{:04x}-{:04x}", top.span.start, top.span.end)); - - std::fs::rename(tmp_filename, dst_filename)?; - std::fs::remove_file(bottom_filename)?; - std::fs::remove_file(top_filename)?; - - drop(combined_files); - - Ok(Some(combined)) - } - - fn operate_on_row( - &self, - row_idx: usize, - func: impl FnOnce(&MmapFile, &ShardRow) -> Result, - ) -> Result { - let files_guard = self.files.read(); - let _row_guard = self.row_locks[row_idx].read(); - let file = if let Some(ref target) = files_guard.1 { - if row_idx < target.header().compacted_up_to.load(Ordering::Acquire) { - target - } else { - &files_guard.0 - } - } else { - &files_guard.0 - }; - - func(file, file.row(row_idx)) - } - - fn operate_on_row_mut( - &self, - row_idx: usize, - func: impl FnOnce(&MmapFile, bool, RwLockWriteGuard<()>, &mut ShardRow) -> Result, - ) -> Result { - let files_guard = self.files.read(); - let row_guard = self.row_locks[row_idx].write(); - let file = if let Some(ref target) = files_guard.1 { - if row_idx < target.header().compacted_up_to.load(Ordering::Acquire) { - target - } else { - &files_guard.0 - } - } else { - &files_guard.0 - }; - - func( - &file, - files_guard.1.is_some(), - row_guard, - file.row_mut(row_idx), - ) - } - - pub(crate) fn read_at( - &self, - row_idx: usize, - entry_idx: usize, - include_val: bool, - ) -> Result> { - self.operate_on_row(row_idx, |file, row| { - if row.signatures[entry_idx] != INVALID_SIG { - Ok(Some(file._read_kv( - &self.stats, - row.offsets_and_sizes[entry_idx], - include_val, - )?)) - } else { - Ok(None) - } - }) - } - - pub(crate) fn get_by_hash(&self, ph: PartedHash) -> Result> { - self.operate_on_row(ph.row_selector(), |file, row| { - let mut first_time = true; - let mut kvs = Vec::with_capacity(1); - let mut start = 0; - while let Some(idx) = row.lookup(ph.signature(), &mut start) { - kvs.push(file.read_kv(&self.stats, row.offsets_and_sizes[idx])?); - if first_time { - self.stats - .num_positive_lookups - .fetch_add(1, Ordering::Relaxed); - first_time = false; - } - } - if kvs.is_empty() { - self.stats - .num_negative_lookups - .fetch_add(1, Ordering::Relaxed); - } - Ok(kvs) - }) - } - - pub(crate) fn get(&self, ph: PartedHash, key: &[u8]) -> Result>> { - self.operate_on_row(ph.row_selector(), |file, row| { - let mut start = 0; - while let Some(idx) = row.lookup(ph.signature(), &mut start) { - let (k, v) = file.read_kv(&self.stats, row.offsets_and_sizes[idx])?; - if key == k { - self.stats - .num_positive_lookups - .fetch_add(1, Ordering::Relaxed); - return Ok(Some(v)); - } - } - self.stats - .num_negative_lookups - .fetch_add(1, Ordering::Relaxed); - Ok(None) - }) - } - - #[cfg(feature = "flush_aggregation")] - fn flush_aggregation(&self, file: &MmapFile) -> Result<()> { - let Some(delay) = self.config.flush_aggregation_delay else { - return Ok(()); - }; - - let do_sync = || -> Result<()> { - self.in_sync_agg_delay.store(true, Ordering::SeqCst); - std::thread::sleep(delay); - self.in_sync_agg_delay.store(false, Ordering::SeqCst); - file.file.sync_data()?; - Ok(()) - }; - - if let Some(_guard) = self.sync_agg_mutex.try_lock() { - // we're the first ones here. wait for the aggregation duration and sync the file - do_sync()?; - } else { - // another thread is currently sync'ing, we're waiting in line. if the holder of the lock is in the - // sleep (aggregation) phase, we can just wait for it to finish and return -- the other thread will - // have sync'ed us by the time we got the lock. otherwise, we'll need to sync as well - let was_in_delay = self.in_sync_agg_delay.load(Ordering::Relaxed); - let _guard = self.sync_agg_mutex.lock(); - if !was_in_delay { - do_sync()?; - } - } - Ok(()) - } - - fn try_replace<'a>( - &'a self, - file: &MmapFile, - row_guard: RwLockWriteGuard<'a, ()>, - row: &mut ShardRow, - ph: PartedHash, - key: &[u8], - val: &[u8], - mode: InsertMode, - ) -> Result> { - let mut start = 0; - let mut had_collision = false; - while let Some(idx) = row.lookup(ph.signature(), &mut start) { - let (k, existing_val) = file.read_kv(&self.stats, row.offsets_and_sizes[idx])?; - if key != k { - had_collision = true; - continue; - } - match mode { - InsertMode::GetOrCreate => { - // no-op, key already exists - self.stats - .num_positive_lookups - .fetch_add(1, Ordering::Relaxed); - return Ok(TryReplaceStatus::KeyExistsNotReplaced(existing_val)); - } - InsertMode::Set => { - // fall through - } - InsertMode::Replace(expected_val) => { - if expected_val.is_some_and(|expected_val| expected_val != existing_val) { - return Ok(TryReplaceStatus::KeyExistsNotReplaced(existing_val)); - } - } - } - - // optimization - if val != existing_val { - row.offsets_and_sizes[idx] = file.write_kv(&self.stats, key, val)?; - file.header() - .wasted_bytes - .fetch_add((k.len() + existing_val.len()) as u64, Ordering::Relaxed); - self.stats.num_updates.fetch_add(1, Ordering::Relaxed); - #[cfg(feature = "flush_aggregation")] - { - drop(row_guard); - self.flush_aggregation(file)?; - } - } - return Ok(TryReplaceStatus::KeyExistsReplaced(existing_val)); - } - - Ok(TryReplaceStatus::KeyDoesNotExist(row_guard, had_collision)) - } - - fn wait_for_compaction(&self) -> Result<()> { - let mut handle_guard = self.compaction_handle.lock(); - if let Some(handle) = handle_guard.take() { - handle.wait()?; - } - Ok(()) - } - - fn begin_compaction(&self, min_write_offset: u64) -> Result<()> { - let mut handle_guard = self.compaction_handle.lock(); - let mut files_guard = self.files.write(); - - if files_guard.0.header().write_offset.load(Ordering::Relaxed) < min_write_offset { - // already compacted by someone else - return Ok(()); - } - - if files_guard.1.is_some() { - // if the compaction target exists and the thread is still running -- all good - if let Some(ref handle) = *handle_guard { - if !handle.finished() { - return Ok(()); - } - } else { - return Ok(()); - } - } - - // the thread could've crashed in the middle of a compaction, and here's the place to extract the error - if let Some(handle) = handle_guard.take() { - handle.wait()?; - } - - assert!(files_guard.1.is_none()); - - let t0 = Instant::now(); - let src_filename = self.config.dir_path.join(format!( - "shard_{:04x}-{:04x}", - self.span.start, self.span.end - )); - let target_filename = self.config.dir_path.join(format!( - "compact_{:04x}-{:04x}", - self.span.start, self.span.end - )); - let target = MmapFile::create(&target_filename, &self.config)?; - target.header().compacted_up_to.store(0, Ordering::Release); - files_guard.1 = Some(target); - - let handle = self.threadpool.submit(CompactionInfo { - files: self.files.clone(), - stats: self.stats.clone(), - row_locks: self.row_locks.clone(), - config: self.config.clone(), - t0, - src_filename, - target_filename, - })?; - *handle_guard = Some(handle); - - Ok(()) - } - - fn background_compact(info: CompactionInfo) -> Result<()> { - let mut files_guard = info.files.upgradable_read(); - let src = &files_guard.0; - let target = files_guard.1.as_ref().unwrap(); - - Self::do_compaction(&info.row_locks, src, target, &info.stats, &info.config)?; - - std::fs::rename(&info.target_filename, &info.src_filename)?; - - info.stats.report_compaction( - info.t0, - src.header().write_offset.load(Ordering::Relaxed), - target.header().write_offset.load(Ordering::Relaxed), - ); - - files_guard.with_upgraded(|files| { - files.0 = files.1.take().unwrap(); - }); - Ok(()) - } - - pub(crate) fn insert( - &self, - ph: PartedHash, - full_key: &[u8], - val: &[u8], - mode: InsertMode, - ) -> Result { - let mut should_compact = None; - - let status = - self.operate_on_row_mut(ph.row_selector(), |file, is_compacting, row_guard, row| { - if !is_compacting { - if file.header().wasted_bytes.load(Ordering::Relaxed) - >= self.config.min_compaction_threashold as u64 - { - should_compact = Some(file.header().write_offset.load(Ordering::Relaxed)); - } else if file.header().write_offset.load(Ordering::Relaxed) - + (full_key.len() + val.len()) as u64 - > self.config.max_shard_size as u64 - { - return Ok(InsertStatus::SplitNeeded); - } - } - - let status = self.try_replace(file, row_guard, row, ph, &full_key, val, mode)?; - match status { - TryReplaceStatus::KeyDoesNotExist(_guard, had_collision) => { - if matches!(mode, InsertMode::Replace(_)) { - return Ok(InsertStatus::KeyDoesNotExist); - } - - // find an empty slot - let mut start = 0; - if let Some(idx) = row.lookup(INVALID_SIG, &mut start) { - let new_off = file.write_kv(&self.stats, &full_key, val)?; - - // we don't want a reorder to happen here - first write the offset, then the signature - row.offsets_and_sizes[idx] = new_off; - std::sync::atomic::fence(Ordering::SeqCst); - row.signatures[idx] = ph.signature(); - if had_collision { - self.stats.num_collisions.fetch_add(1, Ordering::Relaxed); - } - file.header().num_inserts.fetch_add(1, Ordering::Relaxed); - #[cfg(feature = "flush_aggregation")] - { - drop(_guard); - self.flush_aggregation(file)?; - } - Ok(InsertStatus::Added) - } else { - // no room in this row, must split - Ok(InsertStatus::SplitNeeded) - } - } - TryReplaceStatus::KeyExistsNotReplaced(existing) => { - Ok(InsertStatus::AlreadyExists(existing)) - } - TryReplaceStatus::KeyExistsReplaced(existing) => { - Ok(InsertStatus::Replaced(existing)) - } - } - })?; - - if let Some(min_write_offset) = should_compact { - self.begin_compaction(min_write_offset)?; - } - Ok(status) - } - - pub(crate) fn remove(&self, ph: PartedHash, key: &[u8]) -> Result>> { - self.operate_on_row_mut(ph.row_selector(), |file, _, _guard, row| { - let mut start = 0; - - while let Some(idx) = row.lookup(ph.signature(), &mut start) { - let (k, v) = file.read_kv(&self.stats, row.offsets_and_sizes[idx])?; - if key == k { - row.signatures[idx] = INVALID_SIG; - // we managed to remove this key - file.header().num_removals.fetch_add(1, Ordering::Relaxed); - file.header() - .wasted_bytes - .fetch_add((k.len() + v.len()) as u64, Ordering::Relaxed); - #[cfg(feature = "flush_aggregation")] - { - drop(_guard); - self.flush_aggregation(file)?; - } - return Ok(Some(v)); - } - } - - Ok(None) - }) - } - - pub(crate) fn get_stats(&self) -> Result { - self.wait_for_compaction()?; - let files_guard = self.files.read(); - let hdr = files_guard.0.header(); - Ok(ShardStats { - write_offset: hdr.write_offset.load(Ordering::Relaxed) as usize, - wasted_bytes: hdr.wasted_bytes.load(Ordering::Relaxed) as usize, - num_inserts: hdr.num_inserts.load(Ordering::Relaxed) as usize, - num_removals: hdr.num_removals.load(Ordering::Relaxed) as usize, - }) - } -} - -impl Drop for Shard { - fn drop(&mut self) { - _ = self.wait_for_compaction(); - } -} diff --git a/src/stats.rs b/src/stats.rs deleted file mode 100644 index 5f9c24e..0000000 --- a/src/stats.rs +++ /dev/null @@ -1,245 +0,0 @@ -use std::{ - fmt::Display, - sync::atomic::{AtomicUsize, Ordering}, - time::{Duration, Instant}, -}; - -use parking_lot::Mutex; - -use crate::{router::ShardRouter, shard::HEADER_SIZE}; - -#[derive(Default, Debug, Clone)] -pub struct Stats { - pub num_shards: usize, - pub num_splits: usize, - pub num_compactions: usize, - pub last_split_stats: Vec<(Duration, u64, u64)>, - pub last_compaction_stats: Vec<(Duration, u64, u64)>, - - pub occupied_bytes: usize, - pub wasted_bytes: usize, - - pub num_inserts: usize, - pub num_updates: usize, - pub num_positive_lookups: usize, - pub num_negative_lookups: usize, - pub num_removals: usize, - pub num_collisions: usize, - - pub num_read_ops: usize, - pub num_read_bytes: usize, - pub num_write_ops: usize, - pub num_write_bytes: usize, - - pub entries_under_128: usize, - pub entries_under_1k: usize, - pub entries_under_8k: usize, - pub entries_under_32k: usize, - pub entries_over_32k: usize, -} - -impl Stats { - pub const FILE_HEADER_SIZE: usize = HEADER_SIZE as usize; - - pub fn data_bytes(&self) -> usize { - self.occupied_bytes - self.wasted_bytes - } - pub fn total_occupied_bytes(&self) -> usize { - self.num_shards * Self::FILE_HEADER_SIZE + self.occupied_bytes - } - pub fn num_entries(&self) -> usize { - self.num_inserts - self.num_removals - } - pub fn average_entry_size(&self) -> usize { - self.data_bytes() - .checked_div(self.num_entries()) - .unwrap_or(0) - } - - pub fn required_num_shards(&self) -> usize { - ShardRouter::calc_num_shards(self.num_entries()) as usize - } - pub fn should_merge_small_shards(&self) -> bool { - self.num_shards > self.required_num_shards() * 2 - } -} - -impl Display for Stats { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, - "sh={} [sp={} com={}] [occ={} wst={}] [ins={} updt={} +lkup={} -lkup={} rem={} coll={}] R={}/{}b W={}/{}b", - self.num_shards, self.num_splits, self.num_compactions, self.occupied_bytes, self.wasted_bytes, - self.num_inserts, self.num_updates, self.num_positive_lookups, self.num_negative_lookups, - self.num_removals, self.num_collisions, self.num_read_ops, self.num_read_bytes, self.num_write_ops, - self.num_write_bytes) - } -} - -#[derive(Debug, Clone)] -pub(crate) struct CyclicArr { - idx: usize, - arr: [T; N], -} -impl Default for CyclicArr { - fn default() -> Self { - Self { - idx: 0, - arr: [T::default(); N], - } - } -} -impl CyclicArr { - pub(crate) fn push(&mut self, val: T) { - self.arr[self.idx % N] = val; - self.idx += 1; - } - pub(crate) fn clear(&mut self) { - self.idx = 0; - for i in 0..N { - self.arr[i] = T::default(); - } - } - fn iter<'a>(&'a self) -> impl Iterator { - (self.idx.checked_sub(N).unwrap_or(0)..self.idx).map(|idx| &self.arr[idx % N]) - } -} - -#[test] -fn test_cyclic_arr() { - let mut arr = CyclicArr::::default(); - assert!(arr.iter().collect::>().is_empty()); - arr.push(1); - arr.push(2); - arr.push(3); - assert_eq!(arr.iter().collect::>(), vec![&1,&2,&3]); - arr.push(4); - arr.push(5); - arr.push(6); - arr.push(7); - arr.push(8); - assert_eq!(arr.iter().collect::>(), vec![&1,&2,&3,&4,&5,&6,&7,&8]); - arr.push(9); - arr.push(10); - arr.push(11); - assert_eq!(arr.iter().collect::>(), vec![&4,&5,&6,&7,&8,&9,&10,&11]); - arr.clear(); - arr.push(12); - arr.push(13); - arr.push(14); - assert_eq!(arr.iter().collect::>(), vec![&12,&13,&14]); - for i in 15u32..1000 { - arr.push(i); - } - assert_eq!(arr.iter().collect::>(), vec![&992,&993,&994,&995,&996,&997,&998,&999]); -} - -#[derive(Debug, Default)] -pub struct InternalStats { - pub(crate) num_splits: AtomicUsize, - pub(crate) num_compactions: AtomicUsize, - pub(crate) last_compaction_stats: Mutex>, - pub(crate) last_split_stats: Mutex>, - - pub(crate) num_updates: AtomicUsize, - pub(crate) num_positive_lookups: AtomicUsize, - pub(crate) num_negative_lookups: AtomicUsize, - pub(crate) num_collisions: AtomicUsize, - - pub(crate) num_read_ops: AtomicUsize, - pub(crate) num_read_bytes: AtomicUsize, - pub(crate) num_write_ops: AtomicUsize, - pub(crate) num_write_bytes: AtomicUsize, - - pub(crate) entries_under_128: AtomicUsize, - pub(crate) entries_under_1k: AtomicUsize, - pub(crate) entries_under_8k: AtomicUsize, - pub(crate) entries_under_32k: AtomicUsize, - pub(crate) entries_over_32k: AtomicUsize, -} - -impl InternalStats { - pub(crate) fn add_entry(&self, sz: usize) { - self.num_write_bytes.fetch_add(sz, Ordering::Relaxed); - self.num_write_ops.fetch_add(1, Ordering::Relaxed); - match sz { - 0..128 => self.entries_under_128.fetch_add(1, Ordering::Relaxed), - 128..1024 => self.entries_under_1k.fetch_add(1, Ordering::Relaxed), - 1024..8192 => self.entries_under_8k.fetch_add(1, Ordering::Relaxed), - 8192..32768 => self.entries_under_32k.fetch_add(1, Ordering::Relaxed), - _ => self.entries_over_32k.fetch_add(1, Ordering::Relaxed), - }; - } - - pub(crate) fn report_split(&self, t0: Instant, bottom_size: u64, top_size: u64) { - let dur = Instant::now().duration_since(t0); - self.num_splits.fetch_add(1, Ordering::Relaxed); - self.last_split_stats - .lock() - .push((dur, bottom_size, top_size)); - } - - pub(crate) fn report_compaction(&self, t0: Instant, prev_size: u64, new_size: u64) { - let dur = Instant::now().duration_since(t0); - self.num_compactions.fetch_add(1, Ordering::Relaxed); - self.last_compaction_stats - .lock() - .push((dur, prev_size, new_size)); - } - - pub(crate) fn clear(&self) { - // store 0 in every stats... - - self.num_splits.store(0, Ordering::SeqCst); - self.num_compactions.store(0, Ordering::SeqCst); - self.last_split_stats.lock().clear(); - self.last_compaction_stats.lock().clear(); - - self.num_updates.store(0, Ordering::SeqCst); - self.num_positive_lookups.store(0, Ordering::SeqCst); - self.num_negative_lookups.store(0, Ordering::SeqCst); - self.num_collisions.store(0, Ordering::SeqCst); - - self.num_read_ops.store(0, Ordering::SeqCst); - self.num_read_bytes.store(0, Ordering::SeqCst); - self.num_write_ops.store(0, Ordering::SeqCst); - self.num_write_bytes.store(0, Ordering::SeqCst); - - self.entries_under_128.store(0, Ordering::SeqCst); - self.entries_under_1k.store(0, Ordering::SeqCst); - self.entries_under_8k.store(0, Ordering::SeqCst); - self.entries_under_32k.store(0, Ordering::SeqCst); - self.entries_over_32k.store(0, Ordering::SeqCst); - } - - pub(crate) fn fill_stats(&self, stats: &mut Stats) { - stats.num_splits = self.num_splits.load(Ordering::Relaxed); - stats.num_compactions = self.num_compactions.load(Ordering::Relaxed); - - { - let mut guard = self.last_split_stats.lock(); - stats.last_split_stats = guard.iter().copied().collect::>(); - guard.clear(); - } - { - let mut guard = self.last_compaction_stats.lock(); - stats.last_compaction_stats = guard.iter().copied().collect::>(); - guard.clear(); - } - - stats.num_updates = self.num_updates.load(Ordering::Relaxed); - stats.num_positive_lookups = self.num_positive_lookups.load(Ordering::Relaxed); - stats.num_negative_lookups = self.num_negative_lookups.load(Ordering::Relaxed); - stats.num_collisions = self.num_collisions.load(Ordering::Relaxed); - - stats.num_read_ops = self.num_read_ops.load(Ordering::Relaxed); - stats.num_read_bytes = self.num_read_bytes.load(Ordering::Relaxed); - stats.num_write_ops = self.num_write_ops.load(Ordering::Relaxed); - stats.num_write_bytes = self.num_write_bytes.load(Ordering::Relaxed); - - stats.entries_under_128 = self.entries_under_128.load(Ordering::Relaxed); - stats.entries_under_1k = self.entries_under_1k.load(Ordering::Relaxed); - stats.entries_under_8k = self.entries_under_8k.load(Ordering::Relaxed); - stats.entries_under_32k = self.entries_under_32k.load(Ordering::Relaxed); - stats.entries_over_32k = self.entries_over_32k.load(Ordering::Relaxed); - } -} diff --git a/src/store.rs b/src/store.rs index 00e2d73..95dbd13 100644 --- a/src/store.rs +++ b/src/store.rs @@ -1,622 +1,1696 @@ -use anyhow::{anyhow, bail, ensure}; -use bytemuck::{bytes_of, from_bytes}; -use fslock::LockFile; -use parking_lot::Mutex; +mod checkpoint; +mod compaction; +mod list; +mod open; +mod queue; +mod recovery; +mod typed; + +use parking_lot::{Condvar, Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; +use siphasher::sip::SipHasher13; + use std::{ + collections::HashMap, + hash::Hasher, path::{Path, PathBuf}, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicBool, AtomicI64, AtomicU16, AtomicU32, AtomicU64, Ordering}, + }, + time::Duration, }; use crate::{ - hashing::{HashSeed, PartedHash}, - router::ShardRouter, - shard::{CompactionThreadPool, InsertMode, InsertStatus, KVPair}, - Stats, MAX_KEY_SIZE, MAX_TOTAL_VALUE_SIZE, + data_file::{DataFile, InflightTracker}, + index_file::{EntryPointer, IndexFile, RowLayout, RowReadGuard, RowWriteGuard}, + internal::{ + EntryType, HashCoord, KeyNamespace, MAX_DATA_FILE_IDX, MAX_DATA_FILES, MIN_SPLIT_LEVEL, + ROW_WIDTH, RangeMetadata, aligned_data_entry_size, aligned_data_entry_waste, + aligned_tombstone_entry_waste, index_file_path, index_rows_file_path, sync_dir, + }, + types::{ + Config, Error, GetOrCreateStatus, INITIAL_DATA_FILE_ORDINAL, ReplaceStatus, Result, Stats, + }, }; -use crate::{ - shard::{NUM_ROWS, ROW_WIDTH}, - stats::InternalStats, -}; - -use crate::{CandyError, Config, Result, MAX_TOTAL_KEY_SIZE, MAX_VALUE_SIZE}; -pub(crate) const USER_NAMESPACE: &[u8] = &[1]; -pub(crate) const TYPED_NAMESPACE: &[u8] = &[2]; -pub(crate) const LIST_NAMESPACE: &[u8] = &[3]; -pub(crate) const ITEM_NAMESPACE: &[u8] = &[4]; -pub(crate) const CHAIN_NAMESPACE: u8 = 5; -pub(crate) const QUEUE_NAMESPACE: &[u8] = &[6]; -pub(crate) const QUEUE_ITEM_NAMESPACE: &[u8] = &[7]; +#[derive(Default)] +struct CompactionState { + wake_requested: bool, +} +// this is needed because std::io::Error is not clone() #[derive(Debug, Clone)] -pub(crate) struct InternalConfig { - pub dir_path: PathBuf, - pub max_shard_size: u32, - pub min_compaction_threashold: u32, - pub hash_seed: HashSeed, - pub expected_number_of_keys: usize, - pub max_concurrent_list_ops: u32, - pub truncate_up: bool, - pub clear_on_unsupported_version: bool, - pub mlock_headers: bool, - pub num_compaction_threads: usize, - #[cfg(feature = "flush_aggregation")] - pub flush_aggregation_delay: Option, +enum CheckpointFailure { + IO(std::io::ErrorKind, String), + MissingDataFile(u16), + Other(String), } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum ReplaceStatus { - PrevValue(Vec), - WrongValue(Vec), - DoesNotExist, -} -impl ReplaceStatus { - pub fn was_replaced(&self) -> bool { - matches!(*self, Self::PrevValue(_)) - } - pub fn failed(&self) -> bool { - !matches!(*self, Self::PrevValue(_)) - } - pub fn is_key_missing(&self) -> bool { - matches!(*self, Self::DoesNotExist) +impl CheckpointFailure { + fn from_error(err: Error) -> Self { + match err { + Error::IOError(io_err) => Self::IO(io_err.kind(), io_err.to_string()), + Error::MissingDataFile(file_idx) => Self::MissingDataFile(file_idx), + other => Self::Other(other.to_string()), + } } - pub fn is_wrong_value(&self) -> bool { - matches!(*self, Self::WrongValue(_)) + + fn to_error(&self) -> Error { + match self { + Self::IO(kind, message) => Error::IOError(std::io::Error::new(*kind, message.clone())), + Self::MissingDataFile(file_idx) => Error::MissingDataFile(*file_idx), + Self::Other(message) => Error::IOError(std::io::Error::other(message.clone())), + } } } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum SetStatus { - PrevValue(Vec), - CreatedNew, +#[derive(Default)] +struct CheckpointState { + requested_epoch: u64, + handled_epoch: u64, + completed_epoch: u64, + last_failure_epoch: u64, + last_failure: Option, + last_checkpoint_dur_ms: u64, } -impl SetStatus { - pub fn was_created(&self) -> bool { - matches!(*self, Self::CreatedNew) - } - pub fn was_replaced(&self) -> bool { - matches!(*self, Self::PrevValue(_)) - } + +#[derive(Clone, Copy)] +struct CheckpointSnapshot { + checkpoint_ordinal: u64, + checkpoint_offset: u64, + checkpointed_delta: i64, + last_commit_ordinal: u64, + last_commit_offset: u64, } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum GetOrCreateStatus { - ExistingValue(Vec), - CreatedNew(Vec), +#[derive(Default)] +struct InnerStats { + num_compactions: AtomicU64, + compaction_errors: AtomicU64, + checkpoint_errors: AtomicU64, + num_positive_lookups: AtomicU64, + num_negative_lookups: AtomicU64, + num_collisions: AtomicU64, + last_remap_dur_ms: AtomicU64, + last_compaction_dur_ms: AtomicU64, + last_compaction_reclaimed_bytes: AtomicU32, + last_compaction_moved_bytes: AtomicU32, + num_read_ops: AtomicU64, + num_read_bytes: AtomicU64, + num_write_ops: AtomicU64, + num_write_bytes: AtomicU64, + num_inserted: AtomicU64, + num_updated: AtomicU64, + num_removed: AtomicU64, + num_rebuilt_entries: AtomicU64, + num_rebuild_purged_bytes: AtomicU64, + size_histogram: [AtomicU64; 6], } -impl GetOrCreateStatus { - pub fn was_created(&self) -> bool { - matches!(*self, Self::CreatedNew(_)) - } - pub fn already_exists(&self) -> bool { - matches!(*self, Self::ExistingValue(_)) - } - pub fn value(self) -> Vec { - match self { - Self::CreatedNew(val) => val, - Self::ExistingValue(val) => val, + +impl InnerStats { + fn reset(&self) { + self.num_compactions.store(0, Ordering::Relaxed); + self.compaction_errors.store(0, Ordering::Relaxed); + self.checkpoint_errors.store(0, Ordering::Relaxed); + self.num_positive_lookups.store(0, Ordering::Relaxed); + self.num_negative_lookups.store(0, Ordering::Relaxed); + self.num_collisions.store(0, Ordering::Relaxed); + self.last_remap_dur_ms.store(0, Ordering::Relaxed); + self.last_compaction_dur_ms.store(0, Ordering::Relaxed); + self.last_compaction_reclaimed_bytes + .store(0, Ordering::Relaxed); + self.last_compaction_moved_bytes.store(0, Ordering::Relaxed); + self.num_read_ops.store(0, Ordering::Relaxed); + self.num_read_bytes.store(0, Ordering::Relaxed); + self.num_write_ops.store(0, Ordering::Relaxed); + self.num_write_bytes.store(0, Ordering::Relaxed); + self.num_inserted.store(0, Ordering::Relaxed); + self.num_removed.store(0, Ordering::Relaxed); + self.num_updated.store(0, Ordering::Relaxed); + self.num_rebuilt_entries.store(0, Ordering::Relaxed); + self.num_rebuild_purged_bytes.store(0, Ordering::Relaxed); + for bucket in &self.size_histogram { + bucket.store(0, Ordering::Relaxed); } } } -/// The CandyStore object. Note that it's fully sync'ed, so can be shared between threads using `Arc` + +struct StoreInner { + base_path: PathBuf, + config: Arc, + index_file: IndexFile, + list_meta_locks: Vec>, + list_meta_locks_mask: usize, + data_files: RwLock>>, + inflight_tracker: InflightTracker, + active_file_idx: AtomicU16, + active_file_ordinal: AtomicU64, + uncommitted_entries_delta: AtomicI64, + checkpoint_state: Mutex, + checkpoint_condvar: Condvar, + checkpoint_shutting_down: AtomicBool, + rotation_lock: Mutex<()>, + compaction_state: Mutex, + compaction_condvar: Condvar, + compaction_shutting_down: AtomicBool, + stats: InnerStats, +} + +struct ExistingEntryUpdate<'a> { + files: &'a HashMap>, + ns: KeyNamespace, + key: &'a [u8], + val: &'a [u8], + hc: HashCoord, + col: usize, + shard_idx: usize, + src_file_idx: u16, + old_klen: usize, + old_vlen: usize, + crash_point_name: Option<&'a str>, +} + +/// A persistent key-value store backed by append-only data files and a mutable index. pub struct CandyStore { - pub(crate) root: ShardRouter, - pub(crate) config: Arc, - // locks for complicated operations - pub(crate) keyed_locks_mask: u32, - pub(crate) keyed_locks: Vec>, - _lockfile: LockFile, - stats: Arc, - //threadpool: Arc, + inner: Arc, + _lockfile: fslock::LockFile, + compaction_thd: Mutex>>, + checkpoint_thd: Mutex>>, + allow_clean_shutdown: AtomicBool, } -/// An iterator over a CandyStore. Note that it's safe to modify (insert/delete) keys while iterating, -/// but the results of the iteration may or may not include these changes. This is considered a -/// well-defined behavior of the store. -pub struct CandyStoreIterator<'a> { - store: &'a CandyStore, - shard_selector: u32, - row_idx: usize, - entry_idx: usize, - raw: bool, - include_val: bool, +pub use list::{KVPair, ListIterator}; +pub use typed::{CandyTypedDeque, CandyTypedKey, CandyTypedList, CandyTypedStore}; + +pub(super) struct OpenState { + index_file: IndexFile, + data_files: HashMap>, + active_file_idx: u16, + active_file_ordinal: u64, } -impl<'a> CandyStoreIterator<'a> { - fn new(store: &'a CandyStore, raw: bool, include_val: bool) -> Self { +impl StoreInner { + fn new( + base_path: PathBuf, + config: Arc, + state: OpenState, + num_logical_locks: usize, + ) -> Self { + let num_shards = state.index_file.num_shards(); Self { - store, - shard_selector: 0, - row_idx: 0, - entry_idx: 0, - raw, - include_val, + base_path, + config, + index_file: state.index_file, + list_meta_locks: (0..num_logical_locks).map(|_| RwLock::new(())).collect(), + list_meta_locks_mask: num_logical_locks - 1, + data_files: RwLock::new(state.data_files), + inflight_tracker: InflightTracker::new(num_shards), + active_file_idx: AtomicU16::new(state.active_file_idx), + active_file_ordinal: AtomicU64::new(state.active_file_ordinal), + uncommitted_entries_delta: AtomicI64::new(0), + checkpoint_state: Mutex::new(CheckpointState::default()), + checkpoint_condvar: Condvar::new(), + checkpoint_shutting_down: AtomicBool::new(false), + rotation_lock: Mutex::new(()), + compaction_state: Mutex::new(CompactionState::default()), + compaction_condvar: Condvar::new(), + compaction_shutting_down: AtomicBool::new(false), + stats: InnerStats::default(), } } - /// Returns the cookie of the next item in the store. This can be used later to construct an iterator - /// that starts at the given point. - pub fn cookie(&self) -> u64 { - ((self.shard_selector as u64 & 0xffff) << 32) - | ((self.row_idx as u64 & 0xffff) << 16) - | (self.entry_idx as u64 & 0xffff) + fn reset(&self) -> Result<()> { + let _logical_guards = self + .list_meta_locks + .iter() + .map(|lock| lock.write()) + .collect::>(); + let row_table = self.index_file.rows_table_mut(); + let _rotation_lock = self.rotation_lock.lock(); + let mut data_files = self.data_files.write(); + + data_files.clear(); + self.inflight_tracker.clear_all(); + self.index_file.reset(row_table)?; + + let index_path = index_file_path(self.base_path.as_path()); + let rows_path = index_rows_file_path(self.base_path.as_path()); + let mut removed_any = false; + for entry in std::fs::read_dir(&self.base_path).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + if path.file_name().and_then(|name| name.to_str()) == Some(".lockfile") + || path == index_path + || path == rows_path + { + continue; + } + + let file_type = entry.file_type().map_err(Error::IOError)?; + if file_type.is_dir() { + std::fs::remove_dir_all(&path).map_err(Error::IOError)?; + removed_any = true; + } else if file_type.is_file() || file_type.is_symlink() { + std::fs::remove_file(&path).map_err(Error::IOError)?; + removed_any = true; + } + } + if removed_any { + sync_dir(self.base_path.as_path())?; + } + + let active_file_idx = 0; + let active_file_ordinal = INITIAL_DATA_FILE_ORDINAL; + let data_file = Arc::new(DataFile::create( + self.base_path.as_path(), + self.config.clone(), + active_file_idx, + active_file_ordinal, + )?); + data_files.insert(active_file_idx, data_file); + self.active_file_idx + .store(active_file_idx, Ordering::Release); + self.active_file_ordinal + .store(active_file_ordinal, Ordering::Release); + self.uncommitted_entries_delta.store(0, Ordering::Relaxed); + *self.checkpoint_state.lock() = CheckpointState::default(); + self.stats.reset(); + + Ok(()) } - // Constructs an iterator starting at the given cookie - pub fn from_cookie(store: &'a CandyStore, cookie: u64, raw: bool, include_val: bool) -> Self { - Self { - store, - shard_selector: ((cookie >> 32) & 0xffff) as u32, - row_idx: ((cookie >> 16) & 0xffff) as usize, - entry_idx: (cookie & 0xffff) as usize, - raw, - include_val, + fn record_lookup(&self, found: bool) { + if found { + self.stats + .num_positive_lookups + .fetch_add(1, Ordering::Relaxed); + } else { + self.stats + .num_negative_lookups + .fetch_add(1, Ordering::Relaxed); } } -} -impl<'a> Iterator for CandyStoreIterator<'a> { - type Item = Result; + fn record_read(&self, bytes: u64) { + self.stats.num_read_ops.fetch_add(1, Ordering::Relaxed); + self.stats + .num_read_bytes + .fetch_add(bytes, Ordering::Relaxed); + } - fn next(&mut self) -> Option { - while self.shard_selector < ShardRouter::END_OF_SHARDS { - let res = self.store.root.shared_op(self.shard_selector, |sh| { - while self.row_idx < NUM_ROWS { - let row_idx = self.row_idx; - let entry_idx = self.entry_idx; + fn record_write(&self, offset: u64, bytes: u64) { + self.stats.num_write_ops.fetch_add(1, Ordering::Relaxed); + self.stats + .num_write_bytes + .fetch_add(bytes, Ordering::Relaxed); + self.note_checkpoint_write(offset + bytes); + } - self.entry_idx += 1; - if self.entry_idx >= ROW_WIDTH { - self.entry_idx = 0; - self.row_idx += 1; - } + fn signal_compaction_scan(&self) { + let mut state = self.compaction_state.lock(); + if state.wake_requested { + return; + } + state.wake_requested = true; + self.compaction_condvar.notify_one(); + } - let Some((mut k, v)) = sh.read_at(row_idx, entry_idx, self.include_val)? else { - continue; - }; - if self.raw { - return Ok((sh.span.start, Some((k, v)))); - } else if k.ends_with(USER_NAMESPACE) { - k.truncate(k.len() - USER_NAMESPACE.len()); - return Ok((sh.span.start, Some((k, v)))); - } - } + fn maybe_signal_compaction_threshold_crossing( + &self, + file_idx: u16, + previous_waste: u32, + new_waste: u32, + ) { + if file_idx == self.active_file_idx.load(Ordering::Acquire) { + return; + } - self.entry_idx = 0; - self.row_idx = 0; - Ok((sh.span.end, None)) - }); + let threshold = self.config.compaction_min_threshold; + if previous_waste <= threshold && new_waste > threshold { + self.signal_compaction_scan(); + } + } - match res { - Ok((shard_selector, kv)) => { - self.shard_selector = shard_selector; - if let Some(kv) = kv { - return Some(Ok(kv)); - } - // continue + fn next_compaction_candidates(&self, max_candidates: usize) -> Vec<(u16, u64)> { + let active_file_idx = self.active_file_idx.load(Ordering::Acquire); + let commit_file_ordinal = self.index_file.checkpoint_cursor().0; + let files = self.data_files.read(); + let mut candidates = files + .iter() + .filter_map(|(&file_idx, data_file)| { + if file_idx == active_file_idx + || data_file.file_ordinal >= commit_file_ordinal + || self.index_file.file_waste(file_idx) <= self.config.compaction_min_threshold + { + return None; } - Err(e) => return Some(Err(e)), - } - } + Some(( + file_idx, + data_file.file_ordinal, + self.index_file.file_waste(file_idx), + )) + }) + .collect::>(); + candidates.sort_by(|left, right| { + right + .2 + .cmp(&left.2) + .then_with(|| left.1.cmp(&right.1)) + .then_with(|| left.0.cmp(&right.0)) + }); + candidates + .into_iter() + .take(max_candidates) + .map(|(file_idx, file_ordinal, _)| (file_idx, file_ordinal)) + .collect() + } - None + fn logical_lock_index(&self, ns: KeyNamespace, key: &[u8]) -> usize { + let mut hasher = SipHasher13::new_with_keys(0x1701_0a66_2024_6b90, 0x284f_fa2e_3e02_3e2a); + hasher.write_u8(ns as u8); + hasher.write(key); + (hasher.finish() as usize) & self.list_meta_locks_mask } -} -impl CandyStore { - /// Opens or creates a new CandyStore. - /// * dir_path - the directory where shards will be kept - /// * config - the configuration options for the store - pub fn open(dir_path: impl AsRef, config: Config) -> Result { - let config = Arc::new(InternalConfig { - dir_path: dir_path.as_ref().to_path_buf(), - expected_number_of_keys: config.expected_number_of_keys, - hash_seed: config.hash_seed, - max_concurrent_list_ops: config.max_concurrent_list_ops, - max_shard_size: config.max_shard_size, - min_compaction_threashold: config.min_compaction_threashold, - truncate_up: config.truncate_up, - clear_on_unsupported_version: config.clear_on_unsupported_version, - mlock_headers: config.mlock_headers, - num_compaction_threads: config.num_compaction_threads, - #[cfg(feature = "flush_aggregation")] - flush_aggregation_delay: config.flush_aggregation_delay, - }); + fn data_file(&self, file_idx: u16) -> Result> { + self.data_files + .read() + .get(&file_idx) + .cloned() + .ok_or(Error::MissingDataFile(file_idx)) + } - std::fs::create_dir_all(dir_path)?; - let lockfilename = config.dir_path.join(".lock"); - let mut lockfile = LockFile::open(&lockfilename)?; - if !lockfile.try_lock_with_pid()? { - let (pid, comm, stat) = if let Ok(mut pid) = std::fs::read_to_string(&lockfilename) { - // this may fail on non-linux OSs, but we default to "?" anyway - pid = pid.trim().to_owned(); - let exe: String = std::fs::read_link(format!("/proc/{pid}/exe")) - .unwrap_or("?".into()) - .to_string_lossy() - .to_string() - .to_owned(); - - let stat: String = std::fs::read_link(format!("/proc/{pid}/stat")) - .unwrap_or("?".into()) - .to_string_lossy() - .to_string() - .to_owned(); - - (pid, exe, stat) - } else { - ("?".into(), "?".into(), "?".into()) - }; + fn ordered_data_files(&self) -> Vec> { + let mut files = self.data_files.read().values().cloned().collect::>(); + files.sort_by_key(|data_file| data_file.file_ordinal); + files + } - bail!( - "Lock file {lockfilename:?} is held by pid {:?} exe={:?} stat {:?}", - pid, - comm, - stat - ); + fn bump_histogram(&self, entry_size: u64) { + // Buckets: [<64, <256, <1K, <4K, <16K, >=16K] + // Boundaries at ilog2 = 6, 8, 10, 12, 14 → bucket = ((ilog2 - 4) / 2).clamp(0, 5) + let bucket = ((entry_size.max(1).ilog2() as usize).saturating_sub(4) / 2).min(5); + self.stats.size_histogram[bucket].fetch_add(1, Ordering::Relaxed); + } + + fn add_uncommitted_num_entries(&self, delta: i64) { + self.uncommitted_entries_delta + .fetch_add(delta, Ordering::Relaxed); + } + + /// Applies `delta` to the persisted committed entry count, clamping at + /// zero. Returns the actual change applied (which may differ from `delta` + /// when the count would underflow). + fn advance_committed_num_entries(&self, delta: i64) -> i64 { + if delta == 0 { + return 0; } - let mut num_keyed_locks = config.max_concurrent_list_ops.max(4); - if !num_keyed_locks.is_power_of_two() { - num_keyed_locks = 1 << (num_keyed_locks.ilog2() + 1); + let committed = &self.index_file.header_ref().committed_num_entries; + let mut current = committed.load(Ordering::Relaxed); + loop { + let updated = current.saturating_add_signed(delta); + match committed.compare_exchange_weak( + current, + updated, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => return updated as i64 - current as i64, + Err(observed) => current = observed, + } } + } - let mut keyed_locks = vec![]; - for _ in 0..num_keyed_locks { - keyed_locks.push(Mutex::new(())); + /// Folds a checkpointed delta into the persisted committed count and + /// adjusts the runtime uncommitted delta so that + /// `committed + uncommitted == live_count` is preserved. + /// + /// When many inserts and removes of the same keys happen within one + /// checkpoint window, the drained delta can be more negative than + /// `committed` can absorb (since it is unsigned). In that case only + /// the clamped portion is applied and the remainder stays in + /// `uncommitted_entries_delta`. + fn fold_checkpointed_num_entries(&self, delta: i64) { + if delta == 0 { + return; } - let stats = Arc::new(InternalStats::default()); - let threadpool = Arc::new(CompactionThreadPool::new(config.num_compaction_threads)); - let root = ShardRouter::new(config.clone(), stats.clone(), threadpool.clone())?; + let actual = self.advance_committed_num_entries(delta); + self.uncommitted_entries_delta + .fetch_add(-actual, Ordering::Relaxed); + } - Ok(Self { - config, - root, - keyed_locks_mask: num_keyed_locks - 1, - keyed_locks, - _lockfile: lockfile, - stats, - //threadpool, + fn persist_checkpoint_cursor(&self, ordinal: u64, offset: u64) { + self.index_file.persist_checkpoint_cursor(ordinal, offset); + } + + fn perform_checkpoint(&self) -> Result<()> { + let snapshot = self.snapshot_checkpoint_progress()?; + let current_cursor = self.index_file.checkpoint_cursor(); + if snapshot.checkpoint_ordinal == current_cursor.0 + && snapshot.checkpoint_offset == current_cursor.1 + && snapshot.checkpointed_delta == 0 + { + return Ok(()); + } + self.sync_checkpoint(snapshot) + } + + fn snapshot_checkpoint_progress(&self) -> Result { + let files = self.data_files.read(); + let active_idx = self.active_file_idx.load(Ordering::Acquire); + let active_file = files + .get(&active_idx) + .cloned() + .ok_or(Error::MissingDataFile(active_idx))?; + let (checkpoint_ordinal, checkpoint_offset, checkpointed_delta) = + self.inflight_tracker.checkpoint_progress(&active_file); + let (last_commit_ordinal, last_commit_offset) = self.index_file.checkpoint_cursor(); + Ok(CheckpointSnapshot { + checkpoint_ordinal, + checkpoint_offset, + checkpointed_delta, + last_commit_ordinal, + last_commit_offset, }) } - /// returns the directory where shards are kept - pub fn get_shards_directory(&self) -> &Path { - &self.config.dir_path + fn sync_checkpoint(&self, snap: CheckpointSnapshot) -> Result<()> { + let files = self.data_files.read(); + for data_file in files.values() { + if data_file.file_ordinal > snap.last_commit_ordinal { + data_file.sync_to_current()?; + } else if data_file.file_ordinal == snap.last_commit_ordinal { + data_file.sync_data(snap.last_commit_offset, data_file.used_bytes())?; + } + } + drop(files); + + self.fold_checkpointed_num_entries(snap.checkpointed_delta); + self.persist_checkpoint_cursor(snap.checkpoint_ordinal, snap.checkpoint_offset); + self.index_file.sync_all()?; + sync_dir(&self.base_path) } - /// Syncs all in-memory changes of all shards to disk. Concurrent changes are allowed while - /// flushing, and may result in partially-sync'ed store. Use sparingly, as this is a costly operaton. - pub fn flush(&self) -> Result<()> { - self.root.call_on_all_shards(|sh| sh.flush())?; - Ok(()) + fn perform_checkpoint_with_logical_locks(&self) -> Result<()> { + let _logical_guards = self + .list_meta_locks + .iter() + .map(|lock| lock.write()) + .collect::>(); + self.perform_checkpoint() } + fn _split_row(&self, hc: HashCoord, sl: u64, gsl: u64) -> Result<()> { + let nsl = sl + 1; + let low_row_idx = hc.row_index(sl); + let high_row_idx = low_row_idx | (1 << sl); + + if nsl > gsl + && let Some(remap_dur) = self.index_file.grow(nsl)? + { + self.stats.last_remap_dur_ms.store( + u64::try_from(remap_dur.as_millis()).unwrap_or(u64::MAX), + Ordering::Relaxed, + ); + } + + let rows_table = self.index_file.rows_table(); + + let low_shard = rows_table.shard_id(low_row_idx); + let high_shard = rows_table.shard_id(high_row_idx); + debug_assert!( + low_shard <= high_shard, + "high_row_idx sets a higher bit, so high_shard >= low_shard" + ); + + let mut low_row = rows_table.row_mut(low_row_idx); + + let _high_guard = if low_shard < high_shard { + Some(rows_table.lock_shard(high_shard)) + } else { + None + }; - /// Clears the store (erasing all keys), and removing all shard files - pub fn clear(&self) -> Result<()> { - self.root.clear()?; - self.stats.clear(); + if low_row.split_level.load(Ordering::Acquire) != sl { + return Ok(()); + } + // SAFETY: the high row (being created) has a split_level of 0, making it unusable by anyone. + // We properly hold the high_row shard lock if it differs from the low_row shard. + let high_row = unsafe { &mut *rows_table.unlocked_row_ptr(high_row_idx) }; + debug_assert_eq!(high_row.split_level.load(Ordering::Acquire), 0); + let split_bit = 1 << (sl - MIN_SPLIT_LEVEL as u64); + for col in 0..ROW_WIDTH { + let entry = low_row.pointers[col]; + if low_row.signatures[col] != HashCoord::INVALID_SIG + && entry.is_valid() + && (entry.masked_row_selector() as u64) & split_bit != 0 + { + high_row.insert(col, low_row.signatures[col], entry); + low_row.remove(col); + } + } + + low_row.set_split_level(nsl); + high_row.set_split_level(nsl); Ok(()) } - pub(crate) fn ensure_sizes(key: &[u8], val: &[u8]) -> Result<()> { - ensure!(key.len() <= MAX_KEY_SIZE, CandyError::KeyTooLong(key.len())); - ensure!( - val.len() <= MAX_VALUE_SIZE, - CandyError::ValueTooLong(val.len()) - ); + /// Rotate to a new data file when the active one is full. + /// + /// The `rotation_lock` serializes concurrent rotations, so the read-then-write + /// on `data_files` (find a free index, then insert) is not a TOCTOU race. + /// `compact_file` also writes to `data_files` (removing files) but only + /// touches non-active indices, so there is no conflict. + fn _rotate_data_file(&self, active_idx: u16) -> Result<()> { + { + let _rot_lock = self.rotation_lock.lock(); + + if self.active_file_idx.load(Ordering::Acquire) != active_idx { + return Ok(()); + } + + let active_file = self.data_file(active_idx)?; + let active_ordinal = active_file.file_ordinal; + + let mut next_idx = + (self.active_file_idx.load(Ordering::Relaxed) + 1) & MAX_DATA_FILE_IDX; + let mut attempts = 0; + { + let files = self.data_files.read(); + while files.contains_key(&next_idx) { + next_idx = (next_idx + 1) & MAX_DATA_FILE_IDX; + attempts += 1; + if attempts > MAX_DATA_FILES { + return Err(Error::TooManyDataFiles); + } + } + } + + let ordinal = self.active_file_ordinal.fetch_add(1, Ordering::Relaxed) + 1; + let data_file = Arc::new(DataFile::create( + self.base_path.as_path(), + self.config.clone(), + next_idx, + ordinal, + )?); + + active_file.seal_for_rotation(); + + self.data_files.write().insert(next_idx, data_file); + self.active_file_idx.store(next_idx, Ordering::Release); + + if active_ordinal != 0 + && self.index_file.file_waste(active_idx) > self.config.compaction_min_threshold + { + self.signal_compaction_scan(); + } + } + _ = self.request_checkpoint_epoch(); Ok(()) } - pub(crate) fn make_user_key(&self, mut key: Vec) -> Vec { - key.extend_from_slice(USER_NAMESPACE); - key + fn _mut_op( + &self, + ns: KeyNamespace, + key: &[u8], + val: &[u8], + mut op: impl FnMut(HashCoord, RowWriteGuard, &[u8], &[u8]) -> Result, + ) -> Result { + let entry_size = aligned_data_entry_size(key.len(), val.len()) as usize; + if key.len() > crate::types::MAX_USER_KEY_SIZE + || val.len() > crate::types::MAX_USER_VALUE_SIZE + || entry_size > self.config.max_data_file_size as usize + { + return Err(Error::PayloadTooLarge(entry_size)); + } + + let hc = HashCoord::new(ns, key, self.config.hash_key); + + loop { + let res = { + let row_table = self.index_file.rows_table(); + let gsl = self + .index_file + .header_ref() + .global_split_level + .load(Ordering::Acquire); + let mut sl = gsl; + let mut res = None; + + loop { + let row = row_table.row_mut(hc.row_index(sl)); + let row_sl = row.split_level.load(Ordering::Acquire); + if row_sl == 0 { + // nonexistent row + sl -= 1; + continue; + } + if row_sl > sl { + // split happened, retry + break; + } + + res = Some(op(hc, row, key, val)); + break; + } + + res + }; + + let Some(res) = res else { + continue; + }; + + match res { + Ok(res) => return Ok(res), + Err(Error::SplitRow(sl)) => { + let gsl = self + .index_file + .header_ref() + .global_split_level + .load(Ordering::Acquire); + // note: it is critical we do not hold the row's lock here + self._split_row(hc, sl, gsl)?; + } + Err(Error::RotateDataFile(active_idx)) => { + self._rotate_data_file(active_idx)?; + } + Err(err) => return Err(err), + } + } } +} - pub(crate) fn get_by_hash(&self, ph: PartedHash) -> Result> { - debug_assert!(ph.is_valid()); - self.root - .shared_op(ph.shard_selector(), |sh| sh.get_by_hash(ph)) +impl CandyStore { + pub fn get_db_path(&self) -> &Path { + &self.inner.base_path } - pub(crate) fn get_raw(&self, full_key: &[u8]) -> Result>> { - let ph = PartedHash::new(&self.config.hash_seed, full_key); - self.root - .shared_op(ph.shard_selector(), |sh| sh.get(ph, &full_key)) + fn list_read_guard(&self, ns: KeyNamespace, key: &[u8]) -> RwLockReadGuard<'_, ()> { + self.inner.list_meta_locks[self.inner.logical_lock_index(ns, key)].read() } - /// Gets the value of a key from the store. If the key does not exist, `None` will be returned. - /// The data is fully-owned, no references are returned. - pub fn get + ?Sized>(&self, key: &B) -> Result>> { - self.owned_get(key.as_ref().to_owned()) + fn list_write_guard(&self, ns: KeyNamespace, key: &[u8]) -> RwLockWriteGuard<'_, ()> { + self.inner.list_meta_locks[self.inner.logical_lock_index(ns, key)].write() } - /// Same as [Self::get] but takes an owned key - pub fn owned_get(&self, key: Vec) -> Result>> { - self.get_raw(&self.make_user_key(key)) + fn try_heal_range_head( + &self, + meta_ns: KeyNamespace, + range_key: &[u8], + initial_next_idx: u64, + new_head: u64, + mut get_meta: GetMeta, + mut set_meta: SetMeta, + ) -> Result<()> + where + GetMeta: FnMut(&CandyStore, &[u8]) -> Result, + SetMeta: FnMut(&CandyStore, &[u8], RangeMetadata) -> Result<()>, + { + let _lock = self.list_write_guard(meta_ns, range_key); + let mut meta = get_meta(self, range_key)?; + if meta.head >= initial_next_idx && meta.head < new_head { + meta.head = new_head; + set_meta(self, range_key, meta)?; + } + Ok(()) } - /// Checks whether the given key exists in the store - pub fn contains + ?Sized>(&self, key: &B) -> Result { - self.owned_contains(key.as_ref().to_owned()) + fn try_heal_range_tail( + &self, + meta_ns: KeyNamespace, + range_key: &[u8], + initial_end_idx: u64, + new_tail: u64, + mut get_meta: GetMeta, + mut set_meta: SetMeta, + ) -> Result<()> + where + GetMeta: FnMut(&CandyStore, &[u8]) -> Result, + SetMeta: FnMut(&CandyStore, &[u8], RangeMetadata) -> Result<()>, + { + let _lock = self.list_write_guard(meta_ns, range_key); + let mut meta = get_meta(self, range_key)?; + if meta.tail <= initial_end_idx && meta.tail > new_tail { + meta.tail = new_tail; + set_meta(self, range_key, meta)?; + } + Ok(()) } - /// Same as [Self::contains] but takes an owned key - pub fn owned_contains(&self, key: Vec) -> Result { - Ok(self.get_raw(&self.make_user_key(key))?.is_some()) + fn _immut_op( + &self, + ns: KeyNamespace, + key: &[u8], + mut op: impl FnMut(HashCoord, RowReadGuard, &[u8]) -> Result, + ) -> Result { + let hc = HashCoord::new(ns, key, self.inner.config.hash_key); + loop { + let row_table = self.inner.index_file.rows_table(); + let gsl = self + .inner + .index_file + .header_ref() + .global_split_level + .load(Ordering::Acquire); + let mut sl = gsl; + loop { + let row = row_table.row(hc.row_index(sl)); + let row_sl = row.split_level.load(Ordering::Acquire); + if row_sl == 0 { + // nonexistent row + sl -= 1; + continue; + } + if row_sl > sl { + // split happened, retry + break; + } + return op(hc, row, key); + } + } } - pub(crate) fn remove_raw(&self, full_key: &[u8]) -> Result>> { - let ph = PartedHash::new(&self.config.hash_seed, full_key); - self.root - .shared_op(ph.shard_selector(), |sh| sh.remove(ph, &full_key)) + fn get_ns(&self, ns: KeyNamespace, key: &[u8]) -> Result>> { + self._immut_op(ns, key, |hc, row, key| { + let files = self.inner.data_files.read(); + for (_, entry) in row.iter_matches(hc) { + let Some(file) = files.get(&entry.file_idx()) else { + continue; + }; + self.inner.record_read(entry.size_hint() as u64); + let kv = match file.read_kv(entry.file_offset(), entry.size_hint()) { + Ok(kv) => kv, + Err(Error::IOError(e)) + if e.kind() == std::io::ErrorKind::UnexpectedEof + || e.kind() == std::io::ErrorKind::InvalidData => + { + continue; + } + Err(e) => return Err(e), + }; + if kv.key() == key { + return Ok(Some(kv.value().to_vec())); + } else { + self.inner + .stats + .num_collisions + .fetch_add(1, Ordering::Relaxed); + } + } + Ok(None) + }) } - /// Removes a key-value pair from the store, returning `None` if the key did not exist, - /// or `Some(old_value)` if it did - pub fn remove + ?Sized>(&self, key: &B) -> Result>> { - self.owned_remove(key.as_ref().to_owned()) + /// Returns the current value for `key`, if it exists. + pub fn get(&self, key: impl AsRef<[u8]>) -> Result>> { + let value = self.get_ns(KeyNamespace::User, key.as_ref())?; + self.inner.record_lookup(value.is_some()); + Ok(value) } - /// Same as [Self::remove] but takes an owned key - pub fn owned_remove(&self, key: Vec) -> Result>> { - self.remove_raw(&self.make_user_key(key)) + /// Returns `true` if `key` currently exists. + pub fn contains(&self, key: impl AsRef<[u8]>) -> Result { + self.get(key).map(|value| value.is_some()) } - pub(crate) fn insert_internal( + fn get_or_create_ns( &self, - full_key: &[u8], - val: &[u8], - mode: InsertMode, - ) -> Result { - let ph = PartedHash::new(&self.config.hash_seed, full_key); + ns: KeyNamespace, + key: &[u8], + default_val: &[u8], + ) -> Result { + self.inner + ._mut_op(ns, key, default_val, |hc, mut row, key, val| { + let files = self.inner.data_files.read(); + for (_, entry) in row.iter_matches(hc) { + let Some(file) = files.get(&entry.file_idx()) else { + continue; + }; + self.inner.record_read(entry.size_hint() as u64); + let kv = file.read_kv(entry.file_offset(), entry.size_hint())?; + if kv.key() == key { + return Ok(GetOrCreateStatus::ExistingValue(kv.into_value())); + } + } - ensure!( - full_key.len() <= MAX_TOTAL_KEY_SIZE, - CandyError::KeyTooLong(full_key.len()) - ); - ensure!( - val.len() <= MAX_TOTAL_VALUE_SIZE, - CandyError::ValueTooLong(val.len()) + if let Some(col) = row.find_free_slot() { + let active_idx = self.inner.active_file_idx.load(Ordering::Acquire); + let active_file = files + .get(&active_idx) + .ok_or(Error::MissingDataFile(active_idx))?; + let (file_off, size, inflight_guard) = active_file.append_kv( + EntryType::Insert, + ns, + key, + val, + row.shard_idx, + &self.inner.inflight_tracker, + )?; + self.inner.record_write(file_off, size as u64); + row.insert( + col, + hc.sig, + EntryPointer::new(active_idx, file_off, size, hc.masked_row_selector()), + ); + self.record_write_stats(key.len(), val.len()); + inflight_guard.complete(); + Ok(GetOrCreateStatus::CreatedNew(val.to_vec())) + } else { + Err(Error::SplitRow(row.split_level.load(Ordering::Relaxed))) + } + }) + } + + /// Returns the existing value for `key`, or inserts `default_val` and returns it. + pub fn get_or_create + ?Sized, B2: AsRef<[u8]> + ?Sized>( + &self, + key: &B1, + default_val: &B2, + ) -> Result { + self.get_or_create_ns(KeyNamespace::User, key.as_ref(), default_val.as_ref()) + } + + fn track_update_waste(&self, file_idx: u16, klen: usize, vlen: usize) { + let added_waste = aligned_data_entry_waste(klen, vlen); + let new_waste = self.inner.index_file.add_file_waste(file_idx, added_waste); + self.inner.maybe_signal_compaction_threshold_crossing( + file_idx, + new_waste.saturating_sub(added_waste), + new_waste, ); + } - if full_key.len() + val.len() > self.config.max_shard_size as usize { - return Err(anyhow!(CandyError::EntryCannotFitInShard( - full_key.len() + val.len(), - self.config.max_shard_size as usize - ))); - } + fn record_write_stats(&self, klen: usize, vlen: usize) { + let entry_size = aligned_data_entry_size(klen, vlen); + self.inner.add_uncommitted_num_entries(1); + self.inner + .stats + .num_inserted + .fetch_add(1, Ordering::Relaxed); + self.inner.bump_histogram(entry_size); + } - self.root.insert(ph, full_key, val, mode) + fn record_replace_stats(&self, new_klen: usize, new_vlen: usize) { + let new_entry_size = aligned_data_entry_size(new_klen, new_vlen); + self.inner.stats.num_updated.fetch_add(1, Ordering::Relaxed); + self.inner.bump_histogram(new_entry_size); } - pub(crate) fn set_raw(&self, full_key: &[u8], val: &[u8]) -> Result { - match self.insert_internal(full_key, val, InsertMode::Set)? { - InsertStatus::Added => Ok(SetStatus::CreatedNew), - InsertStatus::Replaced(v) => Ok(SetStatus::PrevValue(v)), - InsertStatus::AlreadyExists(v) => Ok(SetStatus::PrevValue(v)), - InsertStatus::KeyDoesNotExist => unreachable!(), - InsertStatus::SplitNeeded => unreachable!(), - } + fn record_remove_stats(&self) { + self.inner.add_uncommitted_num_entries(-1); + self.inner.stats.num_removed.fetch_add(1, Ordering::Relaxed); } - /// Inserts a key-value pair, creating it or replacing an existing pair. Note that if the program crashed - /// while or "right after" this operation, or if the operating system is unable to flush the page cache, - /// you may lose some data. However, you will still be in a consistent state, where you will get a previous - /// version of the state. - /// - /// While this method is O(1) amortized, every so often it will trigger either a shard compaction or a - /// shard split, which requires rewriting the whole shard. However, unlike LSM trees, this operation is - /// constant in size - pub fn set + ?Sized, B2: AsRef<[u8]> + ?Sized>( + fn apply_update_to_existing_entry( &self, - key: &B1, - val: &B2, - ) -> Result { - self.owned_set(key.as_ref().to_owned(), val.as_ref()) + row: &mut RowWriteGuard<'_>, + update: ExistingEntryUpdate<'_>, + ) -> Result<()> { + let active_idx = self.inner.active_file_idx.load(Ordering::Acquire); + let active_file = update + .files + .get(&active_idx) + .ok_or(Error::MissingDataFile(active_idx))?; + let (file_off, size, inflight_guard) = active_file.append_kv( + EntryType::Update, + update.ns, + update.key, + update.val, + update.shard_idx, + &self.inner.inflight_tracker, + )?; + self.inner.record_write(file_off, size as u64); + if let Some(name) = update.crash_point_name { + crate::crash_point(name); + } + + row.replace_pointer( + update.col, + EntryPointer::new(active_idx, file_off, size, update.hc.masked_row_selector()), + ); + self.track_update_waste(update.src_file_idx, update.old_klen, update.old_vlen); + self.record_replace_stats(update.key.len(), update.val.len()); + inflight_guard.complete(); + Ok(()) + } + + fn set_ns(&self, ns: KeyNamespace, key: &[u8], val: &[u8]) -> Result>> { + self.inner._mut_op(ns, key, val, |hc, mut row, key, val| { + let files = self.inner.data_files.read(); + for (col, entry) in row.iter_matches(hc) { + let Some(file) = files.get(&entry.file_idx()) else { + continue; + }; + self.inner.record_read(entry.size_hint() as u64); + let kv = file.read_kv(entry.file_offset(), entry.size_hint())?; + if kv.key() == key { + // optimization + if kv.value() == val { + return Ok(Some(kv.into_value())); + } + let klen = kv.key().len(); + let vlen = kv.value().len(); + let old_val = kv.into_value(); + let src_file_idx = file.file_idx; + + let shard_idx = row.shard_idx; + self.apply_update_to_existing_entry( + &mut row, + ExistingEntryUpdate { + files: &files, + ns, + key, + val, + hc, + col, + shard_idx, + src_file_idx, + old_klen: klen, + old_vlen: vlen, + crash_point_name: Some("set_after_write_before_update"), + }, + )?; + return Ok(Some(old_val)); + } else { + self.inner + .stats + .num_collisions + .fetch_add(1, Ordering::Relaxed); + } + } + + if let Some(col) = row.find_free_slot() { + let active_idx = self.inner.active_file_idx.load(Ordering::Acquire); + let active_file = files + .get(&active_idx) + .ok_or(Error::MissingDataFile(active_idx))?; + let (file_off, size, inflight_guard) = active_file.append_kv( + EntryType::Insert, + ns, + key, + val, + row.shard_idx, + &self.inner.inflight_tracker, + )?; + self.inner.record_write(file_off, size as u64); + crate::crash_point("set_after_write_before_insert"); + row.insert( + col, + hc.sig, + EntryPointer::new(active_idx, file_off, size, hc.masked_row_selector()), + ); + self.record_write_stats(key.len(), val.len()); + inflight_guard.complete(); + Ok(None) + } else { + Err(Error::SplitRow(row.split_level.load(Ordering::Relaxed))) + } + }) } - /// Same as [Self::set], but the key passed owned to this function - pub fn owned_set(&self, key: Vec, val: &[u8]) -> Result { - Self::ensure_sizes(&key, &val)?; - self.set_raw(&self.make_user_key(key), val) + /// Inserts or replaces `key` with `val`. + pub fn set(&self, key: impl AsRef<[u8]>, val: impl AsRef<[u8]>) -> Result { + Ok( + match self.set_ns(KeyNamespace::User, key.as_ref(), val.as_ref())? { + Some(previous) => crate::SetStatus::PrevValue(previous), + None => crate::SetStatus::CreatedNew, + }, + ) } - pub(crate) fn replace_raw( + fn replace_ns( &self, - full_key: &[u8], + ns: KeyNamespace, + key: &[u8], val: &[u8], expected_val: Option<&[u8]>, ) -> Result { - match self.insert_internal(full_key, val, InsertMode::Replace(expected_val))? { - InsertStatus::Added => unreachable!(), - InsertStatus::Replaced(v) => Ok(ReplaceStatus::PrevValue(v)), - InsertStatus::AlreadyExists(v) => Ok(ReplaceStatus::WrongValue(v)), - InsertStatus::KeyDoesNotExist => Ok(ReplaceStatus::DoesNotExist), - InsertStatus::SplitNeeded => unreachable!(), - } + self.inner._mut_op(ns, key, val, |hc, mut row, key, val| { + let files = self.inner.data_files.read(); + for (col, entry) in row.iter_matches(hc) { + let Some(file) = files.get(&entry.file_idx()) else { + continue; + }; + self.inner.record_read(entry.size_hint() as u64); + let kv = file.read_kv(entry.file_offset(), entry.size_hint())?; + if kv.key() == key { + if let Some(expected) = expected_val + && kv.value() != expected + { + return Ok(ReplaceStatus::WrongValue(kv.into_value())); + } + // optimization + if kv.value() == val { + return Ok(ReplaceStatus::PrevValue(kv.into_value())); + } + + let klen = kv.key().len(); + let vlen = kv.value().len(); + let old_val = kv.into_value(); + let src_file_idx = file.file_idx; + + let shard_idx = row.shard_idx; + self.apply_update_to_existing_entry( + &mut row, + ExistingEntryUpdate { + files: &files, + ns, + key, + val, + hc, + col, + shard_idx, + src_file_idx, + old_klen: klen, + old_vlen: vlen, + crash_point_name: None, + }, + )?; + return Ok(ReplaceStatus::PrevValue(old_val)); + } + } + Ok(ReplaceStatus::DoesNotExist) + }) } - /// Replaces the value of an existing key with a new value. If the key existed, returns - /// `PrevValue(value)` with its old value, and if it did not, returns `DoesNotExist` but - /// does not create the key. - /// - /// See [Self::set] for more details - pub fn replace + ?Sized, B2: AsRef<[u8]> + ?Sized>( + /// Replaces `key` with `val` only if the current value matches `expected_val` when provided. + pub fn replace + ?Sized, B2: AsRef<[u8]> + ?Sized, B3: AsRef<[u8]> + ?Sized>( &self, key: &B1, val: &B2, - expected_val: Option<&B2>, + expected_val: Option<&B3>, ) -> Result { - self.owned_replace( - key.as_ref().to_owned(), + self.replace_ns( + KeyNamespace::User, + key.as_ref(), val.as_ref(), - expected_val.map(|ev| ev.as_ref()), + expected_val.map(|expected| expected.as_ref()), ) } - /// Same as [Self::replace], but the key passed owned to this function - pub fn owned_replace( - &self, - key: Vec, - val: &[u8], - expected_val: Option<&[u8]>, - ) -> Result { - Self::ensure_sizes(&key, &val)?; - self.replace_raw(&self.make_user_key(key), val, expected_val) + fn track_tombstone_waste(&self, file_idx: u16, klen: usize, vlen: usize) { + let active_idx = self.inner.active_file_idx.load(Ordering::Relaxed); + if file_idx == active_idx { + self.inner.index_file.add_file_waste( + file_idx, + aligned_data_entry_waste(klen, vlen) + aligned_tombstone_entry_waste(klen), + ); + } else { + let old_entry_waste = aligned_data_entry_waste(klen, vlen); + let new_waste = self + .inner + .index_file + .add_file_waste(file_idx, old_entry_waste); + self.inner.maybe_signal_compaction_threshold_crossing( + file_idx, + new_waste.saturating_sub(old_entry_waste), + new_waste, + ); + self.inner + .index_file + .add_file_waste(active_idx, aligned_tombstone_entry_waste(klen)); + } } - pub(crate) fn get_or_create_raw( - &self, - full_key: &[u8], - default_val: Vec, - ) -> Result { - match self.insert_internal(full_key, &default_val, InsertMode::GetOrCreate)? { - InsertStatus::Added => Ok(GetOrCreateStatus::CreatedNew(default_val)), - InsertStatus::AlreadyExists(v) => Ok(GetOrCreateStatus::ExistingValue(v)), - InsertStatus::Replaced(_) => unreachable!(), - InsertStatus::KeyDoesNotExist => unreachable!(), - InsertStatus::SplitNeeded => unreachable!(), + fn remove_ns(&self, ns: KeyNamespace, key: &[u8]) -> Result>> { + self.inner._mut_op(ns, key, &[], |hc, mut row, key, _| { + let files = self.inner.data_files.read(); + for (col, entry) in row.iter_matches(hc) { + let Some(file) = files.get(&entry.file_idx()) else { + continue; + }; + self.inner.record_read(entry.size_hint() as u64); + let kv = file.read_kv(entry.file_offset(), entry.size_hint())?; + + if kv.key() == key { + let klen = kv.key().len(); + let vlen = kv.value().len(); + let old_val = kv.into_value(); + let src_file_idx = file.file_idx; + + let active_idx = self.inner.active_file_idx.load(Ordering::Acquire); + let active_file = files + .get(&active_idx) + .ok_or(Error::MissingDataFile(active_idx))?; + let (file_off, tombstone_size, inflight_guard) = active_file.append_tombstone( + ns, + key, + row.shard_idx, + &self.inner.inflight_tracker, + )?; + self.inner.record_write(file_off, tombstone_size as u64); + + row.remove(col); + self.track_tombstone_waste(src_file_idx, klen, vlen); + self.record_remove_stats(); + inflight_guard.complete(); + return Ok(Some(old_val)); + } + } + + Ok(None) + }) + } + + /// Removes `key` and returns its previous value if it existed. + pub fn remove(&self, key: impl AsRef<[u8]>) -> Result>> { + self.remove_ns(KeyNamespace::User, key.as_ref()) + } + + /// Iterates over all currently live user key/value pairs. + pub fn iter_items(&self) -> impl Iterator, Vec)>> + '_ { + let mut row_idx = 0usize; + let mut row_entries: Vec = Vec::with_capacity(ROW_WIDTH); + let mut batch_files = None::>>>; + let mut scratch_buf = Vec::new(); + let mut ptr_idx = 0usize; + + std::iter::from_fn(move || { + loop { + if ptr_idx < row_entries.len() { + let ptr = row_entries[ptr_idx]; + ptr_idx += 1; + let files = batch_files + .as_ref() + .expect("row entries should only be drained with a file map guard"); + let Some(file) = files.get(&ptr.file_idx()) else { + continue; + }; + self.inner.record_read(ptr.size_hint() as u64); + let kv = match file.read_kv_into( + ptr.file_offset(), + ptr.size_hint(), + &mut scratch_buf, + ) { + Ok(kv) => kv, + Err(Error::IOError(e)) + if e.kind() == std::io::ErrorKind::UnexpectedEof + || e.kind() == std::io::ErrorKind::InvalidData => + { + continue; + } + Err(e) => return Some(Err(e)), + }; + if kv.ns != KeyNamespace::User as u8 { + continue; + } + let key = kv.key().to_vec(); + let value = kv.value().to_vec(); + return Some(Ok((key, value))); + } + + row_entries.clear(); + batch_files = None; + ptr_idx = 0; + + loop { + let row_table = self.inner.index_file.rows_table(); + let gsl = self + .inner + .index_file + .header_ref() + .global_split_level + .load(Ordering::Acquire); + let active_rows = 1usize << gsl; + + if row_idx >= active_rows { + break; + } + + let idx = row_idx; + row_idx += 1; + + let row = row_table.row(idx); + if row.split_level.load(Ordering::Acquire) == 0 { + continue; + } + for col in 0..ROW_WIDTH { + if row.signatures[col] != HashCoord::INVALID_SIG + && row.pointers[col].is_valid() + { + row_entries.push(row.pointers[col]); + } + } + batch_files = Some(self.inner.data_files.read()); + break; + } + + if row_entries.is_empty() { + return None; + } + } + }) + } + + /// Flushes index and data files to stable storage. + pub fn flush(&self) -> Result<()> { + self.inner.index_file.sync_all()?; + let files = self.inner.data_files.read(); + for data_file in files.values() { + data_file.sync_to_current()?; } + sync_dir(&self.inner.base_path) } - /// Gets the value of the given key or creates it with the given default value. If the key did not exist, - /// returns `CreatedNew(default_val)`, and if it did, returns `ExistingValue(value)`. - /// This is done atomically, so it can be used to create a key only if it did not exist before, - /// like `open` with `O_EXCL`. + /// Establishes a durable recovery checkpoint. /// - /// See [Self::set] for more details - pub fn get_or_create + ?Sized, B2: AsRef<[u8]> + ?Sized>( - &self, - key: &B1, - default_val: &B2, - ) -> Result { - self.owned_get_or_create(key.as_ref().to_owned(), default_val.as_ref().to_owned()) + /// Reads the earliest in-flight `(file_ordinal, offset)` tuple across all + /// shards to determine the first position that may still require replay. + /// If no writes are in flight, the checkpoint targets the active file tail. + /// Syncs the data and index files and advances the persisted replay cursor + /// so the next open can resume from this point without replaying earlier + /// writes. + /// + /// This waits for the background checkpoint worker to establish a checkpoint + /// after taking all logical list/queue locks, so compound operations are + /// checkpointed only at well-defined boundaries. + pub fn checkpoint(&self) -> Result<()> { + let target_epoch = self.inner.request_checkpoint_epoch(); + self.inner.wait_for_checkpoint_epoch(target_epoch) } - /// Same as [Self::get_or_create], but the `key` and `default_val` are passed owned to this function - pub fn owned_get_or_create( - &self, - key: Vec, - default_val: Vec, - ) -> Result { - Self::ensure_sizes(&key, &default_val)?; - self.get_or_create_raw(&self.make_user_key(key), default_val) + /// Returns the number of background compaction errors observed since open. + pub fn compaction_errors(&self) -> u64 { + self.inner.stats.compaction_errors.load(Ordering::Relaxed) } - /// Returns an iterator over the whole store (skipping lists or typed items) - pub fn iter(&self) -> CandyStoreIterator<'_> { - CandyStoreIterator::new(self, false, true) + /// Returns the number of currently live entries. + pub fn num_items(&self) -> usize { + let committed = self + .inner + .index_file + .header_ref() + .committed_num_entries + .load(Ordering::Relaxed); + let uncommitted = self.inner.uncommitted_entries_delta.load(Ordering::Relaxed); + let count = committed.saturating_add_signed(uncommitted); + debug_assert!( + (committed as i128 + uncommitted as i128) >= 0, + "live entry count underflow: committed={committed}, uncommitted={uncommitted}" + ); + count as usize } - /// Returns an iterator of keys only over the whole store (skipping lists or typed items) - pub fn iter_keys(&self) -> impl Iterator>> + use<'_> { - CandyStoreIterator::new(self, false, true).map(|res| match res { - Ok(kv) => Ok(kv.0), - Err(e) => Err(e), - }) + /// Returns the current index capacity in entries. + pub fn capacity(&self) -> usize { + let row_table = self.inner.index_file.rows_table(); + let row_count = row_table.row_guard.len() / std::mem::size_of::(); + row_count * ROW_WIDTH } - pub fn iter_raw(&self) -> CandyStoreIterator<'_> { - CandyStoreIterator::new(self, true, true) + /// Shrinks the index when the reclaimable row ratio is at least `min_wasted_ratio`. + pub fn shrink_to_fit_blocking(&self, min_wasted_ratio: f64) -> Result { + let _logical_guards = self + .inner + .list_meta_locks + .iter() + .map(|lock| lock.write()) + .collect::>(); + let row_table = self.inner.index_file.rows_table_mut(); + + let min_wasted_ratio = min_wasted_ratio.clamp(0.0, 1.0); + let current_rows = row_table.row_guard.len() / std::mem::size_of::(); + if current_rows == 0 { + return Ok(0); + } + + let required_rows = self.num_items().div_ceil(ROW_WIDTH * 8 / 10).max(1); + let min_rows_cfg = (self.inner.config.initial_capacity / ROW_WIDTH) + .max(1usize << MIN_SPLIT_LEVEL) + .max(1); + let min_rows = required_rows.max(min_rows_cfg); + + let reclaimable_rows = current_rows.saturating_sub(min_rows); + let reclaimable_ratio = reclaimable_rows as f64 / current_rows as f64; + if reclaimable_ratio < min_wasted_ratio { + return Ok(current_rows); + } + + self.inner + .index_file + .shrink_with_rows_guard(min_rows, row_table) } - /// Returns an iterator starting from the specified cookie (obtained via [CandyStoreIterator::cookie]) - pub fn iter_from_cookie(&self, cookie: u64) -> CandyStoreIterator<'_> { - CandyStoreIterator::from_cookie(self, cookie, false, true) + /// Returns a snapshot of store statistics and accounting counters. + pub fn stats(&self) -> Stats { + let num_rows = self.inner.index_file.num_rows() as u64; + + // Derive data_bytes and waste_bytes from file sizes and per-file + // waste levels rather than maintaining them as persistent counters. + let (total_bytes, num_data_files) = { + let data_files = self.inner.data_files.read(); + + ( + data_files.values().map(|df| df.used_bytes()).sum(), + data_files.len() as u64, + ) + }; + let waste_bytes = self.inner.index_file.total_waste(); + let s = &self.inner.stats; + let checkpoint_state = self.inner.checkpoint_state.lock(); + let checkpoint_generation = self.inner.index_file.checkpoint_generation(); + let checkpoint_epoch = checkpoint_state.completed_epoch; + let uncheckpointed_bytes = self.inner.approx_uncheckpointed_bytes(); + let last_checkpoint_dur = Duration::from_millis(checkpoint_state.last_checkpoint_dur_ms); + + Stats { + num_rows, + num_items: self.num_items() as u64, + index_size_bytes: self.inner.index_file.file_size_bytes(), + num_data_files, + + total_bytes, + waste_bytes, + + num_compactions: s.num_compactions.load(Ordering::Relaxed), + checkpoint_errors: s.checkpoint_errors.load(Ordering::Relaxed), + + last_remap_dur: Duration::from_millis(s.last_remap_dur_ms.load(Ordering::Relaxed)), + checkpoint_generation, + checkpoint_epoch, + uncheckpointed_bytes, + last_checkpoint_dur, + last_compaction_dur: Duration::from_millis( + s.last_compaction_dur_ms.load(Ordering::Relaxed), + ), + last_compaction_reclaimed_bytes: s + .last_compaction_reclaimed_bytes + .load(Ordering::Relaxed), + last_compaction_moved_bytes: s.last_compaction_moved_bytes.load(Ordering::Relaxed), + + num_read_ops: s.num_read_ops.load(Ordering::Relaxed), + num_read_bytes: s.num_read_bytes.load(Ordering::Relaxed), + num_write_ops: s.num_write_ops.load(Ordering::Relaxed), + num_write_bytes: s.num_write_bytes.load(Ordering::Relaxed), + + num_inserted: s.num_inserted.load(Ordering::Relaxed), + num_updated: s.num_updated.load(Ordering::Relaxed), + num_removed: s.num_removed.load(Ordering::Relaxed), + num_positive_lookups: s.num_positive_lookups.load(Ordering::Relaxed), + num_negative_lookups: s.num_negative_lookups.load(Ordering::Relaxed), + num_collisions: s.num_collisions.load(Ordering::Relaxed), + + num_rebuilt_entries: s.num_rebuilt_entries.load(Ordering::Relaxed), + num_rebuild_purged_bytes: s.num_rebuild_purged_bytes.load(Ordering::Relaxed), + + entries_under_64: s.size_histogram[0].load(Ordering::Relaxed), + entries_under_256: s.size_histogram[1].load(Ordering::Relaxed), + entries_under_1024: s.size_histogram[2].load(Ordering::Relaxed), + entries_under_4096: s.size_histogram[3].load(Ordering::Relaxed), + entries_under_16384: s.size_histogram[4].load(Ordering::Relaxed), + entries_over_16384: s.size_histogram[5].load(Ordering::Relaxed), + } } - /// Returns an iterator of keys only starting from the specified cookie (obtained via [CandyStoreIterator::cookie]) - pub fn iter_keys_from_cookie( - &self, - cookie: u64, - ) -> impl Iterator>> + use<'_> { - CandyStoreIterator::from_cookie(self, cookie, false, true).map(|res| match res { - Ok(kv) => Ok(kv.0), - Err(e) => Err(e), - }) + /// Simulates a crash by dropping the instance without performing clean shutdown operations (e.g. marking the index as clean). + pub fn _abort_for_testing(self) { + self.allow_clean_shutdown.store(false, Ordering::Relaxed); + drop(self); } +} - /// Returns useful stats about the store - pub fn stats(&self) -> Stats { - let shard_stats = self.root.call_on_all_shards(|sh| sh.get_stats()).unwrap(); +#[cfg(test)] +mod tests { + use super::*; + + use std::{thread, time::Instant}; + + use tempfile::tempdir; + + #[test] + fn test_compaction_errors_reports_counter() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + assert_eq!(db.compaction_errors(), 0); + + db.inner.stats.compaction_errors.store(7, Ordering::Relaxed); + + assert_eq!(db.compaction_errors(), 7); + + Ok(()) + } - let mut stats = Stats::default(); - self.stats.fill_stats(&mut stats); + #[test] + fn test_stats_reports_transient_collision_counter() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; - for stats2 in shard_stats { - stats.num_shards += 1; - stats.occupied_bytes += stats2.write_offset; - stats.wasted_bytes += stats2.wasted_bytes; - stats.num_inserts += stats2.num_inserts; - stats.num_removals += stats2.num_removals; + db.inner.stats.num_collisions.store(11, Ordering::Relaxed); + + assert_eq!(db.stats().num_collisions, 11); + + Ok(()) + } + + #[test] + fn test_stats_reports_last_remap_duration() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.inner + .stats + .last_remap_dur_ms + .store(17, Ordering::Relaxed); + + assert_eq!(db.stats().last_remap_dur, Duration::from_millis(17)); + + Ok(()) + } + + #[test] + fn test_stats_reports_last_compaction_stats() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.inner + .stats + .last_compaction_dur_ms + .store(23, Ordering::Relaxed); + db.inner + .stats + .last_compaction_reclaimed_bytes + .store(1234, Ordering::Relaxed); + db.inner + .stats + .last_compaction_moved_bytes + .store(5678, Ordering::Relaxed); + + let stats = db.stats(); + assert_eq!(stats.last_compaction_dur, Duration::from_millis(23)); + assert_eq!(stats.last_compaction_reclaimed_bytes, 1234); + assert_eq!(stats.last_compaction_moved_bytes, 5678); + + Ok(()) + } + + #[test] + fn test_stats_reports_rebuild_counters() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.inner + .stats + .num_rebuilt_entries + .store(11, Ordering::Relaxed); + db.inner + .stats + .num_rebuild_purged_bytes + .store(96, Ordering::Relaxed); + + let stats = db.stats(); + assert_eq!(stats.num_rebuilt_entries, 11); + assert_eq!(stats.num_rebuild_purged_bytes, 96); + + Ok(()) + } + + #[test] + fn test_stats_reports_checkpoint_state() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.stop_compaction(); + db.set("checkpoint-stats", vec![b'z'; 512])?; + + let active_idx = db.inner.active_file_idx.load(Ordering::Acquire); + let active_ordinal = db + .inner + .data_files + .read() + .get(&active_idx) + .expect("active data file should exist") + .file_ordinal; + db.inner.persist_checkpoint_cursor(active_ordinal, 0); + + { + let mut checkpoint_state = db.inner.checkpoint_state.lock(); + checkpoint_state.completed_epoch = 13; + checkpoint_state.last_checkpoint_dur_ms = 29; } - stats + + let expected_dirty = db + .inner + .data_files + .read() + .get(&active_idx) + .expect("active data file should exist") + .used_bytes(); + + let stats = db.stats(); + assert!(stats.checkpoint_generation > 0); + assert_eq!(stats.checkpoint_epoch, 13); + assert_eq!(stats.uncheckpointed_bytes, expected_dirty); + assert_eq!(stats.last_checkpoint_dur, Duration::from_millis(29)); + + Ok(()) } - /// Merges small shards (shards with a used capacity of less than `max_fill_level`), `max_fill_level` should - /// be a number between 0 and 0.5, the reasonable choice is 0.25. - /// - /// Note 1: this is an expensive operation that takes a global lock on the store (no other operations can - /// take place while merging is in progress). Only use it if you expect the number of items to be at half or - /// less than what it was (i.e., after a peak period) - /// - /// Note 2: merging will stop once we reach the number of shards required for [Config::expected_number_of_keys], - /// if configured - /// - /// Returns true if any shards were merged, false otherwise - pub fn merge_small_shards(&self, max_fill_level: f32) -> Result { - self.root.merge_small_shards(max_fill_level) + #[test] + fn test_checkpoint_does_not_join_compaction_thread() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.stop_compaction(); + *db.compaction_thd.lock() = Some(thread::spawn(|| { + thread::sleep(Duration::from_millis(400)); + })); + + let t0 = Instant::now(); + db.checkpoint()?; + assert!( + t0.elapsed() < Duration::from_millis(200), + "checkpoint should not wait for the compaction thread handle" + ); + + db.compaction_thd + .lock() + .take() + .expect("test compaction thread should still be present") + .join() + .expect("test compaction thread panicked"); + + Ok(()) } - /// Sets a big item, whose value is unlimited in size. Behind the scenes the value is split into chunks - /// and stored as a list. This makes this API non-atomic, i.e., crashing while writing a big value may later - /// allow you to retrieve a partial result. It is up to the caller to add a length field or a checksum to make - /// sure the value is correct. - /// - /// Returns true if the value had existed before (thus it was replaced), false otherwise - pub fn set_big + ?Sized, B2: AsRef<[u8]> + ?Sized>( - &self, - key: &B1, - val: &B2, - ) -> Result { - let existed = self.discard_queue(key)?; - self.extend_queue(key, val.as_ref().chunks(MAX_VALUE_SIZE))?; - self.push_to_queue_tail(key, bytes_of(&val.as_ref().len()))?; - Ok(existed) - } - - /// Returns a big item, collecting all the underlying chunks into a single value that's returned to the - /// caller. - pub fn get_big(&self, key: &[u8]) -> Result>> { - let mut val = vec![]; - let range = self.queue_range(key)?; - for res in self.iter_queue(key) { - let (idx, chunk) = res?; - // last element should encode the byte length of the item - if it's missing or encodes a different length, - // consider it corrupt and ignore this element - if idx + 1 == range.end { - if chunk.len() == size_of::() && *from_bytes::(&chunk) == val.len() { - return Ok(Some(val)); - } - } else { - val.extend_from_slice(&chunk); - } + #[test] + fn test_rotation_schedules_background_checkpoint() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open( + dir.path(), + Config { + max_data_file_size: 2048, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + ..Config::default() + }, + )?; + + db.stop_compaction(); + while db.stats().num_data_files < 2 { + let idx = db.stats().num_write_ops; + db.set( + format!("rotate-{idx}"), + format!("payload-{}", "x".repeat(768)), + )?; } - Ok(None) + + let t0 = Instant::now(); + while db.inner.index_file.checkpoint_cursor() == (0, 0) { + assert!( + t0.elapsed() < Duration::from_secs(2), + "rotation should enqueue a checkpoint that advances the replay cursor" + ); + thread::sleep(Duration::from_millis(10)); + } + + Ok(()) } - /// Removes a big item by key. Returns true if the key had existed, false otherwise. - /// See also [Self::set_big] - pub fn remove_big(&self, key: &[u8]) -> Result { - self.discard_queue(key) + #[test] + fn test_checkpoint_without_new_bytes_skips_io_and_advances_epoch() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open( + dir.path(), + Config { + checkpoint_interval: None, + checkpoint_delta_bytes: None, + compaction_throughput_bytes_per_sec: 0, + ..Config::default() + }, + )?; + + db.stop_compaction(); + let cursor_before = db.inner.index_file.checkpoint_cursor(); + let requested_before = db.inner.checkpoint_state.lock().requested_epoch; + db.checkpoint()?; + let state = db.inner.checkpoint_state.lock(); + assert_eq!(state.requested_epoch, requested_before + 1); + assert_eq!(state.completed_epoch, requested_before + 1); + assert_eq!(db.inner.index_file.checkpoint_cursor(), cursor_before); + + Ok(()) } -} -// impl Drop for CandyStore { -// fn drop(&mut self) { -// _ = self.threadpool.terminate(); -// } -// } + #[test] + fn test_checkpoint_delta_bytes_schedules_background_checkpoint() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open( + dir.path(), + Config { + checkpoint_interval: None, + checkpoint_delta_bytes: Some(512), + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + ..Config::default() + }, + )?; + + db.stop_compaction(); + db.set("delta-threshold", vec![b'x'; 1024])?; + + let t0 = Instant::now(); + while db.inner.index_file.checkpoint_cursor() == (0, 0) { + assert!( + t0.elapsed() < Duration::from_secs(2), + "checkpoint_delta_bytes should schedule a background checkpoint" + ); + thread::sleep(Duration::from_millis(10)); + } + + Ok(()) + } + + #[test] + fn test_checkpoint_interval_schedules_background_checkpoint() -> Result<()> { + let dir = tempdir().unwrap(); + let db = CandyStore::open( + dir.path(), + Config { + checkpoint_interval: Some(Duration::from_millis(50)), + checkpoint_delta_bytes: None, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + ..Config::default() + }, + )?; + + db.stop_compaction(); + db.set("interval-threshold", vec![b'y'; 256])?; + + let t0 = Instant::now(); + while db.inner.index_file.checkpoint_cursor() == (0, 0) { + assert!( + t0.elapsed() < Duration::from_secs(2), + "checkpoint_interval should checkpoint dirty bytes even without explicit requests" + ); + thread::sleep(Duration::from_millis(10)); + } + + Ok(()) + } +} diff --git a/src/store/checkpoint.rs b/src/store/checkpoint.rs new file mode 100644 index 0000000..0598eda --- /dev/null +++ b/src/store/checkpoint.rs @@ -0,0 +1,343 @@ +use std::sync::Arc; +use std::sync::atomic::Ordering; +use std::time::{Duration, Instant}; + +use crate::types::{Error, Result}; + +use super::{CandyStore, CheckpointFailure, CheckpointSnapshot, StoreInner}; + +/// RAII guard that marks the checkpoint worker as shut down and wakes all +/// waiters when the worker thread exits — whether it returns normally or +/// panics. This prevents `wait_for_checkpoint_epoch` from blocking forever +/// if the worker encounters an unexpected panic. +struct WorkerShutdownGuard<'a> { + inner: &'a StoreInner, +} + +impl Drop for WorkerShutdownGuard<'_> { + fn drop(&mut self) { + let _state = self.inner.checkpoint_state.lock(); + self.inner + .checkpoint_shutting_down + .store(true, Ordering::Release); + self.inner.checkpoint_condvar.notify_all(); + } +} + +enum CheckpointRun { + Shutdown, + Idle { + reset_timer: bool, + }, + Ready { + target_epoch: Option, + snapshot: Result, + }, +} + +impl StoreInner { + fn request_checkpoint_epoch_locked(state: &mut super::CheckpointState) -> u64 { + state.requested_epoch = state + .requested_epoch + .checked_add(1) + .expect("checkpoint epoch overflow"); + state.requested_epoch + } + + pub(super) fn approx_uncheckpointed_bytes(&self) -> u64 { + let active_idx = self.active_file_idx.load(Ordering::Acquire); + let files = self.data_files.read(); + let Some(active_file) = files.get(&active_idx) else { + return 0; + }; + + let (commit_file_ordinal, commit_offset) = self.index_file.checkpoint_cursor(); + let commit_offset = if commit_file_ordinal == active_file.file_ordinal { + commit_offset + } else { + 0 + }; + + active_file.used_bytes().saturating_sub(commit_offset) + } + + pub(super) fn request_checkpoint_epoch(&self) -> u64 { + let mut state = self.checkpoint_state.lock(); + let target_epoch = Self::request_checkpoint_epoch_locked(&mut state); + self.checkpoint_condvar.notify_all(); + target_epoch + } + + pub(super) fn note_checkpoint_write(&self, end_offset: u64) { + let Some(threshold) = self.config.checkpoint_delta_bytes else { + return; + }; + + // this is not atomic but it's fine -- rotation triggers checkpointing so if the last checkpoint + // happened before this file, we know a checkpoint is in progress. skip. + let (ordinal, commit_offset) = self.index_file.checkpoint_cursor(); + if self.active_file_ordinal.load(Ordering::Relaxed) != ordinal { + return; + } + + if end_offset <= commit_offset + threshold as u64 { + return; + } + + let mut state = self.checkpoint_state.lock(); + if state.requested_epoch > state.completed_epoch { + return; + } + + Self::request_checkpoint_epoch_locked(&mut state); + self.checkpoint_condvar.notify_all(); + } + + pub(super) fn wait_for_checkpoint_epoch(&self, target_epoch: u64) -> Result<()> { + let mut state = self.checkpoint_state.lock(); + loop { + if state.completed_epoch >= target_epoch { + return Ok(()); + } + if state.handled_epoch >= target_epoch && state.last_failure_epoch >= target_epoch { + return Err(state + .last_failure + .as_ref() + .map(CheckpointFailure::to_error) + .unwrap_or_else(|| { + Error::CheckpointShutdown( + "checkpoint worker stopped before completing request".into(), + ) + })); + } + if self.checkpoint_shutting_down.load(Ordering::Acquire) { + return Err(Error::CheckpointShutdown( + "checkpoint worker is shutting down".into(), + )); + } + self.checkpoint_condvar.wait(&mut state); + } + } + + fn wait_for_checkpoint_trigger( + &self, + interval: Option, + last_checkpoint_at: Instant, + ) -> Option { + let mut state = self.checkpoint_state.lock(); + loop { + if self.checkpoint_shutting_down.load(Ordering::Acquire) { + self.checkpoint_condvar.notify_all(); + return None; + } + + if state.handled_epoch < state.requested_epoch { + return Some(false); + } + + if let Some(interval) = interval { + let remaining = interval.saturating_sub(last_checkpoint_at.elapsed()); + if remaining.is_zero() { + return Some(true); + } + let wait_result = self.checkpoint_condvar.wait_for(&mut state, remaining); + if wait_result.timed_out() { + return Some(true); + } + } else { + self.checkpoint_condvar.wait(&mut state); + } + } + } + + fn complete_checkpoint_noop( + &self, + state: &mut super::CheckpointState, + target_epoch: Option, + ) { + if let Some(target_epoch) = target_epoch { + state.handled_epoch = target_epoch; + state.completed_epoch = target_epoch; + state.last_checkpoint_dur_ms = 0; + if state.last_failure_epoch <= state.completed_epoch { + state.last_failure_epoch = 0; + state.last_failure = None; + } + self.checkpoint_condvar.notify_all(); + } + } + + fn prepare_checkpoint_run(&self, interval_elapsed: bool) -> CheckpointRun { + let _logical_guards = self + .list_meta_locks + .iter() + .map(|lock| lock.write()) + .collect::>(); + let mut state = self.checkpoint_state.lock(); + if self.checkpoint_shutting_down.load(Ordering::Acquire) { + self.checkpoint_condvar.notify_all(); + return CheckpointRun::Shutdown; + } + + let target_epoch = + (state.handled_epoch < state.requested_epoch).then_some(state.requested_epoch); + if target_epoch.is_none() && !interval_elapsed { + return CheckpointRun::Idle { reset_timer: false }; + } + + let current_cursor = self.index_file.checkpoint_cursor(); + match self.snapshot_checkpoint_progress() { + Ok(snapshot) + if snapshot.checkpoint_ordinal == current_cursor.0 + && snapshot.checkpoint_offset == current_cursor.1 + && snapshot.checkpointed_delta == 0 => + { + self.complete_checkpoint_noop(&mut state, target_epoch); + CheckpointRun::Idle { reset_timer: true } + } + Ok(snapshot) => CheckpointRun::Ready { + target_epoch, + snapshot: Ok(snapshot), + }, + Err(err) => CheckpointRun::Ready { + target_epoch, + snapshot: Err(err), + }, + } + } + + fn checkpoint_needs_follow_up(&self, threshold: u64, snapshot: CheckpointSnapshot) -> bool { + let active_idx = self.active_file_idx.load(Ordering::Acquire); + let files = self.data_files.read(); + match files.get(&active_idx) { + Some(active_file) => { + active_file.file_ordinal == snapshot.checkpoint_ordinal + && active_file + .used_bytes() + .saturating_sub(snapshot.checkpoint_offset) + >= threshold + } + None => false, + } + } + + fn finish_checkpoint_run( + &self, + target_epoch: Option, + snapshot: Result, + threshold: Option, + started_at: Instant, + ) { + let snapshot_for_follow_up = snapshot.as_ref().ok().copied(); + let result = snapshot.and_then(|snap| self.sync_checkpoint(snap)); + let mut should_signal_compaction = false; + + let mut state = self.checkpoint_state.lock(); + match result { + Ok(()) => { + should_signal_compaction = true; + state.last_checkpoint_dur_ms = + u64::try_from(started_at.elapsed().as_millis()).unwrap_or(u64::MAX); + if let Some(target_epoch) = target_epoch { + state.handled_epoch = state.handled_epoch.max(target_epoch); + state.completed_epoch = state.completed_epoch.max(target_epoch); + } + if state.last_failure_epoch <= state.completed_epoch { + state.last_failure_epoch = 0; + state.last_failure = None; + } + + let should_request_follow_up = match (threshold, snapshot_for_follow_up) { + (Some(threshold), Some(snapshot)) => { + self.checkpoint_needs_follow_up(threshold, snapshot) + } + _ => false, + }; + if should_request_follow_up && state.handled_epoch >= state.requested_epoch { + Self::request_checkpoint_epoch_locked(&mut state); + } + } + Err(err) => { + self.stats.checkpoint_errors.fetch_add(1, Ordering::Relaxed); + let failure_epoch = target_epoch + .unwrap_or_else(|| Self::request_checkpoint_epoch_locked(&mut state)); + state.handled_epoch = state.handled_epoch.max(failure_epoch); + state.last_failure_epoch = failure_epoch; + state.last_failure = Some(CheckpointFailure::from_error(err)); + } + } + self.checkpoint_condvar.notify_all(); + drop(state); + + if should_signal_compaction { + self.signal_compaction_scan(); + } + } + + fn run_checkpoint_worker(self: &Arc) { + let _shutdown_guard = WorkerShutdownGuard { inner: self }; + let interval = self.config.checkpoint_interval; + let threshold = self.config.checkpoint_delta_bytes.map(|value| value as u64); + let mut last_checkpoint_at = Instant::now(); + + loop { + let Some(interval_elapsed) = + self.wait_for_checkpoint_trigger(interval, last_checkpoint_at) + else { + return; + }; + + let (target_epoch, snapshot) = match self.prepare_checkpoint_run(interval_elapsed) { + CheckpointRun::Shutdown => return, + CheckpointRun::Idle { reset_timer } => { + if reset_timer { + last_checkpoint_at = Instant::now(); + } + continue; + } + CheckpointRun::Ready { + target_epoch, + snapshot, + } => (target_epoch, snapshot), + }; + + let started_at = Instant::now(); + self.finish_checkpoint_run(target_epoch, snapshot, threshold, started_at); + last_checkpoint_at = Instant::now(); + } + } +} + +impl CandyStore { + pub(super) fn start_checkpoint_worker(&self) { + let mut checkpoint_thd = self.checkpoint_thd.lock(); + if checkpoint_thd.is_some() { + return; + } + + self.inner + .checkpoint_shutting_down + .store(false, Ordering::Release); + let ctx = Arc::clone(&self.inner); + let thd = std::thread::Builder::new() + .name("candy_checkpoint".into()) + .spawn(move || { + ctx.run_checkpoint_worker(); + }) + .unwrap(); + *checkpoint_thd = Some(thd); + } + + pub(super) fn stop_checkpoint_worker(&self) { + { + let _state = self.inner.checkpoint_state.lock(); + self.inner + .checkpoint_shutting_down + .store(true, Ordering::Release); + self.inner.checkpoint_condvar.notify_all(); + } + if let Some(thd) = self.checkpoint_thd.lock().take() { + let _ = thd.join(); + } + } +} diff --git a/src/store/compaction.rs b/src/store/compaction.rs new file mode 100644 index 0000000..34d3c1c --- /dev/null +++ b/src/store/compaction.rs @@ -0,0 +1,655 @@ +use std::sync::{Arc, atomic::Ordering}; + +use crate::{ + data_file::DataFile, + index_file::EntryPointer, + internal::{KeyNamespace, data_file_path, invalid_data_error, sync_dir}, + pacer::Pacer, + types::{Error, Result}, +}; + +use super::{CandyStore, StoreInner}; + +pub(super) struct CompactionOutcome { + pub(super) compacted_files: u64, + pub(super) reclaimed_bytes: u32, + pub(super) moved_bytes: u32, +} + +type CompactionSources = Vec<(u16, Arc)>; + +struct CompactionEntry { + row_idx: usize, + col: usize, + entry: EntryPointer, + source_file: Arc, +} + +type CompactionRowSnapshot = Vec; + +impl StoreInner { + fn empty_compaction_outcome() -> CompactionOutcome { + CompactionOutcome { + compacted_files: 0, + reclaimed_bytes: 0, + moved_bytes: 0, + } + } + + fn collect_compaction_sources( + &self, + candidates: &[(u16, u64)], + active_file_idx: u16, + ) -> CompactionSources { + let files = self.data_files.read(); + candidates + .iter() + .filter_map(|&(file_idx, expected_ordinal)| { + if file_idx == active_file_idx { + return None; + } + + let data_file = files.get(&file_idx)?.clone(); + if data_file.file_ordinal != expected_ordinal { + return None; + } + + Some((file_idx, data_file)) + }) + .collect() + } + + fn snapshot_compaction_row( + &self, + row_idx: usize, + sources: &CompactionSources, + ) -> Option { + let rows = self.index_file.rows_table(); + let active_rows = self.index_file.num_rows(); + if row_idx >= active_rows { + return None; + } + + let row = rows.row(row_idx); + Some( + row.pointers + .iter() + .enumerate() + .filter_map(|(col, &entry)| { + if !entry.is_valid() { + return None; + } + let (_, source_file) = + sources.iter().find(|(idx, _)| *idx == entry.file_idx())?; + Some(CompactionEntry { + row_idx, + col, + entry, + source_file: source_file.clone(), + }) + }) + .collect(), + ) + } + + fn rewrite_compacted_entry( + &self, + task: &CompactionEntry, + ns: KeyNamespace, + key: &[u8], + value: &[u8], + moved_bytes: &mut u64, + ) -> Result<()> { + let mut rotate_idx_req = None; + loop { + if let Some(rotate_idx) = rotate_idx_req.take() { + self._rotate_data_file(rotate_idx)?; + } + + let rows = self.index_file.rows_table(); + let active_rows = self.index_file.num_rows(); + if task.row_idx >= active_rows { + return Ok(()); + } + + let mut row = rows.row_mut(task.row_idx); + if row.pointers[task.col] != task.entry { + return Ok(()); + } + + let active_idx = self.active_file_idx.load(Ordering::Acquire); + let active_file = self + .data_files + .read() + .get(&active_idx) + .cloned() + .ok_or(Error::MissingDataFile(active_idx))?; + + match active_file.append_kv( + crate::internal::EntryType::Update, + ns, + key, + value, + row.shard_idx, + &self.inflight_tracker, + ) { + Ok((file_off, size, inflight_guard)) => { + self.record_write(file_off, size as u64); + *moved_bytes = moved_bytes.saturating_add(size as u64); + row.replace_pointer( + task.col, + EntryPointer::new( + active_idx, + file_off, + size, + task.entry.masked_row_selector(), + ), + ); + inflight_guard.complete(); + return Ok(()); + } + Err(Error::RotateDataFile(rotate_idx)) => { + drop(row); + rotate_idx_req = Some(rotate_idx); + } + Err(err) => return Err(err), + } + } + } + + fn compact_snapshot_entry( + &self, + task: CompactionEntry, + pacer: &mut Pacer, + read_buf: &mut Vec, + moved_bytes: &mut u64, + ) -> Result<()> { + self.record_read(task.entry.size_hint() as u64); + pacer.consume(task.entry.size_hint() as u64); + + let kv = task.source_file.read_kv_into( + task.entry.file_offset(), + task.entry.size_hint(), + read_buf, + )?; + + let Some(ns) = KeyNamespace::from_u8(kv.ns) else { + return Err(invalid_data_error("unknown key namespace in data file")); + }; + + self.rewrite_compacted_entry(&task, ns, kv.key(), kv.value(), moved_bytes) + } + + pub(super) fn compact_files( + &self, + candidates: &[(u16, u64)], + pacer: &mut Pacer, + #[cfg(windows)] pending_deletions: &mut Vec, + ) -> Result { + if candidates.is_empty() { + return Ok(Self::empty_compaction_outcome()); + } + + let active_file_idx = self.active_file_idx.load(Ordering::Acquire); + let sources = self.collect_compaction_sources(candidates, active_file_idx); + + if sources.is_empty() { + return Ok(Self::empty_compaction_outcome()); + } + + let mut moved_bytes = 0u64; + let mut read_buf = Vec::new(); + + let mut row_idx = 0; + loop { + if self.compaction_shutting_down.load(Ordering::Acquire) { + return Ok(Self::empty_compaction_outcome()); + } + + let Some(snapshot) = self.snapshot_compaction_row(row_idx, &sources) else { + break; + }; + + for task in snapshot { + self.compact_snapshot_entry(task, pacer, &mut read_buf, &mut moved_bytes)?; + } + + row_idx += 1; + } + + let removed = { + let mut files = self.data_files.write(); + sources + .iter() + .filter_map(|(file_idx, _)| { + files + .remove(file_idx) + .map(|data_file| (*file_idx, data_file)) + }) + .collect::>() + }; + let compacted_files = removed.len() as u64; + drop(sources); + + // Durability barrier: ensure all moved entries are durable in the active + // file and the updated index pointers are persisted before we delete the + // source files. Without this, a crash after deletion could leave the + // persisted index pointing at files that no longer exist. + if !removed.is_empty() { + let active_idx = self.active_file_idx.load(Ordering::Acquire); + if let Some(active_file) = self.data_files.read().get(&active_idx).cloned() { + let _ = active_file.sync_to_current(); + } + let _ = self.index_file.sync_all(); + } + + let mut reclaimed_bytes = 0u64; + for (file_idx, data_file) in removed { + drop(data_file); + + reclaimed_bytes = + reclaimed_bytes.saturating_add(self.index_file.take_file_waste(file_idx) as u64); + + let file_path = data_file_path(self.base_path.as_path(), file_idx); + match std::fs::remove_file(&file_path) { + Ok(()) => sync_dir(self.base_path.as_path())?, + #[cfg(windows)] + Err(_) => pending_deletions.push(file_path), + #[cfg(not(windows))] + Err(err) => return Err(Error::IOError(err)), + } + } + + let _ = self.index_file.flush_header(); + + Ok(CompactionOutcome { + compacted_files, + reclaimed_bytes: reclaimed_bytes.min(u64::from(u32::MAX)) as u32, + moved_bytes: moved_bytes.min(u64::from(u32::MAX)) as u32, + }) + } +} + +impl CandyStore { + pub(super) fn stop_compaction(&self) { + self.inner + .compaction_shutting_down + .store(true, Ordering::Release); + { + let mut state = self.inner.compaction_state.lock(); + state.wake_requested = true; + self.inner.compaction_condvar.notify_all(); + } + if let Some(thd) = self.compaction_thd.lock().take() { + let _ = thd.join(); + } + } + + #[cfg(windows)] + fn retry_pending_deletions(ctx: &StoreInner, pending: &mut Vec) { + let before = pending.len(); + pending.retain(|path| std::fs::remove_file(path).is_err()); + if pending.len() < before { + let _ = sync_dir(ctx.base_path.as_path()); + } + } + + pub(super) fn start_compaction(&self) { + let mut compaction_thd = self.compaction_thd.lock(); + if compaction_thd.is_some() { + return; + } + + self.inner + .compaction_shutting_down + .store(false, Ordering::Release); + let ctx = Arc::clone(&self.inner); + let thd = std::thread::Builder::new() + .name("candy_compact".into()) + .spawn(move || { + if ctx.config.compaction_throughput_bytes_per_sec == 0 { + // Compaction disabled — park until shutdown. + let mut state = ctx.compaction_state.lock(); + while !ctx.compaction_shutting_down.load(Ordering::Acquire) { + ctx.compaction_condvar.wait(&mut state); + } + return; + } + + let throughput_bytes_per_sec = + ctx.config.compaction_throughput_bytes_per_sec as u64; + let tokens_per_unit = (throughput_bytes_per_sec / 10).max(1); + let burst_size = tokens_per_unit.saturating_mul(2); + let mut pacer = Pacer::new( + tokens_per_unit, + std::time::Duration::from_millis(100), + burst_size, + ); + + #[cfg(windows)] + let mut pending_deletions = Vec::::new(); + loop { + { + let mut state = ctx.compaction_state.lock(); + while !state.wake_requested + && !ctx.compaction_shutting_down.load(Ordering::Acquire) + { + ctx.compaction_condvar.wait(&mut state); + } + + if ctx.compaction_shutting_down.load(Ordering::Acquire) { + break; + } + + state.wake_requested = false; + } + loop { + let candidates = ctx.next_compaction_candidates(4); + if candidates.is_empty() { + break; + } + if ctx.compaction_shutting_down.load(Ordering::Acquire) { + return; + } + #[cfg(windows)] + Self::retry_pending_deletions(&ctx, &mut pending_deletions); + let t0 = std::time::Instant::now(); + let res = ctx.compact_files( + &candidates, + &mut pacer, + #[cfg(windows)] + &mut pending_deletions, + ); + let compaction_millis = + u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX); + match res { + Ok(outcome) => { + ctx.stats + .num_compactions + .fetch_add(outcome.compacted_files, Ordering::Relaxed); + ctx.stats + .last_compaction_dur_ms + .store(compaction_millis, Ordering::Relaxed); + ctx.stats + .last_compaction_reclaimed_bytes + .store(outcome.reclaimed_bytes, Ordering::Relaxed); + ctx.stats + .last_compaction_moved_bytes + .store(outcome.moved_bytes, Ordering::Relaxed); + } + Err(_e) => { + ctx.stats.compaction_errors.fetch_add(1, Ordering::Relaxed); + } + } + } + #[cfg(windows)] + Self::retry_pending_deletions(&ctx, &mut pending_deletions); + } + }) + .unwrap(); + + *compaction_thd = Some(thd); + self.inner.signal_compaction_scan(); + } +} + +impl Drop for CandyStore { + fn drop(&mut self) { + self.stop_compaction(); + + let should_checkpoint = self.allow_clean_shutdown.load(Ordering::Relaxed); + self.stop_checkpoint_worker(); + + if !should_checkpoint { + return; + } + + let _ = self.inner.perform_checkpoint_with_logical_locks(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::{sync::Arc, thread, time::Duration}; + + use crate::{CandyStore, Config}; + + fn count_live_entries_in_file(store: &CandyStore, file_idx: u16) -> u64 { + let rows = store.inner.index_file.rows_table(); + let num_rows = store.inner.index_file.num_rows(); + let mut count = 0u64; + + for row_idx in 0..num_rows { + let row = rows.row(row_idx); + for entry in row.pointers.iter() { + if entry.is_valid() && entry.file_idx() == file_idx { + count += 1; + } + } + } + + count + } + + #[test] + fn test_compaction_reads_only_live_entries_for_target_file() -> Result<()> { + let dir = tempfile::tempdir().map_err(Error::IOError)?; + let db = CandyStore::open( + dir.path(), + Config { + max_data_file_size: 8192, + compaction_min_threshold: u32::MAX, + ..Default::default() + }, + )?; + db.stop_compaction(); + + for idx in 0..24 { + db.set("hot", format!("hot-value-{idx:02}-{}", "x".repeat(48)))?; + } + + let mut filler_idx = 0u64; + while db.inner.data_files.read().len() == 1 { + db.set( + format!("filler-{filler_idx}"), + format!("filler-value-{}", "y".repeat(48)), + )?; + filler_idx += 1; + } + + let active_idx = db.inner.active_file_idx.load(Ordering::Acquire); + let (target_idx, target_ordinal) = { + let files = db.inner.data_files.read(); + let (&target_idx, target_file) = files + .iter() + .find(|(idx, _)| **idx != active_idx) + .expect("expected a non-active file to compact"); + (target_idx, target_file.file_ordinal) + }; + + let live_entries = count_live_entries_in_file(&db, target_idx); + assert!( + live_entries > 0, + "expected live entries in the compacted file" + ); + + let before_read_ops = db.stats().num_read_ops; + let mut pacer = Pacer::new(u64::MAX / 4, Duration::from_secs(1), u64::MAX / 4); + db.inner + .compaction_shutting_down + .store(false, Ordering::Release); + let outcome = db.inner.compact_files( + &[(target_idx, target_ordinal)], + &mut pacer, + #[cfg(windows)] + &mut Vec::new(), + )?; + + assert_eq!(outcome.compacted_files, 1); + assert_eq!(db.stats().num_read_ops - before_read_ops, live_entries); + assert_eq!(count_live_entries_in_file(&db, target_idx), 0); + + Ok(()) + } + + #[test] + fn test_compaction_batch_reads_live_entries_for_all_target_files() -> Result<()> { + let dir = tempfile::tempdir().map_err(Error::IOError)?; + let db = CandyStore::open( + dir.path(), + Config { + max_data_file_size: 2048, + compaction_min_threshold: u32::MAX, + ..Default::default() + }, + )?; + db.stop_compaction(); + + for idx in 0..32 { + db.set(format!("hot-{idx}"), format!("seed-{}", "x".repeat(48)))?; + } + + while db.inner.data_files.read().len() < 5 { + let idx = db.stats().num_write_ops; + db.set( + format!("roll-{idx}"), + format!("roll-value-{}", "y".repeat(48)), + )?; + } + + let pre_compaction_active_idx = db.inner.active_file_idx.load(Ordering::Acquire); + db.inner._rotate_data_file(pre_compaction_active_idx)?; + + let active_idx = db.inner.active_file_idx.load(Ordering::Acquire); + let targets = { + let files = db.inner.data_files.read(); + let mut target_files = files + .iter() + .filter(|(file_idx, _)| **file_idx != active_idx) + .map(|(&file_idx, data_file)| (file_idx, data_file.file_ordinal)) + .collect::>(); + target_files.sort_by_key(|(file_idx, _)| *file_idx); + target_files.truncate(4); + target_files + }; + assert_eq!(targets.len(), 4); + + let live_entries = targets + .iter() + .map(|(file_idx, _)| count_live_entries_in_file(&db, *file_idx)) + .sum::(); + assert!(live_entries > 0); + + let before_read_ops = db.stats().num_read_ops; + let mut pacer = Pacer::new(u64::MAX / 4, Duration::from_secs(1), u64::MAX / 4); + db.inner + .compaction_shutting_down + .store(false, Ordering::Release); + let outcome = db.inner.compact_files( + &targets, + &mut pacer, + #[cfg(windows)] + &mut Vec::new(), + )?; + + assert_eq!(outcome.compacted_files, 4); + assert_eq!(db.stats().num_read_ops - before_read_ops, live_entries); + for (file_idx, _) in targets { + assert_eq!(count_live_entries_in_file(&db, file_idx), 0); + } + + Ok(()) + } + + #[test] + fn test_compaction_allows_concurrent_index_growth() -> Result<()> { + let dir = tempfile::tempdir().map_err(Error::IOError)?; + let db = Arc::new(CandyStore::open( + dir.path(), + Config { + initial_capacity: 16, + remap_scaler: 1, + max_data_file_size: 64 * 1024, + compaction_min_threshold: u32::MAX, + ..Default::default() + }, + )?); + db.stop_compaction(); + + let mut expected = Vec::new(); + for idx in 0..64u32 { + let key = format!("seed-{idx:04}"); + let value = format!("seed-value-{idx:04}-{}", "x".repeat(768)); + db.set(&key, &value)?; + expected.push((key, value.into_bytes())); + } + + let target_idx = db.inner.active_file_idx.load(Ordering::Acquire); + let target_ordinal = { + let files = db.inner.data_files.read(); + files + .get(&target_idx) + .expect("target file should exist before rotation") + .file_ordinal + }; + db.inner._rotate_data_file(target_idx)?; + + let live_entries = count_live_entries_in_file(&db, target_idx); + assert!( + live_entries >= 8, + "expected a file with enough live entries to slow compaction" + ); + + let rows_before = db.inner.index_file.num_rows(); + let db_for_compaction = Arc::clone(&db); + let compaction_handle = thread::spawn(move || { + let mut pacer = Pacer::new(256, Duration::from_millis(10), 256); + db_for_compaction + .inner + .compaction_shutting_down + .store(false, Ordering::Release); + db_for_compaction.inner.compact_files( + &[(target_idx, target_ordinal)], + &mut pacer, + #[cfg(windows)] + &mut Vec::new(), + ) + }); + + let mut grew = false; + for idx in 0..20_000u32 { + let key = format!("grow-{idx:04}"); + let value = format!("grow-value-{idx:04}-{}", "y".repeat(96)); + db.set(&key, &value)?; + expected.push((key, value.into_bytes())); + + if db.inner.index_file.num_rows() > rows_before { + grew = true; + break; + } + } + + let outcome = compaction_handle + .join() + .expect("compaction thread panicked")?; + assert_eq!(outcome.compacted_files, 1); + assert!( + grew, + "expected concurrent writes to force index growth during compaction" + ); + assert!(db.inner.index_file.num_rows() > rows_before); + assert_eq!(count_live_entries_in_file(&db, target_idx), 0); + + for (key, value) in expected { + assert_eq!( + db.get(&key)?, + Some(value), + "key {key} should remain readable" + ); + } + + Ok(()) + } +} diff --git a/src/store/list.rs b/src/store/list.rs new file mode 100644 index 0000000..171af32 --- /dev/null +++ b/src/store/list.rs @@ -0,0 +1,862 @@ +use siphasher::sip::SipHasher13; +use smallvec::SmallVec; + +use std::{hash::Hasher, ops::Range}; + +use crate::{ + internal::{KeyNamespace, RangeMetadata, aligned_data_entry_size}, + store::CandyStore, + types::{ + Error, GetOrCreateStatus, ListCompactionParams, MAX_USER_KEY_SIZE, MAX_USER_VALUE_SIZE, + ReplaceStatus, Result, SetStatus, + }, +}; + +/// A list item as `(item_key, value)`. +pub type KVPair = (Vec, Vec); + +#[derive(Clone, Copy)] +pub(super) struct ListNamespaces { + pub(super) meta: KeyNamespace, + pub(super) index: KeyNamespace, + pub(super) data: KeyNamespace, +} + +const LIST_NS: ListNamespaces = ListNamespaces { + meta: KeyNamespace::ListMeta, + index: KeyNamespace::ListIndex, + data: KeyNamespace::ListData, +}; + +/// Double-ended iterator over live list items in logical order. +pub struct ListIterator<'a> { + store: &'a CandyStore, + list: Vec, + ns: ListNamespaces, + next_idx: u64, + end_idx: u64, + initial_next_idx: u64, + initial_end_idx: u64, +} + +type ListMetadata = RangeMetadata; + +impl ListIterator<'_> { + fn try_heal_head(&self, new_head: u64) -> Result<()> { + self.store.try_heal_range_head( + self.ns.meta, + &self.list, + self.initial_next_idx, + new_head, + |store, list| get_list_meta(store, self.ns, list), + |store, list, meta| set_list_meta(store, self.ns, list, meta), + ) + } + + fn try_heal_tail(&self, new_tail: u64) -> Result<()> { + self.store.try_heal_range_tail( + self.ns.meta, + &self.list, + self.initial_end_idx, + new_tail, + |store, list| get_list_meta(store, self.ns, list), + |store, list, meta| set_list_meta(store, self.ns, list, meta), + ) + } +} + +impl Iterator for ListIterator<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + while self.next_idx <= self.end_idx { + let idx = self.next_idx; + self.next_idx += 1; + + if idx > self.initial_next_idx + 1000 { + let _ = self.try_heal_head(idx); + self.initial_next_idx = idx; + } + + let idx_key = make_list_index_key(&self.list, idx); + let key = match self.store.get_ns(self.ns.index, &idx_key) { + Ok(Some(key)) => key, + Ok(None) => continue, + Err(err) => return Some(Err(err)), + }; + + let data_key = make_list_data_key(&self.list, &key); + let value = match self.store.get_ns(self.ns.data, &data_key) { + Ok(Some(value)) => value, + Ok(None) => continue, + Err(err) => return Some(Err(err)), + }; + + return Some(Ok((key, strip_idx_suffix(value)))); + } + + None + } +} + +impl DoubleEndedIterator for ListIterator<'_> { + fn next_back(&mut self) -> Option<::Item> { + while self.next_idx <= self.end_idx { + let idx = self.end_idx; + if self.end_idx == 0 { + self.next_idx = 1; + } else { + self.end_idx -= 1; + } + + if idx + 1000 < self.initial_end_idx { + let _ = self.try_heal_tail(idx); + self.initial_end_idx = idx; + } + + let idx_key = make_list_index_key(&self.list, idx); + let key = match self.store.get_ns(self.ns.index, &idx_key) { + Ok(Some(key)) => key, + Ok(None) => continue, + Err(err) => return Some(Err(err)), + }; + + let data_key = make_list_data_key(&self.list, &key); + let value = match self.store.get_ns(self.ns.data, &data_key) { + Ok(Some(value)) => value, + Ok(None) => continue, + Err(err) => return Some(Err(err)), + }; + + return Some(Ok((key, strip_idx_suffix(value)))); + } + + None + } +} + +impl CandyStore { + /// Inserts or replaces `item_key` in `list_key`, placing the item at the tail. + pub fn set_in_list< + B1: AsRef<[u8]> + ?Sized, + B2: AsRef<[u8]> + ?Sized, + B3: AsRef<[u8]> + ?Sized, + >( + &self, + list_key: &B1, + item_key: &B2, + val: &B3, + ) -> Result { + let previous = self.list_set_at_tail_with_ns( + LIST_NS, + list_key.as_ref(), + item_key.as_ref(), + val.as_ref(), + )?; + Ok(match previous { + Some(previous) => SetStatus::PrevValue(previous), + None => SetStatus::CreatedNew, + }) + } + + /// Inserts or replaces `item_key` in `list_key`, moving it to the logical tail. + pub fn set_in_list_promoting< + B1: AsRef<[u8]> + ?Sized, + B2: AsRef<[u8]> + ?Sized, + B3: AsRef<[u8]> + ?Sized, + >( + &self, + list_key: &B1, + item_key: &B2, + val: &B3, + ) -> Result { + let previous = + self.list_promote_with_ns(LIST_NS, list_key.as_ref(), item_key.as_ref(), val.as_ref())?; + Ok(match previous { + Some(previous) => SetStatus::PrevValue(previous), + None => SetStatus::CreatedNew, + }) + } + + /// Replaces a list item only if its current value matches `expected_val` when provided. + pub fn replace_in_list< + B1: AsRef<[u8]> + ?Sized, + B2: AsRef<[u8]> + ?Sized, + B3: AsRef<[u8]> + ?Sized, + B4: AsRef<[u8]> + ?Sized, + >( + &self, + list_key: &B1, + item_key: &B2, + val: &B3, + expected_val: Option<&B4>, + ) -> Result { + self.list_replace_with_ns( + LIST_NS, + list_key.as_ref(), + item_key.as_ref(), + val.as_ref(), + expected_val.map(|expected| expected.as_ref()), + ) + } + + /// Returns the current list item value, or inserts `default_val` if the item is missing. + pub fn get_or_create_in_list< + B1: AsRef<[u8]> + ?Sized, + B2: AsRef<[u8]> + ?Sized, + B3: AsRef<[u8]> + ?Sized, + >( + &self, + list_key: &B1, + item_key: &B2, + default_val: &B3, + ) -> Result { + self.list_get_or_create_with_ns( + LIST_NS, + list_key.as_ref(), + item_key.as_ref(), + default_val.as_ref(), + ) + } + + /// Returns the current value for `item_key` in `list_key`, if present. + pub fn get_from_list + ?Sized, B2: AsRef<[u8]> + ?Sized>( + &self, + list_key: &B1, + item_key: &B2, + ) -> Result>> { + self.list_get_with_ns(LIST_NS, list_key.as_ref(), item_key.as_ref()) + } + + /// Removes `item_key` from `list_key` and returns its previous value if it existed. + pub fn remove_from_list + ?Sized, B2: AsRef<[u8]> + ?Sized>( + &self, + list_key: &B1, + item_key: &B2, + ) -> Result>> { + self.list_remove_with_ns(LIST_NS, list_key.as_ref(), item_key.as_ref()) + } + + /// Compacts list storage when `params` indicate enough holes exist to justify rewriting it. + pub fn compact_list_if_needed + ?Sized>( + &self, + list_key: &B, + params: ListCompactionParams, + ) -> Result { + self.list_compact_with_ns(LIST_NS, list_key.as_ref(), params) + } + + /// Iterates over live items in `list_key` from head to tail. + pub fn iter_list + ?Sized>(&self, list_key: &B) -> ListIterator<'_> { + self.list_iter_with_ns(LIST_NS, list_key.as_ref()) + } + + /// Removes all items in `list_key`. + pub fn discard_list + ?Sized>(&self, list_key: &B) -> Result { + self.list_discard_with_ns(LIST_NS, list_key.as_ref()) + } + + /// Returns the head item of `list_key` without removing it. + pub fn peek_list_head + ?Sized>(&self, list_key: &B) -> Result> { + self.peek_list_head_with_ns(LIST_NS, list_key.as_ref()) + } + + /// Returns the tail item of `list_key` without removing it. + pub fn peek_list_tail + ?Sized>(&self, list_key: &B) -> Result> { + self.peek_list_tail_with_ns(LIST_NS, list_key.as_ref()) + } + + /// Removes and returns the head item of `list_key`. + pub fn pop_list_head + ?Sized>(&self, list_key: &B) -> Result> { + self.pop_list_head_with_ns(LIST_NS, list_key.as_ref()) + } + + /// Removes and returns the tail item of `list_key`. + pub fn pop_list_tail + ?Sized>(&self, list_key: &B) -> Result> { + self.pop_list_tail_with_ns(LIST_NS, list_key.as_ref()) + } + + /// Returns the number of live items in `list_key`. + pub fn list_len + ?Sized>(&self, list_key: &B) -> Result { + self.list_len_with_ns(LIST_NS, list_key.as_ref()) + } + + /// Retains only items for which `func` returns `true`, preserving list order. + pub fn retain_in_list + ?Sized>( + &self, + list_key: &B, + func: impl FnMut(&[u8], &[u8]) -> Result, + ) -> Result<()> { + self.list_retain_with_ns(LIST_NS, list_key.as_ref(), func) + } + + pub(super) fn list_retain_with_ns( + &self, + ns: ListNamespaces, + list_key: &[u8], + mut func: impl FnMut(&[u8], &[u8]) -> Result, + ) -> Result<()> { + let _lock = self.list_write_guard(ns.meta, list_key); + let mut meta = get_list_meta(self, ns, list_key)?; + if meta.count == 0 { + return Ok(()); + } + + let original_head = meta.head; + let original_tail = meta.tail; + let mut new_tail = meta.tail; + let mut retained_count = 0u64; + + for idx in original_head..=original_tail { + let idx_key = make_list_index_key(list_key, idx); + let key = match self.get_ns(ns.index, &idx_key)? { + Some(key) => key, + None => continue, + }; + + let data_key = make_list_data_key(list_key, &key); + let val_with_idx = match self.get_ns(ns.data, &data_key)? { + Some(value) => value, + None => { + self.remove_ns(ns.index, &idx_key)?; + continue; + } + }; + let value = strip_idx_suffix(val_with_idx); + + self.remove_ns(ns.index, &idx_key)?; + + if func(&key, &value)? { + new_tail += 1; + let new_value = append_idx_suffix(&value, new_tail); + self.set_ns(ns.data, &data_key, &new_value)?; + + let new_idx_key = make_list_index_key(list_key, new_tail); + self.set_ns(ns.index, &new_idx_key, &key)?; + retained_count += 1; + } else { + self.remove_ns(ns.data, &data_key)?; + } + } + + if retained_count == 0 { + meta = ListMetadata::new(); + } else { + meta.head = original_tail + 1; + meta.tail = new_tail; + meta.count = retained_count; + } + + set_list_meta(self, ns, list_key, meta)?; + Ok(()) + } + + pub(super) fn list_set_at_tail_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + key: &[u8], + value: &[u8], + ) -> Result>> { + self.validate_list_item_sizes(list, key, value)?; + let _lock = self.list_write_guard(ns.meta, list); + + let mut meta = get_list_meta(self, ns, list)?; + let data_key = make_list_data_key(list, key); + + if let Some(existing) = self.get_ns(ns.data, &data_key)? { + let idx = extract_idx_suffix(&existing); + let new_value = append_idx_suffix(value, idx); + let old_with_idx = self.set_ns(ns.data, &data_key, &new_value)?; + + // Always write the index entry: after a crash the OS may have + // flushed the metadata update but not the corresponding index + // write, leaving the entry invisible in the list. + let idx_key = make_list_index_key(list, idx); + self.set_ns(ns.index, &idx_key, key)?; + + if meta.count == 0 || idx > meta.tail { + if meta.count == 0 { + meta.head = idx; + } + if idx > meta.tail { + meta.tail = idx; + } + meta.count += 1; + set_list_meta(self, ns, list, meta)?; + } + + return Ok(old_with_idx.map(strip_idx_suffix)); + } + + let new_idx = meta.tail + 1; + let value_with_idx = append_idx_suffix(value, new_idx); + self.set_ns(ns.data, &data_key, &value_with_idx)?; + + let idx_key = make_list_index_key(list, new_idx); + self.set_ns(ns.index, &idx_key, key)?; + + if meta.count == 0 { + meta.head = new_idx; + } + meta.tail = new_idx; + meta.count += 1; + set_list_meta(self, ns, list, meta)?; + + Ok(None) + } + + pub(super) fn list_replace_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + key: &[u8], + value: &[u8], + expected: Option<&[u8]>, + ) -> Result { + self.validate_list_item_sizes(list, key, value)?; + let _lock = self.list_write_guard(ns.meta, list); + + let data_key = make_list_data_key(list, key); + let Some(existing_value) = self.get_ns(ns.data, &data_key)? else { + return Ok(ReplaceStatus::DoesNotExist); + }; + + let previous = strip_idx_suffix(existing_value.clone()); + if let Some(expected) = expected + && previous != expected + { + return Ok(ReplaceStatus::WrongValue(previous)); + } + + let idx = extract_idx_suffix(&existing_value); + let new_value = append_idx_suffix(value, idx); + self.set_ns(ns.data, &data_key, &new_value)?; + Ok(ReplaceStatus::PrevValue(previous)) + } + + pub(super) fn list_get_or_create_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + key: &[u8], + value: &[u8], + ) -> Result { + self.validate_list_item_sizes(list, key, value)?; + let _lock = self.list_write_guard(ns.meta, list); + + let data_key = make_list_data_key(list, key); + if let Some(existing) = self.get_ns(ns.data, &data_key)? { + return Ok(GetOrCreateStatus::ExistingValue(strip_idx_suffix(existing))); + } + + let mut meta = get_list_meta(self, ns, list)?; + let new_idx = meta.tail + 1; + let value_with_idx = append_idx_suffix(value, new_idx); + self.set_ns(ns.data, &data_key, &value_with_idx)?; + + let idx_key = make_list_index_key(list, new_idx); + self.set_ns(ns.index, &idx_key, key)?; + + if meta.count == 0 { + meta.head = new_idx; + } + meta.tail = new_idx; + meta.count += 1; + set_list_meta(self, ns, list, meta)?; + + Ok(GetOrCreateStatus::CreatedNew(value.to_vec())) + } + + pub(super) fn list_promote_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + key: &[u8], + value: &[u8], + ) -> Result>> { + self.validate_list_item_sizes(list, key, value)?; + let _lock = self.list_write_guard(ns.meta, list); + let mut meta = get_list_meta(self, ns, list)?; + let data_key = make_list_data_key(list, key); + + let mut old_value = None; + let mut old_idx_key = None; + if let Some(existing) = self.get_ns(ns.data, &data_key)? { + let idx = extract_idx_suffix(&existing); + old_idx_key = Some(make_list_index_key(list, idx)); + old_value = Some(strip_idx_suffix(existing)); + } else { + meta.count += 1; + } + + let new_idx = meta.tail + 1; + let value_with_idx = append_idx_suffix(value, new_idx); + self.set_ns(ns.data, &data_key, &value_with_idx)?; + + let idx_key = make_list_index_key(list, new_idx); + self.set_ns(ns.index, &idx_key, key)?; + + if let Some(old_idx_key) = old_idx_key { + self.remove_ns(ns.index, &old_idx_key)?; + } + + if meta.count == 1 { + meta.head = new_idx; + } + meta.tail = new_idx; + set_list_meta(self, ns, list, meta)?; + + Ok(old_value) + } + + pub(super) fn list_get_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + key: &[u8], + ) -> Result>> { + let _lock = self.list_read_guard(ns.meta, list); + let data_key = make_list_data_key(list, key); + Ok(self.get_ns(ns.data, &data_key)?.map(strip_idx_suffix)) + } + + pub(super) fn list_remove_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + key: &[u8], + ) -> Result>> { + let _lock = self.list_write_guard(ns.meta, list); + self._list_remove_with_ns(ns, list, key) + } + + fn _list_remove_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + key: &[u8], + ) -> Result>> { + let mut meta = get_list_meta(self, ns, list)?; + let data_key = make_list_data_key(list, key); + let removed = match self.remove_ns(ns.data, &data_key)? { + Some(value) => value, + None => return Ok(None), + }; + + let idx = extract_idx_suffix(&removed); + let idx_key = make_list_index_key(list, idx); + self.remove_ns(ns.index, &idx_key)?; + + let old_value = Some(strip_idx_suffix(removed)); + meta.count = meta.count.saturating_sub(1); + + if meta.count == 0 { + meta = ListMetadata::new(); + } else { + let mut check_head = idx == meta.head; + if !check_head { + let head_key = make_list_index_key(list, meta.head); + if self.get_ns(ns.index, &head_key)?.is_none() { + check_head = true; + } + } + + if check_head { + let mut new_head = meta.head; + loop { + if new_head > meta.tail { + meta = ListMetadata::new(); + break; + } + if new_head == idx { + new_head += 1; + continue; + } + let probe_idx_key = make_list_index_key(list, new_head); + if self.get_ns(ns.index, &probe_idx_key)?.is_some() { + meta.head = new_head; + break; + } + new_head += 1; + } + } + + if meta.count > 0 { + let mut check_tail = idx == meta.tail; + if !check_tail { + let tail_key = make_list_index_key(list, meta.tail); + if self.get_ns(ns.index, &tail_key)?.is_none() { + check_tail = true; + } + } + + if check_tail { + let mut new_tail = meta.tail; + loop { + if new_tail < meta.head { + meta = ListMetadata::new(); + break; + } + if new_tail == idx { + if new_tail == 0 { + break; + } + new_tail -= 1; + continue; + } + let probe_idx_key = make_list_index_key(list, new_tail); + if self.get_ns(ns.index, &probe_idx_key)?.is_some() { + meta.tail = new_tail; + break; + } + if new_tail == 0 { + meta = ListMetadata::new(); + break; + } + new_tail -= 1; + } + } + } + } + + set_list_meta(self, ns, list, meta)?; + Ok(old_value) + } + + pub(super) fn list_discard_with_ns(&self, ns: ListNamespaces, list: &[u8]) -> Result { + let _lock = self.list_write_guard(ns.meta, list); + let meta = get_list_meta(self, ns, list)?; + if meta.count == 0 { + return Ok(false); + } + + for idx in meta.head..=meta.tail { + let idx_key = make_list_index_key(list, idx); + if let Some(key) = self.remove_ns(ns.index, &idx_key)? { + let data_key = make_list_data_key(list, &key); + self.remove_ns(ns.data, &data_key)?; + } + } + + self.remove_ns(ns.meta, list)?; + Ok(true) + } + + pub(super) fn list_compact_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + params: ListCompactionParams, + ) -> Result { + let _lock = self.list_write_guard(ns.meta, list); + let mut meta = get_list_meta(self, ns, list)?; + if meta.count == 0 { + return Ok(false); + } + + let span = if meta.tail >= meta.head { + meta.tail - meta.head + 1 + } else { + 0 + }; + if span == 0 || span < params.min_length { + return Ok(false); + } + + let holes_ratio = (span - meta.count) as f64 / span as f64; + if holes_ratio < params.min_holes_ratio { + return Ok(false); + } + + let limit = meta.tail; + while meta.head <= limit { + let idx_key = make_list_index_key(list, meta.head); + if let Some(key) = self.get_ns(ns.index, &idx_key)? { + let data_key = make_list_data_key(list, &key); + if let Some(value_with_idx) = self.get_ns(ns.data, &data_key)? { + let value = strip_idx_suffix(value_with_idx); + + meta.tail += 1; + let new_idx = meta.tail; + let new_value_with_idx = append_idx_suffix(&value, new_idx); + // Overwrite data in-place (not remove+set) to avoid data + // loss if a crash occurs between the two operations + self.set_ns(ns.data, &data_key, &new_value_with_idx)?; + + // Write new index before removing old so the entry is + // always reachable via at least one index position + let new_idx_key = make_list_index_key(list, new_idx); + self.set_ns(ns.index, &new_idx_key, &key)?; + } + self.remove_ns(ns.index, &idx_key)?; + } + meta.head += 1; + set_list_meta(self, ns, list, meta)?; + } + + Ok(true) + } + + fn peek_list_head_with_ns( + &self, + ns: ListNamespaces, + list_key: &[u8], + ) -> Result> { + self.list_iter_with_ns(ns, list_key).next().transpose() + } + + fn peek_list_tail_with_ns( + &self, + ns: ListNamespaces, + list_key: &[u8], + ) -> Result> { + self.list_iter_with_ns(ns, list_key).next_back().transpose() + } + + pub(super) fn pop_list_head_with_ns( + &self, + ns: ListNamespaces, + list_key: &[u8], + ) -> Result> { + let _lock = self.list_write_guard(ns.meta, list_key); + let head = self.peek_list_head_with_ns(ns, list_key)?; + if let Some((key, _)) = head + && let Some(value) = self._list_remove_with_ns(ns, list_key, &key)? + { + return Ok(Some((key, value))); + } + Ok(None) + } + + pub(super) fn pop_list_tail_with_ns( + &self, + ns: ListNamespaces, + list_key: &[u8], + ) -> Result> { + let _lock = self.list_write_guard(ns.meta, list_key); + let tail = self.peek_list_tail_with_ns(ns, list_key)?; + if let Some((key, _)) = tail + && let Some(value) = self._list_remove_with_ns(ns, list_key, &key)? + { + return Ok(Some((key, value))); + } + Ok(None) + } + + pub(super) fn list_iter_with_ns<'a>( + &'a self, + ns: ListNamespaces, + list: &[u8], + ) -> ListIterator<'a> { + let meta = get_list_meta(self, ns, list).unwrap_or_else(|_| ListMetadata::new()); + ListIterator { + store: self, + list: list.to_vec(), + ns, + next_idx: meta.head, + end_idx: meta.tail, + initial_next_idx: meta.head, + initial_end_idx: meta.tail, + } + } + + pub(super) fn list_len_with_ns(&self, ns: ListNamespaces, list: &[u8]) -> Result { + Ok(get_list_meta(self, ns, list)?.count as usize) + } + + pub(super) fn list_range_with_ns( + &self, + ns: ListNamespaces, + list: &[u8], + ) -> Result> { + let meta = get_list_meta(self, ns, list)?; + if meta.head > meta.tail { + return Ok(0..0); + } + Ok(meta.head as usize..meta.tail.saturating_add(1) as usize) + } + + fn validate_list_item_sizes(&self, list: &[u8], key: &[u8], value: &[u8]) -> Result<()> { + let data_key_len = make_list_data_key(list, key).len(); + let data_value_len = value.len() + size_of::(); + validate_internal_entry(self, data_key_len, data_value_len)?; + + let index_key_len = make_list_index_key(list, 0).len(); + validate_internal_entry(self, index_key_len, key.len()) + } +} + +fn validate_internal_entry(store: &CandyStore, key_len: usize, value_len: usize) -> Result<()> { + let entry_size = aligned_data_entry_size(key_len, value_len) as usize; + if key_len > MAX_USER_KEY_SIZE + || value_len > MAX_USER_VALUE_SIZE + || entry_size > store.inner.config.max_data_file_size as usize + { + return Err(Error::PayloadTooLarge(entry_size)); + } + Ok(()) +} + +fn get_list_meta(store: &CandyStore, ns: ListNamespaces, list: &[u8]) -> Result { + if let Some(value) = store.get_ns(ns.meta, list)? + && let Some(meta) = ListMetadata::from_bytes(&value) + { + return Ok(meta); + } + Ok(ListMetadata::new()) +} + +fn set_list_meta( + store: &CandyStore, + ns: ListNamespaces, + list: &[u8], + meta: ListMetadata, +) -> Result<()> { + store.set_ns(ns.meta, list, &meta.to_bytes())?; + Ok(()) +} + +fn hash_list_key(list: &[u8]) -> u64 { + let mut hasher = SipHasher13::new_with_keys(0x7ac1485be800c70e, 0x22ac1dcc7992c592); + hasher.write(list); + hasher.finish() +} + +fn make_list_data_key(list: &[u8], key: &[u8]) -> SmallVec<[u8; 128]> { + let hash = hash_list_key(list); + let mut out = SmallVec::<[u8; 128]>::with_capacity(8 + key.len()); + out.extend_from_slice(&hash.to_le_bytes()); + out.extend_from_slice(key); + out +} + +fn make_list_index_key(list: &[u8], idx: u64) -> SmallVec<[u8; 16]> { + let hash = hash_list_key(list); + let mut out = SmallVec::<[u8; 16]>::with_capacity(16); + out.extend_from_slice(&hash.to_le_bytes()); + out.extend_from_slice(&idx.to_be_bytes()); + out +} + +fn append_idx_suffix(value: &[u8], idx: u64) -> SmallVec<[u8; 128]> { + let mut out = SmallVec::<[u8; 128]>::with_capacity(value.len() + size_of::()); + out.extend_from_slice(value); + out.extend_from_slice(&idx.to_le_bytes()); + out +} + +fn strip_idx_suffix(mut value: Vec) -> Vec { + if value.len() >= size_of::() { + value.truncate(value.len() - size_of::()); + } + value +} + +fn extract_idx_suffix(value: &[u8]) -> u64 { + let n = value.len(); + if n < size_of::() { + return 0; + } + u64::from_le_bytes(value[n - size_of::()..n].try_into().unwrap()) +} diff --git a/src/store/open.rs b/src/store/open.rs new file mode 100644 index 0000000..e854a79 --- /dev/null +++ b/src/store/open.rs @@ -0,0 +1,334 @@ +use std::{ + collections::{HashMap, HashSet}, + path::Path, + sync::Arc, + time::Duration, +}; + +use crate::{ + data_file::DataFile, + index_file::IndexFile, + internal::{ + DATA_FILE_SIGNATURE, DATA_FILE_VERSION, FILE_OFFSET_ALIGNMENT, INDEX_FILE_SIGNATURE, + INDEX_FILE_VERSION, MAX_REPRESENTABLE_FILE_SIZE, index_file_path, index_rows_file_path, + is_resettable_open_error, parse_data_file_idx, read_available_at, sync_dir, + }, + types::{Config, Error, INITIAL_DATA_FILE_ORDINAL, Result}, +}; + +use super::{CandyStore, OpenState, StoreInner}; + +impl CandyStore { + fn recreate_index_files(base_path: &Path) -> Result<()> { + let mut removed_any = false; + for path in [index_file_path(base_path), index_rows_file_path(base_path)] { + match std::fs::remove_file(&path) { + Ok(()) => removed_any = true, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => return Err(Error::IOError(err)), + } + } + if removed_any { + sync_dir(base_path)?; + } + Ok(()) + } + + fn existing_version_if_signature_matches( + path: &Path, + signature: &[u8; 8], + ) -> Result> { + let file = match std::fs::File::options().read(true).open(path) { + Ok(file) => file, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(None), + Err(err) => return Err(Error::IOError(err)), + }; + + let header = read_available_at(&file, 12, 0).map_err(Error::IOError)?; + if header.len() < 12 || &header[0..8] != signature { + return Ok(None); + } + + Ok(Some(u32::from_le_bytes(header[8..12].try_into().unwrap()))) + } + + fn has_unrecognized_index_version(base_path: &Path) -> Result { + Ok(matches!( + Self::existing_version_if_signature_matches( + &index_file_path(base_path), + INDEX_FILE_SIGNATURE, + )?, + Some(version) if version != INDEX_FILE_VERSION + )) + } + + fn data_files_use_recognized_versions(base_path: &Path) -> Result { + let mut found_any = false; + + for entry in std::fs::read_dir(base_path).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + if parse_data_file_idx(&path).is_none() { + continue; + } + + let Some(version) = + Self::existing_version_if_signature_matches(&path, DATA_FILE_SIGNATURE)? + else { + return Ok(false); + }; + + if version != DATA_FILE_VERSION { + return Ok(false); + } + + found_any = true; + } + + Ok(found_any) + } + + fn should_port_to_current_format(base_path: &Path) -> Result { + if !Self::has_unrecognized_index_version(base_path)? { + return Ok(false); + } + + Self::data_files_use_recognized_versions(base_path) + } + + fn build_store( + base_path: std::path::PathBuf, + config: Arc, + lockfile: fslock::LockFile, + ) -> Result { + let state = Self::open_or_reset_state(&base_path, config.clone())?; + let num_logical_locks = config.max_concurrency.max(8).next_power_of_two(); + + Ok(Self { + inner: Arc::new(StoreInner::new(base_path, config, state, num_logical_locks)), + _lockfile: lockfile, + compaction_thd: parking_lot::Mutex::new(None), + checkpoint_thd: parking_lot::Mutex::new(None), + allow_clean_shutdown: std::sync::atomic::AtomicBool::new(true), + }) + } + + fn clear_db_files(base_path: &Path) -> Result<()> { + let mut removed_any = false; + for entry in std::fs::read_dir(base_path).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + if path.file_name().and_then(|name| name.to_str()) == Some(".lockfile") { + continue; + } + + let file_type = entry.file_type().map_err(Error::IOError)?; + if file_type.is_dir() { + std::fs::remove_dir_all(&path).map_err(Error::IOError)?; + removed_any = true; + } else if file_type.is_file() || file_type.is_symlink() { + std::fs::remove_file(&path).map_err(Error::IOError)?; + removed_any = true; + } + } + if removed_any { + sync_dir(base_path)?; + } + Ok(()) + } + + fn open_state(base_path: &Path, config: Arc) -> Result { + let index_file = IndexFile::open(base_path, config.clone())?; + let mut data_files = HashMap::new(); + let mut file_ordinals = Vec::new(); + let mut seen_ordinals = HashSet::new(); + let mut active_file_idx = 0; + let mut active_file_ordinal = INITIAL_DATA_FILE_ORDINAL; + + for entry in std::fs::read_dir(base_path).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + let Some(file_idx) = parse_data_file_idx(&path) else { + continue; + }; + let file_ordinal = DataFile::read_ordinal(base_path, file_idx)?; + if !seen_ordinals.insert(file_ordinal) { + return Err(crate::internal::invalid_data_error( + "duplicate data file ordinal", + )); + } + if file_ordinals.is_empty() || file_ordinal > active_file_ordinal { + active_file_idx = file_idx; + active_file_ordinal = file_ordinal; + } + file_ordinals.push((file_idx, file_ordinal)); + } + + for (file_idx, file_ordinal) in file_ordinals { + let validate_tail = file_ordinal == active_file_ordinal; + let data_file = Arc::new(DataFile::open( + base_path, + config.clone(), + file_idx, + validate_tail, + )?); + data_files.insert(file_idx, data_file); + } + + if data_files.is_empty() { + let data_file = Arc::new(DataFile::create( + base_path, + config.clone(), + active_file_idx, + active_file_ordinal, + )?); + data_files.insert(active_file_idx, data_file); + } + + Ok(OpenState { + index_file, + data_files, + active_file_idx, + active_file_ordinal, + }) + } + + fn acquire_lockfile(base_path: &Path) -> Result { + let lockfile_path = base_path.join(".lockfile"); + let mut lockfile = fslock::LockFile::open(&lockfile_path).map_err(Error::IOError)?; + if !lockfile.try_lock().unwrap_or(false) { + let content = + String::from_utf8_lossy(&std::fs::read(&lockfile_path).unwrap_or("".into())) + .into_owned(); + + return Err(Error::LockfileTaken(lockfile_path, content)); + } + + let content = format!( + "[{}] {}", + std::process::id(), + std::env::args().collect::>().join(" ") + ); + _ = std::fs::write(&lockfile_path, content).map_err(Error::IOError); + Ok(lockfile) + } + + fn open_or_reset_state(base_path: &Path, config: Arc) -> Result { + match Self::open_state(base_path, config.clone()) { + Ok(state) => Ok(state), + Err(err) => { + if config.port_to_current_format && Self::should_port_to_current_format(base_path)? + { + Self::recreate_index_files(base_path)?; + return Self::open_state(base_path, config); + } + + if config.reset_on_invalid_data && is_resettable_open_error(&err) { + Self::clear_db_files(base_path)?; + return Self::open_state(base_path, config); + } + + Err(err) + } + } + } + + fn normalize_config_for_path(base_path: &Path, config: Config) -> Result> { + let max_data_file_size = config.max_data_file_size.min(MAX_REPRESENTABLE_FILE_SIZE); + let mut normalized = Config { + max_data_file_size, + compaction_min_threshold: config + .compaction_min_threshold + .min((max_data_file_size as f64 * 0.8) as u32), + remap_scaler: config.remap_scaler.clamp(1, 4), + checkpoint_interval: config.checkpoint_interval.map(|d| { + if d.is_zero() { + Duration::from_millis(100) + } else { + d + } + }), + checkpoint_delta_bytes: config + .checkpoint_delta_bytes + .map(|b| b.max(FILE_OFFSET_ALIGNMENT as usize)), + ..config + }; + + match IndexFile::existing_hash_key(base_path) { + Ok(Some(hash_key)) => normalized.hash_key = hash_key, + Ok(None) => {} + Err(err) if is_resettable_open_error(&err) => {} + Err(err) => return Err(err), + } + + Ok(Arc::new(normalized)) + } + + /// Opens a store at `path`, creating it if needed. + /// + /// If `config.port_to_current_format` is enabled, opening may recreate the + /// index files when their format is outdated but the data files are still + /// recognized. If `config.reset_on_invalid_data` is enabled, opening may + /// remove all contents and recreate fresh store files when the on-disk + /// data is corrupt. While the store is open, the active `.lockfile` is + /// preserved so the directory remains locked against concurrent opens. + pub fn open(path: impl AsRef, config: Config) -> Result { + let base_path = path.as_ref().to_path_buf(); + std::fs::create_dir_all(&base_path).map_err(Error::IOError)?; + + let lockfile = Self::acquire_lockfile(&base_path)?; + let config = Self::normalize_config_for_path(&base_path, config)?; + + let store = Self::build_store(base_path.clone(), config.clone(), lockfile)?; + match store.recover_index() { + Ok(()) => { + store.start_checkpoint_worker(); + store.start_compaction(); + Ok(store) + } + Err(err) if config.reset_on_invalid_data && is_resettable_open_error(&err) => { + let store = std::mem::ManuallyDrop::new(store); + let inner = unsafe { std::ptr::read(&store.inner) }; + let lockfile = unsafe { std::ptr::read(&store._lockfile) }; + let compaction_thd = unsafe { std::ptr::read(&store.compaction_thd) }; + let checkpoint_thd = unsafe { std::ptr::read(&store.checkpoint_thd) }; + let _allow_clean_shutdown = unsafe { std::ptr::read(&store.allow_clean_shutdown) }; + drop(compaction_thd); + drop(checkpoint_thd); + drop(inner); + + Self::clear_db_files(&base_path)?; + + let recovered = Self::build_store(base_path, config, lockfile)?; + recovered.recover_index()?; + recovered.start_checkpoint_worker(); + recovered.start_compaction(); + Ok(recovered) + } + Err(err) => Err(err), + } + } + + /// Clears the store and recreates a fresh empty database in the same + /// directory. + /// + /// This removes all directory contents, including unrelated files and + /// subdirectories, before recreating the store files. While the store is + /// open, the active `.lockfile` is preserved so the directory remains + /// locked against concurrent opens. + pub fn clear(&self) -> Result<()> { + // stop bg thread + self.stop_compaction(); + self.stop_checkpoint_worker(); + + // now we're single-threaded. take all locks and clear state + self.inner.reset()?; + + self.allow_clean_shutdown + .store(true, std::sync::atomic::Ordering::Relaxed); + self.start_checkpoint_worker(); + self.start_compaction(); + + Ok(()) + } +} diff --git a/src/store/queue.rs b/src/store/queue.rs new file mode 100644 index 0000000..18760c7 --- /dev/null +++ b/src/store/queue.rs @@ -0,0 +1,637 @@ +use siphasher::sip::SipHasher13; + +use std::{hash::Hasher, mem::size_of, ops::Range}; + +use crate::{ + internal::{KeyNamespace, RangeMetadata, aligned_data_entry_size}, + store::CandyStore, + types::{Error, MAX_USER_VALUE_SIZE, Result}, +}; + +#[derive(Clone, Copy)] +pub(super) struct QueueNamespaces { + pub(super) meta: KeyNamespace, + pub(super) data: KeyNamespace, +} + +const QUEUE_NS: QueueNamespaces = QueueNamespaces { + meta: KeyNamespace::QueueMeta, + data: KeyNamespace::QueueData, +}; + +const BIG_NS: QueueNamespaces = QueueNamespaces { + meta: KeyNamespace::BigMeta, + data: KeyNamespace::BigData, +}; + +/// Double-ended iterator over live queue items and their logical indices. +pub struct QueueIterator<'a> { + store: &'a CandyStore, + queue: Vec, + ns: QueueNamespaces, + next_idx: u64, + end_idx: u64, + initial_next_idx: u64, + initial_end_idx: u64, +} + +type QueueMetadata = RangeMetadata; + +impl<'a> QueueIterator<'a> { + fn try_heal_head(&self, new_head: u64) -> Result<()> { + self.store.try_heal_range_head( + self.ns.meta, + &self.queue, + self.initial_next_idx, + new_head, + |store, queue| get_queue_meta(store, self.ns, queue), + |store, queue, meta| set_queue_meta(store, self.ns, queue, meta), + ) + } + + fn try_heal_tail(&self, new_tail: u64) -> Result<()> { + self.store.try_heal_range_tail( + self.ns.meta, + &self.queue, + self.initial_end_idx, + new_tail, + |store, queue| get_queue_meta(store, self.ns, queue), + |store, queue, meta| set_queue_meta(store, self.ns, queue, meta), + ) + } +} + +impl Iterator for QueueIterator<'_> { + type Item = Result<(usize, Vec)>; + + fn next(&mut self) -> Option { + while self.next_idx <= self.end_idx { + let idx = self.next_idx; + self.next_idx += 1; + + if idx > self.initial_next_idx + 1000 { + let _ = self.try_heal_head(idx); + self.initial_next_idx = idx; + } + + let key = make_queue_data_key(&self.queue, idx); + match self.store.get_ns(self.ns.data, &key) { + Ok(Some(v)) => return Some(Ok((idx as usize, v))), + Ok(None) => continue, + Err(e) => return Some(Err(e)), + } + } + None + } +} + +impl DoubleEndedIterator for QueueIterator<'_> { + fn next_back(&mut self) -> Option<::Item> { + while self.next_idx <= self.end_idx { + let idx = self.end_idx; + if self.end_idx == 0 { + self.next_idx = 1; + } else { + self.end_idx -= 1; + } + + if idx + 1000 < self.initial_end_idx { + let _ = self.try_heal_tail(idx); + self.initial_end_idx = idx; + } + + let key = make_queue_data_key(&self.queue, idx); + match self.store.get_ns(self.ns.data, &key) { + Ok(Some(v)) => return Some(Ok((idx as usize, v))), + Ok(None) => continue, + Err(e) => return Some(Err(e)), + } + } + None + } +} + +impl CandyStore { + /// Pushes `val` to the head of `queue_key` and returns its logical index. + pub fn push_to_queue_head + ?Sized, B2: AsRef<[u8]> + ?Sized>( + &self, + queue_key: &B1, + val: &B2, + ) -> Result { + self.queue_push_head_with_ns(QUEUE_NS, queue_key.as_ref(), val.as_ref()) + .map(|idx| idx as usize) + } + + /// Pushes `val` to the tail of `queue_key` and returns its logical index. + pub fn push_to_queue_tail + ?Sized, B2: AsRef<[u8]> + ?Sized>( + &self, + queue_key: &B1, + val: &B2, + ) -> Result { + self.queue_push_tail_with_ns(QUEUE_NS, queue_key.as_ref(), val.as_ref()) + .map(|idx| idx as usize) + } + + /// Removes and returns the head value of `queue_key`. + pub fn pop_queue_head + ?Sized>( + &self, + queue_key: &B, + ) -> Result>> { + Ok(self + .queue_pop_head_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(_, value)| value)) + } + + /// Removes and returns the head item of `queue_key` together with its logical index. + pub fn pop_queue_head_with_idx + ?Sized>( + &self, + queue_key: &B, + ) -> Result)>> { + Ok(self + .queue_pop_head_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(idx, value)| (idx as usize, value))) + } + + /// Removes and returns the tail value of `queue_key`. + pub fn pop_queue_tail + ?Sized>( + &self, + queue_key: &B, + ) -> Result>> { + Ok(self + .queue_pop_tail_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(_, value)| value)) + } + + /// Removes and returns the tail item of `queue_key` together with its logical index. + pub fn pop_queue_tail_with_idx + ?Sized>( + &self, + queue_key: &B, + ) -> Result)>> { + Ok(self + .queue_pop_tail_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(idx, value)| (idx as usize, value))) + } + + /// Returns the head value of `queue_key` without removing it. + pub fn peek_queue_head + ?Sized>( + &self, + queue_key: &B, + ) -> Result>> { + Ok(self + .queue_peek_head_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(_, value)| value)) + } + + /// Returns the head item of `queue_key` and its logical index without removing it. + pub fn peek_queue_head_with_idx + ?Sized>( + &self, + queue_key: &B, + ) -> Result)>> { + Ok(self + .queue_peek_head_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(idx, value)| (idx as usize, value))) + } + + /// Returns the tail value of `queue_key` without removing it. + pub fn peek_queue_tail + ?Sized>( + &self, + queue_key: &B, + ) -> Result>> { + Ok(self + .queue_peek_tail_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(_, value)| value)) + } + + /// Returns the tail item of `queue_key` and its logical index without removing it. + pub fn peek_queue_tail_with_idx + ?Sized>( + &self, + queue_key: &B, + ) -> Result)>> { + Ok(self + .queue_peek_tail_with_ns(QUEUE_NS, queue_key.as_ref())? + .map(|(idx, value)| (idx as usize, value))) + } + + /// Removes and returns the item at logical index `idx`, if it exists. + pub fn remove_from_queue + ?Sized>( + &self, + queue_key: &B, + idx: usize, + ) -> Result>> { + self.queue_remove_with_ns(QUEUE_NS, queue_key.as_ref(), idx as u64) + } + + /// Removes all items from `queue_key`. + pub fn discard_queue + ?Sized>(&self, queue_key: &B) -> Result { + self.queue_discard_with_ns(QUEUE_NS, queue_key.as_ref()) + } + + /// Appends all provided values to the tail of `queue_key`. + pub fn extend_queue + ?Sized>( + &self, + queue_key: &B, + items: impl IntoIterator>, + ) -> Result> { + let mut start = None; + let mut end = None; + + for item in items { + let idx = self.push_to_queue_tail(queue_key, &item)?; + if start.is_none() { + start = Some(idx); + } + end = Some(idx + 1); + } + + Ok(match (start, end) { + (Some(start), Some(end)) => start..end, + _ => { + let range = self.queue_range(queue_key)?; + range.start..range.start + } + }) + } + + /// Returns the number of live items in `queue_key`. + pub fn queue_len + ?Sized>(&self, queue_key: &B) -> Result { + Ok(self.queue_len_with_ns(QUEUE_NS, queue_key.as_ref())? as usize) + } + + /// Returns the current inclusive-exclusive logical index span for `queue_key`. + pub fn queue_range + ?Sized>(&self, queue_key: &B) -> Result> { + self.queue_range_with_ns(QUEUE_NS, queue_key.as_ref()) + } + + /// Iterates over live items in `queue_key` from head to tail. + pub fn iter_queue<'a, B: AsRef<[u8]> + ?Sized>(&'a self, queue_key: &B) -> QueueIterator<'a> { + self.queue_iter_with_ns(QUEUE_NS, queue_key.as_ref()) + } + + /// Stores a large value under `key`, chunking it across queue-backed data entries if needed. + pub fn set_big + ?Sized, B2: AsRef<[u8]> + ?Sized>( + &self, + key: &B1, + value: &B2, + ) -> Result { + self.queue_set_big_with_ns(BIG_NS, key.as_ref(), value.as_ref()) + } + + /// Loads a value previously stored with [`CandyStore::set_big`]. + pub fn get_big + ?Sized>(&self, key: &B) -> Result>> { + self.queue_get_big_with_ns(BIG_NS, key.as_ref()) + } + + /// Removes a value previously stored with [`CandyStore::set_big`]. + pub fn remove_big + ?Sized>(&self, key: &B) -> Result { + self.queue_discard_with_ns(BIG_NS, key.as_ref()) + } + + pub(super) fn queue_push_tail_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + value: &[u8], + ) -> Result { + let _lock = self.list_write_guard(ns.meta, queue); + self._queue_push_tail_with_ns(ns, queue, value) + } + + fn _queue_push_tail_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + value: &[u8], + ) -> Result { + let mut meta = get_queue_meta(self, ns, queue)?; + let new_tail = meta.tail + 1; + let key = make_queue_data_key(queue, new_tail); + self.set_ns(ns.data, &key, value)?; + meta.tail = new_tail; + meta.count += 1; + if meta.head > meta.tail { + meta.head = new_tail; + } + set_queue_meta(self, ns, queue, meta)?; + Ok(new_tail) + } + + pub(super) fn queue_push_head_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + value: &[u8], + ) -> Result { + let _lock = self.list_write_guard(ns.meta, queue); + let mut meta = get_queue_meta(self, ns, queue)?; + let new_head = meta.head - 1; + let key = make_queue_data_key(queue, new_head); + self.set_ns(ns.data, &key, value)?; + meta.head = new_head; + meta.count += 1; + if meta.tail < meta.head { + meta.tail = new_head; + } + set_queue_meta(self, ns, queue, meta)?; + Ok(new_head) + } + + pub(super) fn queue_pop_head_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + ) -> Result)>> { + let _lock = self.list_write_guard(ns.meta, queue); + let mut meta = get_queue_meta(self, ns, queue)?; + loop { + if meta.head > meta.tail { + return Ok(None); + } + + let idx = meta.head; + let key = make_queue_data_key(queue, idx); + let value = self.remove_ns(ns.data, &key)?; + meta.head += 1; + + if let Some(value) = value { + meta.count = meta.count.saturating_sub(1); + if meta.head > meta.tail { + meta = QueueMetadata::new(); + } + set_queue_meta(self, ns, queue, meta)?; + return Ok(Some((idx, value))); + } + + set_queue_meta(self, ns, queue, meta)?; + } + } + + pub(super) fn queue_pop_tail_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + ) -> Result)>> { + let _lock = self.list_write_guard(ns.meta, queue); + let mut meta = get_queue_meta(self, ns, queue)?; + loop { + if meta.head > meta.tail { + return Ok(None); + } + + let idx = meta.tail; + let key = make_queue_data_key(queue, idx); + let value = self.remove_ns(ns.data, &key)?; + meta.tail = meta.tail.saturating_sub(1); + + if let Some(value) = value { + meta.count = meta.count.saturating_sub(1); + if meta.head > meta.tail { + meta = QueueMetadata::new(); + } + set_queue_meta(self, ns, queue, meta)?; + return Ok(Some((idx, value))); + } + + set_queue_meta(self, ns, queue, meta)?; + } + } + + pub(super) fn queue_peek_head_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + ) -> Result)>> { + let _lock = self.list_read_guard(ns.meta, queue); + let meta = get_queue_meta(self, ns, queue)?; + if meta.head > meta.tail { + return Ok(None); + } + for idx in meta.head..=meta.tail { + let key = make_queue_data_key(queue, idx); + if let Some(value) = self.get_ns(ns.data, &key)? { + return Ok(Some((idx, value))); + } + } + Ok(None) + } + + pub(super) fn queue_peek_tail_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + ) -> Result)>> { + let _lock = self.list_read_guard(ns.meta, queue); + let meta = get_queue_meta(self, ns, queue)?; + if meta.head > meta.tail { + return Ok(None); + } + for idx in (meta.head..=meta.tail).rev() { + let key = make_queue_data_key(queue, idx); + if let Some(value) = self.get_ns(ns.data, &key)? { + return Ok(Some((idx, value))); + } + } + Ok(None) + } + + pub(super) fn queue_len_with_ns(&self, ns: QueueNamespaces, queue: &[u8]) -> Result { + Ok(get_queue_meta(self, ns, queue)?.count) + } + + pub(super) fn queue_discard_with_ns(&self, ns: QueueNamespaces, queue: &[u8]) -> Result { + let _lock = self.list_write_guard(ns.meta, queue); + self._queue_discard_with_ns(ns, queue) + } + + fn _queue_discard_with_ns(&self, ns: QueueNamespaces, queue: &[u8]) -> Result { + let mut meta = get_queue_meta(self, ns, queue)?; + let had_items = meta.head <= meta.tail; + while meta.head <= meta.tail { + let key = make_queue_data_key(queue, meta.head); + _ = self.remove_ns(ns.data, &key)?; + meta.head += 1; + } + + self.remove_ns(ns.meta, queue)?; + Ok(had_items) + } + + fn queue_remove_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + idx: u64, + ) -> Result>> { + let _lock = self.list_write_guard(ns.meta, queue); + let mut meta = get_queue_meta(self, ns, queue)?; + let key = make_queue_data_key(queue, idx); + let removed = match self.remove_ns(ns.data, &key)? { + Some(value) => value, + None => return Ok(None), + }; + + meta.count = meta.count.saturating_sub(1); + + if idx == meta.head { + meta.head += 1; + } + + if meta.tail == idx { + meta.tail = meta.tail.saturating_sub(1); + } + + if meta.head > meta.tail { + meta = QueueMetadata::new(); + } + + set_queue_meta(self, ns, queue, meta)?; + Ok(Some(removed)) + } + + pub(super) fn queue_iter_with_ns<'a>( + &'a self, + ns: QueueNamespaces, + queue: &[u8], + ) -> QueueIterator<'a> { + let meta = get_queue_meta(self, ns, queue).unwrap_or_else(|_| QueueMetadata::new()); + QueueIterator { + store: self, + queue: queue.to_vec(), + ns, + next_idx: meta.head, + end_idx: meta.tail, + initial_next_idx: meta.head, + initial_end_idx: meta.tail, + } + } + + pub(super) fn queue_range_with_ns( + &self, + ns: QueueNamespaces, + queue: &[u8], + ) -> Result> { + let meta = get_queue_meta(self, ns, queue)?; + if meta.count == 0 || meta.head > meta.tail { + return Ok(0..0); + } + Ok(meta.head as usize..meta.tail.saturating_add(1) as usize) + } + + pub(super) fn queue_set_big_with_ns( + &self, + ns: QueueNamespaces, + key: &[u8], + value: &[u8], + ) -> Result { + let _lock = self.list_write_guard(ns.meta, key); + let existed = self._queue_discard_with_ns(ns, key)?; + + let max_chunk_len = self.max_big_chunk_len(key)?; + + for chunk in value.chunks(max_chunk_len) { + self._queue_push_tail_with_ns(ns, key, chunk)?; + } + + self._queue_push_tail_with_ns(ns, key, &value.len().to_le_bytes())?; + Ok(existed) + } + + pub(super) fn queue_get_big_with_ns( + &self, + ns: QueueNamespaces, + key: &[u8], + ) -> Result>> { + let _lock = self.list_read_guard(ns.meta, key); + let meta = get_queue_meta(self, ns, key)?; + let expected_chunks = meta.count; + if expected_chunks == 0 { + return Ok(None); + } + + let mut collected = Vec::new(); + let mut seen = 0u64; + for idx in meta.head..=meta.tail { + let item_key = make_queue_data_key(key, idx); + let Some(chunk) = self.get_ns(ns.data, &item_key)? else { + continue; + }; + + seen += 1; + if seen == expected_chunks && chunk.len() == size_of::() { + let recorded_len = usize::from_le_bytes(chunk.as_slice().try_into().unwrap()); + if recorded_len == collected.len() { + return Ok(Some(collected)); + } + return Ok(None); + } + + collected.extend_from_slice(&chunk); + if seen == expected_chunks { + return Ok(None); + } + } + + Ok(None) + } + + fn max_big_chunk_len(&self, key: &[u8]) -> Result { + let data_key_len = make_queue_data_key(key, 0).len(); + if aligned_data_entry_size(data_key_len, size_of::()) as usize + > self.inner.config.max_data_file_size as usize + { + return Err(Error::PayloadTooLarge(aligned_data_entry_size( + data_key_len, + size_of::(), + ) as usize)); + } + + let mut max_chunk_len = MAX_USER_VALUE_SIZE; + while max_chunk_len > 0 + && aligned_data_entry_size(data_key_len, max_chunk_len) as usize + > self.inner.config.max_data_file_size as usize + { + max_chunk_len -= 1; + } + + if max_chunk_len == 0 { + return Err(Error::PayloadTooLarge( + aligned_data_entry_size(data_key_len, 1) as usize, + )); + } + + Ok(max_chunk_len) + } +} + +fn get_queue_meta(store: &CandyStore, ns: QueueNamespaces, queue: &[u8]) -> Result { + if let Some(value) = store.get_ns(ns.meta, queue)? + && let Some(meta) = QueueMetadata::from_bytes(&value) + { + return Ok(meta); + } + Ok(QueueMetadata::new()) +} + +fn set_queue_meta( + store: &CandyStore, + ns: QueueNamespaces, + queue: &[u8], + meta: QueueMetadata, +) -> Result<()> { + store.set_ns(ns.meta, queue, &meta.to_bytes())?; + Ok(()) +} + +fn hash_queue_key(queue: &[u8]) -> u64 { + let mut hasher = SipHasher13::new_with_keys(0xb1ccc559a9924eaa, 0x1b1a682059c2d599); + hasher.write(queue); + hasher.finish() +} + +fn make_queue_data_key(queue: &[u8], seq: u64) -> [u8; 16] { + let hash = hash_queue_key(queue); + + let mut key = [0u8; 16]; + key[..8].copy_from_slice(&hash.to_le_bytes()); + key[8..].copy_from_slice(&seq.to_be_bytes()); + key +} diff --git a/src/store/recovery.rs b/src/store/recovery.rs new file mode 100644 index 0000000..8cdd3e5 --- /dev/null +++ b/src/store/recovery.rs @@ -0,0 +1,371 @@ +use std::sync::{Arc, atomic::Ordering}; + +use crate::{ + crash_point, + data_file::DataFile, + index_file::EntryPointer, + internal::{ + EntryType, FILE_OFFSET_ALIGNMENT, HashCoord, KVRef, KeyNamespace, ROW_WIDTH, + aligned_data_entry_size, aligned_tombstone_entry_waste, invalid_data_error, + }, + types::{Error, MAX_USER_KEY_SIZE, MAX_USER_VALUE_SIZE, Result}, +}; + +use super::CandyStore; + +#[derive(Clone, Copy)] +enum RebuildMode { + TailFrom(u64), + FullFile, +} + +impl CandyStore { + /// How many bytes of replayed data between progressive checkpoints. + const REBUILD_CHECKPOINT_INTERVAL_BYTES: u64 = 256 * 1024; + + pub(super) fn recover_index(&self) -> Result<()> { + let ordered_files = self.inner.ordered_data_files(); + let Some(last_file) = ordered_files.last().cloned() else { + return Ok(()); + }; + + let (commit_file_ordinal, commit_offset) = self.inner.index_file.checkpoint_cursor(); + + let start_idx = ordered_files + .iter() + .position(|data_file| data_file.file_ordinal >= commit_file_ordinal) + .unwrap_or(ordered_files.len() - 1); + let start_file = &ordered_files[start_idx]; + let rebuild_mode = if start_file.file_ordinal == commit_file_ordinal { + self.validated_commit_offset(start_file, commit_offset)? + } else { + RebuildMode::TailFrom(0) + }; + + // Recompute the runtime-only delta from the persisted replay cursor. + self.inner + .uncommitted_entries_delta + .store(0, Ordering::Relaxed); + let mut match_scratch = Vec::new(); + let mut bytes_since_checkpoint = 0u64; + let mut pending_committed_delta = 0i64; + + let mut final_cursor = (last_file.file_ordinal, last_file.used_bytes()); + for (idx, data_file) in ordered_files.iter().enumerate().skip(start_idx) { + let file_mode = if idx == start_idx { + rebuild_mode + } else { + RebuildMode::TailFrom(0) + }; + let durable_extent = self.rebuild_file_from( + data_file, + file_mode, + &mut bytes_since_checkpoint, + &mut pending_committed_delta, + &mut match_scratch, + )?; + final_cursor = (data_file.file_ordinal, durable_extent); + } + + self.persist_rebuild_checkpoint(final_cursor.0, final_cursor.1, pending_committed_delta)?; + debug_assert_eq!( + self.inner.uncommitted_entries_delta.load(Ordering::Relaxed), + 0 + ); + + Ok(()) + } + + fn rebuild_file_from( + &self, + data_file: &Arc, + rebuild_mode: RebuildMode, + bytes_since_checkpoint: &mut u64, + pending_committed_delta: &mut i64, + match_scratch: &mut Vec, + ) -> Result { + let start_offset = match rebuild_mode { + RebuildMode::TailFrom(offset) => offset, + RebuildMode::FullFile => 0, + }; + + // Pre-purge any index entries that point past the file's durable + // extent. This handles the case where the data file was truncated + // (e.g. disk-full or corruption) and ensures the replay loop won't + // encounter stale pointers when comparing existing entries. + let pre_rebuild_tail_upper_bound = data_file.recovery_tail_upper_bound(); + let pre_purge_extent = pre_rebuild_tail_upper_bound.next_multiple_of(FILE_OFFSET_ALIGNMENT); + self.apply_recovery_delta( + self.purge_uncommitted_file_entries(data_file.file_idx, pre_purge_extent)?, + pending_committed_delta, + ); + + if matches!(rebuild_mode, RebuildMode::FullFile) { + // The saved checkpoint within this file is no longer trustworthy. + // Remove every pointer into it and rebuild its contribution from 0. + self.apply_recovery_delta( + self.purge_uncommitted_file_entries(data_file.file_idx, 0)?, + pending_committed_delta, + ); + } + + let mut offset = start_offset; + let mut read_buf = Vec::new(); + let mut buf_file_offset = 0u64; + let mut last_durable_offset = start_offset; + loop { + let Some((kv, entry_offset, next_offset)) = + data_file.read_next_entry_ref(offset, &mut read_buf, &mut buf_file_offset)? + else { + break; + }; + let entry_bytes = next_offset - offset; + offset = next_offset; + + let Some(ns) = KeyNamespace::from_u8(kv.ns) else { + return Err(invalid_data_error("unknown key namespace in data file")); + }; + + self.apply_recovery_delta( + self.recover_entry(data_file, ns, kv, entry_offset, match_scratch)?, + pending_committed_delta, + ); + self.inner + .stats + .num_rebuilt_entries + .fetch_add(1, Ordering::Relaxed); + last_durable_offset = next_offset; + crash_point("rebuild_entry"); + + *bytes_since_checkpoint += entry_bytes; + if *bytes_since_checkpoint >= Self::REBUILD_CHECKPOINT_INTERVAL_BYTES { + self.persist_rebuild_checkpoint( + data_file.file_ordinal, + offset, + *pending_committed_delta, + )?; + *pending_committed_delta = 0; + *bytes_since_checkpoint = 0; + } + } + + let durable_extent = last_durable_offset.next_multiple_of(FILE_OFFSET_ALIGNMENT); + + if durable_extent < pre_rebuild_tail_upper_bound { + self.inner.stats.num_rebuild_purged_bytes.fetch_add( + pre_rebuild_tail_upper_bound - durable_extent, + Ordering::Relaxed, + ); + data_file.truncate_to_offset(durable_extent)?; + } + + self.apply_recovery_delta( + self.purge_uncommitted_file_entries(data_file.file_idx, durable_extent)?, + pending_committed_delta, + ); + Ok(durable_extent) + } + + fn validated_commit_offset( + &self, + active_file: &Arc, + checkpoint_offset: u64, + ) -> Result { + if checkpoint_offset == 0 { + return Ok(RebuildMode::TailFrom(0)); + } + + let used_bytes = active_file.used_bytes(); + if checkpoint_offset > used_bytes { + return Ok(RebuildMode::FullFile); + } + if checkpoint_offset == used_bytes { + return Ok(RebuildMode::TailFrom(checkpoint_offset)); + } + + let mut probe_buf = Vec::new(); + let mut probe_file_offset = 0u64; + match active_file.read_next_entry_ref( + checkpoint_offset, + &mut probe_buf, + &mut probe_file_offset, + )? { + Some((_, entry_offset, _)) if entry_offset == checkpoint_offset => { + Ok(RebuildMode::TailFrom(checkpoint_offset)) + } + _ => Ok(RebuildMode::FullFile), + } + } + + fn persist_rebuild_checkpoint(&self, ordinal: u64, offset: u64, delta: i64) -> Result<()> { + let resume_offset = offset.next_multiple_of(FILE_OFFSET_ALIGNMENT); + + self.inner.fold_checkpointed_num_entries(delta); + self.inner.persist_checkpoint_cursor(ordinal, resume_offset); + + self.inner.index_file.sync_all() + } + + /// Remove index entries pointing to the active file at or beyond `durable_extent`. + fn purge_uncommitted_file_entries(&self, file_idx: u16, min_offset: u64) -> Result { + let row_table = self.inner.index_file.rows_table(); + let num_rows = self.inner.index_file.num_rows(); + let mut removed = 0i64; + + for row_idx in 0..num_rows { + let mut row = row_table.row_mut(row_idx); + if row.split_level.load(Ordering::Acquire) == 0 { + continue; + } + for col in 0..ROW_WIDTH { + if row.signatures[col] == HashCoord::INVALID_SIG { + continue; + } + let ptr = row.pointers[col]; + if !ptr.is_valid() { + continue; + } + if ptr.file_idx() == file_idx && ptr.file_offset() >= min_offset { + row.remove(col); + removed += 1; + } + } + } + Ok(-removed) + } + + fn recover_entry( + &self, + data_file: &Arc, + ns: KeyNamespace, + kv: KVRef<'_>, + entry_offset: u64, + match_scratch: &mut Vec, + ) -> Result { + match kv.entry_type { + EntryType::Insert | EntryType::Update => { + self.recover_data_entry(data_file, ns, kv, entry_offset, match_scratch) + } + EntryType::Tombstone => self.recover_tombstone_entry(data_file, ns, kv, match_scratch), + _ => Ok(0), + } + } + + fn apply_recovery_delta(&self, delta: i64, pending_committed_delta: &mut i64) { + if delta == 0 { + return; + } + + self.inner.add_uncommitted_num_entries(delta); + *pending_committed_delta += delta; + } + + /// Fix index pointers for a data/update entry and return its live-entry delta. + fn recover_data_entry( + &self, + data_file: &Arc, + ns: KeyNamespace, + kv: KVRef<'_>, + entry_offset: u64, + match_scratch: &mut Vec, + ) -> Result { + let key = kv.key(); + let val = kv.value(); + self.validate_recovered_data_entry(key, val)?; + let entry_len = 4 + 4 + key.len() + val.len() + 2; + let aligned_len = entry_len.next_multiple_of(FILE_OFFSET_ALIGNMENT as usize); + let hc = HashCoord::new(ns, key, self.inner.config.hash_key); + let ptr = EntryPointer::new( + data_file.file_idx, + entry_offset, + aligned_len, + hc.masked_row_selector(), + ); + + self.inner._mut_op(ns, key, &[], |hc, mut row, key, _| { + let files = self.inner.data_files.read(); + for (col, entry) in row.iter_matches(hc) { + let file = files + .get(&entry.file_idx()) + .ok_or(Error::MissingDataFile(entry.file_idx()))?; + let existing_kv = + file.read_kv_into(entry.file_offset(), entry.size_hint(), match_scratch)?; + if existing_kv.key() == key { + if entry == ptr { + // Already points at this entry — nothing to fix. + return Ok(0); + } + if entry.file_idx() == data_file.file_idx + && entry.file_offset() > ptr.file_offset() + { + // A newer active-file entry already exists — skip. + return Ok(0); + } + // Older pointer — replace with this newer one. + row.replace_pointer(col, ptr); + return Ok(0); + } + } + // Key not in index — insert it. + if let Some(col) = row.find_free_slot() { + row.insert(col, hc.sig, ptr); + Ok(1) + } else { + Err(Error::SplitRow(row.split_level.load(Ordering::Relaxed))) + } + }) + } + + /// Fix index pointers for a tombstone entry and return its live-entry delta. + fn recover_tombstone_entry( + &self, + _data_file: &Arc, + ns: KeyNamespace, + kv: KVRef<'_>, + match_scratch: &mut Vec, + ) -> Result { + let key = kv.key(); + self.validate_recovered_tombstone_entry(key)?; + + self.inner._mut_op(ns, key, &[], |hc, mut row, key, _| { + let files = self.inner.data_files.read(); + for (col, entry) in row.iter_matches(hc) { + let file = files + .get(&entry.file_idx()) + .ok_or(Error::MissingDataFile(entry.file_idx()))?; + let existing_kv = + file.read_kv_into(entry.file_offset(), entry.size_hint(), match_scratch)?; + if existing_kv.key() == key { + row.remove(col); + return Ok(-1); + } + } + Ok(0) + }) + } + + fn validate_recovered_data_entry(&self, key: &[u8], val: &[u8]) -> Result<()> { + let entry_size = aligned_data_entry_size(key.len(), val.len()) as usize; + if key.len() > MAX_USER_KEY_SIZE + || val.len() > MAX_USER_VALUE_SIZE + || entry_size > self.inner.config.max_data_file_size as usize + { + return Err(invalid_data_error( + "recovered data entry exceeds configured limits", + )); + } + Ok(()) + } + + fn validate_recovered_tombstone_entry(&self, key: &[u8]) -> Result<()> { + let entry_size = aligned_tombstone_entry_waste(key.len()) as usize; + if key.len() > MAX_USER_KEY_SIZE + || entry_size > self.inner.config.max_data_file_size as usize + { + return Err(invalid_data_error( + "recovered tombstone entry exceeds configured limits", + )); + } + Ok(()) + } +} diff --git a/src/store/typed.rs b/src/store/typed.rs new file mode 100644 index 0000000..7375745 --- /dev/null +++ b/src/store/typed.rs @@ -0,0 +1,835 @@ +use std::{borrow::Borrow, marker::PhantomData, ops::Range, sync::Arc}; + +use serde::{Serialize, de::DeserializeOwned}; +use smallvec::SmallVec; + +use crate::{ + internal::KeyNamespace, + store::CandyStore, + types::{Error, ListCompactionParams, Result}, +}; + +#[derive(Clone, Copy)] +struct TypedBigNamespaces { + meta: KeyNamespace, + data: KeyNamespace, +} + +const TYPED_BIG_NS: TypedBigNamespaces = TypedBigNamespaces { + meta: KeyNamespace::TypedBigMeta, + data: KeyNamespace::TypedBigData, +}; + +const TYPED_QUEUE_NS: super::queue::QueueNamespaces = super::queue::QueueNamespaces { + meta: KeyNamespace::TypedQueueMeta, + data: KeyNamespace::TypedQueueData, +}; + +const TYPED_LIST_NS: super::list::ListNamespaces = super::list::ListNamespaces { + meta: KeyNamespace::TypedListMeta, + index: KeyNamespace::TypedListIndex, + data: KeyNamespace::TypedListData, +}; + +const INLINE_TYPED_BUF_SIZE: usize = 128; + +type InlineBytes = SmallVec<[u8; INLINE_TYPED_BUF_SIZE]>; + +/// Marker trait for typed keys and collection identifiers used by the typed wrappers. +pub trait CandyTypedKey: Serialize + DeserializeOwned { + const TYPE_ID: u32; +} + +macro_rules! typed_builtin { + ($ty:ty, $type_id:literal) => { + impl CandyTypedKey for $ty { + const TYPE_ID: u32 = $type_id; + } + }; +} + +typed_builtin!(u8, 1); +typed_builtin!(u16, 2); +typed_builtin!(u32, 3); +typed_builtin!(u64, 4); +typed_builtin!(u128, 5); +typed_builtin!(i8, 6); +typed_builtin!(i16, 7); +typed_builtin!(i32, 8); +typed_builtin!(i64, 9); +typed_builtin!(i128, 10); +typed_builtin!(bool, 11); +typed_builtin!(usize, 12); +typed_builtin!(isize, 13); +typed_builtin!(char, 14); +typed_builtin!(String, 15); +typed_builtin!(Vec, 16); +typed_builtin!(uuid::Bytes, 17); + +/// Typed wrapper over the store key-value API. +pub struct CandyTypedStore { + store: Arc, + _phantom: PhantomData<(K, V)>, +} + +/// Typed wrapper over the queue API. +pub struct CandyTypedDeque { + store: Arc, + _phantom: PhantomData<(L, V)>, +} + +/// Typed wrapper over the ordered map/list API. +pub struct CandyTypedList { + store: Arc, + _phantom: PhantomData<(L, K, V)>, +} + +impl Clone for CandyTypedDeque { + fn clone(&self) -> Self { + Self { + store: Arc::clone(&self.store), + _phantom: PhantomData, + } + } +} + +impl Clone for CandyTypedStore { + fn clone(&self) -> Self { + Self { + store: Arc::clone(&self.store), + _phantom: PhantomData, + } + } +} + +impl Clone for CandyTypedList { + fn clone(&self) -> Self { + Self { + store: Arc::clone(&self.store), + _phantom: PhantomData, + } + } +} + +impl CandyTypedStore +where + K: CandyTypedKey + Serialize, + V: Serialize + DeserializeOwned, +{ + /// Creates a typed key-value view over `store`. + pub fn new(store: Arc) -> Self { + Self { + store, + _phantom: PhantomData, + } + } + + fn make_key(key: &Q) -> InlineBytes + where + K: Borrow, + { + append_type_id(encode_to_smallvec(key), K::TYPE_ID) + } + + /// Returns the decoded value for `key`, if present. + pub fn get(&self, key: &Q) -> Result> + where + K: Borrow, + { + let key_bytes = Self::make_key(key); + self.store + .get_ns(KeyNamespace::Typed, &key_bytes)? + .map(|bytes| decode_from_bytes::(&bytes)) + .transpose() + } + + /// Inserts or replaces `key` with `val`. + pub fn set( + &self, + key: &Q1, + val: &Q2, + ) -> Result> + where + K: Borrow, + V: Borrow, + { + let key_bytes = Self::make_key(key); + let value_bytes = encode_to_smallvec(val); + self.store + .set_ns(KeyNamespace::Typed, &key_bytes, &value_bytes)? + .map(|prev| decode_from_bytes::(&prev)) + .transpose() + } + + /// Removes `key` and returns its previous decoded value if it existed. + pub fn remove(&self, key: &Q) -> Result> + where + K: Borrow, + { + let key_bytes = Self::make_key(key); + self.store + .remove_ns(KeyNamespace::Typed, &key_bytes)? + .map(|prev| decode_from_bytes::(&prev)) + .transpose() + } + + /// Returns `true` if `key` currently exists. + pub fn contains(&self, key: &Q) -> Result + where + K: Borrow, + { + let key_bytes = Self::make_key(key); + self.store + .get_ns(KeyNamespace::Typed, &key_bytes) + .map(|value| value.is_some()) + } + + /// Returns the current value for `key`, or inserts and returns `val` if the key is missing. + pub fn get_or_create( + &self, + key: &Q1, + val: &Q2, + ) -> Result + where + K: Borrow, + V: Borrow, + { + let key_bytes = Self::make_key(key); + let value_bytes = encode_to_smallvec(val); + let status = self + .store + .get_or_create_ns(KeyNamespace::Typed, &key_bytes, &value_bytes)?; + match status { + crate::GetOrCreateStatus::ExistingValue(value) + | crate::GetOrCreateStatus::CreatedNew(value) => decode_from_bytes::(&value), + } + } + + /// Replaces `key` with `val` only if the current value matches `expected_val` when provided. + pub fn replace( + &self, + key: &Q1, + val: &Q2, + expected_val: Option<&Q3>, + ) -> Result> + where + K: Borrow, + V: Borrow, + { + let key_bytes = Self::make_key(key); + let value_bytes = encode_to_smallvec(val); + let expected_bytes = expected_val.map(encode_to_smallvec); + match self.store.replace_ns( + KeyNamespace::Typed, + &key_bytes, + &value_bytes, + expected_bytes.as_deref(), + )? { + crate::ReplaceStatus::PrevValue(prev) => decode_from_bytes::(&prev).map(Some), + crate::ReplaceStatus::WrongValue(_) | crate::ReplaceStatus::DoesNotExist => Ok(None), + } + } + + /// Stores a large typed value under `key`. + pub fn set_big( + &self, + key: &Q1, + val: &Q2, + ) -> Result + where + K: Borrow, + V: Borrow, + { + let key_bytes = Self::make_key(key); + let value_bytes = encode_to_smallvec(val); + self.store.queue_set_big_with_ns( + super::queue::QueueNamespaces { + meta: TYPED_BIG_NS.meta, + data: TYPED_BIG_NS.data, + }, + &key_bytes, + &value_bytes, + ) + } + + /// Loads a large typed value previously stored with [`CandyTypedStore::set_big`]. + pub fn get_big(&self, key: &Q) -> Result> + where + K: Borrow, + { + let key_bytes = Self::make_key(key); + self.store + .queue_get_big_with_ns( + super::queue::QueueNamespaces { + meta: TYPED_BIG_NS.meta, + data: TYPED_BIG_NS.data, + }, + &key_bytes, + )? + .map(|value| decode_from_bytes::(&value)) + .transpose() + } + + /// Removes a large typed value previously stored with [`CandyTypedStore::set_big`]. + pub fn remove_big(&self, key: &Q) -> Result + where + K: Borrow, + { + let key_bytes = Self::make_key(key); + self.store.queue_discard_with_ns( + super::queue::QueueNamespaces { + meta: TYPED_BIG_NS.meta, + data: TYPED_BIG_NS.data, + }, + &key_bytes, + ) + } +} + +impl CandyTypedDeque +where + L: CandyTypedKey + Serialize, + V: Serialize + DeserializeOwned, +{ + /// Creates a typed queue view over `store`. + pub fn new(store: Arc) -> Self { + Self { + store, + _phantom: PhantomData, + } + } + + fn make_queue_key(queue_key: &Q) -> InlineBytes + where + L: Borrow, + { + append_type_id(encode_to_smallvec(queue_key), L::TYPE_ID) + } + + /// Pushes `val` to the tail of `queue_key`. + pub fn push_tail( + &self, + queue_key: &Q, + val: &QV, + ) -> Result<()> + where + L: Borrow, + V: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + let vbytes = encode_to_smallvec(val); + self.store + .queue_push_tail_with_ns(TYPED_QUEUE_NS, &qkey, &vbytes) + .map(|_| ()) + } + + /// Pushes `val` to the head of `queue_key`. + pub fn push_head( + &self, + queue_key: &Q, + val: &QV, + ) -> Result<()> + where + L: Borrow, + V: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + let vbytes = encode_to_smallvec(val); + self.store + .queue_push_head_with_ns(TYPED_QUEUE_NS, &qkey, &vbytes) + .map(|_| ()) + } + + /// Removes and returns the head item of `queue_key` together with its logical index. + pub fn pop_head_with_idx( + &self, + queue_key: &Q, + ) -> Result> + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + match self.store.queue_pop_head_with_ns(TYPED_QUEUE_NS, &qkey)? { + Some((idx, value)) => { + decode_from_bytes::(&value).map(|value| Some((idx as usize, value))) + } + None => Ok(None), + } + } + + /// Removes and returns the head value of `queue_key`. + pub fn pop_head(&self, queue_key: &Q) -> Result> + where + L: Borrow, + { + Ok(self.pop_head_with_idx(queue_key)?.map(|(_, value)| value)) + } + + /// Removes and returns the tail item of `queue_key` together with its logical index. + pub fn pop_tail_with_idx( + &self, + queue_key: &Q, + ) -> Result> + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + match self.store.queue_pop_tail_with_ns(TYPED_QUEUE_NS, &qkey)? { + Some((idx, value)) => { + decode_from_bytes::(&value).map(|value| Some((idx as usize, value))) + } + None => Ok(None), + } + } + + /// Removes and returns the tail value of `queue_key`. + pub fn pop_tail(&self, queue_key: &Q) -> Result> + where + L: Borrow, + { + Ok(self.pop_tail_with_idx(queue_key)?.map(|(_, value)| value)) + } + + /// Returns the head item of `queue_key` and its logical index without removing it. + pub fn peek_head_with_idx( + &self, + queue_key: &Q, + ) -> Result> + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + match self.store.queue_peek_head_with_ns(TYPED_QUEUE_NS, &qkey)? { + Some((idx, value)) => { + decode_from_bytes::(&value).map(|value| Some((idx as usize, value))) + } + None => Ok(None), + } + } + + /// Returns the head value of `queue_key` without removing it. + pub fn peek_head(&self, queue_key: &Q) -> Result> + where + L: Borrow, + { + Ok(self.peek_head_with_idx(queue_key)?.map(|(_, value)| value)) + } + + /// Returns the tail item of `queue_key` and its logical index without removing it. + pub fn peek_tail_with_idx( + &self, + queue_key: &Q, + ) -> Result> + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + match self.store.queue_peek_tail_with_ns(TYPED_QUEUE_NS, &qkey)? { + Some((idx, value)) => { + decode_from_bytes::(&value).map(|value| Some((idx as usize, value))) + } + None => Ok(None), + } + } + + /// Returns the tail value of `queue_key` without removing it. + pub fn peek_tail(&self, queue_key: &Q) -> Result> + where + L: Borrow, + { + Ok(self.peek_tail_with_idx(queue_key)?.map(|(_, value)| value)) + } + + /// Returns the number of live items in `queue_key`. + pub fn len(&self, queue_key: &Q) -> Result + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + self.store + .queue_len_with_ns(TYPED_QUEUE_NS, &qkey) + .map(|len| len as usize) + } + + /// Returns the current inclusive-exclusive logical index span for `queue_key`. + pub fn range(&self, queue_key: &Q) -> Result> + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + self.store.queue_range_with_ns(TYPED_QUEUE_NS, &qkey) + } + + /// Returns `true` when `queue_key` has no live items. + pub fn is_empty(&self, queue_key: &Q) -> Result + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + self.store + .queue_len_with_ns(TYPED_QUEUE_NS, &qkey) + .map(|len| len == 0) + } + + /// Removes all items from `queue_key`. + pub fn discard(&self, queue_key: &Q) -> Result + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + self.store.queue_discard_with_ns(TYPED_QUEUE_NS, &qkey) + } + + /// Iterates over live items in `queue_key` from head to tail. + pub fn iter<'a, Q: ?Sized + Serialize>( + &'a self, + queue_key: &Q, + ) -> impl DoubleEndedIterator> + 'a + where + L: Borrow, + { + let qkey = Self::make_queue_key(queue_key); + self.store + .queue_iter_with_ns(TYPED_QUEUE_NS, &qkey) + .map(|res| { + res.and_then(|(idx, value)| { + decode_from_bytes::(&value).map(|value| (idx, value)) + }) + }) + } +} + +impl CandyTypedList +where + L: CandyTypedKey + Serialize, + K: Serialize + DeserializeOwned, + V: Serialize + DeserializeOwned, +{ + /// Creates a typed ordered-map/list view over `store`. + pub fn new(store: Arc) -> Self { + Self { + store, + _phantom: PhantomData, + } + } + + fn make_list_key(list_key: &Q) -> InlineBytes + where + L: Borrow, + { + append_type_id(encode_to_smallvec(list_key), L::TYPE_ID) + } + + fn make_item_key(item_key: &Q) -> InlineBytes + where + K: Borrow, + { + encode_to_smallvec(item_key) + } + + /// Returns `true` if `item_key` exists in `list_key`. + pub fn contains( + &self, + list_key: &Q1, + item_key: &Q2, + ) -> Result + where + L: Borrow, + K: Borrow, + { + self.get(list_key, item_key).map(|value| value.is_some()) + } + + /// Inserts or replaces `item_key` in `list_key`, placing it at the logical tail. + pub fn set( + &self, + list_key: &Q1, + item_key: &Q2, + val: &Q3, + ) -> Result> + where + L: Borrow, + K: Borrow, + V: Borrow, + { + let lkey = Self::make_list_key(list_key); + let ikey = Self::make_item_key(item_key); + let vbytes = encode_to_smallvec(val); + self.store + .list_set_at_tail_with_ns(TYPED_LIST_NS, &lkey, &ikey, &vbytes)? + .map(|prev| decode_from_bytes::(&prev)) + .transpose() + } + + /// Returns the current value for `item_key`, or inserts `default_val` if it is missing. + pub fn get_or_create( + &self, + list_key: &Q1, + item_key: &Q2, + default_val: &Q3, + ) -> Result + where + L: Borrow, + K: Borrow, + { + let lkey = Self::make_list_key(list_key); + let ikey = Self::make_item_key(item_key); + let vbytes = encode_to_smallvec(default_val); + match self + .store + .list_get_or_create_with_ns(TYPED_LIST_NS, &lkey, &ikey, &vbytes)? + { + crate::GetOrCreateStatus::ExistingValue(value) + | crate::GetOrCreateStatus::CreatedNew(value) => decode_from_bytes::(&value), + } + } + + /// Replaces `item_key` only if its current value matches `expected_val` when provided. + pub fn replace< + Q1: ?Sized + Serialize, + Q2: ?Sized + Serialize, + Q3: ?Sized + Serialize, + Q4: ?Sized + Serialize, + >( + &self, + list_key: &Q1, + item_key: &Q2, + val: &Q3, + expected_val: Option<&Q4>, + ) -> Result> + where + L: Borrow, + K: Borrow, + V: Borrow, + { + let lkey = Self::make_list_key(list_key); + let ikey = Self::make_item_key(item_key); + let vbytes = encode_to_smallvec(val); + let expected_bytes = expected_val.map(encode_to_smallvec); + match self.store.list_replace_with_ns( + TYPED_LIST_NS, + &lkey, + &ikey, + &vbytes, + expected_bytes.as_deref(), + )? { + crate::ReplaceStatus::PrevValue(prev) => decode_from_bytes::(&prev).map(Some), + crate::ReplaceStatus::WrongValue(_) | crate::ReplaceStatus::DoesNotExist => Ok(None), + } + } + + /// Returns the decoded value for `item_key`, if present. + pub fn get( + &self, + list_key: &Q1, + item_key: &Q2, + ) -> Result> + where + L: Borrow, + K: Borrow, + { + let lkey = Self::make_list_key(list_key); + let ikey = Self::make_item_key(item_key); + self.store + .list_get_with_ns(TYPED_LIST_NS, &lkey, &ikey)? + .map(|value| decode_from_bytes::(&value)) + .transpose() + } + + /// Removes `item_key` and returns its previous decoded value if it existed. + pub fn remove( + &self, + list_key: &Q1, + item_key: &Q2, + ) -> Result> + where + L: Borrow, + K: Borrow, + { + let lkey = Self::make_list_key(list_key); + let ikey = Self::make_item_key(item_key); + self.store + .list_remove_with_ns(TYPED_LIST_NS, &lkey, &ikey)? + .map(|value| decode_from_bytes::(&value)) + .transpose() + } + + /// Returns the number of live items in `list_key`. + pub fn len(&self, list_key: &Q) -> Result + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + self.store.list_len_with_ns(TYPED_LIST_NS, &lkey) + } + + /// Returns the current inclusive-exclusive logical span for `list_key`. + pub fn range(&self, list_key: &Q) -> Result> + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + self.store.list_range_with_ns(TYPED_LIST_NS, &lkey) + } + + /// Returns `true` when `list_key` has no live items. + pub fn is_empty(&self, list_key: &Q) -> Result + where + L: Borrow, + { + self.len(list_key).map(|len| len == 0) + } + + /// Removes all items from `list_key`. + pub fn discard(&self, list_key: &Q) -> Result + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + self.store.list_discard_with_ns(TYPED_LIST_NS, &lkey) + } + + /// Compacts `list_key` when `params` indicate enough holes exist to justify rewriting it. + pub fn compact_if_needed( + &self, + list_key: &Q, + params: ListCompactionParams, + ) -> Result + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + self.store + .list_compact_with_ns(TYPED_LIST_NS, &lkey, params) + } + + /// Inserts or replaces `item_key`, moving it to the logical tail and returning the previous value when present. + pub fn set_promoting( + &self, + list_key: &Q1, + item_key: &Q2, + value: &Q3, + ) -> Result> + where + L: Borrow, + K: Borrow, + V: Borrow, + { + let lkey = Self::make_list_key(list_key); + let ikey = Self::make_item_key(item_key); + let vbytes = encode_to_smallvec(value); + self.store + .list_promote_with_ns(TYPED_LIST_NS, &lkey, &ikey, &vbytes)? + .map(|prev| decode_from_bytes::(&prev)) + .transpose() + } + + /// Iterates over live items in `list_key` from head to tail. + pub fn iter<'a, Q: ?Sized + Serialize>( + &'a self, + list_key: &Q, + ) -> impl DoubleEndedIterator> + 'a + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + self.store + .list_iter_with_ns(TYPED_LIST_NS, &lkey) + .map(|res| { + res.and_then(|(key, value)| { + Ok(( + decode_from_bytes::(&key)?, + decode_from_bytes::(&value)?, + )) + }) + }) + } + + /// Removes and returns the tail item of `list_key`. + pub fn pop_tail(&self, list_key: &Q) -> Result> + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + match self.store.pop_list_tail_with_ns(TYPED_LIST_NS, &lkey)? { + Some((key, value)) => Ok(Some(( + decode_from_bytes::(&key)?, + decode_from_bytes::(&value)?, + ))), + None => Ok(None), + } + } + + /// Removes and returns the head item of `list_key`. + pub fn pop_head(&self, list_key: &Q) -> Result> + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + match self.store.pop_list_head_with_ns(TYPED_LIST_NS, &lkey)? { + Some((key, value)) => Ok(Some(( + decode_from_bytes::(&key)?, + decode_from_bytes::(&value)?, + ))), + None => Ok(None), + } + } + + /// Returns the tail item of `list_key` without removing it. + pub fn peek_tail(&self, list_key: &Q) -> Result> + where + L: Borrow, + { + let mut iter = self.iter(list_key); + match iter.next_back() { + Some(Ok(pair)) => Ok(Some(pair)), + Some(Err(err)) => Err(err), + None => Ok(None), + } + } + + /// Returns the head item of `list_key` without removing it. + pub fn peek_head(&self, list_key: &Q) -> Result> + where + L: Borrow, + { + match self.iter(list_key).next() { + Some(Ok(pair)) => Ok(Some(pair)), + Some(Err(err)) => Err(err), + None => Ok(None), + } + } + + /// Retains only items for which `func` returns `true`, preserving list order. + pub fn retain( + &self, + list_key: &Q, + mut func: impl FnMut(&K, &V) -> Result, + ) -> Result<()> + where + L: Borrow, + { + let lkey = Self::make_list_key(list_key); + self.store + .list_retain_with_ns(TYPED_LIST_NS, &lkey, |k_bytes, v_bytes| { + let key = decode_from_bytes::(k_bytes)?; + let value = decode_from_bytes::(v_bytes)?; + func(&key, &value) + }) + } +} + +fn decode_from_bytes(bytes: &[u8]) -> Result { + postcard::from_bytes(bytes).map_err(Error::PostcardError) +} + +fn encode_to_smallvec(value: &T) -> InlineBytes { + let mut buf = InlineBytes::new(); + postcard::to_io(value, &mut buf).unwrap(); + buf +} + +fn append_type_id(mut bytes: InlineBytes, type_id: u32) -> InlineBytes { + bytes.extend_from_slice(&type_id.to_le_bytes()); + bytes +} diff --git a/src/typed.rs b/src/typed.rs deleted file mode 100644 index 8afc13d..0000000 --- a/src/typed.rs +++ /dev/null @@ -1,759 +0,0 @@ -use anyhow::anyhow; -use bytemuck::bytes_of; -use std::{borrow::Borrow, marker::PhantomData, ops::Range, sync::Arc}; - -use crate::{ - store::{ReplaceStatus, SetStatus, TYPED_NAMESPACE}, - CandyStore, ListCompactionParams, -}; - -use crate::Result; -use databuf::{config::num::LE, DecodeOwned, Encode}; - -pub trait CandyTypedKey: Encode + DecodeOwned { - /// a random number that remains consistent (unlike [std::any::TypeId]), so that `MyPair(u32, u32)` - /// is different from `YourPair(u32, u32)` - const TYPE_ID: u32; -} - -macro_rules! typed_builtin { - ($t:ty, $v:literal) => { - impl CandyTypedKey for $t { - const TYPE_ID: u32 = $v; - } - }; -} - -typed_builtin!(u8, 1); -typed_builtin!(u16, 2); -typed_builtin!(u32, 3); -typed_builtin!(u64, 4); -typed_builtin!(u128, 5); -typed_builtin!(i8, 6); -typed_builtin!(i16, 7); -typed_builtin!(i32, 8); -typed_builtin!(i64, 9); -typed_builtin!(i128, 10); -typed_builtin!(bool, 11); -typed_builtin!(usize, 12); -typed_builtin!(isize, 13); -typed_builtin!(char, 14); -typed_builtin!(String, 15); -typed_builtin!(Vec, 16); -typed_builtin!(uuid::Bytes, 17); - -fn from_bytes(bytes: &[u8]) -> Result { - T::from_bytes::(bytes).map_err(|e| anyhow!(e)) -} - -/// Typed stores are wrappers around an underlying [CandyStore], that serialize keys and values (using [databuf]). -/// These are but thin wrappers, and multiple such wrappers can exist over the same store. -/// -/// The keys and values must support [Encode] and [DecodeOwned], with the addition that keys also provide -/// a `TYPE_ID` const, via the [CandyTypedKey] trait. -/// -/// Notes: -/// * All APIs take keys and values by-ref, because they will serialize them, so taking owned values doesn't -/// make sense -/// * [CandyStore::iter] will skip typed items, since it's meaningless to interpret them without the wrapper -pub struct CandyTypedStore { - store: Arc, - _phantom: PhantomData<(K, V)>, -} - -impl Clone for CandyTypedStore { - fn clone(&self) -> Self { - Self { - store: self.store.clone(), - _phantom: Default::default(), - } - } -} - -impl CandyTypedStore -where - K: CandyTypedKey, - V: Encode + DecodeOwned, -{ - /// Constructs a typed wrapper over a CandyStore - pub fn new(store: Arc) -> Self { - Self { - store, - _phantom: Default::default(), - } - } - - fn make_key(key: &Q) -> Vec - where - K: Borrow, - { - let mut kbytes = key.to_bytes::(); - kbytes.extend_from_slice(bytes_of(&K::TYPE_ID)); - kbytes.extend_from_slice(TYPED_NAMESPACE); - kbytes - } - - /// Same as [CandyStore::contains] but serializes the key - pub fn contains(&self, key: &Q) -> Result - where - K: Borrow, - { - Ok(self.store.get_raw(&Self::make_key(key))?.is_some()) - } - - /// Same as [CandyStore::get] but serializes the key and deserializes the value - pub fn get(&self, key: &Q) -> Result> - where - K: Borrow, - { - let kbytes = Self::make_key(key); - if let Some(vbytes) = self.store.get_raw(&kbytes)? { - Ok(Some(from_bytes::(&vbytes)?)) - } else { - Ok(None) - } - } - - /// Same as [CandyStore::replace] but serializes the key and the value - pub fn replace( - &self, - key: &Q1, - val: &Q2, - expected_val: Option<&Q2>, - ) -> Result> - where - K: Borrow, - V: Borrow, - { - let kbytes = Self::make_key(key); - let vbytes = val.to_bytes::(); - let ebytes = expected_val.map(|ev| ev.to_bytes::()).unwrap_or(vec![]); - match self - .store - .replace_raw(&kbytes, &vbytes, expected_val.map(|_| &*ebytes))? - { - ReplaceStatus::DoesNotExist => Ok(None), - ReplaceStatus::PrevValue(v) => Ok(Some(from_bytes::(&v)?)), - ReplaceStatus::WrongValue(_) => Ok(None), - } - } - - /// Same as [CandyStore::set] but serializes the key and the value. - pub fn set( - &self, - key: &Q1, - val: &Q2, - ) -> Result> - where - K: Borrow, - V: Borrow, - { - let kbytes = Self::make_key(key); - let vbytes = val.to_bytes::(); - match self.store.set_raw(&kbytes, &vbytes)? { - SetStatus::CreatedNew => Ok(None), - SetStatus::PrevValue(v) => Ok(Some(from_bytes::(&v)?)), - } - } - - /// Same as [CandyStore::get_or_create] but serializes the key and the default value - pub fn get_or_create( - &self, - key: &Q1, - default_val: &Q2, - ) -> Result - where - K: Borrow, - V: Borrow, - { - let kbytes = Self::make_key(key); - Ok(from_bytes::( - &self - .store - .get_or_create_raw(&kbytes, default_val.to_bytes::())? - .value(), - )?) - } - - /// Same as [CandyStore::remove] but serializes the key - pub fn remove(&self, k: &Q) -> Result> - where - K: Borrow, - { - let kbytes = Self::make_key(k); - if let Some(vbytes) = self.store.remove_raw(&kbytes)? { - Ok(Some(from_bytes::(&vbytes)?)) - } else { - Ok(None) - } - } - - /// Same as [CandyStore::get_big] but serializes the key and deserializes the value - pub fn get_big(&self, key: &Q) -> Result> - where - K: Borrow, - { - let kbytes = Self::make_key(key); - if let Some(vbytes) = self.store.get_big(&kbytes)? { - Ok(Some(from_bytes::(&vbytes)?)) - } else { - Ok(None) - } - } - - /// Same as [CandyStore::set_big] but serializes the key and the value. - pub fn set_big( - &self, - key: &Q1, - val: &Q2, - ) -> Result - where - K: Borrow, - V: Borrow, - { - let kbytes = Self::make_key(key); - let vbytes = val.to_bytes::(); - self.store.set_big(&kbytes, &vbytes) - } - - /// Same as [CandyStore::remove_big] but serializes the key - pub fn remove_big(&self, k: &Q) -> Result - where - K: Borrow, - { - let kbytes = Self::make_key(k); - self.store.remove_big(&kbytes) - } -} - -/// A wrapper around [CandyStore] that exposes the list API in a typed manner. See [CandyTypedStore] for more -/// info -pub struct CandyTypedList { - store: Arc, - _phantom: PhantomData<(L, K, V)>, -} - -impl Clone for CandyTypedList { - fn clone(&self) -> Self { - Self { - store: self.store.clone(), - _phantom: Default::default(), - } - } -} - -impl CandyTypedList -where - L: CandyTypedKey, - K: Encode + DecodeOwned, - V: Encode + DecodeOwned, -{ - /// Constructs a [CandyTypedList] over an existing [CandyStore] - pub fn new(store: Arc) -> Self { - Self { - store, - _phantom: PhantomData, - } - } - - fn make_list_key(list_key: &Q) -> Vec - where - L: Borrow, - { - let mut kbytes = list_key.to_bytes::(); - kbytes.extend_from_slice(bytes_of(&L::TYPE_ID)); - kbytes - } - - /// Tests if the given typed `item_key` exists in this list (identified by `list_key`) - pub fn contains( - &self, - list_key: &Q1, - item_key: &Q2, - ) -> Result - where - L: Borrow, - K: Borrow, - { - let list_key = Self::make_list_key(list_key); - let item_key = item_key.to_bytes::(); - Ok(self - .store - .owned_get_from_list(list_key, item_key)? - .is_some()) - } - - /// Same as [CandyStore::get_from_list], but `list_key` and `item_key` are typed - pub fn get( - &self, - list_key: &Q1, - item_key: &Q2, - ) -> Result> - where - L: Borrow, - K: Borrow, - { - let list_key = Self::make_list_key(list_key); - let item_key = item_key.to_bytes::(); - if let Some(vbytes) = self.store.owned_get_from_list(list_key, item_key)? { - Ok(Some(from_bytes::(&vbytes)?)) - } else { - Ok(None) - } - } - - fn _set( - &self, - list_key: &Q1, - item_key: &Q2, - val: &Q3, - promote: bool, - ) -> Result> - where - L: Borrow, - K: Borrow, - V: Borrow, - { - let list_key = Self::make_list_key(list_key); - let item_key = item_key.to_bytes::(); - let val = val.to_bytes::(); - match self - .store - .owned_set_in_list(list_key, item_key, val, promote)? - { - SetStatus::CreatedNew => Ok(None), - SetStatus::PrevValue(v) => Ok(Some(from_bytes::(&v)?)), - } - } - - /// Same as [CandyStore::set_in_list], but `list_key`, `item_key` and `val` are typed - pub fn set( - &self, - list_key: &Q1, - item_key: &Q2, - val: &Q3, - ) -> Result> - where - L: Borrow, - K: Borrow, - V: Borrow, - { - self._set(list_key, item_key, val, false) - } - - /// Same as [CandyStore::set_in_list_promoting], but `list_key`, `item_key` and `val` are typed - pub fn set_promoting( - &self, - list_key: &Q1, - item_key: &Q2, - val: &Q3, - ) -> Result> - where - L: Borrow, - K: Borrow, - V: Borrow, - { - self._set(list_key, item_key, val, true) - } - - /// Same as [CandyStore::get_or_create_in_list], but `list_key`, `item_key` and `default_val` are typed - pub fn get_or_create( - &self, - list_key: &Q1, - item_key: &Q2, - default_val: &Q3, - ) -> Result - where - L: Borrow, - K: Borrow, - { - let list_key = Self::make_list_key(list_key); - let item_key = item_key.to_bytes::(); - let default_val = default_val.to_bytes::(); - let vbytes = self - .store - .owned_get_or_create_in_list(list_key, item_key, default_val)? - .value(); - from_bytes::(&vbytes) - } - - /// Same as [CandyStore::replace_in_list], but `list_key`, `item_key` and `val` are typed - pub fn replace( - &self, - list_key: &Q1, - item_key: &Q2, - val: &Q3, - expected_val: Option<&Q3>, - ) -> Result> - where - L: Borrow, - K: Borrow, - V: Borrow, - { - let list_key = Self::make_list_key(list_key); - let item_key = item_key.to_bytes::(); - let val = val.to_bytes::(); - let ebytes = expected_val - .map(|ev| ev.to_bytes::()) - .unwrap_or_default(); - match self.store.owned_replace_in_list( - list_key, - item_key, - val, - expected_val.map(|_| &*ebytes), - )? { - ReplaceStatus::DoesNotExist => Ok(None), - ReplaceStatus::PrevValue(v) => Ok(Some(from_bytes::(&v)?)), - ReplaceStatus::WrongValue(_) => Ok(None), - } - } - - /// Same as [CandyStore::remove_from_list], but `list_key` and `item_key` are typed - pub fn remove( - &self, - list_key: &Q1, - item_key: &Q2, - ) -> Result> - where - L: Borrow, - K: Borrow, - { - let list_key = Self::make_list_key(list_key); - let item_key = item_key.to_bytes::(); - if let Some(vbytes) = self.store.owned_remove_from_list(list_key, item_key)? { - Ok(Some(from_bytes::(&vbytes)?)) - } else { - Ok(None) - } - } - - /// Same as [CandyStore::iter_list], but `list_key` is typed - pub fn iter<'a, Q: ?Sized + Encode>( - &'a self, - list_key: &Q, - ) -> impl Iterator> + 'a - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - self.store.owned_iter_list(list_key).map(|res| match res { - Err(e) => Err(e), - Ok((k, v)) => { - let key = from_bytes::(&k)?; - let val = from_bytes::(&v)?; - Ok((key, val)) - } - }) - } - - /// Same as [CandyStore::iter_list_backwards], but `list_key` is typed - pub fn iter_backwards<'a, Q: ?Sized + Encode>( - &'a self, - list_key: &Q, - ) -> impl Iterator> + 'a - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - self.store - .owned_iter_list_backwards(list_key) - .map(|res| match res { - Err(e) => Err(e), - Ok((k, v)) => { - let key = from_bytes::(&k)?; - let val = from_bytes::(&v)?; - Ok((key, val)) - } - }) - } - - /// Same as [CandyStore::discard_list], but `list_key` is typed - pub fn discard(&self, list_key: &Q) -> Result - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - self.store.owned_discard_list(list_key) - } - - /// Same as [CandyStore::compact_list_if_needed], but `list_key` is typed - pub fn compact_if_needed( - &self, - list_key: &Q, - params: ListCompactionParams, - ) -> Result - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - self.store.compact_list_if_needed(&list_key, params) - } - - /// Same as [CandyStore::pop_list_tail], but `list_key` is typed - pub fn pop_tail(&self, list_key: &Q) -> Result> - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - let Some((k, v)) = self.store.owned_pop_list_tail(list_key)? else { - return Ok(None); - }; - Ok(Some((from_bytes::(&k)?, from_bytes::(&v)?))) - } - - /// Same as [CandyStore::pop_list_head], but `list_key` is typed - pub fn pop_head(&self, list_key: &Q) -> Result> - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - let Some((k, v)) = self.store.owned_pop_list_head(list_key)? else { - return Ok(None); - }; - Ok(Some((from_bytes::(&k)?, from_bytes::(&v)?))) - } - - /// Same as [CandyStore::peek_list_tail], but `list_key` is typed - pub fn peek_tail(&self, list_key: &Q) -> Result> - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - let Some((k, v)) = self.store.owned_peek_list_tail(list_key)? else { - return Ok(None); - }; - Ok(Some((from_bytes::(&k)?, from_bytes::(&v)?))) - } - - /// Same as [CandyStore::peek_list_head], but `list_key` is typed - pub fn peek_head(&self, list_key: &Q) -> Result> - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - let Some((k, v)) = self.store.owned_peek_list_head(list_key)? else { - return Ok(None); - }; - Ok(Some((from_bytes::(&k)?, from_bytes::(&v)?))) - } - - /// Same as [CandyStore::list_len], but `list_key` is typed - pub fn len(&self, list_key: &Q) -> Result - where - L: Borrow, - { - self.store.owned_list_len(Self::make_list_key(list_key)) - } - - /// Same as [CandyStore::retain_in_list], but `list_key` is typed - pub fn retain( - &self, - list_key: &Q, - mut func: impl FnMut(&K, &V) -> Result, - ) -> Result<()> - where - L: Borrow, - { - let list_key = Self::make_list_key(list_key); - self.store.owned_retain_in_list(list_key, |k, v| { - let tk = from_bytes::(&k)?; - let tv = from_bytes::(&v)?; - func(&tk, &tv) - }) - } -} - -/// A wrapper around [CandyStore] that exposes the queue API in a typed manner. See [CandyTypedStore] for more -/// info -pub struct CandyTypedDeque { - store: Arc, - _phantom: PhantomData<(L, V)>, -} - -impl Clone for CandyTypedDeque { - fn clone(&self) -> Self { - Self { - store: self.store.clone(), - _phantom: Default::default(), - } - } -} - -impl CandyTypedDeque -where - L: CandyTypedKey, - V: Encode + DecodeOwned, -{ - pub fn new(store: Arc) -> Self { - Self { - store, - _phantom: Default::default(), - } - } - - /// Pushes a value at the beginning (head) of the queue - pub fn push_head( - &self, - queue_key: &Q1, - val: &Q2, - ) -> Result<()> - where - L: Borrow, - V: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - let val = val.to_bytes::(); - self.store.push_to_queue_head(&queue_key, &val)?; - Ok(()) - } - - /// Pushes a value at the end (tail) of the queue - pub fn push_tail( - &self, - queue_key: &Q1, - val: &Q2, - ) -> Result<()> - where - L: Borrow, - V: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - let val = val.to_bytes::(); - self.store.push_to_queue_tail(&queue_key, &val)?; - Ok(()) - } - - /// Pops a value from the beginning (head) of the queue - pub fn pop_head_with_idx(&self, queue_key: &Q) -> Result> - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - let Some((idx, v)) = self.store.pop_queue_head_with_idx(&queue_key)? else { - return Ok(None); - }; - Ok(Some((idx, from_bytes::(&v)?))) - } - - /// Pops a value from the beginning (head) of the queue - pub fn pop_head(&self, queue_key: &Q) -> Result> - where - L: Borrow, - { - Ok(self.pop_head_with_idx(queue_key)?.map(|iv| iv.1)) - } - - /// Pops a value from the end (tail) of the queue - pub fn pop_tail_with_idx(&self, queue_key: &Q) -> Result> - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - let Some((idx, v)) = self.store.pop_queue_tail_with_idx(&queue_key)? else { - return Ok(None); - }; - Ok(Some((idx, from_bytes::(&v)?))) - } - - /// Pops a value from the end (tail) of the queue - pub fn pop_tail(&self, queue_key: &Q) -> Result> - where - L: Borrow, - { - Ok(self.pop_tail_with_idx(queue_key)?.map(|iv| iv.1)) - } - - /// Peek at the value from the beginning (head) of the queue and its index - pub fn peek_head_with_idx( - &self, - queue_key: &Q, - ) -> Result> - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - let Some((idx, v)) = self.store.peek_queue_head_with_idx(&queue_key)? else { - return Ok(None); - }; - Ok(Some((idx, from_bytes::(&v)?))) - } - - /// Peek at the value from the beginning (head) of the queue - pub fn peek_head(&self, queue_key: &Q) -> Result> - where - L: Borrow, - { - Ok(self.peek_head_with_idx(queue_key)?.map(|iv| iv.1)) - } - - /// Peek at the value from the end (tail) of the queue - pub fn peek_tail_with_idx( - &self, - queue_key: &Q, - ) -> Result> - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - let Some((idx, v)) = self.store.peek_queue_tail_with_idx(&queue_key)? else { - return Ok(None); - }; - Ok(Some((idx, from_bytes::(&v)?))) - } - - /// Peek at the value from the end (tail) of the queue - pub fn peek_tail(&self, queue_key: &Q) -> Result> - where - L: Borrow, - { - Ok(self.peek_tail_with_idx(queue_key)?.map(|iv| iv.1)) - } - - /// See [CandyTypedList::iter] - pub fn iter<'a, Q: ?Sized + Encode>( - &'a self, - queue_key: &Q, - ) -> impl Iterator> + 'a - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - self.store.iter_queue(&queue_key).map(|res| match res { - Err(e) => Err(e), - Ok((idx, v)) => Ok((idx, from_bytes::(&v).unwrap())), - }) - } - - /// See [CandyTypedList::iter_backwards] - pub fn iter_backwards<'a, Q: ?Sized + Encode>( - &'a self, - queue_key: &Q, - ) -> impl Iterator> + 'a - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - self.store - .iter_queue_backwards(&queue_key) - .map(|res| match res { - Err(e) => Err(e), - Ok((idx, v)) => Ok((idx, from_bytes::(&v).unwrap())), - }) - } - - pub fn len(&self, queue_key: &Q) -> Result - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - self.store.queue_len(&queue_key) - } - - pub fn range(&self, queue_key: &Q) -> Result> - where - L: Borrow, - { - let queue_key = CandyTypedList::::make_list_key(queue_key); - self.store.queue_range(&queue_key) - } -} diff --git a/src/types.rs b/src/types.rs new file mode 100644 index 0000000..dcf63a7 --- /dev/null +++ b/src/types.rs @@ -0,0 +1,293 @@ +/// Maximum supported data-file size after internal encoding overhead limits. +pub const MAX_FILE_SIZE: usize = (1 << 30) - (1 << 24); +/// Maximum supported user key length in bytes. +pub const MAX_USER_KEY_SIZE: usize = crate::internal::MAX_INTERNAL_KEY_SIZE - 16; +/// Maximum supported inline value length in bytes. +pub const MAX_USER_VALUE_SIZE: usize = crate::internal::MAX_INTERNAL_VALUE_SIZE - 64; + +pub(crate) const ROW_WIDTH: usize = crate::internal::ROW_WIDTH; +pub(crate) const INITIAL_DATA_FILE_ORDINAL: u64 = 0x00bd_38a0_2a35_1cdf; + +use crate::internal::MIN_INITIAL_ROWS; +use std::time::Duration; + +#[derive(Debug, Clone, Copy)] +/// Runtime configuration for opening a store. +pub struct Config { + /// SipHash keys used for row selection and signatures. + /// + /// When a store is created or fully reset, this key is written into the + /// index header. Reopening an existing store reuses the persisted hash key + /// from disk even if a different value is provided here. + pub hash_key: (u64, u64), + /// Whether to try to lock index mmaps into memory. + pub mlock_index: bool, + /// Growth factor used when remapping index structures. + pub remap_scaler: u8, + /// Initial target capacity in number of key/value entries. + pub initial_capacity: usize, + /// Maximum size of a single data file in bytes. + pub max_data_file_size: u32, + /// Minimum per-file waste threshold before background compaction considers it. + pub compaction_min_threshold: u32, + /// Maximum logical concurrency used to size internal lock tables, defaults to num_cpus*2 + pub max_concurrency: usize, + /// Recreate index files from recognized data files when only the index format is outdated. + pub port_to_current_format: bool, + /// Reset the database if opening encounters invalid on-disk data. + pub reset_on_invalid_data: bool, + /// Target background compaction throughput in bytes per second. + pub compaction_throughput_bytes_per_sec: usize, + /// perform a checkpoint (for crash-consistency) every this much time (`None` to disable) + pub checkpoint_interval: Option, + /// perform a checkpoint (for crash-consistency) every this many bytes written (`None` to disable) + pub checkpoint_delta_bytes: Option, +} + +impl Default for Config { + fn default() -> Self { + Self { + hash_key: (0x7c2b_23a8_12c2_005f, 0x1f6a_4035_386e_c891), + mlock_index: false, + remap_scaler: 1, + initial_capacity: MIN_INITIAL_ROWS * ROW_WIDTH, + max_data_file_size: 64 * 1024 * 1024, + compaction_min_threshold: 24 * 1024 * 1024, + max_concurrency: (2 * num_cpus::get()).clamp(16, 64), + port_to_current_format: true, + reset_on_invalid_data: false, + compaction_throughput_bytes_per_sec: 4 * 1024 * 1024, + checkpoint_interval: Some(Duration::from_secs(5)), + checkpoint_delta_bytes: Some(128 * 1024), + } + } +} + +#[derive(thiserror::Error, Debug)] +/// Errors returned by store operations and open/recovery flows. +pub enum Error { + #[error("IO error: {0}")] + IOError(std::io::Error), + + #[error("Missing data file: {0}")] + MissingDataFile(u16), + + #[error("Data file {0} reached size limit")] + RotateDataFile(u16), + + #[error("Row needs splitting at split level {0}")] + SplitRow(u64), + + #[error("Too many data files")] + TooManyDataFiles, + + #[error("Lockfile {0} is taken by {1}")] + LockfileTaken(std::path::PathBuf, String), + + #[error("Payload {0} too large")] + PayloadTooLarge(usize), + + #[error("Checkpoint shutdown: {0}")] + CheckpointShutdown(String), + + #[error("Postcard error: {0}")] + PostcardError(postcard::Error), +} + +/// Convenience result type used by the crate. +pub type Result = std::result::Result; + +#[derive(Debug, Clone, PartialEq, Eq)] +/// Outcome of a conditional replace operation. +pub enum ReplaceStatus { + /// The key existed and the previous value was replaced. + PrevValue(Vec), + /// The key existed, but its current value did not match the expected value. + WrongValue(Vec), + /// The key did not exist. + DoesNotExist, +} + +impl ReplaceStatus { + /// Returns `true` when the value was replaced. + pub fn was_replaced(&self) -> bool { + matches!(self, Self::PrevValue(_)) + } + + /// Returns `true` when the replace operation did not update the value. + pub fn failed(&self) -> bool { + !self.was_replaced() + } + + /// Returns `true` when the target key was missing. + pub fn is_key_missing(&self) -> bool { + matches!(self, Self::DoesNotExist) + } + + /// Returns `true` when the expected value check failed. + pub fn is_wrong_value(&self) -> bool { + matches!(self, Self::WrongValue(_)) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +/// Outcome of a set operation. +pub enum SetStatus { + /// The key existed and the previous value was returned. + PrevValue(Vec), + /// The key was newly inserted. + CreatedNew, +} + +impl SetStatus { + /// Returns `true` when the key did not previously exist. + pub fn was_created(&self) -> bool { + matches!(self, Self::CreatedNew) + } + + /// Returns `true` when the key previously existed and was overwritten. + pub fn was_replaced(&self) -> bool { + matches!(self, Self::PrevValue(_)) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +/// Outcome of a get-or-create operation. +pub enum GetOrCreateStatus { + /// The key already existed and its current value was returned. + ExistingValue(Vec), + /// The key was created with the provided default value. + CreatedNew(Vec), +} + +impl GetOrCreateStatus { + /// Returns `true` when the key was inserted by the operation. + pub fn was_created(&self) -> bool { + matches!(self, Self::CreatedNew(_)) + } + + /// Returns `true` when the key already existed. + pub fn already_exists(&self) -> bool { + matches!(self, Self::ExistingValue(_)) + } + + /// Returns the resulting value regardless of whether it was created or already existed. + pub fn value(self) -> Vec { + match self { + Self::ExistingValue(value) | Self::CreatedNew(value) => value, + } + } +} + +#[derive(Debug, Clone, Copy)] +/// Heuristics controlling list compaction. +pub struct ListCompactionParams { + /// Minimum list span length before compaction is considered. + pub min_length: u64, + /// Minimum hole ratio required to trigger compaction. + pub min_holes_ratio: f64, +} + +impl Default for ListCompactionParams { + fn default() -> Self { + Self { + min_length: 100, + min_holes_ratio: 0.25, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Snapshot of store-level counters and size statistics. +pub struct Stats { + /// Number of allocated index rows. + pub num_rows: u64, + /// Number of currently live entries. + pub num_items: u64, + /// Number of data files currently present. + pub num_data_files: u64, + + /// Total bytes occupied by index metadata files. + pub index_size_bytes: u64, + + /// Time spent in the most recent grow remap operation. + pub last_remap_dur: Duration, + /// Persisted checkpoint slot generation visible to recovery. + pub checkpoint_generation: u64, + /// Most recent completed runtime checkpoint epoch handled by the checkpoint worker. + pub checkpoint_epoch: u64, + /// Approximate bytes written since the last completed checkpoint. + /// + /// This is a best-effort runtime metric intended for monitoring rather + /// than an exact durable boundary. + pub uncheckpointed_bytes: u64, + /// Time spent in the most recent successful checkpoint operation. + pub last_checkpoint_dur: Duration, + /// Number of completed background compactions. + pub num_compactions: u64, + /// Number of background checkpoint errors since open. + pub checkpoint_errors: u64, + /// Time spent in the most recent successful file compaction. + pub last_compaction_dur: Duration, + /// Bytes reclaimed by the most recent successful file compaction. + pub last_compaction_reclaimed_bytes: u32, + /// Bytes rewritten by the most recent successful file compaction. + pub last_compaction_moved_bytes: u32, + + /// Number of entry creations recorded since open. + pub num_inserted: u64, + /// Number of entry removals recorded since open. + pub num_removed: u64, + /// Number of entry replacements recorded since open. + pub num_updated: u64, + /// Number of successful key lookups. + pub num_positive_lookups: u64, + /// Number of failed key lookups. + pub num_negative_lookups: u64, + /// Number of probes that had to inspect a second matching index entry. + pub num_collisions: u64, + + /// Number of read operations performed against data files. + pub num_read_ops: u64, + /// Total bytes read from data files. + pub num_read_bytes: u64, + /// Number of write operations performed against data files. + pub num_write_ops: u64, + /// Total bytes written to data files. + pub num_write_bytes: u64, + + /// Number of entries replayed during the most recent recovery rebuild. + pub num_rebuilt_entries: u64, + /// Number of trailing data-file bytes discarded during the most recent recovery rebuild. + pub num_rebuild_purged_bytes: u64, + + /// Total bytes currently occupied by the data files (including waste) + pub total_bytes: u64, + /// Total bytes currently accounted as unreclaimed waste. + pub waste_bytes: u64, + + /// Approximate histogram bucket for entries under 64 bytes since open. + pub entries_under_64: u64, + /// Approximate histogram bucket for entries under 256 bytes since open. + pub entries_under_256: u64, + /// Approximate histogram bucket for entries under 1024 bytes since open. + pub entries_under_1024: u64, + /// Approximate histogram bucket for entries under 4096 bytes since open. + pub entries_under_4096: u64, + /// Approximate histogram bucket for entries under 16384 bytes since open. + pub entries_under_16384: u64, + /// Approximate histogram bucket for entries of 16384 bytes or larger since open. + pub entries_over_16384: u64, +} + +impl Stats { + /// Theoretical maximum number of entries at the current row count. + pub fn index_capacity(&self) -> u64 { + self.num_rows.saturating_mul(ROW_WIDTH as u64) + } + + /// bytes used for live data entries + pub fn data_bytes(&self) -> u64 { + self.total_bytes.saturating_sub(self.waste_bytes) + } +} diff --git a/tests/basic_ops.rs b/tests/basic_ops.rs new file mode 100644 index 0000000..e4c9cba --- /dev/null +++ b/tests/basic_ops.rs @@ -0,0 +1,212 @@ +mod common; + +use candystore::{ + CandyStore, Config, Error, GetOrCreateStatus, MAX_KEY_LEN, MAX_VALUE_LEN, ReplaceStatus, + SetStatus, +}; +use std::sync::{Arc, Barrier}; +use std::thread; +use tempfile::tempdir; + +#[test] +fn test_basic() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + assert!(db.get("hello")?.is_none()); + + assert!(matches!(db.set("hello", "world")?, SetStatus::CreatedNew)); + assert_eq!(db.get("hello")?, Some("world".into())); + + assert!( + matches!(db.set("hello", "earth")?, SetStatus::PrevValue(ref value) if value == b"world") + ); + assert_eq!(db.get("hello")?, Some("earth".into())); + + assert_eq!(db.remove("hello")?, Some("earth".into())); + assert!(db.get("hello")?.is_none()); + assert!(db.remove("hello")?.is_none()); + + Ok(()) +} + +#[test] +fn test_reopen_existing_db() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + { + let db = CandyStore::open(dir.path(), Config::default())?; + assert!(matches!(db.set("hello", "world")?, SetStatus::CreatedNew)); + assert!(matches!(db.set("goodbye", "earth")?, SetStatus::CreatedNew)); + } + + let db = CandyStore::open(dir.path(), Config::default())?; + assert_eq!(db.get("hello")?, Some("world".into())); + assert_eq!(db.get("goodbye")?, Some("earth".into())); + + Ok(()) +} + +#[test] +fn test_reopen_with_different_hash_key_uses_persisted_key() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + let original_config = Config { + hash_key: (1, 2), + ..Config::default() + }; + let different_config = Config { + hash_key: (3, 4), + ..original_config + }; + + { + let db = CandyStore::open(dir.path(), original_config)?; + db.set("hello", "world")?; + } + + let db = CandyStore::open(dir.path(), different_config)?; + assert_eq!(db.get("hello")?, Some("world".into())); + db.set("goodbye", "earth")?; + drop(db); + + let db = CandyStore::open(dir.path(), original_config)?; + assert_eq!(db.get("hello")?, Some("world".into())); + assert_eq!(db.get("goodbye")?, Some("earth".into())); + + Ok(()) +} +#[test] +fn test_oversized_value_rejected() { + let dir = tempfile::tempdir().unwrap(); + let config = candystore::Config { + max_data_file_size: 1024 * 1024, + ..Default::default() + }; + let db = candystore::CandyStore::open(dir.path(), config).unwrap(); + + let large_value = vec![0u8; 2 * 1024 * 1024]; // 2MB + + // Should gracefully reject the oversized key/value pair + let result = db.set("key", &large_value); + assert!(result.is_err()); +} + +#[test] +fn test_max_key_len() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let key = vec![b'k'; MAX_KEY_LEN]; + db.set(&key, b"value")?; + assert_eq!(db.get(&key)?, Some(b"value".to_vec())); + + Ok(()) +} + +#[test] +fn test_key_too_long() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let key = vec![b'k'; MAX_KEY_LEN + 1]; + assert!(db.set(&key, b"value").is_err()); + + Ok(()) +} + +#[test] +fn test_max_value_len() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let value = vec![b'v'; MAX_VALUE_LEN]; + db.set(b"key", &value)?; + assert_eq!(db.get(b"key")?, Some(value)); + + Ok(()) +} + +#[test] +fn test_value_too_long() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let value = vec![b'v'; MAX_VALUE_LEN + 1]; + assert!(db.set(b"key", &value).is_err()); + + Ok(()) +} + +#[test] +fn test_empty_key_and_value() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.set(b"", b"value")?; + assert_eq!(db.get(b"")?, Some(b"value".to_vec())); + + db.set(b"empty", b"")?; + assert_eq!(db.get(b"empty")?, Some(Vec::new())); + + Ok(()) +} + +#[test] +fn test_get_or_create_and_replace() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let created = db.get_or_create("hello", "world")?; + assert!(matches!(created, GetOrCreateStatus::CreatedNew(ref value) if value == b"world")); + + let existing = db.get_or_create("hello", "other")?; + assert!(matches!(existing, GetOrCreateStatus::ExistingValue(ref value) if value == b"world")); + + let wrong = db.replace("hello", "earth", Some(&"wrong"))?; + assert!(matches!(wrong, ReplaceStatus::WrongValue(ref value) if value == b"world")); + assert_eq!(db.get("hello")?, Some(b"world".to_vec())); + + let replaced = db.replace("hello", "earth", Some(&"world"))?; + assert!(matches!(replaced, ReplaceStatus::PrevValue(ref value) if value == b"world")); + assert_eq!(db.get("hello")?, Some(b"earth".to_vec())); + + let missing = db.replace("missing", "value", Option::<&str>::None)?; + assert!(matches!(missing, ReplaceStatus::DoesNotExist)); + + Ok(()) +} + +#[test] +fn test_get_or_create_is_atomic_under_contention() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default())?); + let barrier = Arc::new(Barrier::new(8)); + + let mut handles = Vec::new(); + for idx in 0..8 { + let db = Arc::clone(&db); + let barrier = Arc::clone(&barrier); + handles.push(thread::spawn(move || { + let value = format!("value-{idx}"); + barrier.wait(); + db.get_or_create("shared", &value).unwrap() + })); + } + + let mut created_values = Vec::new(); + let mut seen_values = Vec::new(); + for handle in handles { + match handle.join().unwrap() { + GetOrCreateStatus::CreatedNew(value) => created_values.push(value), + GetOrCreateStatus::ExistingValue(value) => seen_values.push(value), + } + } + + assert_eq!(created_values.len(), 1); + let winning_value = created_values.pop().unwrap(); + assert_eq!(db.get("shared")?, Some(winning_value.clone())); + assert!(seen_values.into_iter().all(|value| value == winning_value)); + + Ok(()) +} diff --git a/tests/big_items.rs b/tests/big_items.rs new file mode 100644 index 0000000..f6e3980 --- /dev/null +++ b/tests/big_items.rs @@ -0,0 +1,71 @@ +use candystore::{CandyStore, Config, Error, MAX_USER_VALUE_SIZE}; +use tempfile::tempdir; + +fn patterned_bytes(len: usize) -> Vec { + (0..len).map(|idx| (idx % 251) as u8).collect() +} + +#[test] +fn test_set_get_remove_big() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let value = patterned_bytes(MAX_USER_VALUE_SIZE + 4096); + + assert!(!db.set_big("blob", &value)?); + assert_eq!(db.get_big("blob")?, Some(value.clone())); + assert!(db.remove_big("blob")?); + assert_eq!(db.get_big("blob")?, None); + assert!(!db.remove_big("blob")?); + + Ok(()) +} + +#[test] +fn test_set_big_reports_replacement() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let first = patterned_bytes(MAX_USER_VALUE_SIZE + 17); + let second = patterned_bytes(MAX_USER_VALUE_SIZE * 2 + 33); + + assert!(!db.set_big("blob", &first)?); + assert!(db.set_big("blob", &second)?); + assert_eq!(db.get_big("blob")?, Some(second)); + + Ok(()) +} + +#[test] +fn test_big_persists_across_reopen() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let value = patterned_bytes(MAX_USER_VALUE_SIZE * 2 + 123); + + { + let db = CandyStore::open(dir.path(), Config::default())?; + db.set_big("blob", &value)?; + } + + let reopened = CandyStore::open(dir.path(), Config::default())?; + assert_eq!(reopened.get_big("blob")?, Some(value)); + + Ok(()) +} + +#[test] +fn test_big_can_exceed_single_value_limit() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open( + dir.path(), + Config { + max_data_file_size: 16 * 1024, + ..Config::default() + }, + )?; + + let value = patterned_bytes(MAX_USER_VALUE_SIZE * 3 + 777); + db.set_big("rotating_blob", &value)?; + assert_eq!(db.get_big("rotating_blob")?, Some(value)); + + Ok(()) +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 13a58a6..74ce6d3 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,16 +1,49 @@ -use candystore::Result; -use rand::random; +use std::hash::Hasher; +use std::io::{Read, Seek, SeekFrom}; -pub fn run_in_tempdir(f: impl FnOnce(&str) -> Result<()>) -> Result<()> { - let rand: u64 = random(); - let dir = format!("/tmp/candy-{rand}"); - _ = std::fs::remove_dir_all(&dir); +use candystore::Config; - f(&dir)?; +#[allow(dead_code)] +pub fn small_file_config() -> Config { + Config { + max_data_file_size: 16 * 1024, + ..Config::default() + } +} - _ = std::fs::remove_dir_all(&dir); - Ok(()) +#[allow(dead_code)] +pub fn checkpoint_slot_checksum(generation: u64, ordinal: u64, offset: u64) -> u64 { + let mut hasher = siphasher::sip::SipHasher13::new(); + hasher.write_u64(generation); + hasher.write_u64(ordinal); + hasher.write_u64(offset); + hasher.finish() } #[allow(dead_code)] -pub const LONG_VAL: &str = "a very long valueeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"; +pub fn logical_data_len(path: &std::path::Path) -> u64 { + const HEADER_LEN: u64 = 4096; + const ALIGNMENT: u64 = 16; + const CHUNK_LEN: usize = 64 * 1024; + + let mut file = std::fs::File::open(path).unwrap(); + let total_len = file.metadata().unwrap().len().saturating_sub(HEADER_LEN); + if total_len == 0 { + return 0; + } + + let mut end = total_len; + let mut buf = vec![0u8; CHUNK_LEN]; + while end > 0 { + let start = end.saturating_sub(CHUNK_LEN as u64); + let chunk_len = (end - start) as usize; + file.seek(SeekFrom::Start(HEADER_LEN + start)).unwrap(); + file.read_exact(&mut buf[..chunk_len]).unwrap(); + if let Some(rel) = buf[..chunk_len].iter().rposition(|byte| *byte != 0) { + return (start + rel as u64 + 1).next_multiple_of(ALIGNMENT); + } + end = start; + } + + 0 +} diff --git a/tests/compaction.rs b/tests/compaction.rs new file mode 100644 index 0000000..65cc792 --- /dev/null +++ b/tests/compaction.rs @@ -0,0 +1,438 @@ +mod common; + +use std::collections::BTreeSet; +use std::sync::{Arc, Barrier}; +use std::thread; + +use candystore::{CandyStore, Config, Error}; +use tempfile::tempdir; + +#[test] +fn test_background_compaction() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + let config = Config { + max_data_file_size: 1024, + compaction_min_threshold: 256, + ..Config::default() + }; + + let db = CandyStore::open(dir.path(), config)?; + + let data_files = || -> BTreeSet { + std::fs::read_dir(dir.path()) + .unwrap() + .filter_map(|entry| { + entry + .ok() + .and_then(|entry| entry.file_name().into_string().ok()) + .filter(|name| name.starts_with("data_")) + }) + .collect() + }; + + for i in 0..100 { + db.set(format!("key{i:04}"), format!("value{i:04}"))?; + } + + let initial_files = data_files(); + assert!(initial_files.len() > 1, "should have multiple data files"); + + for i in 0..100 { + db.set(format!("key{i:04}"), format!("updated{i:04}"))?; + } + + let mut files_after = data_files(); + for _ in 0..100 { + std::thread::sleep(std::time::Duration::from_millis(10)); + files_after = data_files(); + if initial_files.iter().any(|file| !files_after.contains(file)) { + break; + } + } + + assert!( + initial_files.iter().any(|file| !files_after.contains(file)), + "compaction should have removed at least one initial data file: initial={initial_files:?}, current={files_after:?}" + ); + + for i in 0..100 { + let key = format!("key{i:04}"); + let expected = format!("updated{i:04}"); + assert_eq!( + db.get(&key)?, + Some(expected.into_bytes()), + "key {key} should have updated value after compaction" + ); + } + + Ok(()) +} + +#[test] +fn test_background_compaction_after_reopen_without_writes() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + let config = Config { + max_data_file_size: 1024, + compaction_min_threshold: 256, + ..Config::default() + }; + let write_config = Config { + compaction_throughput_bytes_per_sec: 0, + ..config + }; + + let files_before; + { + let db = CandyStore::open(dir.path(), write_config)?; + + for i in 0..200 { + db.set(format!("key{i:04}"), vec![b'a'; 64])?; + } + + for i in 0..200 { + assert_eq!(db.remove(format!("key{i:04}"))?, Some(vec![b'a'; 64])); + } + + files_before = std::fs::read_dir(dir.path()) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_str() + .is_some_and(|s| s.starts_with("data_")) + }) + .count(); + assert!( + files_before > 1, + "expected multiple data files before close" + ); + } + + let count_data_files = || -> usize { + std::fs::read_dir(dir.path()) + .unwrap() + .filter(|e| { + e.as_ref() + .ok() + .and_then(|e| e.file_name().to_str().map(|s| s.starts_with("data_"))) + .unwrap_or(false) + }) + .count() + }; + + let db = CandyStore::open(dir.path(), config)?; + + for _ in 0..100 { + std::thread::sleep(std::time::Duration::from_millis(10)); + if count_data_files() < files_before { + break; + } + } + + let files_after = count_data_files(); + assert!( + files_after < files_before, + "reopened store should compact without new writes: before={files_before}, after={files_after}" + ); + + for i in 0..200 { + assert_eq!( + db.get(format!("key{i:04}"))?, + None, + "key{i:04} should remain deleted" + ); + } + + Ok(()) +} + +#[test] +fn test_background_compaction_drains_large_backlog() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + let write_config = Config { + // Keep one value per file while leaving each stale file below the setup-time + // compaction threshold so backlog creation does not race the background worker. + max_data_file_size: 200, + compaction_min_threshold: 160, + ..Config::default() + }; + + let compact_config = Config { + max_data_file_size: write_config.max_data_file_size, + compaction_min_threshold: 64, + ..Config::default() + }; + + const NUM_KEYS: usize = 64; + + { + let db = CandyStore::open(dir.path(), write_config)?; + + for i in 0..NUM_KEYS { + db.set(format!("key{i:04}"), vec![b'a'; 96])?; + } + + for i in 0..NUM_KEYS { + assert_eq!(db.remove(format!("key{i:04}"))?, Some(vec![b'a'; 96])); + } + } + + let count_data_files = || -> usize { + std::fs::read_dir(dir.path()) + .unwrap() + .filter(|e| { + e.as_ref() + .ok() + .and_then(|e| e.file_name().to_str().map(|s| s.starts_with("data_"))) + .unwrap_or(false) + }) + .count() + }; + + let setup_files = count_data_files(); + assert!( + setup_files >= NUM_KEYS, + "expected backlog setup to create many stale files: {setup_files}" + ); + + let db = CandyStore::open(dir.path(), compact_config)?; + + let files_before = count_data_files(); + assert!( + files_before >= setup_files.saturating_sub(2), + "expected reopen to begin with nearly the full stale-file backlog: setup={setup_files}, before={files_before}" + ); + + let min_expected_drained = (setup_files / 2).max(8); + + for _ in 0..300 { + std::thread::sleep(std::time::Duration::from_millis(10)); + if count_data_files() + min_expected_drained <= files_before { + break; + } + } + + let files_after = count_data_files(); + assert!( + files_after + min_expected_drained <= files_before, + "compaction worker should drain a large backlog after being woken: before={files_before}, after={files_after}, expected_drain={min_expected_drained}" + ); + + for i in 0..NUM_KEYS { + assert_eq!( + db.get(format!("key{i:04}"))?, + None, + "key{i:04} should remain deleted" + ); + } + + Ok(()) +} + +#[test] +fn test_compaction_updates_reclaimed_bytes() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + let config = Config { + max_data_file_size: 1024, + compaction_min_threshold: 256, + ..Config::default() + }; + + let db = CandyStore::open(dir.path(), config)?; + + for i in 0..100 { + db.set(format!("key{i:04}"), format!("value{i:04}"))?; + } + + // Update all keys to generate waste + for i in 0..100 { + db.set(format!("key{i:04}"), format!("updated{i:04}"))?; + } + + // Wait for compaction to run + for _ in 0..200 { + std::thread::sleep(std::time::Duration::from_millis(10)); + if db.stats().num_compactions > 0 { + break; + } + } + + let stats = db.stats(); + assert!( + stats.num_compactions > 0, + "compaction should have run at least once" + ); + assert!( + stats.last_compaction_reclaimed_bytes > 0, + "last_compaction_reclaimed_bytes should be positive after compaction" + ); + assert!( + stats.waste_bytes < 200_000, + "waste_bytes should reflect current unreclaimed waste, not a lifetime total" + ); + + for i in 0..100 { + let key = format!("key{i:04}"); + let expected = format!("updated{i:04}"); + assert_eq!(db.get(&key)?, Some(expected.into_bytes())); + } + + Ok(()) +} + +#[test] +fn test_concurrent_updates_with_compaction() -> Result<(), Error> { + const THREADS: usize = 8; + const KEYS: usize = 200; + const ROUNDS: usize = 20; + + let dir = tempdir().unwrap(); + + let config = Config { + max_data_file_size: 2048, + compaction_min_threshold: 512, + ..Config::default() + }; + + let db = Arc::new(CandyStore::open(dir.path(), config)?); + + // Seed initial keys + for i in 0..KEYS { + db.set(format!("key{i:04}"), format!("v0_{i:04}"))?; + } + + let barrier = Arc::new(Barrier::new(THREADS)); + let handles: Vec<_> = (0..THREADS) + .map(|t| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + barrier.wait(); + for round in 0..ROUNDS { + for i in 0..KEYS { + let key = format!("key{i:04}"); + let val = format!("v{round}_{t}_{i:04}"); + db.set(&key, &val).unwrap(); + } + } + }) + }) + .collect(); + + for h in handles { + h.join().unwrap(); + } + + // Give compaction time to finish remaining work + for _ in 0..100 { + std::thread::sleep(std::time::Duration::from_millis(10)); + } + + // All keys should still be readable + for i in 0..KEYS { + let key = format!("key{i:04}"); + assert!(db.get(&key)?.is_some(), "key {key} should exist"); + } + + let stats = db.stats(); + assert!( + stats.num_compactions > 0, + "compaction should have run during concurrent updates" + ); + assert!( + stats.last_compaction_reclaimed_bytes > 0, + "last_compaction_reclaimed_bytes should be positive after concurrent updates + compaction" + ); + + Ok(()) +} + +#[test] +fn test_concurrent_removes_trigger_compaction() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + let config = Config { + max_data_file_size: 1024, + compaction_min_threshold: 256, + ..Config::default() + }; + + let db = Arc::new(CandyStore::open(dir.path(), config)?); + + // Create keys spread across many files + for i in 0..300 { + db.set(format!("key{i:04}"), vec![b'x'; 64])?; + } + + let files_before = std::fs::read_dir(dir.path()) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_str() + .is_some_and(|s| s.starts_with("data_")) + }) + .count(); + assert!(files_before > 2); + + // Remove all keys concurrently — tombstone waste should trigger compaction + const THREADS: usize = 8; + let barrier = Arc::new(Barrier::new(THREADS)); + let handles: Vec<_> = (0..THREADS) + .map(|t| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + barrier.wait(); + for i in (t..300).step_by(THREADS) { + let _ = db.remove(format!("key{i:04}")).unwrap(); + } + }) + }) + .collect(); + + for h in handles { + h.join().unwrap(); + } + + // Wait for compaction + for _ in 0..200 { + std::thread::sleep(std::time::Duration::from_millis(10)); + let files_now = std::fs::read_dir(dir.path()) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_str() + .is_some_and(|s| s.starts_with("data_")) + }) + .count(); + if files_now < files_before { + break; + } + } + + let files_after = std::fs::read_dir(dir.path()) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.file_name() + .to_str() + .is_some_and(|s| s.starts_with("data_")) + }) + .count(); + assert!( + files_after < files_before, + "compaction should remove files after concurrent removes: before={files_before}, after={files_after}" + ); + + for i in 0..300 { + assert_eq!(db.get(format!("key{i:04}"))?, None); + } + + Ok(()) +} diff --git a/tests/concurrency.rs b/tests/concurrency.rs new file mode 100644 index 0000000..bca6a6e --- /dev/null +++ b/tests/concurrency.rs @@ -0,0 +1,332 @@ +mod common; + +use std::sync::{Arc, Barrier}; +use std::thread; + +use candystore::{CandyStore, Config, Error, SetStatus}; +use tempfile::tempdir; + +#[test] +fn test_multi_threaded_disjoint_writes() -> Result<(), Error> { + const THREADS: usize = 30; + const KEYS_PER_THREAD: usize = 10_000; + + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default())?); + let barrier = Arc::new(Barrier::new(THREADS)); + + let handles: Vec<_> = (0..THREADS) + .map(|thread_idx| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + barrier.wait(); + for key_idx in 0..KEYS_PER_THREAD { + let key = format!("mt_key_{thread_idx:02}_{key_idx:04}"); + let value = format!("mt_val_{thread_idx:02}_{key_idx:04}"); + assert!(matches!( + db.set(&key, &value).unwrap(), + SetStatus::CreatedNew + )); + assert_eq!(db.get(&key).unwrap(), Some(value.into())); + } + }) + }) + .collect(); + + for handle in handles { + handle.join().unwrap(); + } + + for thread_idx in 0..THREADS { + for key_idx in 0..KEYS_PER_THREAD { + let key = format!("mt_key_{thread_idx:02}_{key_idx:04}"); + let value = format!("mt_val_{thread_idx:02}_{key_idx:04}"); + assert_eq!(db.get(&key)?, Some(value.into())); + } + } + + Ok(()) +} + +#[test] +fn test_multi_threaded_reads() -> Result<(), Error> { + const THREADS: usize = 30; + const KEYS: usize = 10_000; + + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default())?); + + for key_idx in 0..KEYS { + let key = format!("read_key_{key_idx:04}"); + let value = format!("read_val_{key_idx:04}"); + assert!(matches!(db.set(&key, &value)?, SetStatus::CreatedNew)); + } + + let barrier = Arc::new(Barrier::new(THREADS)); + let handles: Vec<_> = (0..THREADS) + .map(|_| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + barrier.wait(); + for key_idx in 0..KEYS { + let key = format!("read_key_{key_idx:04}"); + let value = format!("read_val_{key_idx:04}"); + assert_eq!(db.get(&key).unwrap(), Some(value.into())); + } + }) + }) + .collect(); + + for handle in handles { + handle.join().unwrap(); + } + + Ok(()) +} + +#[test] +fn test_multi_threaded_same_key_writes() -> Result<(), Error> { + const THREADS: usize = 30; + + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default())?); + let barrier = Arc::new(Barrier::new(THREADS)); + + let handles: Vec<_> = (0..THREADS) + .map(|thread_idx| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + let value = format!("same_value_{thread_idx:02}"); + barrier.wait(); + db.set("shared-key", &value).unwrap(); + }) + }) + .collect(); + + for handle in handles { + handle.join().unwrap(); + } + + let final_value = db.get("shared-key")?.expect("value should exist"); + assert!( + std::str::from_utf8(&final_value) + .unwrap() + .starts_with("same_value_") + ); + + Ok(()) +} + +#[test] +fn test_multi_threaded_same_key_read_write() -> Result<(), Error> { + const THREADS: usize = 30; + const WRITES_PER_THREAD: usize = 10_000; + + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default())?); + assert!(matches!( + db.set("shared-key", "seed")?, + SetStatus::CreatedNew + )); + + let barrier = Arc::new(Barrier::new(THREADS)); + let handles: Vec<_> = (0..THREADS) + .map(|thread_idx| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + barrier.wait(); + for write_idx in 0..WRITES_PER_THREAD { + if thread_idx % 2 == 0 { + let value = format!("rw_{thread_idx:02}_{write_idx:02}"); + db.set("shared-key", &value).unwrap(); + } else { + let value = db.get("shared-key").unwrap(); + assert!(value.is_some()); + } + } + }) + }) + .collect(); + + for handle in handles { + handle.join().unwrap(); + } + + assert!(db.get("shared-key")?.is_some()); + + Ok(()) +} + +#[test] +fn test_multi_threaded_writes_with_splits_and_rotation() -> Result<(), Error> { + const THREADS: usize = 30; + const KEYS_PER_THREAD: usize = 2_000; // to avoid too many open files in small config + + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), common::small_file_config())?); + let barrier = Arc::new(Barrier::new(THREADS)); + + let handles: Vec<_> = (0..THREADS) + .map(|thread_idx| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + barrier.wait(); + for key_idx in 0..KEYS_PER_THREAD { + let key = format!("mt_split_rotate_key_{thread_idx:02}_{key_idx:04}"); + let value = format!( + "mt_split_rotate_val_{thread_idx:02}_{key_idx:04}_{}", + "x".repeat(48) + ); + assert!(matches!( + db.set(&key, &value).unwrap(), + SetStatus::CreatedNew + )); + } + }) + }) + .collect(); + + for handle in handles { + handle.join().unwrap(); + } + + for thread_idx in 0..THREADS { + for key_idx in 0..KEYS_PER_THREAD { + let key = format!("mt_split_rotate_key_{thread_idx:02}_{key_idx:04}"); + let value = format!( + "mt_split_rotate_val_{thread_idx:02}_{key_idx:04}_{}", + "x".repeat(48) + ); + assert_eq!(db.get(&key)?, Some(value.into())); + } + } + + let data_file_count = std::fs::read_dir(dir.path()) + .map_err(Error::IOError)? + .filter_map(|entry| entry.ok()) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with("data_")) + }) + .count(); + assert!( + data_file_count > 1, + "expected concurrent writes to trigger rotation with small files" + ); + + Ok(()) +} + +fn expected_key(is_shared: bool, thread_idx: usize, key_idx: usize) -> Vec { + let mut key = String::new(); + if is_shared { + key.push_str(&format!("shared_key_{key_idx}")); + } else { + key.push_str(&format!("distinct_key_{thread_idx}_{key_idx}")); + } + // Mix in some large keys + if key_idx.is_multiple_of(7) { + key.push_str(&"K".repeat(150)); + } + key.into_bytes() +} + +fn expected_value(key: &[u8]) -> Vec { + let mut val = String::from_utf8_lossy(key).into_owned(); + let length_marker = key.iter().map(|&b| b as usize).sum::(); + if length_marker % 3 == 0 { + val.push_str(&"V".repeat(5000)); + } else if length_marker % 5 == 0 { + val.push_str(&"V".repeat(100)); // Medium + } + val.into_bytes() +} + +fn pseudo_rand(seed: &mut u64) -> u64 { + *seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1); + *seed +} + +#[test] +fn test_concurrent_mixed_workload() -> Result<(), Error> { + const THREADS: usize = 30; + const OPERATIONS_PER_THREAD: usize = 10_000; + const SHARED_KEYS_TOTAL: usize = 2_000; + const DISTINCT_KEYS_PER_THREAD: usize = 300; // 30 * 300 = 9000 distinct total + + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default())?); + let barrier = Arc::new(Barrier::new(THREADS)); + + let handles: Vec<_> = (0..THREADS) + .map(|thread_idx| { + let db = db.clone(); + let barrier = barrier.clone(); + thread::spawn(move || { + let mut seed = (thread_idx as u64 + 1) * 123456789; + + // Track our own distinct keys so we strictly assert them + let mut distinct_state = vec![false; DISTINCT_KEYS_PER_THREAD]; + + barrier.wait(); + + for _ in 0..OPERATIONS_PER_THREAD { + let r = pseudo_rand(&mut seed); + let is_shared = (r % 100) < 50; // 50% operations on shared pool, 50% on distinct + + let key_idx = if is_shared { + (pseudo_rand(&mut seed) as usize) % SHARED_KEYS_TOTAL + } else { + (pseudo_rand(&mut seed) as usize) % DISTINCT_KEYS_PER_THREAD + }; + + let key = expected_key(is_shared, thread_idx, key_idx); + let val = expected_value(&key); + + let op = pseudo_rand(&mut seed) % 100; + if op < 40 { + // 40% Set + db.set(&key, &val).unwrap(); + if !is_shared { + distinct_state[key_idx] = true; + } + } else if op < 80 { + // 40% Get + let actual = db.get(&key).unwrap(); + if is_shared { + // Validation: Either exactly the expected value or None! + if let Some(v) = actual { + assert_eq!(v, val, "Shared key data corrupted!"); + } + } else { + if distinct_state[key_idx] { + assert_eq!(actual, Some(val), "Distinct key missing!"); + } else { + assert_eq!(actual, None, "Distinct key found but not set!"); + } + } + } else { + // 20% Remove + let _ = db.remove(&key).unwrap(); + if !is_shared { + distinct_state[key_idx] = false; + } + } + } + }) + }) + .collect(); + + for handle in handles { + handle.join().unwrap(); + } + + Ok(()) +} diff --git a/candy-crasher/src/main.rs b/tests/crasher.rs similarity index 62% rename from candy-crasher/src/main.rs rename to tests/crasher.rs index 02115bb..6b21a80 100644 --- a/candy-crasher/src/main.rs +++ b/tests/crasher.rs @@ -1,27 +1,50 @@ +#![cfg(unix)] + use std::ptr::null_mut; use std::time::Duration; use std::{ops::Range, sync::atomic::AtomicU64, sync::atomic::Ordering::SeqCst}; use candystore::{CandyStore, Config, Result}; -use rand::Rng; +use rand::RngExt; + +#[cfg(debug_assertions)] +const TARGET: u32 = 100_000; +#[cfg(debug_assertions)] +const SLEEP_RANGE: Range = 300..800; +#[cfg(not(debug_assertions))] const TARGET: u32 = 1_000_000; -const CONFIG: Config = Config { - max_shard_size: 64 * 1024 * 1024, - min_compaction_threashold: 8 * 1024 * 1024, - hash_seed: *b"kOYLu0xvq2WtzcKJ", - expected_number_of_keys: 0, - max_concurrent_list_ops: 64, - truncate_up: true, - clear_on_unsupported_version: true, - mlock_headers: false, - num_compaction_threads: 4, -}; - -fn child_inserts() -> Result<()> { +#[cfg(not(debug_assertions))] +const SLEEP_RANGE: Range = 50..500; + +fn get_config() -> Config { + Config { + max_data_file_size: 64 * 1024 * 1024, + compaction_min_threshold: 8 * 1024 * 1024, + hash_key: (0xb047_a3ef_b334_9804, 0x807d_3135_878e_9b27), + initial_capacity: 1024, + max_concurrency: 64, + ..Default::default() + } +} + +const DB_DIR: &str = "/tmp/dbdir_crash"; + +fn record_rebuild_stats(shared_stuff: &SharedStuff, store: &CandyStore) { + let stats = store.stats(); + shared_stuff + .total_num_rebuilt_entries + .fetch_add(stats.num_rebuilt_entries, SeqCst); + shared_stuff + .total_num_dropped_bytes_on_rebuild + .fetch_add(stats.num_rebuild_purged_bytes, SeqCst); +} + +fn child_inserts(shared_stuff: &SharedStuff) -> Result<()> { // our job is to create 1M entries while being killed by our evil parent - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; + record_rebuild_stats(shared_stuff, &store); let highest_bytes = store.get("highest")?.unwrap_or(vec![0, 0, 0, 0]); let highest = u32::from_le_bytes(highest_bytes.try_into().unwrap()); @@ -33,18 +56,19 @@ fn child_inserts() -> Result<()> { println!("child starting at {highest}"); for i in highest..TARGET { - store.set(&i.to_le_bytes(), "i am a key")?; - store.set("highest", &i.to_le_bytes())?; + store.set(i.to_le_bytes(), "i am a key")?; + store.set("highest", i.to_le_bytes())?; } println!("child finished"); Ok(()) } -fn child_removals() -> Result<()> { +fn child_removals(shared_stuff: &SharedStuff) -> Result<()> { // our job is to remove 1M entries while being killed by our evil parent - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; + record_rebuild_stats(shared_stuff, &store); let lowest_bytes = store.get("lowest")?.unwrap_or(vec![0, 0, 0, 0]); let lowest = u32::from_le_bytes(lowest_bytes.try_into().unwrap()); @@ -55,19 +79,22 @@ fn child_removals() -> Result<()> { println!("child starting at {lowest}"); + assert!(!store.contains("highest")?, "\"highest\" got resurrected"); + for i in lowest..TARGET { - store.remove(&i.to_le_bytes())?; - store.set("lowest", &i.to_le_bytes())?; + store.remove(i.to_le_bytes())?; + store.set("lowest", i.to_le_bytes())?; } println!("child finished"); Ok(()) } -fn child_list_inserts() -> Result<()> { +fn child_list_inserts(shared_stuff: &SharedStuff) -> Result<()> { // our job is to insert 1M entries to a list while being killed by our evil parent - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; + record_rebuild_stats(shared_stuff, &store); let highest_bytes = store.get("list_highest")?.unwrap_or(vec![0, 0, 0, 0]); let highest = u32::from_le_bytes(highest_bytes.try_into().unwrap()); @@ -81,17 +108,18 @@ fn child_list_inserts() -> Result<()> { for i in highest..TARGET { store.set_in_list("xxx", &i.to_le_bytes(), "yyy")?; - store.set("list_highest", &i.to_le_bytes())?; + store.set("list_highest", i.to_le_bytes())?; } println!("child finished"); Ok(()) } -fn child_list_removals() -> Result<()> { +fn child_list_removals(shared_stuff: &SharedStuff) -> Result<()> { // our job is to remove 1M entries to a list while being killed by our evil parent - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; + record_rebuild_stats(shared_stuff, &store); let lowest_bytes = store.get("list_lowest")?.unwrap_or(vec![0, 0, 0, 0]); let lowest = u32::from_le_bytes(lowest_bytes.try_into().unwrap()); @@ -124,7 +152,7 @@ fn child_list_removals() -> Result<()> { old.is_none() || old == Some("yyy".into()), "{i} old={old:?}" ); - store.set("list_lowest", &i.to_le_bytes())?; + store.set("list_lowest", i.to_le_bytes())?; } println!("child finished"); @@ -132,8 +160,9 @@ fn child_list_removals() -> Result<()> { Ok(()) } -fn child_list_iterator_removals() -> Result<()> { - let store = CandyStore::open("dbdir", CONFIG)?; +fn child_list_iterator_removals(shared_stuff: &SharedStuff) -> Result<()> { + let store = CandyStore::open(DB_DIR, get_config())?; + record_rebuild_stats(shared_stuff, &store); if rand::random() { //println!("FWD"); @@ -147,7 +176,7 @@ fn child_list_iterator_removals() -> Result<()> { } } else { //println!("BACK"); - for (i, res) in store.iter_list_backwards("xxx").enumerate() { + for (i, res) in store.iter_list("xxx").rev().enumerate() { let (k, v) = res?; let v2 = u32::from_le_bytes(v.try_into().unwrap()); if i == 0 { @@ -164,24 +193,24 @@ fn child_list_iterator_removals() -> Result<()> { fn parent_run( shared_stuff: &SharedStuff, - mut child_func: impl FnMut() -> Result<()>, - sleep: Range, + child_name: &str, + mut child_func: impl FnMut(&SharedStuff) -> Result<()>, ) -> Result<()> { + println!("======== Parent starts {child_name} ========"); for i in 0.. { let pid = unsafe { libc::fork() }; assert!(pid >= 0); if pid == 0 { - let res = child_func(); - if res.is_err() { + let res = child_func(shared_stuff); + if let Err(e) = res { + eprintln!("Child failed: {}", e); shared_stuff.failed.store(1, SeqCst); } - res.unwrap(); unsafe { libc::exit(0) }; } else { // parent - std::thread::sleep(Duration::from_millis( - rand::thread_rng().gen_range(sleep.clone()), - )); + let dur = Duration::from_millis(rand::rng().random_range(SLEEP_RANGE)); + std::thread::sleep(dur); let mut status = 0i32; let rc = unsafe { libc::waitpid(pid, &mut status, libc::WNOHANG) }; if rc == 0 { @@ -190,7 +219,7 @@ fn parent_run( panic!("child crashed at iteration {i}"); } - println!("[{i}] killing child"); + println!("[{i}] killing child after {dur:?}"); unsafe { libc::kill(pid, libc::SIGKILL); libc::wait(&mut status); @@ -216,10 +245,18 @@ fn parent_run( struct SharedStuff { failed: AtomicU64, + total_num_rebuilt_entries: AtomicU64, + total_num_dropped_bytes_on_rebuild: AtomicU64, } -fn main() -> Result<()> { - _ = std::fs::remove_dir_all("dbdir"); +#[test] +fn test_crash_recovery() -> Result<()> { + // Only run on Linux because of fork/mmap + if cfg!(not(target_os = "linux")) { + return Ok(()); + } + + _ = std::fs::remove_dir_all(DB_DIR); let map_addr = unsafe { libc::mmap( @@ -234,29 +271,24 @@ fn main() -> Result<()> { assert_ne!(map_addr, libc::MAP_FAILED); let shared_stuff = unsafe { &*(map_addr as *const SharedStuff) }; + shared_stuff.failed.store(0, SeqCst); + shared_stuff.total_num_rebuilt_entries.store(0, SeqCst); + shared_stuff + .total_num_dropped_bytes_on_rebuild + .store(0, SeqCst); - // let store = CandyStore::open( - // "dbdir", - // Config { - // expected_number_of_keys: 1_000_000, - // clear_on_unsupported_version: true, - // ..Default::default() - // }, - // )?; - // drop(store); - - parent_run(shared_stuff, child_inserts, 10..300)?; + parent_run(shared_stuff, "child_inserts", child_inserts)?; { println!("Parent starts validating the DB..."); - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; assert_eq!( store.remove("highest")?, Some((TARGET - 1).to_le_bytes().to_vec()) ); let mut count = 0; - for res in store.iter() { + for res in store.iter_items() { let (k, v) = res?; assert_eq!(v, b"i am a key"); let k = u32::from_le_bytes(k.try_into().unwrap()); @@ -268,32 +300,28 @@ fn main() -> Result<()> { println!("DB validated successfully"); } - parent_run(shared_stuff, child_removals, 10..30)?; + parent_run(shared_stuff, "child_removals", child_removals)?; { println!("Parent starts validating the DB..."); - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; assert_eq!( store.remove("lowest")?, Some((TARGET - 1).to_le_bytes().to_vec()) ); - assert_eq!( - store.iter().count(), - 0, - "{:?}", - store.iter().collect::>() - ); + let items = store.iter_items().collect::>(); + assert_eq!(items.len(), 0, "{items:?}"); println!("DB validated successfully"); } - parent_run(shared_stuff, child_list_inserts, 10..300)?; + parent_run(shared_stuff, "child_list_inserts", child_list_inserts)?; { println!("Parent starts validating the DB..."); - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; assert_eq!( store.remove("list_highest")?, Some((TARGET - 1).to_le_bytes().to_vec()) @@ -308,12 +336,12 @@ fn main() -> Result<()> { println!("DB validated successfully"); } - parent_run(shared_stuff, child_list_removals, 10..80)?; + parent_run(shared_stuff, "child_list_removals", child_list_removals)?; { println!("Parent starts validating the DB..."); - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; assert_eq!( store.remove("list_lowest")?, Some((TARGET - 1).to_le_bytes().to_vec()) @@ -321,19 +349,19 @@ fn main() -> Result<()> { assert_eq!(store.iter_list("xxx").count(), 0); - println!("leaked: {}", store.iter_raw().count()); + println!("leaked: {}", store.iter_items().count()); store.discard_list("xxx")?; println!("DB validated successfully"); } { - println!("Parent creates 1M members in a list..."); + println!("Parent creates {} members in a list...", TARGET); - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; let t0 = std::time::Instant::now(); - for i in 0u32..1_000_000 { - if i % 65536 == 0 { + for i in 0u32..TARGET { + if i % 100000 == 0 { println!("{i}"); } store.set_in_list("xxx", &i.to_le_bytes(), &i.to_le_bytes())?; @@ -344,21 +372,32 @@ fn main() -> Result<()> { ); } - parent_run(shared_stuff, child_list_iterator_removals, 10..200)?; + parent_run( + shared_stuff, + "child_list_iterator_removals", + child_list_iterator_removals, + )?; { println!("Parent starts validating the DB..."); - let store = CandyStore::open("dbdir", CONFIG)?; + let store = CandyStore::open(DB_DIR, get_config())?; assert_eq!(store.iter_list("xxx").count(), 0); // we will surely leak some entries that were unlinked from the list before they were removed - println!("leaked: {}", store.iter_raw().count()); + println!("leaked: {}", store.iter_items().count()); store.discard_list("xxx")?; println!("DB validated successfully"); } + println!( + "rebuilt_entries_total={} dropped_bytes_on_rebuild_total={}", + shared_stuff.total_num_rebuilt_entries.load(SeqCst), + shared_stuff.total_num_dropped_bytes_on_rebuild.load(SeqCst) + ); + + _ = std::fs::remove_dir_all(DB_DIR); Ok(()) } diff --git a/tests/data_loss.rs b/tests/data_loss.rs new file mode 100644 index 0000000..6a4ba66 --- /dev/null +++ b/tests/data_loss.rs @@ -0,0 +1,159 @@ +use std::fs::OpenOptions; +use std::io::{Seek, SeekFrom, Write}; + +use candystore::{CandyStore, Config}; + +mod common; + +fn first_data_file_path(dir: &std::path::Path) -> std::path::PathBuf { + std::fs::read_dir(dir) + .unwrap() + .filter_map(|entry| entry.ok()) + .find(|entry| entry.file_name().to_string_lossy().starts_with("data_")) + .unwrap() + .path() +} + +fn zero_range(path: &std::path::Path, start: u64, len: usize) { + let mut file = OpenOptions::new().write(true).open(path).unwrap(); + file.seek(SeekFrom::Start(start)).unwrap(); + file.write_all(&vec![0u8; len]).unwrap(); + file.sync_all().unwrap(); +} + +#[test] +fn test_zeroed_tail_data_file_lookup() { + let dir = tempfile::tempdir().unwrap(); + let config = Config::default(); + + { + let store = CandyStore::open(dir.path(), config).unwrap(); + for idx in 0..1000 { + store + .set( + format!("key:{idx:04}").as_bytes(), + format!("val:{idx:04}").as_bytes(), + ) + .unwrap(); + } + store.flush().unwrap(); + } + + let data_path = first_data_file_path(dir.path()); + let file_len = common::logical_data_len(&data_path); + let zero_len = 2400usize; + zero_range(&data_path, 4096 + file_len - zero_len as u64, zero_len); + + let store = CandyStore::open(dir.path(), config).unwrap(); + let mut missing = 0; + for idx in 0..1000 { + let key = format!("key:{idx:04}"); + match store.get(key.as_bytes()).unwrap() { + Some(value) => assert_eq!(value, format!("val:{idx:04}").into_bytes()), + None => missing += 1, + } + } + + assert!(missing > 0); + assert!(missing < 1000); +} + +#[test] +fn test_truncated_data_file_queues() { + let dir = tempfile::tempdir().unwrap(); + let config = Config::default(); + let num_queues = 50; + let items_per_queue = 10; + + { + let store = CandyStore::open(dir.path(), config).unwrap(); + for item_idx in 0..items_per_queue { + for queue_idx in 0..num_queues { + store + .push_to_queue_tail( + format!("queue:{queue_idx}").as_str(), + format!("val:{item_idx:04}").as_bytes(), + ) + .unwrap(); + } + } + store.flush().unwrap(); + } + + let data_path = first_data_file_path(dir.path()); + let file = OpenOptions::new().write(true).open(&data_path).unwrap(); + let file_len = common::logical_data_len(&data_path); + file.set_len(4096 + file_len - 2400).unwrap(); + + let store = CandyStore::open(dir.path(), config).unwrap(); + + let mut total_missing = 0; + for queue_idx in 0..num_queues { + let queue_key = format!("queue:{queue_idx}"); + let items: Vec<_> = store + .iter_queue(&queue_key) + .collect::>>() + .unwrap(); + + assert!(items.len() <= items_per_queue); + total_missing += items_per_queue - items.len(); + + for (idx, (_queue_idx, value)) in items.into_iter().enumerate() { + assert_eq!(value, format!("val:{idx:04}").into_bytes()); + } + } + + assert!(total_missing > 0); + assert!(total_missing < num_queues * items_per_queue); +} + +#[test] +fn test_truncated_data_file_lists() { + let dir = tempfile::tempdir().unwrap(); + let config = Config::default(); + let num_lists = 50; + let items_per_list = 10; + + { + let store = CandyStore::open(dir.path(), config).unwrap(); + for item_idx in 0..items_per_list { + for list_idx in 0..num_lists { + store + .set_in_list( + format!("list:{list_idx}").as_str(), + format!("key:{item_idx:04}").as_bytes(), + format!("val:{item_idx:04}").as_bytes(), + ) + .unwrap(); + } + } + store.flush().unwrap(); + } + + let data_path = first_data_file_path(dir.path()); + let file = OpenOptions::new().write(true).open(&data_path).unwrap(); + let file_len = common::logical_data_len(&data_path); + file.set_len(4096 + file_len - 2400).unwrap(); + + let store = CandyStore::open(dir.path(), config).unwrap(); + + let mut total_missing = 0; + for list_idx in 0..num_lists { + let list_key = format!("list:{list_idx}"); + let items: Vec<_> = store + .iter_list(&list_key) + .collect::>>() + .unwrap(); + + assert!(items.len() <= items_per_list); + total_missing += items_per_list - items.len(); + + for (idx, (key, value)) in items.into_iter().enumerate() { + assert_eq!(key, format!("key:{idx:04}").into_bytes()); + assert_eq!(value, format!("val:{idx:04}").into_bytes()); + } + } + + assert!(total_missing > 0); + assert!(total_missing < num_lists * items_per_list); +} diff --git a/tests/double_open.rs b/tests/double_open.rs new file mode 100644 index 0000000..b722a50 --- /dev/null +++ b/tests/double_open.rs @@ -0,0 +1,20 @@ +use candystore::{CandyStore, Config, Error}; +use tempfile::tempdir; + +#[test] +fn test_double_open_fails() { + let dir = tempdir().unwrap(); + let _db1 = CandyStore::open(dir.path(), Config::default()).unwrap(); + + // from the same thread + let db2_res = CandyStore::open(dir.path(), Config::default()); + assert!(matches!(db2_res, Err(Error::LockfileTaken(_, _)))); + + // from a different thread + std::thread::spawn(move || { + let db3_res = CandyStore::open(dir.path(), Config::default()); + assert!(matches!(db3_res, Err(Error::LockfileTaken(_, _)))); + }) + .join() + .unwrap(); +} diff --git a/tests/iteration.rs b/tests/iteration.rs new file mode 100644 index 0000000..0d47e40 --- /dev/null +++ b/tests/iteration.rs @@ -0,0 +1,99 @@ +mod common; + +use std::collections::HashMap; + +use candystore::{CandyStore, Config, Error}; +use tempfile::tempdir; + +#[test] +fn test_iter_items_empty_db() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let items: Vec<_> = db.iter_items().collect::>()?; + assert!(items.is_empty()); + + Ok(()) +} + +#[test] +fn test_iter_items_basic() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let mut expected = HashMap::new(); + for i in 0..100 { + let key = format!("iter_key_{i:04}"); + let value = format!("iter_val_{i:04}"); + db.set(&key, &value)?; + expected.insert(key.into_bytes(), value.into_bytes()); + } + + let items: HashMap, Vec> = db.iter_items().collect::>()?; + assert_eq!(items.len(), expected.len()); + assert_eq!(items, expected); + + Ok(()) +} + +#[test] +fn test_iter_items_after_updates_and_removes() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + for i in 0..50 { + db.set(format!("key_{i:04}"), format!("val_{i:04}"))?; + } + + for i in 0..20 { + db.set(format!("key_{i:04}"), format!("updated_{i:04}"))?; + } + + for i in 40..50 { + db.remove(format!("key_{i:04}"))?; + } + + let items: HashMap, Vec> = db.iter_items().collect::>()?; + assert_eq!(items.len(), 40); + + for i in 0..20 { + let key = format!("key_{i:04}"); + assert_eq!( + items.get(key.as_bytes()), + Some(&format!("updated_{i:04}").into_bytes()) + ); + } + for i in 20..40 { + let key = format!("key_{i:04}"); + assert_eq!( + items.get(key.as_bytes()), + Some(&format!("val_{i:04}").into_bytes()) + ); + } + for i in 40..50 { + let key = format!("key_{i:04}"); + assert!(!items.contains_key(key.as_bytes())); + } + + Ok(()) +} + +#[test] +fn test_iter_items_with_splits_and_rotation() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), common::small_file_config())?; + + let mut expected = HashMap::new(); + for i in 0..2000 { + let key = format!("split_iter_key_{i:05}"); + let value = format!("split_iter_val_{i:05}_{}", "x".repeat(48)); + db.set(&key, &value)?; + expected.insert(key.into_bytes(), value.into_bytes()); + } + + let items: HashMap, Vec> = db.iter_items().collect::>()?; + assert_eq!(items.len(), expected.len()); + assert_eq!(items, expected); + + Ok(()) +} diff --git a/tests/list.rs b/tests/list.rs new file mode 100644 index 0000000..f815d33 --- /dev/null +++ b/tests/list.rs @@ -0,0 +1,383 @@ +use candystore::{ + CandyStore, Config, GetOrCreateStatus, ListCompactionParams, ReplaceStatus, SetStatus, +}; +use std::sync::{Arc, Barrier}; +use std::thread; +use tempfile::tempdir; + +#[test] +fn test_list_set_get_len() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"l1"; + + assert_eq!(db.list_len(list).unwrap(), 0); + + assert!(matches!( + db.set_in_list(list, b"k1", b"v1").unwrap(), + SetStatus::CreatedNew + )); + assert!(matches!( + db.set_in_list(list, b"k2", b"v2").unwrap(), + SetStatus::CreatedNew + )); + + assert_eq!(db.list_len(list).unwrap(), 2); + assert_eq!(db.get_from_list(list, b"k1").unwrap(), Some(b"v1".to_vec())); + assert_eq!(db.get_from_list(list, b"k2").unwrap(), Some(b"v2".to_vec())); + + assert!( + matches!(db.set_in_list(list, b"k1", b"v1b").unwrap(), SetStatus::PrevValue(ref value) if value == b"v1") + ); + assert_eq!(db.list_len(list).unwrap(), 2); + assert_eq!( + db.get_from_list(list, b"k1").unwrap(), + Some(b"v1b".to_vec()) + ); +} + +#[test] +fn test_list_remove_and_iteration() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"l2"; + + db.set_in_list(list, b"a", b"1").unwrap(); + db.set_in_list(list, b"b", b"2").unwrap(); + db.set_in_list(list, b"c", b"3").unwrap(); + + assert_eq!( + db.remove_from_list(list, b"a").unwrap(), + Some(b"1".to_vec()) + ); + let items: Vec<_> = db.iter_list(list).map(|entry| entry.unwrap()).collect(); + assert_eq!( + items, + vec![ + (b"b".to_vec(), b"2".to_vec()), + (b"c".to_vec(), b"3".to_vec()) + ] + ); + + assert_eq!( + db.remove_from_list(list, b"c").unwrap(), + Some(b"3".to_vec()) + ); + let items: Vec<_> = db.iter_list(list).map(|entry| entry.unwrap()).collect(); + assert_eq!(items, vec![(b"b".to_vec(), b"2".to_vec())]); + + assert_eq!( + db.remove_from_list(list, b"b").unwrap(), + Some(b"2".to_vec()) + ); + assert_eq!(db.list_len(list).unwrap(), 0); + assert_eq!(db.iter_list(list).count(), 0); +} + +#[test] +fn test_list_iteration_skips_holes_and_reverse() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"l3"; + + db.set_in_list(list, b"k1", b"v1").unwrap(); + db.set_in_list(list, b"k2", b"v2").unwrap(); + db.set_in_list(list, b"k3", b"v3").unwrap(); + db.set_in_list(list, b"k4", b"v4").unwrap(); + db.remove_from_list(list, b"k2").unwrap(); + + let forward: Vec<_> = db.iter_list(list).map(|entry| entry.unwrap()).collect(); + assert_eq!( + forward, + vec![ + (b"k1".to_vec(), b"v1".to_vec()), + (b"k3".to_vec(), b"v3".to_vec()), + (b"k4".to_vec(), b"v4".to_vec()), + ] + ); + + let reverse: Vec<_> = db + .iter_list(list) + .rev() + .map(|entry| entry.unwrap()) + .collect(); + assert_eq!( + reverse, + vec![ + (b"k4".to_vec(), b"v4".to_vec()), + (b"k3".to_vec(), b"v3".to_vec()), + (b"k1".to_vec(), b"v1".to_vec()), + ] + ); +} + +#[test] +fn test_list_persistence_and_discard() { + let dir = tempdir().unwrap(); + let path = dir.path().to_path_buf(); + + { + let db = CandyStore::open(&path, Config::default()).unwrap(); + db.set_in_list(b"persist", b"k", b"v").unwrap(); + } + + { + let db = CandyStore::open(&path, Config::default()).unwrap(); + assert_eq!(db.list_len(b"persist").unwrap(), 1); + assert_eq!( + db.get_from_list(b"persist", b"k").unwrap(), + Some(b"v".to_vec()) + ); + assert!(db.discard_list(b"persist").unwrap()); + assert_eq!(db.list_len(b"persist").unwrap(), 0); + } +} + +#[test] +fn test_list_promoting_matches_legacy_tail_semantics() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"promo"; + + db.set_in_list(list, b"a", b"1").unwrap(); + db.set_in_list(list, b"b", b"2").unwrap(); + db.set_in_list(list, b"c", b"3").unwrap(); + + assert!( + matches!(db.set_in_list_promoting(list, b"b", b"2x").unwrap(), SetStatus::PrevValue(ref value) if value == b"2") + ); + + let items: Vec<_> = db.iter_list(list).map(|entry| entry.unwrap()).collect(); + assert_eq!(items.last().unwrap(), &(b"b".to_vec(), b"2x".to_vec())); + + assert!(matches!( + db.set_in_list_promoting(list, b"d", b"4").unwrap(), + SetStatus::CreatedNew + )); + + let items: Vec<_> = db.iter_list(list).map(|entry| entry.unwrap()).collect(); + assert_eq!(items.last().unwrap(), &(b"d".to_vec(), b"4".to_vec())); +} + +#[test] +fn test_list_compact_if_needed() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"compact"; + + db.set_in_list(list, b"a", b"1").unwrap(); + db.set_in_list(list, b"b", b"2").unwrap(); + db.set_in_list(list, b"c", b"3").unwrap(); + db.remove_from_list(list, b"b").unwrap(); + + assert!( + db.compact_list_if_needed( + list, + ListCompactionParams { + min_length: 1, + min_holes_ratio: 0.2 + } + ) + .unwrap() + ); + let items: Vec<_> = db.iter_list(list).map(|entry| entry.unwrap()).collect(); + assert_eq!( + items, + vec![ + (b"a".to_vec(), b"1".to_vec()), + (b"c".to_vec(), b"3".to_vec()) + ] + ); + assert!( + !db.compact_list_if_needed( + list, + ListCompactionParams { + min_length: 1, + min_holes_ratio: 0.5 + } + ) + .unwrap() + ); +} + +#[test] +fn test_replace_and_get_or_create_in_list() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"replace"; + + assert!( + matches!(db.get_or_create_in_list(list, b"k1", b"v1").unwrap(), GetOrCreateStatus::CreatedNew(ref value) if value == b"v1") + ); + assert!( + matches!(db.get_or_create_in_list(list, b"k1", b"other").unwrap(), GetOrCreateStatus::ExistingValue(ref value) if value == b"v1") + ); + assert!( + matches!(db.replace_in_list(list, b"k1", b"v2", Some(b"zz")).unwrap(), ReplaceStatus::WrongValue(ref value) if value == b"v1") + ); + assert!( + matches!(db.replace_in_list(list, b"k1", b"v2", None::<&[u8]>).unwrap(), ReplaceStatus::PrevValue(ref value) if value == b"v1") + ); + assert_eq!(db.get_from_list(list, b"k1").unwrap(), Some(b"v2".to_vec())); + assert!(matches!( + db.replace_in_list(list, b"missing", b"v", None::<&[u8]>) + .unwrap(), + ReplaceStatus::DoesNotExist + )); +} + +#[test] +fn test_list_pop_peek_and_retain() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"poppeek"; + + db.set_in_list(list, b"k1", b"v1").unwrap(); + db.set_in_list(list, b"k2", b"v2").unwrap(); + db.set_in_list(list, b"k3", b"v3").unwrap(); + + assert_eq!( + db.peek_list_head(list).unwrap().unwrap(), + (b"k1".to_vec(), b"v1".to_vec()) + ); + assert_eq!( + db.peek_list_tail(list).unwrap().unwrap(), + (b"k3".to_vec(), b"v3".to_vec()) + ); + assert_eq!( + db.pop_list_head(list).unwrap().unwrap(), + (b"k1".to_vec(), b"v1".to_vec()) + ); + assert_eq!( + db.pop_list_tail(list).unwrap().unwrap(), + (b"k3".to_vec(), b"v3".to_vec()) + ); + + db.set_in_list(list, b"k4", b"v4").unwrap(); + db.set_in_list(list, b"k5", b"v5").unwrap(); + db.retain_in_list(list, |key, _| Ok(key != b"k4")).unwrap(); + + let items: Vec<_> = db.iter_list(list).map(|entry| entry.unwrap()).collect(); + assert_eq!( + items, + vec![ + (b"k2".to_vec(), b"v2".to_vec()), + (b"k5".to_vec(), b"v5".to_vec()) + ] + ); + assert!( + !db.compact_list_if_needed( + list, + ListCompactionParams { + min_length: 1, + min_holes_ratio: 0.1, + }, + ) + .unwrap() + ); +} + +#[test] +fn test_list_compaction_uses_span_like_legacy_candystore() { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default()).unwrap(); + let list = b"span_compact"; + + for idx in 0..10u8 { + db.set_in_list(list, &[idx], &[idx]).unwrap(); + } + for idx in 1..9u8 { + db.remove_from_list(list, &[idx]).unwrap(); + } + + assert!( + db.compact_list_if_needed( + list, + ListCompactionParams { + min_length: 5, + min_holes_ratio: 0.5, + }, + ) + .unwrap() + ); +} + +#[test] +fn test_list_concurrency_basic() { + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default()).unwrap()); + let list_key = b"concurrent_list"; + let num_threads = 8; + let items_per_thread = 200; + let barrier = Arc::new(Barrier::new(num_threads)); + + let mut handles = Vec::new(); + for thread_idx in 0..num_threads { + let db = Arc::clone(&db); + let barrier = Arc::clone(&barrier); + handles.push(thread::spawn(move || { + barrier.wait(); + for item_idx in 0..items_per_thread { + let key = format!("t{thread_idx}-{item_idx}"); + let value = format!("val-{thread_idx}-{item_idx}"); + db.set_in_list(list_key, key.as_bytes(), value.as_bytes()) + .unwrap(); + } + + for item_idx in 0..items_per_thread { + let key = format!("t{thread_idx}-{item_idx}"); + db.remove_from_list(list_key, key.as_bytes()).unwrap(); + } + })); + } + + for handle in handles { + handle.join().unwrap(); + } + + assert_eq!(db.list_len(list_key).unwrap(), 0); + assert_eq!(db.iter_list(list_key).count(), 0); +} + +#[test] +fn test_list_concurrency_promoting() { + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), Config::default()).unwrap()); + let list_key = b"concurrent_list_promo"; + let num_threads = 4; + let items_per_thread = 100; + let barrier = Arc::new(Barrier::new(num_threads)); + + for idx in 0..100 { + db.set_in_list(list_key, format!("base-{idx}").as_bytes(), b"base") + .unwrap(); + } + + let mut handles = Vec::new(); + for thread_idx in 0..num_threads { + let db = Arc::clone(&db); + let barrier = Arc::clone(&barrier); + handles.push(thread::spawn(move || { + barrier.wait(); + for item_idx in 0..items_per_thread { + let key = format!("t{thread_idx}-{item_idx}"); + db.set_in_list_promoting(list_key, key.as_bytes(), b"val") + .unwrap(); + } + + for base_idx in 0..50 { + let key = format!("base-{base_idx}"); + db.set_in_list_promoting(list_key, key.as_bytes(), b"base-promoted") + .unwrap(); + } + })); + } + + for handle in handles { + handle.join().unwrap(); + } + + let expected_len = 100 + num_threads * items_per_thread; + assert_eq!(db.list_len(list_key).unwrap(), expected_len); + assert_eq!(db.iter_list(list_key).count(), expected_len); +} diff --git a/tests/maintenance.rs b/tests/maintenance.rs new file mode 100644 index 0000000..c30ad3c --- /dev/null +++ b/tests/maintenance.rs @@ -0,0 +1,87 @@ +mod common; + +use std::fs; + +use candystore::{CandyStore, Config, Error}; +use tempfile::tempdir; + +#[test] +fn test_clear_resets_store_files_and_contents() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024, + ..Config::default() + }; + + let db = CandyStore::open(dir.path(), config)?; + + for i in 0..100 { + db.set(format!("key{i:04}"), vec![b'x'; 64])?; + } + for i in 0..50 { + db.set(format!("key{i:04}"), vec![b'y'; 64])?; + } + for i in 50..75 { + db.remove(format!("key{i:04}"))?; + } + + let data_files_before = std::fs::read_dir(dir.path()) + .map_err(Error::IOError)? + .filter_map(|entry| entry.ok()) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with("data_")) + }) + .count(); + assert!(data_files_before > 1); + + fs::write(dir.path().join("extra.txt"), b"junk").map_err(Error::IOError)?; + fs::create_dir(dir.path().join("extra_dir")).map_err(Error::IOError)?; + fs::write(dir.path().join("extra_dir").join("nested.txt"), b"junk").map_err(Error::IOError)?; + + db.clear()?; + + assert!(db.get("key0000")?.is_none()); + assert_eq!(db.iter_items().count(), 0); + assert!(!dir.path().join("extra.txt").exists()); + assert!(!dir.path().join("extra_dir").exists()); + + let data_files_after = std::fs::read_dir(dir.path()) + .map_err(Error::IOError)? + .filter_map(|entry| entry.ok()) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with("data_")) + }) + .count(); + assert_eq!(data_files_after, 1); + + db.set("fresh", "value")?; + assert_eq!(db.get("fresh")?, Some(b"value".to_vec())); + drop(db); + + let reopened = CandyStore::open(dir.path(), config)?; + assert!(reopened.get("key0000")?.is_none()); + assert_eq!(reopened.get("fresh")?, Some(b"value".to_vec())); + + Ok(()) +} + +#[test] +fn test_explicit_close_releases_lock_and_persists_clean_shutdown() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + let db = CandyStore::open(dir.path(), config)?; + db.set("key", "value")?; + drop(db); + + let reopened = CandyStore::open(dir.path(), config)?; + assert_eq!(reopened.get("key")?, Some(b"value".to_vec())); + + Ok(()) +} diff --git a/tests/metrics.rs b/tests/metrics.rs new file mode 100644 index 0000000..d72867f --- /dev/null +++ b/tests/metrics.rs @@ -0,0 +1,125 @@ +use candystore::{CandyStore, Config}; +use std::time::Duration; + +const ROW_WIDTH: u64 = (16 * 21) as u64; + +#[test] +fn test_metrics_updates() -> Result<(), Box> { + let dir = tempfile::tempdir()?; + let config = Config { + initial_capacity: 1000, + ..Default::default() + }; + + let db = CandyStore::open(dir.path(), config)?; + + let stats = db.stats(); + assert_eq!(stats.num_rows, 8); + assert_eq!(stats.index_capacity(), 8 * ROW_WIDTH); + assert_eq!(stats.num_items, 0); + assert_eq!(stats.num_positive_lookups, 0); + assert_eq!(stats.num_negative_lookups, 0); + assert_eq!(stats.num_collisions, 0); + assert_eq!(stats.last_remap_dur, Duration::ZERO); + assert_eq!(stats.last_compaction_dur, Duration::ZERO); + assert_eq!(stats.last_compaction_reclaimed_bytes, 0); + assert_eq!(stats.last_compaction_moved_bytes, 0); + assert_eq!(stats.num_read_ops, 0); + assert_eq!(stats.num_read_bytes, 0); + assert_eq!(stats.num_write_ops, 0); + assert_eq!(stats.num_write_bytes, 0); + assert_eq!(stats.num_inserted, 0); + assert_eq!(stats.num_removed, 0); + assert_eq!(stats.num_updated, 0); + assert_eq!(stats.num_rebuilt_entries, 0); + assert_eq!(stats.num_rebuild_purged_bytes, 0); + assert_eq!(stats.data_bytes(), 0); + assert_eq!(stats.waste_bytes, 0); + + db.set("key1", "val1")?; + + let stats = db.stats(); + assert_eq!(stats.num_items, 1); + assert_eq!(stats.num_inserted, 1); + assert_eq!(stats.num_updated, 0); + assert_eq!(stats.num_removed, 0); + assert_eq!(stats.num_inserted, 1); + assert_eq!(stats.num_removed, 0); + assert_eq!(stats.num_updated, 0); + assert!(stats.data_bytes() > 0); + assert_eq!(stats.waste_bytes, 0); + assert_eq!(stats.num_write_ops, 1); + assert!(stats.num_write_bytes > 0); + assert!(stats.index_size_bytes > 0); + assert_eq!(stats.num_data_files, 1); + assert!(stats.data_bytes() > 0); + + db.set("key1", "val2")?; + + let stats = db.stats(); + assert_eq!(stats.num_items, 1); + assert_eq!(stats.num_updated, 1); + assert_eq!(stats.num_inserted, 1); + assert_eq!(stats.num_updated, 1); + assert_eq!(stats.num_removed, 0); + assert!(stats.data_bytes() > 0); + assert!(stats.waste_bytes > 0); + assert_eq!(stats.num_write_ops, 2); + + db.remove("key1")?; + + let stats = db.stats(); + assert_eq!(stats.num_items, 0); + assert_eq!(stats.num_removed, 1); + assert!(stats.waste_bytes > 0); + assert_eq!(stats.num_write_ops, 3); + assert_eq!(stats.num_inserted, 1); + assert_eq!(stats.num_updated, 1); + assert_eq!(stats.num_removed, 1); + assert_eq!(stats.data_bytes(), 0); + + assert_eq!(db.get("missing")?, None); + assert_eq!(db.get("key1")?, None); + + let stats = db.stats(); + assert_eq!(stats.num_positive_lookups, 0); + assert_eq!(stats.num_negative_lookups, 2); + assert_eq!(stats.num_read_ops, 2); + assert!(stats.num_read_bytes > 0); + + Ok(()) +} + +#[test] +fn test_metrics_compaction() -> Result<(), Box> { + let dir = tempfile::tempdir()?; + let config = Config { + max_data_file_size: 4096, + compaction_min_threshold: 10, + ..Default::default() + }; + + let db = CandyStore::open(dir.path(), config)?; + + for i in 0..500 { + db.set( + "key", + format!("value_that_is_long_enough_to_take_up_space_{}", i), + )?; + } + + for i in 0..100 { + db.set(format!("other_key_{}", i), "val")?; + std::thread::sleep(std::time::Duration::from_millis(2)); + } + + let stats = db.stats(); + assert!(stats.num_updated > 0); + assert!(stats.data_bytes() > 0); + assert!(stats.num_items > 0); + assert!(stats.index_capacity() >= stats.num_items); + assert!(stats.num_write_ops > 0); + assert!(stats.num_write_bytes > 0); + + Ok(()) +} diff --git a/tests/proptest_state_machine.rs b/tests/proptest_state_machine.rs new file mode 100644 index 0000000..474b639 --- /dev/null +++ b/tests/proptest_state_machine.rs @@ -0,0 +1,113 @@ +use candystore::{CandyStore, Config}; +use proptest::prelude::*; +use std::collections::BTreeMap; +use tempfile::TempDir; + +#[derive(Debug, Clone)] +enum Op { + Set(String, String), + Get(String), + Remove(String), + CleanShutdown, + SimulateCrash, +} + +fn op_strategy() -> impl Strategy { + // Narrow key space to highly encourage collisions (overwrites, deletes of existing keys) + let key_strat = "[a-d]{1,2}"; + // Variable size payload to occasionally trigger rotation in small stores + let val_strat = "[a-zA-Z0-9]{0,50}"; + + prop_oneof![ + // Weight probabilities so we mostly mutate state, check it, and occasionally restart + 40 => (key_strat, val_strat).prop_map(|(k, v)| Op::Set(k, v)), + 40 => key_strat.prop_map(Op::Get), + 20 => key_strat.prop_map(Op::Remove), + 8 => Just(Op::CleanShutdown), + 2 => Just(Op::SimulateCrash), + ] +} + +proptest! { + // 200 randomized sequences with up to 2000 operations each for a deeper stress test + #![proptest_config(ProptestConfig::with_cases(200))] + + #[test] + fn test_candystore_state_machine(ops in proptest::collection::vec(op_strategy(), 1..2000)) { + let dir = TempDir::new().unwrap(); + + // Small file size so we generate many data files, rotations, and splits within 200 operations + let config = Config { + max_data_file_size: 1024 * 4, // 4KB boundaries + ..Default::default() + }; + + // The authoritative reference state + let mut oracle = BTreeMap::new(); + + let mut db_opt = Some(CandyStore::open(dir.path(), config).unwrap()); + + for (op_idx, op) in ops.iter().enumerate() { + match op { + Op::Set(k, v) => { + oracle.insert(k.clone(), v.clone()); + let db = db_opt.as_ref().unwrap(); + let _ = db + .set(k.as_bytes(), v.as_bytes()) + .unwrap_or_else(|err| panic!("set failed at op {op_idx}: {op:?}: {err}")); + } + Op::Get(k) => { + let db = db_opt.as_ref().unwrap(); + let expected = oracle.get(k); + let actual = db + .get(k.as_bytes()) + .unwrap_or_else(|err| panic!("get failed at op {op_idx}: {op:?}: {err}")); + + match expected { + Some(v) => assert_eq!( + Some(v.as_bytes()), + actual.as_deref(), + "get mismatch at op {op_idx}: {op:?}" + ), + None => assert_eq!(None, actual, "get mismatch at op {op_idx}: {op:?}"), + } + } + Op::Remove(k) => { + oracle.remove(k); + let db = db_opt.as_ref().unwrap(); + let _ = db + .remove(k.as_bytes()) + .unwrap_or_else(|err| panic!("remove failed at op {op_idx}: {op:?}: {err}")); + } + Op::CleanShutdown => { + // Close the current DB instance by dropping it, then reopen + drop(db_opt.take().unwrap()); + db_opt = Some( + CandyStore::open(dir.path(), config) + .unwrap_or_else(|err| panic!("reopen after clean shutdown failed at op {op_idx}: {op:?}: {err}")), + ); + } + Op::SimulateCrash => { + // Force a rebuild + db_opt.take().unwrap()._abort_for_testing(); + db_opt = Some( + CandyStore::open(dir.path(), config) + .unwrap_or_else(|err| panic!("reopen after simulated crash failed at op {op_idx}: {op:?}: {err}")), + ); + } + } + } + + // Final verification pass: check the oracle exact matches internal state + let db = db_opt.as_ref().unwrap(); + + // Verify every key that should exist, DOES exist + for (k, v) in oracle.iter() { + let actual = db + .get(k.as_bytes()) + .unwrap_or_else(|err| panic!("final get failed for key {k:?}: {err}")) + .unwrap_or_else(|| panic!("final verification missing key {k:?}")); + assert_eq!(v.as_bytes(), actual.as_slice(), "final verification mismatch for key {k:?}"); + } + } +} diff --git a/tests/queue.rs b/tests/queue.rs new file mode 100644 index 0000000..947250a --- /dev/null +++ b/tests/queue.rs @@ -0,0 +1,366 @@ +mod common; + +use std::sync::{ + Arc, + atomic::{AtomicBool, AtomicUsize, Ordering}, +}; + +use candystore::{CandyStore, Config, Error}; +use tempfile::tempdir; + +#[test] +fn test_queue_fifo() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.push_to_queue_tail(&b"my_queue"[..], &b"item1"[..])?; + db.push_to_queue_tail(&b"my_queue"[..], &b"item2"[..])?; + db.push_to_queue_tail(&b"my_queue"[..], &b"item3"[..])?; + + assert_eq!(db.queue_len(&b"my_queue"[..])?, 3); + assert_eq!( + db.pop_queue_head(&b"my_queue"[..])?, + Some(b"item1".to_vec()) + ); + assert_eq!( + db.pop_queue_head(&b"my_queue"[..])?, + Some(b"item2".to_vec()) + ); + assert_eq!( + db.pop_queue_head(&b"my_queue"[..])?, + Some(b"item3".to_vec()) + ); + assert_eq!(db.pop_queue_head(&b"my_queue"[..])?, None); + assert_eq!(db.queue_len(&b"my_queue"[..])?, 0); + + Ok(()) +} + +#[test] +fn test_queue_lifo() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.push_to_queue_tail(&b"stack"[..], &b"item1"[..])?; + db.push_to_queue_tail(&b"stack"[..], &b"item2"[..])?; + db.push_to_queue_tail(&b"stack"[..], &b"item3"[..])?; + + assert_eq!(db.pop_queue_tail(&b"stack"[..])?, Some(b"item3".to_vec())); + assert_eq!(db.pop_queue_tail(&b"stack"[..])?, Some(b"item2".to_vec())); + assert_eq!(db.pop_queue_tail(&b"stack"[..])?, Some(b"item1".to_vec())); + assert_eq!(db.pop_queue_tail(&b"stack"[..])?, None); + + Ok(()) +} + +#[test] +fn test_queue_deque() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.push_to_queue_head(&b"deque"[..], &b"1"[..])?; + db.push_to_queue_head(&b"deque"[..], &b"2"[..])?; + db.push_to_queue_tail(&b"deque"[..], &b"3"[..])?; + db.push_to_queue_tail(&b"deque"[..], &b"4"[..])?; + + assert_eq!(db.queue_len(&b"deque"[..])?, 4); + assert_eq!(db.peek_queue_head(&b"deque"[..])?, Some(b"2".to_vec())); + assert_eq!(db.peek_queue_tail(&b"deque"[..])?, Some(b"4".to_vec())); + assert_eq!(db.pop_queue_head(&b"deque"[..])?, Some(b"2".to_vec())); + assert_eq!(db.pop_queue_tail(&b"deque"[..])?, Some(b"4".to_vec())); + assert_eq!(db.pop_queue_head(&b"deque"[..])?, Some(b"1".to_vec())); + assert_eq!(db.pop_queue_tail(&b"deque"[..])?, Some(b"3".to_vec())); + assert_eq!(db.pop_queue_head(&b"deque"[..])?, None); + + Ok(()) +} + +#[test] +fn test_queue_with_idx_methods() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let first = db.push_to_queue_tail(&b"idxq"[..], &b"a"[..])?; + let second = db.push_to_queue_tail(&b"idxq"[..], &b"b"[..])?; + let third = db.push_to_queue_head(&b"idxq"[..], &b"z"[..])?; + + assert!(third < first && first < second); + assert_eq!( + db.peek_queue_head_with_idx(&b"idxq"[..])?, + Some((third, b"z".to_vec())) + ); + assert_eq!( + db.peek_queue_tail_with_idx(&b"idxq"[..])?, + Some((second, b"b".to_vec())) + ); + assert_eq!( + db.pop_queue_head_with_idx(&b"idxq"[..])?, + Some((third, b"z".to_vec())) + ); + assert_eq!( + db.pop_queue_tail_with_idx(&b"idxq"[..])?, + Some((second, b"b".to_vec())) + ); + assert_eq!( + db.pop_queue_head_with_idx(&b"idxq"[..])?, + Some((first, b"a".to_vec())) + ); + assert_eq!(db.pop_queue_tail_with_idx(&b"idxq"[..])?, None); + + Ok(()) +} + +#[test] +fn test_queue_empty_push_head_has_stable_value_semantics() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + assert!(db.queue_range(&b"head_first"[..])?.is_empty()); + + let idx = db.push_to_queue_head(&b"head_first"[..], &b"x"[..])?; + assert_eq!( + db.peek_queue_head_with_idx(&b"head_first"[..])?, + Some((idx, b"x".to_vec())) + ); + assert_eq!( + db.peek_queue_tail_with_idx(&b"head_first"[..])?, + Some((idx, b"x".to_vec())) + ); + assert_eq!(db.queue_len(&b"head_first"[..])?, 1); + + Ok(()) +} + +#[test] +fn test_queue_peek_skips_holes_like_legacy_candystore() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let first = db.push_to_queue_tail(&b"peek_holes_head"[..], &b"v1"[..])?; + let second = db.push_to_queue_tail(&b"peek_holes_head"[..], &b"v2"[..])?; + let third = db.push_to_queue_tail(&b"peek_holes_head"[..], &b"v3"[..])?; + + assert_eq!( + db.remove_from_queue(&b"peek_holes_head"[..], second)?, + Some(b"v2".to_vec()) + ); + assert_eq!( + db.remove_from_queue(&b"peek_holes_head"[..], first)?, + Some(b"v1".to_vec()) + ); + assert_eq!( + db.peek_queue_head_with_idx(&b"peek_holes_head"[..])?, + Some((third, b"v3".to_vec())) + ); + assert_eq!(db.queue_range(&b"peek_holes_head"[..])?, second..third + 1); + + let first = db.push_to_queue_tail(&b"peek_holes_tail"[..], &b"v1"[..])?; + let second = db.push_to_queue_tail(&b"peek_holes_tail"[..], &b"v2"[..])?; + let third = db.push_to_queue_tail(&b"peek_holes_tail"[..], &b"v3"[..])?; + + assert_eq!( + db.remove_from_queue(&b"peek_holes_tail"[..], second)?, + Some(b"v2".to_vec()) + ); + assert_eq!( + db.remove_from_queue(&b"peek_holes_tail"[..], third)?, + Some(b"v3".to_vec()) + ); + assert_eq!( + db.peek_queue_tail_with_idx(&b"peek_holes_tail"[..])?, + Some((first, b"v1".to_vec())) + ); + assert_eq!(db.queue_range(&b"peek_holes_tail"[..])?, first..third); + + Ok(()) +} + +#[test] +fn test_extend_queue_returns_inserted_range() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let first = db.extend_queue(&b"bulk"[..], [&b"v1"[..], &b"v2"[..], &b"v3"[..]])?; + assert_eq!(first.len(), 3); + assert_eq!(db.queue_range(&b"bulk"[..])?, first.clone()); + + let second = db.extend_queue(&b"bulk"[..], [&b"v4"[..], &b"v5"[..]])?; + assert_eq!(second.start, first.end); + assert_eq!(second.len(), 2); + assert_eq!(db.queue_range(&b"bulk"[..])?, first.start..second.end); + + let items: Vec<_> = db.iter_queue(&b"bulk"[..]).collect::>()?; + assert_eq!( + items, + vec![ + (first.start, b"v1".to_vec()), + (first.start + 1, b"v2".to_vec()), + (first.start + 2, b"v3".to_vec()), + (second.start, b"v4".to_vec()), + (second.start + 1, b"v5".to_vec()), + ] + ); + + Ok(()) +} + +#[test] +fn test_queue_persistence() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + { + let db = CandyStore::open(dir.path(), Config::default())?; + db.push_to_queue_tail(&b"q1"[..], &b"val1"[..])?; + db.push_to_queue_tail(&b"q1"[..], &b"val2"[..])?; + } + + { + let db = CandyStore::open(dir.path(), Config::default())?; + assert_eq!(db.queue_len(&b"q1"[..])?, 2); + assert_eq!(db.pop_queue_head(&b"q1"[..])?, Some(b"val1".to_vec())); + } + + { + let db = CandyStore::open(dir.path(), Config::default())?; + assert_eq!(db.queue_len(&b"q1"[..])?, 1); + assert_eq!(db.pop_queue_head(&b"q1"[..])?, Some(b"val2".to_vec())); + assert_eq!(db.pop_queue_head(&b"q1"[..])?, None); + } + + Ok(()) +} + +#[test] +fn test_queue_reverse_iteration_skips_holes() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.push_to_queue_tail(&b"q_rev_iter"[..], &b"v1"[..])?; + db.push_to_queue_tail(&b"q_rev_iter"[..], &b"v2"[..])?; + db.push_to_queue_tail(&b"q_rev_iter"[..], &b"v3"[..])?; + db.push_to_queue_tail(&b"q_rev_iter"[..], &b"v4"[..])?; + + assert_eq!(db.pop_queue_head(&b"q_rev_iter"[..])?, Some(b"v1".to_vec())); + assert_eq!(db.pop_queue_head(&b"q_rev_iter"[..])?, Some(b"v2".to_vec())); + + let rev_items: Vec<_> = db + .iter_queue(&b"q_rev_iter"[..]) + .rev() + .map(|res| res.unwrap().1) + .collect(); + assert_eq!(rev_items, vec![b"v4".to_vec(), b"v3".to_vec()]); + + let fwd_items: Vec<_> = db + .iter_queue(&b"q_rev_iter"[..]) + .map(|res| res.unwrap().1) + .collect(); + assert_eq!(fwd_items, vec![b"v3".to_vec(), b"v4".to_vec()]); + + Ok(()) +} + +#[test] +fn test_multiple_queues() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + db.push_to_queue_tail(&b"q1"[..], &b"v1"[..])?; + db.push_to_queue_tail(&b"q2"[..], &b"v2"[..])?; + + assert_eq!(db.pop_queue_head(&b"q1"[..])?, Some(b"v1".to_vec())); + assert_eq!(db.pop_queue_head(&b"q2"[..])?, Some(b"v2".to_vec())); + + Ok(()) +} + +#[test] +fn test_queue_remove_hole_is_skipped() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + let idx1 = db.push_to_queue_tail(&b"holey"[..], &b"v1"[..])?; + let idx2 = db.push_to_queue_tail(&b"holey"[..], &b"v2"[..])?; + let idx3 = db.push_to_queue_tail(&b"holey"[..], &b"v3"[..])?; + + assert!(idx1 < idx2 && idx2 < idx3); + assert_eq!( + db.remove_from_queue(&b"holey"[..], idx2)?, + Some(b"v2".to_vec()) + ); + + let items: Vec<_> = db.iter_queue(&b"holey"[..]).collect::>()?; + assert_eq!(items.len(), 2); + assert_eq!(items[0].1, b"v1".to_vec()); + assert_eq!(items[1].1, b"v3".to_vec()); + + Ok(()) +} + +#[test] +fn test_queue_concurrency() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = Arc::new(CandyStore::open(dir.path(), common::small_file_config())?); + let queue = b"concurrent_queue"; + + let producers = 4; + let items_per_producer = 1000; + let consumers = 4; + let finished = Arc::new(AtomicBool::new(false)); + let consumed = Arc::new(AtomicUsize::new(0)); + + let mut consumer_handles = Vec::new(); + for _ in 0..consumers { + let db = db.clone(); + let finished = finished.clone(); + let consumed = consumed.clone(); + consumer_handles.push(std::thread::spawn(move || { + loop { + match db.pop_queue_head(&queue[..]).unwrap() { + Some(_) => { + consumed.fetch_add(1, Ordering::Relaxed); + } + None => { + if finished.load(Ordering::Relaxed) { + match db.pop_queue_head(&queue[..]).unwrap() { + Some(_) => { + consumed.fetch_add(1, Ordering::Relaxed); + } + None => break, + } + } else { + std::thread::yield_now(); + } + } + } + } + })); + } + + let mut producer_handles = Vec::new(); + for producer in 0..producers { + let db = db.clone(); + producer_handles.push(std::thread::spawn(move || { + for item in 0..items_per_producer { + let value = format!("p{producer}-{item}"); + db.push_to_queue_tail(&queue[..], value.as_bytes()).unwrap(); + } + })); + } + + for handle in producer_handles { + handle.join().unwrap(); + } + finished.store(true, Ordering::Relaxed); + + for handle in consumer_handles { + handle.join().unwrap(); + } + + assert_eq!( + consumed.load(Ordering::Relaxed), + producers * items_per_producer + ); + assert_eq!(db.queue_len(&queue[..])?, 0); + + Ok(()) +} diff --git a/tests/recovery.rs b/tests/recovery.rs new file mode 100644 index 0000000..e907232 --- /dev/null +++ b/tests/recovery.rs @@ -0,0 +1,1712 @@ +mod common; + +use std::collections::HashSet; +use std::fs; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use candystore::{CandyStore, CandyTypedDeque, CandyTypedList, CandyTypedStore, Config, Error}; +use tempfile::tempdir; + +use crate::common::checkpoint_slot_checksum; + +const CHECKPOINT_SLOT_0_OFFSET: u64 = 128; +const CHECKPOINT_SLOT_STRIDE: u64 = 32; +const CHECKPOINT_SLOT_CHECKSUM_OFFSET: u64 = 24; + +fn patterned_bytes_with_seed(len: usize, seed: usize) -> Vec { + (0..len) + .map(|idx| (((idx * 31) + (seed * 17)) % 251) as u8) + .collect() +} + +fn rewrite_first_data_entry_header( + dir: &std::path::Path, + rewrite: impl FnOnce(u32) -> u32, +) -> Result<(), Error> { + let mut file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(dir.join("data_0000")) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start(4096)).map_err(Error::IOError)?; + let mut entry_header = [0u8; 8]; + file.read_exact(&mut entry_header).map_err(Error::IOError)?; + + let header = u32::from_le_bytes(entry_header[0..4].try_into().unwrap()); + let klen = u16::from_le_bytes(entry_header[4..6].try_into().unwrap()) as usize; + let vlen = u16::from_le_bytes(entry_header[6..8].try_into().unwrap()) as usize; + let entry_len = 4 + 4 + klen + vlen + 2; + + file.seek(SeekFrom::Start(4096)).map_err(Error::IOError)?; + let mut entry = vec![0u8; entry_len]; + file.read_exact(&mut entry).map_err(Error::IOError)?; + entry[0..4].copy_from_slice(&rewrite(header).to_le_bytes()); + + let checksum = crc16_ibm3740_fast::hash(&entry[..entry_len - 2]) as u16; + entry[entry_len - 2..entry_len].copy_from_slice(&checksum.to_le_bytes()); + + file.seek(SeekFrom::Start(4096)).map_err(Error::IOError)?; + file.write_all(&entry).map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +fn rewrite_data_file_ordinal( + dir: &std::path::Path, + file_idx: u16, + ordinal: u64, +) -> Result<(), Error> { + let mut file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(dir.join(format!("data_{file_idx:04}"))) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start(16)).map_err(Error::IOError)?; + file.write_all(&ordinal.to_le_bytes()) + .map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +fn read_index_version(dir: &std::path::Path) -> Result { + let mut file = std::fs::OpenOptions::new() + .read(true) + .open(dir.join("index")) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start(8)).map_err(Error::IOError)?; + let mut buf = [0u8; 4]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + Ok(u32::from_le_bytes(buf)) +} + +fn rewrite_index_version(dir: &std::path::Path, version: u32) -> Result<(), Error> { + let mut file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(dir.join("index")) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start(8)).map_err(Error::IOError)?; + file.write_all(&version.to_le_bytes()) + .map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +fn active_file_ordinal(dir: &std::path::Path) -> Result { + let mut max_ordinal: Option = None; + + for entry in std::fs::read_dir(dir).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + let Some(name) = path.file_name().and_then(|name| name.to_str()) else { + continue; + }; + if !name.starts_with("data_") { + continue; + } + + let mut file = std::fs::OpenOptions::new() + .read(true) + .open(&path) + .map_err(Error::IOError)?; + file.seek(SeekFrom::Start(16)).map_err(Error::IOError)?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + let ordinal = u64::from_le_bytes(buf); + max_ordinal = Some(max_ordinal.map_or(ordinal, |current| current.max(ordinal))); + } + + max_ordinal.ok_or_else(|| { + Error::IOError(std::io::Error::new( + std::io::ErrorKind::NotFound, + "no data files found", + )) + }) +} + +fn data_files_by_ordinal(dir: &std::path::Path) -> Result, Error> { + let mut files = Vec::new(); + + for entry in std::fs::read_dir(dir).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + let Some(name) = path.file_name().and_then(|name| name.to_str()) else { + continue; + }; + if !name.starts_with("data_") { + continue; + } + + let mut file = std::fs::OpenOptions::new() + .read(true) + .open(&path) + .map_err(Error::IOError)?; + file.seek(SeekFrom::Start(16)).map_err(Error::IOError)?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + let ordinal = u64::from_le_bytes(buf); + let used_bytes = common::logical_data_len(&path); + files.push((ordinal, used_bytes)); + } + + files.sort_by_key(|(ordinal, _)| *ordinal); + Ok(files) +} + +fn data_file_records_by_ordinal( + dir: &std::path::Path, +) -> Result, Error> { + let mut files = Vec::new(); + + for entry in std::fs::read_dir(dir).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + let Some(name) = path.file_name().and_then(|name| name.to_str()) else { + continue; + }; + let Some(file_idx) = name + .strip_prefix("data_") + .and_then(|suffix| suffix.parse::().ok()) + else { + continue; + }; + + let mut file = std::fs::OpenOptions::new() + .read(true) + .open(&path) + .map_err(Error::IOError)?; + file.seek(SeekFrom::Start(16)).map_err(Error::IOError)?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + let ordinal = u64::from_le_bytes(buf); + let used_bytes = common::logical_data_len(&path); + files.push((file_idx, ordinal, used_bytes, path)); + } + + files.sort_by_key(|(_, ordinal, _, _)| *ordinal); + Ok(files) +} + +fn write_commit_cursor(dir: &std::path::Path, offset: u64) -> Result<(), Error> { + let ordinal = active_file_ordinal(dir)?; + write_commit_cursor_for_ordinal(dir, ordinal, offset) +} + +fn write_commit_cursor_for_ordinal( + dir: &std::path::Path, + ordinal: u64, + offset: u64, +) -> Result<(), Error> { + let mut file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(dir.join("index")) + .map_err(Error::IOError)?; + + let generation = next_checkpoint_generation(&mut file)?; + let checksum = checkpoint_slot_checksum(generation, ordinal, offset); + let slot_offset = 128 + (generation as u64 % 2) * 32; + + file.seek(SeekFrom::Start(slot_offset)) + .map_err(Error::IOError)?; + file.write_all(&generation.to_le_bytes()) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start(slot_offset + 8)) + .map_err(Error::IOError)?; + file.write_all(&ordinal.to_le_bytes()) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start(slot_offset + 16)) + .map_err(Error::IOError)?; + file.write_all(&offset.to_le_bytes()) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start(slot_offset + 24)) + .map_err(Error::IOError)?; + file.write_all(&checksum.to_le_bytes()) + .map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +fn next_checkpoint_generation(file: &mut std::fs::File) -> Result { + use std::io::Read; + + let mut max_generation = 0u64; + for slot_offset in [128u64, 160u64] { + file.seek(SeekFrom::Start(slot_offset)) + .map_err(Error::IOError)?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + max_generation = max_generation.max(u64::from_le_bytes(buf)); + } + + Ok(max_generation + 1) +} + +fn corrupt_latest_checkpoint_slot_checksum(dir: &std::path::Path) -> Result<(), Error> { + let mut file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .open(dir.join("index")) + .map_err(Error::IOError)?; + + let mut latest_generation = 0u64; + let mut latest_slot_offset = CHECKPOINT_SLOT_0_OFFSET; + for slot_offset in [ + CHECKPOINT_SLOT_0_OFFSET, + CHECKPOINT_SLOT_0_OFFSET + CHECKPOINT_SLOT_STRIDE, + ] { + file.seek(SeekFrom::Start(slot_offset)) + .map_err(Error::IOError)?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + let generation = u64::from_le_bytes(buf); + if generation >= latest_generation { + latest_generation = generation; + latest_slot_offset = slot_offset; + } + } + + file.seek(SeekFrom::Start( + latest_slot_offset + CHECKPOINT_SLOT_CHECKSUM_OFFSET, + )) + .map_err(Error::IOError)?; + file.write_all(&0u32.to_le_bytes()) + .map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +fn active_data_file_path(dir: &std::path::Path) -> Result { + let active_ordinal = active_file_ordinal(dir)?; + + for entry in std::fs::read_dir(dir).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + let Some(name) = path.file_name().and_then(|name| name.to_str()) else { + continue; + }; + if !name.starts_with("data_") { + continue; + } + + let mut file = std::fs::OpenOptions::new() + .read(true) + .open(&path) + .map_err(Error::IOError)?; + file.seek(SeekFrom::Start(16)).map_err(Error::IOError)?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + if u64::from_le_bytes(buf) == active_ordinal { + return Ok(path); + } + } + + Err(Error::IOError(std::io::Error::new( + std::io::ErrorKind::NotFound, + "active data file not found", + ))) +} + +fn append_aligned_tail_garbage(dir: &std::path::Path, len: usize) -> Result<(), Error> { + debug_assert_eq!(len % 16, 0); + + let path = active_data_file_path(dir)?; + let mut file = std::fs::OpenOptions::new() + .append(true) + .open(path) + .map_err(Error::IOError)?; + file.write_all(&vec![0xA5; len]).map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +fn assert_rebuild_stats_non_zero(db: &CandyStore) { + let stats = db.stats(); + assert!( + stats.num_rebuilt_entries > 0 || stats.checkpoint_generation > 0, + "expected either replayed entries or a checkpoint that already covered the data" + ); + assert!( + stats.num_rebuild_purged_bytes > 0, + "expected rebuild to trim a dirty file tail" + ); +} + +fn wait_for_background_checkpoint(db: &CandyStore, previous_generation: u64) { + let started_at = Instant::now(); + loop { + let stats = db.stats(); + if stats.checkpoint_generation > previous_generation { + return; + } + assert!( + started_at.elapsed() < Duration::from_secs(3), + "background checkpoint did not complete in time: prev_gen={previous_generation}, current_gen={}, uncheckpointed_bytes={}", + stats.checkpoint_generation, + stats.uncheckpointed_bytes, + ); + std::thread::sleep(Duration::from_millis(10)); + } +} + +fn wait_for_checkpoint_generation_advance(db: &CandyStore, previous_generation: u64) { + let started_at = Instant::now(); + loop { + let stats = db.stats(); + if stats.checkpoint_generation > previous_generation { + return; + } + assert!( + started_at.elapsed() < Duration::from_secs(3), + "background checkpoint generation did not advance in time: prev_gen={previous_generation}, current_gen={}", + stats.checkpoint_generation, + ); + std::thread::sleep(Duration::from_millis(10)); + } +} + +#[test] +fn test_recovery_after_dirty_shutdown() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + { + let db = CandyStore::open(dir.path(), Config::default())?; + db.set("key1", "val1")?; + db.set("key2", "val2")?; + db.set("key3", "val3")?; + db.set("key2", "val2_updated")?; + db.remove("key3")?; + db._abort_for_testing(); + } + + { + let db = CandyStore::open(dir.path(), Config::default())?; + assert_eq!(db.get("key1")?, Some(b"val1".to_vec())); + assert_eq!(db.get("key2")?, Some(b"val2_updated".to_vec())); + assert!(db.get("key3")?.is_none()); + } + + Ok(()) +} + +#[test] +fn test_recovery_uses_persisted_hash_key_on_reopen() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let original_config = Config { + hash_key: (1, 2), + ..Config::default() + }; + let different_config = Config { + hash_key: (3, 4), + ..original_config + }; + + { + let db = CandyStore::open(dir.path(), original_config)?; + db.set("key1", "val1")?; + db.set("key2", "val2")?; + db._abort_for_testing(); + } + + { + let db = CandyStore::open(dir.path(), different_config)?; + assert_eq!(db.get("key1")?, Some(b"val1".to_vec())); + assert_eq!(db.get("key2")?, Some(b"val2".to_vec())); + db.set("key3", "val3")?; + } + + { + let db = CandyStore::open(dir.path(), original_config)?; + assert_eq!(db.get("key1")?, Some(b"val1".to_vec())); + assert_eq!(db.get("key2")?, Some(b"val2".to_vec())); + assert_eq!(db.get("key3")?, Some(b"val3".to_vec())); + } + + Ok(()) +} + +#[test] +fn test_recovery_with_many_keys_and_splits() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = common::small_file_config(); + + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..500 { + db.set(format!("k{i:04}"), format!("v{i:04}"))?; + } + for i in (0..500).step_by(3) { + db.set(format!("k{i:04}"), format!("updated_{i}"))?; + } + for i in (0..500).step_by(7) { + db.remove(format!("k{i:04}"))?; + } + db._abort_for_testing(); + } + + { + let db = CandyStore::open(dir.path(), config)?; + + for i in 0..500 { + let key = format!("k{i:04}"); + if i % 7 == 0 { + assert!(db.get(&key)?.is_none(), "key {key} should be removed"); + } else if i % 3 == 0 { + assert_eq!( + db.get(&key)?, + Some(format!("updated_{i}").into_bytes()), + "key {key} should be updated" + ); + } else { + assert_eq!( + db.get(&key)?, + Some(format!("v{i:04}").into_bytes()), + "key {key} should have original value" + ); + } + } + } + + Ok(()) +} + +#[test] +fn test_rebuild_if_dirty_recovers_large_dataset_across_multiple_data_files() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 64 * 1024 * 1024, + compaction_throughput_bytes_per_sec: 1024, + ..Config::default() + }; + const TARGET_NUM_DATA_FILES: u64 = 5; + const VALUE_SIZE: usize = 60 * 1024; + const NUM_REMOVALS: usize = 128; + + let total_keys; + let removed_keys; + + { + let db = CandyStore::open(dir.path(), config)?; + let mut next_idx = 0usize; + while db.stats().num_data_files < TARGET_NUM_DATA_FILES { + let key = format!("large-rebuild-{next_idx:06}"); + let value = patterned_bytes_with_seed(VALUE_SIZE, next_idx); + db.set(&key, &value)?; + next_idx += 1; + } + + total_keys = next_idx; + removed_keys = ((total_keys - NUM_REMOVALS)..total_keys).collect::>(); + for idx in &removed_keys { + let key = format!("large-rebuild-{idx:06}"); + assert!( + db.remove(&key)?.is_some(), + "expected {key} to exist before removal" + ); + } + + let stats = db.stats(); + assert!( + stats.num_data_files >= TARGET_NUM_DATA_FILES, + "expected at least {TARGET_NUM_DATA_FILES} data files, got {}", + stats.num_data_files + ); + + db._abort_for_testing(); + } + + { + let db = CandyStore::open(dir.path(), config)?; + assert!( + db.stats().num_data_files >= TARGET_NUM_DATA_FILES, + "rebuild should preserve the multi-file dataset" + ); + + for idx in 0..total_keys { + let key = format!("large-rebuild-{idx:06}"); + if removed_keys.contains(&idx) { + assert!( + db.get(&key)?.is_none(), + "removed key {key} reappeared after rebuild" + ); + } else { + let expected = patterned_bytes_with_seed(VALUE_SIZE, idx); + assert_eq!( + db.get(&key)?, + Some(expected), + "key {key} did not survive large rebuild correctly" + ); + } + } + } + + Ok(()) +} + +#[test] +fn test_rebuild_if_dirty_recovers_with_corrupted_rows_checksum() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key1", "val1")?; + db.set("key2", "val2")?; + db.set("key2", "val2_updated")?; + db.remove("key1")?; + db._abort_for_testing(); + } + + { + let db = CandyStore::open(dir.path(), config)?; + assert!(db.get("key1")?.is_none()); + assert_eq!(db.get("key2")?, Some(b"val2_updated".to_vec())); + } + + { + let db = CandyStore::open(dir.path(), config)?; + assert!(db.get("key1")?.is_none()); + assert_eq!(db.get("key2")?, Some(b"val2_updated".to_vec())); + } + + Ok(()) +} + +#[test] +fn test_rebuild_if_dirty_rejects_unknown_data_entry_type() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key1", "val1")?; + db._abort_for_testing(); + } + + rewrite_first_data_entry_header(dir.path(), |header| (header & !(0b11 << 30)) | (0b10 << 30))?; + + match CandyStore::open(dir.path(), config) { + Err(Error::IOError(io_err)) if io_err.kind() == std::io::ErrorKind::InvalidData => Ok(()), + Err(err) => panic!("expected invalid-data error for unknown entry type, got {err}"), + Ok(_) => panic!("expected open to fail for unknown entry type"), + } +} + +#[test] +fn test_rebuild_if_dirty_rejects_unknown_data_namespace() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key1", "val1")?; + db._abort_for_testing(); + } + + rewrite_first_data_entry_header(dir.path(), |header| { + let cleared = header & !(0x3f << 24); + cleared | (63 << 24) + })?; + + match CandyStore::open(dir.path(), config) { + Err(Error::IOError(io_err)) if io_err.kind() == std::io::ErrorKind::InvalidData => Ok(()), + Err(err) => panic!("expected invalid-data error for unknown namespace, got {err}"), + Ok(_) => panic!("expected open to fail for unknown namespace"), + } +} + +#[test] +fn test_open_rejects_duplicate_data_file_ordinals() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = common::small_file_config(); + + { + let db = CandyStore::open(dir.path(), config)?; + for idx in 0..64 { + db.set(format!("dup-ordinal-{idx:03}"), vec![b'x'; 512])?; + if db.stats().num_data_files >= 2 { + break; + } + } + assert!( + db.stats().num_data_files >= 2, + "expected multiple data files" + ); + } + + rewrite_data_file_ordinal(dir.path(), 1, 0x00bd_38a0_2a35_1cdf)?; + + match CandyStore::open(dir.path(), config) { + Err(Error::IOError(io_err)) if io_err.kind() == std::io::ErrorKind::InvalidData => Ok(()), + Err(err) => panic!("expected invalid-data error for duplicate ordinal, got {err}"), + Ok(_) => panic!("expected open to fail for duplicate data file ordinals"), + } +} + +#[test] +fn test_rebuild_if_dirty_recovers_lists() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + let list = b"rebuild-list"; + + { + let db = CandyStore::open(dir.path(), config)?; + db.set_in_list(list, b"a", b"1")?; + db.set_in_list(list, b"b", b"2")?; + db.set_in_list(list, b"c", b"3")?; + db.set_in_list(list, b"b", b"2b")?; + db.remove_from_list(list, b"a")?; + db._abort_for_testing(); + } + + { + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.list_len(list)?, 2); + assert_eq!(db.get_from_list(list, b"a")?, None); + assert_eq!(db.get_from_list(list, b"b")?, Some(b"2b".to_vec())); + assert_eq!(db.get_from_list(list, b"c")?, Some(b"3".to_vec())); + + let items: Vec<_> = db.iter_list(list).collect::>()?; + assert_eq!( + items, + vec![ + (b"b".to_vec(), b"2b".to_vec()), + (b"c".to_vec(), b"3".to_vec()), + ] + ); + } + + Ok(()) +} + +#[test] +fn test_rebuild_if_dirty_recovers_queues() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + let queue = b"rebuild-queue"; + + let first_idx; + let keep_idx; + let removed_idx; + + { + let db = CandyStore::open(dir.path(), config)?; + first_idx = db.push_to_queue_tail(queue, b"tail-1")?; + keep_idx = db.push_to_queue_tail(queue, b"tail-2")?; + removed_idx = db.push_to_queue_tail(queue, b"tail-3")?; + db.push_to_queue_head(queue, b"head-0")?; + + assert_eq!(db.pop_queue_head(queue)?, Some(b"head-0".to_vec())); + assert_eq!( + db.remove_from_queue(queue, removed_idx)?, + Some(b"tail-3".to_vec()) + ); + db._abort_for_testing(); + } + + { + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.queue_len(queue)?, 2); + assert_eq!(db.peek_queue_head(queue)?, Some(b"tail-1".to_vec())); + assert_eq!(db.peek_queue_tail(queue)?, Some(b"tail-2".to_vec())); + assert_eq!(db.remove_from_queue(queue, removed_idx)?, None); + + let items: Vec<_> = db.iter_queue(queue).collect::>()?; + assert_eq!( + items, + vec![ + (first_idx, b"tail-1".to_vec()), + (keep_idx, b"tail-2".to_vec()), + ] + ); + } + + Ok(()) +} + +#[test] +fn test_rebuild_if_dirty_recovers_typed_data() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + let list_key = 7u32; + let queue_key = 9u32; + + { + let store = Arc::new(CandyStore::open(dir.path(), config)?); + let typed_kv = CandyTypedStore::::new(Arc::clone(&store)); + let typed_list = CandyTypedList::::new(Arc::clone(&store)); + let typed_queue = CandyTypedDeque::::new(Arc::clone(&store)); + + typed_kv.set(&1u32, &"one".to_string())?; + typed_kv.set(&1u32, &"uno".to_string())?; + typed_kv.set(&2u32, &"two".to_string())?; + assert_eq!(typed_kv.remove(&2u32)?, Some("two".to_string())); + + typed_list.set(&list_key, &1u32, &"a".to_string())?; + typed_list.set(&list_key, &2u32, &"b".to_string())?; + typed_list.set(&list_key, &3u32, &"c".to_string())?; + assert_eq!(typed_list.remove(&list_key, &2u32)?, Some("b".to_string())); + + typed_queue.push_tail(&queue_key, &10u32)?; + typed_queue.push_tail(&queue_key, &20u32)?; + typed_queue.push_head(&queue_key, &5u32)?; + assert_eq!(typed_queue.pop_tail(&queue_key)?, Some(20u32)); + + drop(typed_queue); + drop(typed_list); + drop(typed_kv); + Arc::into_inner(store).unwrap()._abort_for_testing(); + } + + { + let store = Arc::new(CandyStore::open(dir.path(), config)?); + let typed_kv = CandyTypedStore::::new(Arc::clone(&store)); + let typed_list = CandyTypedList::::new(Arc::clone(&store)); + let typed_queue = CandyTypedDeque::::new(Arc::clone(&store)); + + assert_eq!(typed_kv.get(&1u32)?, Some("uno".to_string())); + assert_eq!(typed_kv.get(&2u32)?, None); + + let typed_list_items: Vec<_> = typed_list.iter(&list_key).collect::>()?; + assert_eq!( + typed_list_items, + vec![(1u32, "a".to_string()), (3u32, "c".to_string())] + ); + + let typed_queue_items: Vec<_> = typed_queue.iter(&queue_key).collect::>()?; + assert_eq!(typed_queue_items.len(), 2); + assert_eq!(typed_queue_items[0].1, 5u32); + assert_eq!(typed_queue_items[1].1, 10u32); + assert_eq!(typed_queue.peek_head(&queue_key)?, Some(5u32)); + assert_eq!(typed_queue.peek_tail(&queue_key)?, Some(10u32)); + assert_eq!( + typed_queue.peek_head_with_idx(&queue_key)?, + Some(typed_queue_items[0]) + ); + assert_eq!( + typed_queue.peek_tail_with_idx(&queue_key)?, + Some(typed_queue_items[1]) + ); + } + + Ok(()) +} + +#[test] +fn test_reset_on_invalid_data_clears_corrupt_store() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + reset_on_invalid_data: true, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key", "value")?; + } + + fs::write(dir.path().join("index"), b"bad").map_err(Error::IOError)?; + fs::write(dir.path().join("rows"), b"bad").map_err(Error::IOError)?; + fs::write(dir.path().join("extra.txt"), b"junk").map_err(Error::IOError)?; + fs::create_dir(dir.path().join("extra_dir")).map_err(Error::IOError)?; + fs::write(dir.path().join("extra_dir").join("nested.txt"), b"junk").map_err(Error::IOError)?; + + let db = CandyStore::open(dir.path(), config)?; + assert!(db.get("key")?.is_none()); + assert!(!dir.path().join("extra.txt").exists()); + assert!(!dir.path().join("extra_dir").exists()); + + db.set("fresh", "value")?; + assert_eq!(db.get("fresh")?, Some(b"value".to_vec())); + + Ok(()) +} + +#[test] +fn test_reset_on_invalid_data_clears_recovery_time_corruption() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + reset_on_invalid_data: true, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key", "value")?; + db._abort_for_testing(); + } + + rewrite_first_data_entry_header(dir.path(), |header| (header & !(0b11 << 30)) | (0b10 << 30))?; + fs::write(dir.path().join("extra.txt"), b"junk").map_err(Error::IOError)?; + fs::create_dir(dir.path().join("extra_dir")).map_err(Error::IOError)?; + fs::write(dir.path().join("extra_dir").join("nested.txt"), b"junk").map_err(Error::IOError)?; + + let db = CandyStore::open(dir.path(), config)?; + assert!(db.get("key")?.is_none()); + assert!(!dir.path().join("extra.txt").exists()); + assert!(!dir.path().join("extra_dir").exists()); + + db.set("fresh", "value")?; + assert_eq!(db.get("fresh")?, Some(b"value".to_vec())); + + Ok(()) +} + +#[test] +fn test_recover_from_truncated_data_file() -> Result<(), Box> { + let dir = tempfile::tempdir()?; + { + let db = candystore::CandyStore::open(dir.path(), candystore::Config::default())?; + db.set("key1", "value1")?; + db.set("key2", "value2")?; + } + + // Corrupt the data file by truncating the last 5 bytes + let data_file = std::fs::read_dir(dir.path())? + .filter_map(|res| res.ok()) + .find(|entry| entry.file_name().to_string_lossy().starts_with("data_")) + .unwrap(); + let file = std::fs::OpenOptions::new() + .write(true) + .open(data_file.path())?; + let logical_len = common::logical_data_len(&data_file.path()); + file.set_len(4096 + logical_len - 16)?; + + // We expect clear recovery (key2 was truncated, thus doesn't exist, but key1 is readable) + let db = candystore::CandyStore::open(dir.path(), candystore::Config::default())?; + assert_eq!(db.get("key1")?.as_deref(), Some("value1".as_bytes())); + assert_eq!(db.get("key2")?, None); + assert_eq!(db.num_items(), 1); + Ok(()) +} + +#[test] +fn test_rebuild_if_dirty_recovers_from_invalid_commit_offset_without_double_counting() +-> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key1", "value1")?; + db.set("key2", "value2")?; + db.set("key2", "value2_updated")?; + db.set("key3", "value3")?; + } + + write_commit_cursor(dir.path(), 5)?; + + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.get("key1")?, Some(b"value1".to_vec())); + assert_eq!(db.get("key2")?, Some(b"value2_updated".to_vec())); + assert_eq!(db.get("key3")?, Some(b"value3".to_vec())); + assert_eq!(db.num_items(), 3); + Ok(()) +} + +#[test] +fn test_clean_reopen_preserves_overwrite_source_entries() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024 * 4, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("dd", "S2pVu3sy437r2s22")?; + db.set("d", "1utyjA7IagDJy7eyp9")?; + db.set("b", "2YEAci7LZeShVxXcS3c2M41XFp9YPJACS5SUw4")?; + db.set("cd", "Sgbm1x3CQmy7HahPWXllPnt68UO9SdaTleWZ9")?; + } + + { + let db = CandyStore::open(dir.path(), config)?; + db.set("ad", "Uj1734MhmFqkdmmFGi03F8N")?; + } + + let db = CandyStore::open(dir.path(), config)?; + assert_eq!( + db.get("cd")?, + Some(b"Sgbm1x3CQmy7HahPWXllPnt68UO9SdaTleWZ9".to_vec()) + ); + assert_eq!( + db.set("cd", "SGVF56VUXC8SpU4ERrUj0Z3Z80oqvXvKR2oOU3ij4yoo0Yuqt")?, + candystore::SetStatus::PrevValue(b"Sgbm1x3CQmy7HahPWXllPnt68UO9SdaTleWZ9".to_vec()) + ); + + Ok(()) +} + +#[test] +fn test_reopen_with_different_max_data_file_size() -> Result<(), Error> { + let dir = tempdir().unwrap(); + + // Phase 1: create with a small max_data_file_size. + let config_small = Config { + max_data_file_size: 1024 * 4, + ..Config::default() + }; + { + let db = CandyStore::open(dir.path(), config_small)?; + for i in 0..20 { + db.set(format!("k{i}"), format!("v{i}"))?; + } + } + + // Phase 2: reopen with a larger max_data_file_size. + // The old data file's physical size won't match the new config, so it + // should be detected as non-preallocated and fall back to sync_all. + let config_large = Config { + max_data_file_size: 1024 * 32, + ..Config::default() + }; + { + let db = CandyStore::open(dir.path(), config_large)?; + for i in 0..20 { + assert_eq!(db.get(format!("k{i}"))?, Some(format!("v{i}").into_bytes())); + } + // Writes under the new config should also work. + db.set("new_key", "new_value")?; + } + + // Phase 3: reopen again with the larger config and verify everything. + let db = CandyStore::open(dir.path(), config_large)?; + for i in 0..20 { + assert_eq!(db.get(format!("k{i}"))?, Some(format!("v{i}").into_bytes())); + } + assert_eq!(db.get("new_key")?, Some(b"new_value".to_vec())); + + Ok(()) +} + +#[test] +fn test_partial_entry_at_tail_of_preallocated_file() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024 * 4, + ..Config::default() + }; + + // Write two valid entries, then close cleanly. + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key1", "value1")?; + db.set("key2", "value2")?; + } + + // Inject a partial entry: write bytes that look like a valid header + // (correct magic for the offset) but with a truncated/invalid checksum. + // This simulates a crash mid-append on a preallocated file, where the + // kernel flushed part of a write but the entry is incomplete. + let data_path = dir.path().join("data_0000"); + let logical_len = common::logical_data_len(&data_path); + { + const ALIGNMENT: u64 = 16; + const MAGIC: u32 = 0x91c8_d7cd; + const MASK: u32 = (1 << 24) - 1; + + let entry_offset = logical_len; + let magic_offset = (((entry_offset / ALIGNMENT) as u32) ^ MAGIC) & MASK; + // EntryType::Insert = 0b00, ns = 0 + let header: u32 = magic_offset; + let klen: u16 = 4; // "abcd" + let vlen: u16 = 8; // "12345678" + + let mut partial = Vec::new(); + partial.extend_from_slice(&header.to_le_bytes()); + partial.extend_from_slice(&klen.to_le_bytes()); + partial.extend_from_slice(&vlen.to_le_bytes()); + // Write the value and key but NOT the checksum — the entry is incomplete. + partial.extend_from_slice(b"12345678"); // value + partial.extend_from_slice(b"abcd"); // key + // No checksum appended, and overwrite with bad trailing bytes. + partial.extend_from_slice(&[0xFF, 0xFF]); + + let mut file = std::fs::OpenOptions::new() + .write(true) + .open(&data_path) + .map_err(Error::IOError)?; + file.seek(SeekFrom::Start(4096 + entry_offset)) + .map_err(Error::IOError)?; + file.write_all(&partial).map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + } + + // Reopen: the partial entry should be ignored by detect_used_bytes. + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.get("key1")?, Some(b"value1".to_vec())); + assert_eq!(db.get("key2")?, Some(b"value2".to_vec())); + assert_eq!(db.get("abcd")?, None); // partial entry must not appear + assert_eq!(db.num_items(), 2); + + // New writes should work (they overwrite the garbage region). + db.set("key3", "value3")?; + drop(db); + + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.get("key3")?, Some(b"value3".to_vec())); + + Ok(()) +} + +#[test] +fn test_incomplete_entry_with_valid_header_and_bad_checksum() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024 * 4, + ..Config::default() + }; + + // Write one valid entry, abort (simulating crash), then inject a + // fully-sized entry whose checksum is deliberately wrong. + { + let db = CandyStore::open(dir.path(), config)?; + db.set("good", "data")?; + db._abort_for_testing(); + } + + let data_path = dir.path().join("data_0000"); + let logical_len = common::logical_data_len(&data_path); + { + const ALIGNMENT: u64 = 16; + const MAGIC: u32 = 0x91c8_d7cd; + const MASK: u32 = (1 << 24) - 1; + + let entry_offset = logical_len; + let magic_offset = (((entry_offset / ALIGNMENT) as u32) ^ MAGIC) & MASK; + let header: u32 = magic_offset; + let key = b"bad"; + let val = b"entry"; + let klen = key.len() as u16; + let vlen = val.len() as u16; + let entry_len = 4 + 4 + klen as usize + vlen as usize + 2; + let aligned_len = entry_len.div_ceil(16) * 16; + + let mut buf = vec![0u8; aligned_len]; + buf[0..4].copy_from_slice(&header.to_le_bytes()); + buf[4..6].copy_from_slice(&klen.to_le_bytes()); + buf[6..8].copy_from_slice(&vlen.to_le_bytes()); + buf[8..8 + val.len()].copy_from_slice(val); + buf[8 + val.len()..8 + val.len() + key.len()].copy_from_slice(key); + // Write a deliberately wrong checksum. + buf[entry_len - 2..entry_len].copy_from_slice(&[0xDE, 0xAD]); + + let mut file = std::fs::OpenOptions::new() + .write(true) + .open(&data_path) + .map_err(Error::IOError)?; + file.seek(SeekFrom::Start(4096 + entry_offset)) + .map_err(Error::IOError)?; + file.write_all(&buf).map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + } + + // Reopen via recovery (dirty shutdown + garbage entry). + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.get("good")?, Some(b"data".to_vec())); + assert_eq!(db.get("bad")?, None); + + // The corrupted entry's bytes should have been purged. + let stats = db.stats(); + assert!( + stats.num_rebuild_purged_bytes > 0, + "expected rebuild to purge the corrupted tail" + ); + + Ok(()) +} + +#[test] +fn test_progressive_rebuild_resumes_from_checkpoint() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024, + ..Config::default() + }; + + // Phase 1: write data across multiple files, then crash. + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + append_aligned_tail_garbage(dir.path(), 64)?; + + // Phase 2: reopen triggers rebuild. Verify all data survived. + { + let db = CandyStore::open(dir.path(), config)?; + assert_rebuild_stats_non_zero(&db); + for i in 0..100 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after full rebuild" + ); + } + // Clean shutdown after rebuild should preserve all recovered data. + } + + Ok(()) +} + +#[test] +fn test_progressive_rebuild_survives_interrupted_rebuild() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024, + ..Config::default() + }; + + // Phase 1: write data, crash. + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + + // Phase 2: reopen triggers rebuild which completes. Then crash again. + { + let db = CandyStore::open(dir.path(), config)?; + // Write more data on top of the rebuilt index. + for i in 100..150 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + append_aligned_tail_garbage(dir.path(), 64)?; + + // Phase 3: another rebuild should start from the persisted replay cursor + // and recover everything written before the second crash. + { + let db = CandyStore::open(dir.path(), config)?; + assert_rebuild_stats_non_zero(&db); + for i in 0..150 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after second rebuild" + ); + } + } + + Ok(()) +} + +#[test] +fn test_progressive_rebuild_with_trust_strategy_resumes_pending() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024, + ..Config::default() + }; + + // Phase 1: write data, crash. + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + + // Phase 2: reopen and write more, then crash. + { + let db = CandyStore::open(dir.path(), config)?; + for i in 100..200 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + append_aligned_tail_garbage(dir.path(), 64)?; + + // Phase 3: reopen — recovery replays from the commit cursor, so all + // data from phases 1+2 should be accessible. + { + let db = CandyStore::open(dir.path(), config)?; + assert_rebuild_stats_non_zero(&db); + for i in 0..200 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after trust-or-rebuild" + ); + } + } + + Ok(()) +} + +#[test] +fn test_checkpoint_advances_recovery_cursor() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db.checkpoint()?; + db._abort_for_testing(); + } + + let db = CandyStore::open(dir.path(), config)?; + let stats = db.stats(); + assert_eq!(stats.num_rebuilt_entries, 0); + assert_eq!(stats.num_rebuild_purged_bytes, 0); + for i in 0..100 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after checkpointed reopen" + ); + } + + Ok(()) +} + +#[test] +fn test_checkpoint_delta_bytes_advances_recovery_cursor_in_background() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 8 * 1024, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + checkpoint_interval: None, + checkpoint_delta_bytes: Some(512), + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + let initial_generation = db.stats().checkpoint_generation; + for i in 0..24 { + db.set( + format!("delta-bg-key{i:04}"), + format!("delta-bg-val{i:04}-{}", "x".repeat(96)), + )?; + } + wait_for_background_checkpoint(&db, initial_generation); + db._abort_for_testing(); + } + + let db = CandyStore::open(dir.path(), config)?; + let stats = db.stats(); + assert!( + stats.num_rebuilt_entries < 24, + "threshold-triggered background checkpoint should avoid replaying the entire store" + ); + assert_eq!(stats.num_rebuild_purged_bytes, 0); + for i in 0..24 { + assert_eq!( + db.get(format!("delta-bg-key{i:04}"))?, + Some(format!("delta-bg-val{i:04}-{}", "x".repeat(96)).into_bytes()), + "delta-bg-key{i:04} missing after threshold-triggered background checkpoint" + ); + } + + Ok(()) +} + +#[test] +fn test_checkpoint_interval_advances_recovery_cursor_in_background() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 8 * 1024, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + checkpoint_interval: Some(Duration::from_millis(50)), + checkpoint_delta_bytes: None, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + let initial_generation = db.stats().checkpoint_generation; + for i in 0..24 { + db.set( + format!("interval-bg-key{i:04}"), + format!("interval-bg-val{i:04}-{}", "y".repeat(96)), + )?; + } + wait_for_background_checkpoint(&db, initial_generation); + db._abort_for_testing(); + } + + let db = CandyStore::open(dir.path(), config)?; + let stats = db.stats(); + assert_eq!(stats.num_rebuilt_entries, 0); + assert_eq!(stats.num_rebuild_purged_bytes, 0); + for i in 0..24 { + assert_eq!( + db.get(format!("interval-bg-key{i:04}"))?, + Some(format!("interval-bg-val{i:04}-{}", "y".repeat(96)).into_bytes()), + "interval-bg-key{i:04} missing after interval-triggered background checkpoint" + ); + } + + Ok(()) +} + +#[test] +fn test_rotation_advances_recovery_cursor_in_background() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 2048, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + checkpoint_interval: None, + checkpoint_delta_bytes: None, + ..Config::default() + }; + + let total_keys; + { + let db = CandyStore::open(dir.path(), config)?; + let initial_generation = db.stats().checkpoint_generation; + let mut next_idx = 0usize; + while db.stats().num_data_files < 2 { + db.set( + format!("rotate-bg-key{next_idx:04}"), + format!("rotate-bg-val{next_idx:04}-{}", "z".repeat(96)), + )?; + next_idx += 1; + } + total_keys = next_idx; + wait_for_checkpoint_generation_advance(&db, initial_generation); + db._abort_for_testing(); + } + + let db = CandyStore::open(dir.path(), config)?; + let stats = db.stats(); + assert!( + stats.num_rebuilt_entries < total_keys as u64, + "rotation-triggered checkpoint should avoid replaying the entire store" + ); + assert_eq!(stats.num_rebuild_purged_bytes, 0); + for i in 0..total_keys { + assert_eq!( + db.get(format!("rotate-bg-key{i:04}"))?, + Some(format!("rotate-bg-val{i:04}-{}", "z".repeat(96)).into_bytes()), + "rotate-bg-key{i:04} missing after rotation-triggered background checkpoint" + ); + } + + Ok(()) +} + +#[test] +fn test_progressive_rebuild_ignores_bogus_checkpoint_offset() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + + // Write a bogus commit cursor offset beyond any data — rebuild should + // fall back to replaying from offset 0. + write_commit_cursor(dir.path(), 0xFFFF_FFFF)?; + + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after restart-from-scratch rebuild" + ); + } + + Ok(()) +} + +#[test] +fn test_progressive_rebuild_falls_back_to_older_valid_checkpoint_slot() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 1024, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + + write_commit_cursor(dir.path(), 0)?; + write_commit_cursor(dir.path(), 0xFFFF_FFFF)?; + corrupt_latest_checkpoint_slot_checksum(dir.path())?; + + let db = CandyStore::open(dir.path(), config)?; + for i in 0..100 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after fallback to older valid checkpoint slot" + ); + } + + Ok(()) +} + +#[test] +fn test_open_ports_outdated_index_format_when_data_format_is_recognized() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + checkpoint_interval: None, + checkpoint_delta_bytes: None, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + ..Config::default() + }; + + let original_index_version; + { + let db = CandyStore::open(dir.path(), config)?; + db.set("port-key-1", "port-val-1")?; + db.set("port-key-2", "port-val-2")?; + original_index_version = read_index_version(dir.path())?; + } + + rewrite_index_version(dir.path(), original_index_version ^ 1)?; + + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.get("port-key-1")?, Some(b"port-val-1".to_vec())); + assert_eq!(db.get("port-key-2")?, Some(b"port-val-2".to_vec())); + assert_eq!(read_index_version(dir.path())?, original_index_version); + assert!( + db.stats().num_rebuilt_entries >= 2, + "expected index recreation to replay the data files" + ); + + Ok(()) +} + +#[test] +fn test_clean_reopen_rebuilds_invalid_active_checkpoint_across_multiple_data_files() +-> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = common::small_file_config(); + + let total_base_keys; + + { + let db = CandyStore::open(dir.path(), config)?; + let mut next_idx = 0usize; + while db.stats().num_data_files < 3 { + let key = format!("multifile-base-{next_idx:04}"); + let value = patterned_bytes_with_seed(512, next_idx); + db.set(&key, &value)?; + next_idx += 1; + } + total_base_keys = next_idx; + + assert!( + db.stats().num_data_files >= 3, + "expected multiple data files before corrupting checkpoint" + ); + assert_eq!(db.num_items(), total_base_keys); + } + + write_commit_cursor(dir.path(), 0xFFFF_FFFF)?; + + let db = CandyStore::open(dir.path(), config)?; + assert!( + db.stats().num_data_files >= 3, + "expected the multi-file layout to survive recovery" + ); + + for idx in 0..total_base_keys { + let key = format!("multifile-base-{idx:04}"); + assert_eq!(db.get(&key)?, Some(patterned_bytes_with_seed(512, idx))); + } + + assert_eq!(db.num_items(), total_base_keys); + + Ok(()) +} + +#[test] +fn test_recovery_replays_later_files_after_checkpointing_an_older_file() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = common::small_file_config(); + + let total_keys; + + { + let db = CandyStore::open(dir.path(), config)?; + let mut next_idx = 0usize; + while db.stats().num_data_files < 3 { + let key = format!("older-cursor-{next_idx:04}"); + let value = patterned_bytes_with_seed(512, next_idx); + db.set(&key, &value)?; + next_idx += 1; + } + total_keys = next_idx; + } + + let files = data_files_by_ordinal(dir.path())?; + assert!( + files.len() >= 3, + "expected multiple data files for replay test" + ); + let (older_ordinal, older_used_bytes) = files[0]; + write_commit_cursor_for_ordinal(dir.path(), older_ordinal, older_used_bytes)?; + + let db = CandyStore::open(dir.path(), config)?; + assert!( + db.stats().num_rebuilt_entries > 0, + "expected recovery to replay later files after rewinding commit cursor" + ); + for idx in 0..total_keys { + let key = format!("older-cursor-{idx:04}"); + assert_eq!(db.get(&key)?, Some(patterned_bytes_with_seed(512, idx))); + } + assert_eq!(db.num_items(), total_keys); + + Ok(()) +} + +#[test] +fn test_recovery_replays_later_files_when_checkpoint_file_is_missing() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 16 * 1024, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + ..Config::default() + }; + + let mut expected = Vec::new(); + { + let db = CandyStore::open(dir.path(), config)?; + let mut update_idx = 0usize; + let final_hot = loop { + let value = patterned_bytes_with_seed(6 * 1024, update_idx); + db.set("hot", &value)?; + update_idx += 1; + if db.stats().num_data_files >= 5 { + break value; + } + }; + expected.push(("hot".to_owned(), final_hot)); + + for idx in 0..4usize { + let key = format!("tail-live-{idx:02}"); + let value = patterned_bytes_with_seed(2048, 10_000 + idx); + db.set(&key, &value)?; + expected.push((key, value)); + } + } + + let files = data_file_records_by_ordinal(dir.path())?; + assert!( + files.len() >= 4, + "expected enough rotated files to simulate a missing checkpoint file" + ); + + let (_, missing_ordinal, missing_used_bytes, missing_path) = &files[1]; + write_commit_cursor_for_ordinal(dir.path(), *missing_ordinal, *missing_used_bytes)?; + fs::remove_file(missing_path).map_err(Error::IOError)?; + + let db = CandyStore::open(dir.path(), config)?; + assert!( + db.stats().num_rebuilt_entries > 0, + "expected recovery to replay entries after a missing checkpoint file" + ); + for (key, value) in expected { + assert_eq!( + db.get(&key)?, + Some(value), + "{key} missing after replaying past a missing checkpoint file" + ); + } + + Ok(()) +} + +#[test] +fn test_recovery_ignores_missing_compacted_files_before_checkpoint_cursor() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 16 * 1024, + compaction_min_threshold: u32::MAX, + compaction_throughput_bytes_per_sec: 0, + ..Config::default() + }; + + let mut expected = Vec::new(); + { + let db = CandyStore::open(dir.path(), config)?; + let mut update_idx = 0usize; + let final_hot = loop { + let value = patterned_bytes_with_seed(6 * 1024, 20_000 + update_idx); + db.set("hot", &value)?; + update_idx += 1; + if db.stats().num_data_files >= 5 { + break value; + } + }; + expected.push(("hot".to_owned(), final_hot)); + + for idx in 0..6usize { + let key = format!("post-cursor-live-{idx:02}"); + let value = patterned_bytes_with_seed(1536, 30_000 + idx); + db.set(&key, &value)?; + expected.push((key, value)); + } + } + + let files = data_file_records_by_ordinal(dir.path())?; + assert!( + files.len() >= 5, + "expected enough files to simulate compacted files before the checkpoint cursor" + ); + + let (_, checkpoint_ordinal, checkpoint_used_bytes, _) = &files[2]; + let (_, _, _, missing_path) = &files[0]; + write_commit_cursor_for_ordinal(dir.path(), *checkpoint_ordinal, *checkpoint_used_bytes)?; + fs::remove_file(missing_path).map_err(Error::IOError)?; + + let db = CandyStore::open(dir.path(), config)?; + assert!( + db.stats().num_rebuilt_entries > 0, + "expected recovery to replay entries after skipping compacted files before the cursor" + ); + for (key, value) in expected { + assert_eq!( + db.get(&key)?, + Some(value), + "{key} missing after skipping compacted files before the checkpoint cursor" + ); + } + + Ok(()) +} diff --git a/tests/rotation.rs b/tests/rotation.rs new file mode 100644 index 0000000..6240c7b --- /dev/null +++ b/tests/rotation.rs @@ -0,0 +1,152 @@ +mod common; + +use candystore::{CandyStore, Config, Error, SetStatus}; +use tempfile::tempdir; + +#[test] +fn test_many_inserts_trigger_splits() -> Result<(), Error> { + const KEYS: usize = 5000; + + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), Config::default())?; + + for key_idx in 0..KEYS { + let key = format!("split_key_{key_idx:05}"); + let value = format!("split_val_{key_idx:05}"); + assert!(matches!(db.set(&key, &value)?, SetStatus::CreatedNew)); + } + + for key_idx in 0..KEYS { + let key = format!("split_key_{key_idx:05}"); + let value = format!("split_val_{key_idx:05}"); + assert_eq!(db.get(&key)?, Some(value.into())); + } + + Ok(()) +} + +#[test] +fn test_rotation_preserves_reads() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), common::small_file_config())?; + + for key_idx in 0..512 { + let key = format!("rotate_key_{key_idx:04}"); + let value = format!("rotate_val_{key_idx:04}_{}", "x".repeat(64)); + assert!(matches!(db.set(&key, &value)?, SetStatus::CreatedNew)); + } + + for key_idx in 0..512 { + let key = format!("rotate_key_{key_idx:04}"); + assert!(db.get(&key)?.is_some(), "missing key after rotation: {key}"); + } + + Ok(()) +} + +#[test] +fn test_new_data_files_are_preallocated() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = common::small_file_config(); + let db = CandyStore::open(dir.path(), config)?; + + let file_len = std::fs::metadata(dir.path().join("data_0000")) + .map_err(Error::IOError)? + .len(); + assert_eq!(file_len, 4096 + config.max_data_file_size as u64); + + db.set("prealloc", [7u8; 128])?; + drop(db); + + let reopened_len = std::fs::metadata(dir.path().join("data_0000")) + .map_err(Error::IOError)? + .len(); + assert_eq!(reopened_len, 4096 + config.max_data_file_size as u64); + + Ok(()) +} + +#[test] +fn test_splits_and_rotation_with_small_files() -> Result<(), Error> { + const KEYS: usize = 5000; + + let dir = tempdir().unwrap(); + let db = CandyStore::open(dir.path(), common::small_file_config())?; + + for key_idx in 0..KEYS { + let key = format!("split_rotate_key_{key_idx:05}"); + let value = format!("split_rotate_val_{key_idx:05}_{}", "x".repeat(48)); + assert!(matches!(db.set(&key, &value)?, SetStatus::CreatedNew)); + } + + for key_idx in 0..KEYS { + let key = format!("split_rotate_key_{key_idx:05}"); + let value = format!("split_rotate_val_{key_idx:05}_{}", "x".repeat(48)); + assert_eq!(db.get(&key)?, Some(value.into())); + } + + let data_file_count = std::fs::read_dir(dir.path()) + .map_err(Error::IOError)? + .filter_map(|entry| entry.ok()) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with("data_")) + }) + .count(); + assert!( + data_file_count > 1, + "expected rotation to create multiple data files" + ); + + let index_rows_size = std::fs::metadata(dir.path().join("rows")) + .map_err(Error::IOError)? + .len(); + assert!( + index_rows_size > 4 * 4096, + "expected index rows growth after row splitting, got rows size {index_rows_size}" + ); + + Ok(()) +} + +#[test] +fn test_splits_rotation_and_reopen_with_small_files() -> Result<(), Error> { + const KEYS: usize = 5000; + + let dir = tempdir().unwrap(); + + { + let db = CandyStore::open(dir.path(), common::small_file_config())?; + for key_idx in 0..KEYS { + let key = format!("reopen_split_rotate_key_{key_idx:05}"); + let value = format!("reopen_split_rotate_val_{key_idx:05}_{}", "x".repeat(48)); + assert!(matches!(db.set(&key, &value)?, SetStatus::CreatedNew)); + } + } + + let db = CandyStore::open(dir.path(), common::small_file_config())?; + for key_idx in 0..KEYS { + let key = format!("reopen_split_rotate_key_{key_idx:05}"); + let value = format!("reopen_split_rotate_val_{key_idx:05}_{}", "x".repeat(48)); + assert_eq!(db.get(&key)?, Some(value.into())); + } + + let data_file_count = std::fs::read_dir(dir.path()) + .map_err(Error::IOError)? + .filter_map(|entry| entry.ok()) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with("data_")) + }) + .count(); + assert!( + data_file_count > 1, + "expected rotation to persist multiple data files after reopen" + ); + + Ok(()) +} diff --git a/tests/shrink.rs b/tests/shrink.rs new file mode 100644 index 0000000..540ece8 --- /dev/null +++ b/tests/shrink.rs @@ -0,0 +1,44 @@ +use candystore::{CandyStore, Config, Result, SetStatus}; + +const ROW_WIDTH: usize = 16 * 21; + +#[test] +fn test_shrink_to_fit_preserves_remaining_keys() -> Result<()> { + let dir = tempfile::tempdir().unwrap(); + let config = Config { + initial_capacity: 4 * ROW_WIDTH, + ..Config::default() + }; + let store = CandyStore::open(dir.path(), config)?; + + for i in 0..10_000 { + let key = format!("key_{i}"); + assert!(matches!( + store.set(key.as_bytes(), b"value")?, + SetStatus::CreatedNew + )); + } + + let before = store.capacity(); + + for i in 0..9_000 { + let key = format!("key_{i}"); + store.remove(key.as_bytes())?; + } + + let shrunk_rows = store.shrink_to_fit_blocking(0.2)?; + assert!(shrunk_rows > 0); + assert!(store.capacity() <= before); + + for i in 9_000..10_000 { + let key = format!("key_{i}"); + assert_eq!(store.get(key.as_bytes())?, Some(b"value".to_vec())); + } + + for i in 0..9_000 { + let key = format!("key_{i}"); + assert_eq!(store.get(key.as_bytes())?, None); + } + + Ok(()) +} diff --git a/tests/test_atomics.rs b/tests/test_atomics.rs deleted file mode 100644 index 3c96404..0000000 --- a/tests/test_atomics.rs +++ /dev/null @@ -1,40 +0,0 @@ -mod common; - -use candystore::{CandyStore, Config, GetOrCreateStatus, ReplaceStatus, Result, SetStatus}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_atomics() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - assert!(db.get_or_create("aaa", "1111")?.was_created()); - - assert!(db.replace("aaa", "2222", None)?.was_replaced()); - - assert_eq!(db.get("aaa")?, Some("2222".into())); - - assert!(db.get_or_create("aaa", "1111")?.already_exists()); - - assert!(!db.replace("bbb", "3333", None)?.was_replaced()); - - assert!(db.set("bbb", "4444")?.was_created()); - assert_eq!(db.set("bbb", "5555")?, SetStatus::PrevValue("4444".into())); - - assert_eq!( - db.get_or_create("bbb", "6666")?, - GetOrCreateStatus::ExistingValue("5555".into()) - ); - - assert_eq!(db.get_or_create("cccc", "6666")?.value(), b"6666"); - assert_eq!(db.get_or_create("aaa", "6666")?.value(), b"2222"); - - assert_eq!( - db.replace("aaa", "6666", Some("2222"))?, - ReplaceStatus::PrevValue("2222".into()) - ); - - Ok(()) - }) -} diff --git a/tests/test_bigval.rs b/tests/test_bigval.rs deleted file mode 100644 index 850444e..0000000 --- a/tests/test_bigval.rs +++ /dev/null @@ -1,32 +0,0 @@ -mod common; - -use std::sync::Arc; - -use candystore::{CandyStore, CandyTypedStore, Config, Result}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_bigval() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open(dir, Config::default())?); - - assert_eq!(db.set_big(b"mykey", &vec![0x99; 1_000_000])?, false); - assert_eq!(db.get_big(b"yourkey")?, None); - assert_eq!(db.get_big(b"mykey")?, Some(vec![0x99; 1_000_000])); - assert_eq!(db.remove_big(b"mykey")?, true); - assert_eq!(db.get_big(b"mykey")?, None); - assert_eq!(db.set_big(b"mykey", &vec![0x88; 100_000])?, false); - assert_eq!(db.set_big(b"mykey", &vec![0x77; 100_000])?, true); - assert_eq!(db.get_big(b"mykey")?, Some(vec![0x77; 100_000])); - - let typed = CandyTypedStore::>::new(db); - assert_eq!(typed.set_big("hello", &vec![123456789; 100_000])?, false); - assert_eq!(typed.get_big("world")?, None); - assert_eq!(typed.get_big("hello")?, Some(vec![123456789; 100_000])); - assert_eq!(typed.remove_big("hello")?, true); - assert_eq!(typed.remove_big("hello")?, false); - - Ok(()) - }) -} diff --git a/tests/test_flush_agg.rs b/tests/test_flush_agg.rs deleted file mode 100644 index a1d4488..0000000 --- a/tests/test_flush_agg.rs +++ /dev/null @@ -1,51 +0,0 @@ -#![cfg(feature = "flush_aggregation")] - -mod common; - -use std::{ - sync::{Arc, Barrier}, - time::{Duration, Instant}, -}; - -use candystore::{CandyStore, Config, Result}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_flush_aggregation() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open( - dir, - Config { - flush_aggregation_delay: Some(Duration::from_millis(1)), - ..Default::default() - }, - )?); - - let num_threads = 10; - let barrier = Arc::new(Barrier::new(num_threads)); - let mut handles = vec![]; - - for i in 0..num_threads { - let db = db.clone(); - let barrier = barrier.clone(); - let h = std::thread::spawn(move || { - barrier.wait(); - let t0 = Instant::now(); - for j in 0..10 { - db.set(&format!("key{i}-{j}"), "val")?; - } - let dur = Instant::now().duration_since(t0); - Result::::Ok(dur) - }); - handles.push(h); - } - - for (i, h) in handles.into_iter().enumerate() { - let dur = h.join().unwrap()?; - println!("{i}: {dur:?}"); - } - - Ok(()) - }) -} diff --git a/tests/test_list_collisions.rs b/tests/test_list_collisions.rs deleted file mode 100644 index cbd39c3..0000000 --- a/tests/test_list_collisions.rs +++ /dev/null @@ -1,76 +0,0 @@ -#![cfg(feature = "whitebox_testing")] - -mod common; - -use candystore::{CandyStore, Config, Result, HASH_BITS_TO_KEEP}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_list_collisions() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - db.clear()?; - - // force many elements to end up with the same PartedHash - unsafe { HASH_BITS_TO_KEEP = 0xff00_000f_0000_00ff }; - - for i in 0u32..100_000 { - if i % 10_000 == 0 { - println!("push {i}"); - } - db.set_in_list("xxx", &i.to_le_bytes(), &i.to_le_bytes())?; - } - - for i in 0u32..100_000 { - if i % 10_000 == 0 { - println!("pop {i}"); - } - assert_eq!(db.pop_list_head("xxx")?.unwrap().1, &i.to_le_bytes()); - } - - assert!(db.pop_list_head("xxx")?.is_none()); - assert!(db.pop_list_tail("xxx")?.is_none()); - assert_eq!(db.iter_list("xxx").count(), 0); - - unsafe { HASH_BITS_TO_KEEP = 0x0000_000f_0000_00ff }; - - for i in 0u32..1000 { - db.set_in_list("xxx", &i.to_le_bytes(), &i.to_le_bytes())?; - } - for i in 400u32..600 { - assert_eq!( - db.remove_from_list("xxx", &i.to_le_bytes())?, - Some(i.to_le_bytes().to_vec()) - ); - } - - for i in 0u32..100 { - assert_eq!( - db.remove_from_list("xxx", &i.to_le_bytes())?, - Some(i.to_le_bytes().to_vec()) - ); - } - - for i in (900u32..1000).rev() { - assert_eq!( - db.remove_from_list("xxx", &i.to_le_bytes())?, - Some(i.to_le_bytes().to_vec()) - ); - } - - let remaining = db - .iter_list("xxx") - .map(|res| u32::from_le_bytes(res.unwrap().1.try_into().unwrap())) - .collect::>(); - - let expectd = (100..400).chain(600..900).collect::>(); - assert_eq!(remaining, expectd); - - db.discard_list("xxx")?; - assert!(db.pop_list_head("xxx")?.is_none()); - - Ok(()) - }) -} diff --git a/tests/test_lists.rs b/tests/test_lists.rs deleted file mode 100644 index 9ad5f4a..0000000 --- a/tests/test_lists.rs +++ /dev/null @@ -1,516 +0,0 @@ -mod common; - -use std::sync::{atomic::AtomicUsize, Arc}; - -use candystore::{ - CandyStore, CandyTypedDeque, CandyTypedList, Config, GetOrCreateStatus, ListCompactionParams, - ReplaceStatus, Result, SetStatus, -}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_lists() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - max_shard_size: 20 * 1024, // use small files to force lots of splits and compactions - min_compaction_threashold: 10 * 1024, - ..Default::default() - }, - )?; - - db.set_in_list("texas", "dallas", "500,000")?; - db.set_in_list("texas", "austin", "300,000")?; - db.set_in_list("texas", "houston", "700,000")?; - db.set_in_list("texas", "dallas", "450,000")?; - - assert_eq!(db.get_from_list("texas", "dallas")?, Some("450,000".into())); - assert_eq!(db.get_from_list("texas", "austin")?, Some("300,000".into())); - assert_eq!( - db.get_from_list("texas", "houston")?, - Some("700,000".into()) - ); - - assert_eq!(db.iter_list("texas").count(), 3); - assert_eq!(db.list_len("texas")?, 3); - assert_eq!(db.iter_list("arkansas").count(), 0); - assert_eq!(db.list_len("arkansas")?, 0); - - let items = db - .iter_list("texas") - .map(|res| res.unwrap()) - .collect::>(); - assert_eq!(items[0].0, "dallas".as_bytes()); - assert_eq!(items[2].0, "houston".as_bytes()); - - db.discard_list("texas")?; - assert_eq!(db.get_from_list("texas", "houston")?, None); - assert_eq!(db.get_from_list("texas", "dallas")?, None); - assert_eq!(db.iter_list("texas").count(), 0); - - db.set_in_list("xxx", "k1", "v1")?; - db.set_in_list("xxx", "k2", "v2")?; - db.set_in_list("xxx", "k3", "v3")?; - db.set_in_list("xxx", "k4", "v4")?; - - // remove from the middle - assert_eq!(db.remove_from_list("xxx", "k3")?, Some("v3".into())); - assert_eq!(db.iter_list("xxx").count(), 3); - assert_eq!(db.list_len("xxx")?, 3); - // remove first - assert_eq!(db.remove_from_list("xxx", "k1")?, Some("v1".into())); - assert_eq!(db.iter_list("xxx").count(), 2); - assert_eq!(db.list_len("xxx")?, 2); - // remove last - assert_eq!(db.remove_from_list("xxx", "k4")?, Some("v4".into())); - assert_eq!(db.iter_list("xxx").count(), 1); - assert_eq!(db.list_len("xxx")?, 1); - // remove single - assert_eq!(db.remove_from_list("xxx", "k2")?, Some("v2".into())); - assert_eq!(db.iter_list("xxx").count(), 0); - assert_eq!(db.list_len("xxx")?, 0); - - for i in 0..10_000 { - db.set_in_list("xxx", &format!("my key {i}"), - "very long key aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")?; - assert_eq!(db.list_len("xxx")?, i + 1); - } - - // make sure we survive splits - assert!(db.stats().num_splits > 1); - - for (i, res) in db.iter_list("xxx").enumerate() { - let (k, _) = res?; - assert_eq!(k, format!("my key {i}").as_bytes()); - db.remove_from_list("xxx", &k)?; - assert_eq!(db.list_len("xxx")?, 10_000 - i - 1); - } - - assert_eq!(db.iter_list("xxx").count(), 0); - - Ok(()) - }) -} - -#[test] -fn test_typed_lists() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open(dir, Config::default())?); - - let typed = CandyTypedList::::new(db); - typed.set("texas", &108, &2005)?; - typed.set("texas", &555, &2006)?; - typed.set("texas", &827, &2007)?; - typed.set("texas", &123, &2008)?; - typed.set("texas", &555, &2009)?; - - assert_eq!(typed.get("texas", &555)?, Some(2009)); - assert_eq!(typed.get("texas", &66666666)?, None); - - assert!(typed.remove("texas", &827)?.is_some()); - assert!(typed.remove("texas", &827)?.is_none()); - assert!(typed.remove("texas", &66666666)?.is_none()); - - let items = typed - .iter("texas") - .map(|res| res.unwrap().1) - .collect::>(); - assert_eq!(items, vec![2005, 2009, 2008]); - - Ok(()) - }) -} - -#[test] -fn test_lists_multithreading() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open(dir, Config::default())?); - - let removed = Arc::new(AtomicUsize::new(0)); - let created = Arc::new(AtomicUsize::new(0)); - let gotten = Arc::new(AtomicUsize::new(0)); - let replaced = Arc::new(AtomicUsize::new(0)); - - let num_thds = 10; - let num_iters = 1000; - - let mut handles = vec![]; - for thd in 0..num_thds { - let db = db.clone(); - let removed = removed.clone(); - let created = created.clone(); - let replaced = replaced.clone(); - let gotten = gotten.clone(); - let h = std::thread::spawn(move || { - for _ in 0..num_iters { - let idx1: u8 = rand::random(); - if db - .set_in_list("xxx", &format!("key{idx1}"), &format!("val-{thd}"))? - .was_created() - { - created.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - } else { - replaced.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - } - - std::thread::yield_now(); - - let idx2: u8 = rand::random(); - if let Some(v) = db.get_from_list("xxx", &format!("key{idx2}"))? { - assert!(v.starts_with(b"val-")); - gotten.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - } - - std::thread::yield_now(); - let idx3: u8 = rand::random(); - if db.remove_from_list("xxx", &format!("key{idx3}"))?.is_some() { - removed.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - } - std::thread::yield_now(); - } - Result::<()>::Ok(()) - }); - handles.push(h); - } - - for h in handles { - h.join().unwrap()?; - } - - let reamining = db.iter_list("xxx").count(); - let created = created.load(std::sync::atomic::Ordering::SeqCst); - let replaced = replaced.load(std::sync::atomic::Ordering::SeqCst); - let removed = removed.load(std::sync::atomic::Ordering::SeqCst); - let gotten = gotten.load(std::sync::atomic::Ordering::SeqCst); - - assert_eq!(created - removed, reamining); - assert_eq!(created + replaced, num_iters * num_thds); - - println!("created={created} replaced={replaced} removed={removed} gotten={gotten} reamining={reamining}"); - - Ok(()) - }) -} - -#[test] -fn test_list_atomics() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - assert_eq!( - db.get_or_create_in_list("xxx", "yyy", "1")?, - GetOrCreateStatus::CreatedNew("1".into()) - ); - - assert_eq!( - db.get_or_create_in_list("xxx", "yyy", "2")?, - GetOrCreateStatus::ExistingValue("1".into()) - ); - - assert_eq!( - db.replace_in_list("xxx", "yyy", "3", None)?, - ReplaceStatus::PrevValue("1".into()) - ); - - assert_eq!( - db.replace_in_list("xxx", "zzz", "3", None)?, - ReplaceStatus::DoesNotExist - ); - - assert_eq!( - db.get_or_create_in_list("xxx", "yyy", "7")?, - GetOrCreateStatus::ExistingValue("3".into()) - ); - - assert_eq!( - db.set_in_list("xxx", "yyy", "4")?, - SetStatus::PrevValue("3".into()) - ); - - assert_eq!(db.get_from_list("xxx", "yyy")?, Some("4".into())); - - Ok(()) - }) -} - -#[test] -fn test_typed_queue() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open(dir, Config::default())?); - - let queue = CandyTypedDeque::::new(db); - assert_eq!(queue.pop_head("orders")?, None); - - for i in 10..30 { - queue.push_tail("orders", &i)?; - } - for i in 10..20 { - assert_eq!(queue.pop_head("orders")?, Some(i)); - } - for i in (20..30).rev() { - assert_eq!(queue.pop_tail("orders")?, Some(i)); - } - - assert_eq!(queue.pop_head("orders")?, None); - - queue.push_tail("orders", &100)?; - queue.push_tail("orders", &101)?; - queue.push_tail("orders", &102)?; - queue.push_head("orders", &103)?; - queue.push_head("orders", &104)?; - queue.push_head("orders", &105)?; - - let items = queue - .iter("orders") - .map(|res| res.unwrap().1) - .collect::>(); - - assert_eq!(items, vec![105, 104, 103, 100, 101, 102]); - - let items = queue - .iter_backwards("orders") - .map(|res| res.unwrap().1) - .collect::>(); - - assert_eq!(items, vec![102, 101, 100, 103, 104, 105]); - - Ok(()) - }) -} - -#[test] -fn test_rev_iter() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - db.set_in_list("mylist", "item1", "xxx")?; - db.set_in_list("mylist", "item2", "xxx")?; - db.set_in_list("mylist", "item3", "xxx")?; - db.set_in_list("mylist", "item4", "xxx")?; - - let items = db - .iter_list("mylist") - .map(|res| res.unwrap().0) - .collect::>(); - - assert_eq!(items, vec![b"item1", b"item2", b"item3", b"item4"]); - - let items = db - .iter_list_backwards("mylist") - .map(|res| res.unwrap().0) - .collect::>(); - - assert_eq!(items, vec![b"item4", b"item3", b"item2", b"item1"]); - - assert_eq!( - db.peek_list_head("mylist")?, - Some(("item1".into(), "xxx".into())) - ); - - assert_eq!( - db.peek_list_tail("mylist")?, - Some(("item4".into(), "xxx".into())) - ); - - Ok(()) - }) -} - -#[test] -fn test_promote() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - let items = || { - db.iter_list("mylist") - .map(|res| res.unwrap().0) - .collect::>() - }; - - db.set_in_list("mylist", "item1", "xxx")?; - db.set_in_list("mylist", "item2", "xxx")?; - db.set_in_list("mylist", "item3", "xxx")?; - db.set_in_list("mylist", "item4", "xxx")?; - - assert_eq!(items(), vec![b"item1", b"item2", b"item3", b"item4"]); - - // no promotion happens - db.set_in_list("mylist", "item2", "yyy")?; - assert_eq!(items(), vec![b"item1", b"item2", b"item3", b"item4"]); - - // promote a middle element - db.set_in_list_promoting("mylist", "item2", "zzz")?; - assert_eq!(items(), vec![b"item1", b"item3", b"item4", b"item2"]); - - // promote head element - db.set_in_list_promoting("mylist", "item1", "zzz")?; - assert_eq!(items(), vec![b"item3", b"item4", b"item2", b"item1"]); - - // promote tail element - db.set_in_list_promoting("mylist", "item1", "zzz")?; - assert_eq!(items(), vec![b"item3", b"item4", b"item2", b"item1"]); - - Ok(()) - }) -} - -#[test] -fn test_typed_promote() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open(dir, Config::default())?); - let typed = CandyTypedList::::new(db); - - let items = || { - typed - .iter("mylist") - .map(|res| res.unwrap().0) - .collect::>() - }; - - typed.set("mylist", &1, "xxx")?; - typed.set("mylist", &2, "xxx")?; - typed.set("mylist", &3, "xxx")?; - typed.set("mylist", &4, "xxx")?; - assert_eq!(items(), &[1, 2, 3, 4]); - - typed.set("mylist", &2, "yyy")?; - assert_eq!(items(), &[1, 2, 3, 4]); - - typed.set_promoting("mylist", &2, "zzz")?; - assert_eq!(items(), &[1, 3, 4, 2]); - - typed.set_promoting("mylist", &1, "zzz")?; - assert_eq!(items(), &[3, 4, 2, 1]); - - typed.set_promoting("mylist", &1, "zzz")?; - assert_eq!(items(), &[3, 4, 2, 1]); - - Ok(()) - }) -} - -#[test] -fn test_list_compaction() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - for i in 0u32..1000 { - db.set_in_list("xxx", &i.to_le_bytes(), "yyy")?; - } - assert!(!db.compact_list_if_needed("xxx", ListCompactionParams::default())?); - - for i in 0u32..1000 { - if i % 3 == 1 { - assert!(db.remove_from_list("xxx", &i.to_le_bytes())?.is_some()); - } - } - - let keys1 = db - .iter_list("xxx") - .map(|res| u32::from_le_bytes(res.unwrap().0.try_into().unwrap())) - .collect::>(); - for k in keys1.iter() { - assert!(k % 3 != 1, "{k}"); - } - - assert!(db.compact_list_if_needed("xxx", ListCompactionParams::default())?); - - let keys2 = db - .iter_list("xxx") - .map(|res| u32::from_le_bytes(res.unwrap().0.try_into().unwrap())) - .collect::>(); - - assert_eq!(keys1, keys2); - - Ok(()) - }) -} - -#[test] -fn test_list_retain() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open(dir, Config::default())?); - - { - let mut dropped = 0; - - for i in 0u32..1000 { - db.set_in_list("xxx", &i.to_le_bytes(), "yyy")?; - } - for i in 0u32..1000 { - if i % 7 == 0 { - db.remove_from_list("xxx", &i.to_le_bytes())?; - dropped += 1; - } - } - - db.retain_in_list("xxx", |k, _v| { - let k2 = u32::from_le_bytes(k.try_into().unwrap()); - if k2 % 5 == 0 { - dropped += 1; - Ok(false) - } else { - Ok(true) // keep - } - })?; - - assert_eq!(db.list_len("xxx")?, 1000 - dropped); - - let mut found = vec![]; - for item in db.iter_list("xxx") { - let (k, v) = item?; - let k2 = u32::from_le_bytes(k.try_into().unwrap()); - assert_ne!(k2 % 7, 0); - assert_ne!(k2 % 5, 0); - assert_eq!(v, b"yyy"); - found.push(k2); - } - assert_eq!(found.len(), 1000 - dropped); - - db.retain_in_list("xxx", |_k, _v| Ok(false))?; - - assert_eq!(db.list_len("xxx")?, 0); - } - - { - let typed = CandyTypedList::::new(db); - - let mut dropped = 0; - - for i in 0u32..1000 { - typed.set("xxx", &i, &(i * 2))?; - } - for i in 0u32..1000 { - if i % 7 == 0 { - typed.remove("xxx", &i)?; - dropped += 1 - } - } - - typed.retain("xxx", |k, _v| { - if k % 5 == 0 { - dropped += 1; - Ok(false) - } else { - Ok(true) // keep - } - })?; - - assert_eq!(typed.len("xxx")?, 1000 - dropped); - - let mut found = vec![]; - for item in typed.iter("xxx") { - let (k, v) = item?; - assert_ne!(k % 7, 0); - assert_ne!(k % 5, 0); - assert_eq!(v, k * 2); - found.push(k); - } - assert_eq!(found.len(), 1000 - dropped); - } - - Ok(()) - }) -} diff --git a/tests/test_loading.rs b/tests/test_loading.rs deleted file mode 100644 index 5470601..0000000 --- a/tests/test_loading.rs +++ /dev/null @@ -1,72 +0,0 @@ -mod common; - -use candystore::{CandyStore, Config, Result}; - -use crate::common::{run_in_tempdir, LONG_VAL}; - -#[test] -fn test_loading() -> Result<()> { - run_in_tempdir(|dir| { - let config = Config { - max_shard_size: 20 * 1024, // use small files to force lots of splits and compactions - min_compaction_threashold: 10 * 1024, - ..Default::default() - }; - - { - let db = CandyStore::open(dir, config.clone())?; - - for i in 0..1000 { - db.set(&format!("unique key {i}"), LONG_VAL)?; - } - - assert!(db.stats().num_splits > 1); - assert_eq!(db.iter().count(), 1000); - } - - { - let db = CandyStore::open(dir, config.clone())?; - - assert_eq!(db.iter().count(), 1000); - - for res in db.iter() { - let (key, val) = res?; - assert_eq!(val, LONG_VAL.as_bytes()); - assert!(key.starts_with(b"unique key ")); - } - } - - { - let existing = std::fs::read_dir(dir)? - .map(|res| res.unwrap().file_name().to_str().unwrap().to_string()) - .filter(|name| name.starts_with("shard_")) - .collect::>(); - - std::fs::write(format!("{dir}/top_1234-5678"), "xxxx")?; - std::fs::write(format!("{dir}/bottom_1234-5678"), "xxxx")?; - - let (_, span) = existing[0].split_once("_").unwrap(); - let (start, end) = span.split_once("-").unwrap(); - let start = u32::from_str_radix(start, 16).unwrap(); - let end = u32::from_str_radix(end, 16).unwrap(); - let mid = (start + end) / 2; - std::fs::write(format!("{dir}/shard_{start:04x}-{mid:04x}"), "xxxx")?; - std::fs::write(format!("{dir}/shard_{mid:04x}-{end:04x}"), "xxxx")?; - - let db = CandyStore::open(dir, config)?; - - assert!(!std::fs::exists(format!("{dir}/top_1234-5678"))?); - assert!(!std::fs::exists(format!("{dir}/bottom_1234-5678"))?); - assert!(!std::fs::exists(format!( - "{dir}/shard_{start:04x}-{mid:04x}" - ))?); - assert!(!std::fs::exists(format!( - "{dir}/shard_{mid:04x}-{end:04x}" - ))?); - - assert_eq!(db.iter().count(), 1000); - } - - Ok(()) - }) -} diff --git a/tests/test_logic.rs b/tests/test_logic.rs deleted file mode 100644 index e2a7c19..0000000 --- a/tests/test_logic.rs +++ /dev/null @@ -1,149 +0,0 @@ -mod common; - -use std::collections::HashSet; - -use candystore::{CandyStore, Config, Result, MAX_VALUE_SIZE}; - -use crate::common::{run_in_tempdir, LONG_VAL}; - -#[test] -fn test_logic() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - max_shard_size: 20 * 1024, // use small files to force lots of splits and compactions - min_compaction_threashold: 10 * 1024, - ..Default::default() - }, - )?; - - assert!(db.get("my name")?.is_none()); - db.set("my_name", "inigo montoya")?; - db.set("your_name", "dread pirate robert")?; - - assert!(db.contains("my_name")?); - assert!(!db.contains("My NaMe")?); - - assert_eq!(db.get("my_name")?, Some("inigo montoya".into())); - assert_eq!(db.get("your_name")?, Some("dread pirate robert".into())); - db.set("your_name", "vizzini")?; - assert_eq!(db.get("your_name")?, Some("vizzini".into())); - assert_eq!(db.remove("my_name")?, Some("inigo montoya".into())); - assert!(db.remove("my_name")?.is_none()); - assert!(db.get("my name")?.is_none()); - - let stats = db.stats(); - assert_eq!(stats.num_entries(), 1); - assert_eq!(stats.num_compactions, 0); - assert_eq!(stats.num_splits, 0); - println!("{stats}"); - - for _ in 0..1000 { - db.set( - "a very long keyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", - LONG_VAL, - )?; - assert!(db - .remove("a very long keyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy")? - .is_some()); - } - - let stats1 = db.stats(); - println!("{stats1}"); - assert_eq!(stats1.num_entries(), 1); - assert!(stats1.num_compactions >= 2); - assert_eq!(stats1.num_splits, 0); - - for i in 0..1000 { - db.set(&format!("unique key {i}"), LONG_VAL)?; - } - - let stats2 = db.stats(); - assert_eq!(stats2.num_entries(), 1001); - assert!(stats2.num_splits > stats1.num_splits); - - assert_eq!(db.get("your_name")?, Some("vizzini".into())); - db.clear()?; - assert_eq!(db.get("your_name")?, None); - - let stats3 = db.stats(); - assert_eq!(stats3.num_entries(), 0); - assert_eq!(stats3.num_compactions, 0); - assert_eq!(stats3.num_splits, 0); - - for i in 0..1000 { - db.set(&format!("unique key {i}"), LONG_VAL)?; - } - - let mut all_keys = HashSet::new(); - - for res in db.iter() { - let (key, val) = res?; - assert_eq!(val, LONG_VAL.as_bytes()); - assert!(key.starts_with(b"unique key ")); - all_keys.insert(key); - } - - assert_eq!(all_keys.len(), 1000); - - all_keys.clear(); - - let cookie = { - let mut iter1 = db.iter(); - for _ in 0..100 { - let res = iter1.next().unwrap(); - let (key, _) = res?; - all_keys.insert(key); - } - iter1.cookie() - }; - - for res in db.iter_from_cookie(cookie) { - let (key, _) = res?; - all_keys.insert(key); - } - - assert_eq!(all_keys.len(), 1000); - - let mut all_keys2 = HashSet::new(); - - for res in db.iter_keys() { - let key = res?; - all_keys2.insert(key); - } - - assert_eq!(all_keys, all_keys2); - - Ok(()) - }) -} - -#[test] -fn test_histogram() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - expected_number_of_keys: 100_000, // pre-split - ..Default::default() - }, - )?; - - db.set("k1", "bbb")?; - db.set("k2", &vec![b'b'; 100])?; - db.set("k3", &vec![b'b'; 500])?; - db.set("k4", &vec![b'b'; 5000])?; - db.set("k4", &vec![b'b'; 4500])?; - db.set("k5", &vec![b'b'; 50000])?; - db.set("kkkkkkkkkkkkkkk", &vec![b'b'; MAX_VALUE_SIZE])?; - - let stats = db.stats(); - assert_eq!(stats.entries_under_128, 2); - assert_eq!(stats.entries_under_1k, 1); - assert_eq!(stats.entries_under_8k, 2); - assert_eq!(stats.entries_over_32k, 2); - - Ok(()) - }) -} diff --git a/tests/test_merge.rs b/tests/test_merge.rs deleted file mode 100644 index 0b3ca11..0000000 --- a/tests/test_merge.rs +++ /dev/null @@ -1,87 +0,0 @@ -mod common; - -use candystore::{CandyStore, Config, Result}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_merge() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - for i in 0u32..100_000 { - db.set(&i.to_le_bytes(), "val")?; - } - assert_eq!(db.stats().num_entries(), 100_000); - assert_eq!(db.stats().num_shards, 4); - - for i in 0u32..100_000 { - if i % 16 != 0 { - db.remove(&i.to_le_bytes())?.unwrap(); - } - } - assert_eq!(db.stats().num_entries(), 6250); - db.merge_small_shards(0.25)?; - assert_eq!(db.stats().num_shards, 1); - - for i in 0u32..100_000 { - db.set(&i.to_le_bytes(), "val")?; - } - assert_eq!(db.stats().num_entries(), 100_000); - assert_eq!(db.stats().num_shards, 4); - for i in 0u32..100_000 { - if i % 4 != 0 { - db.remove(&i.to_le_bytes())?.unwrap(); - } - } - assert_eq!(db.stats().num_entries(), 25_000); - db.merge_small_shards(0.25)?; - assert_eq!(db.stats().num_shards, 2); - - for i in 0u32..100_000 { - if (i % 4 == 0) && (i % 16 != 0) { - db.remove(&i.to_le_bytes())?.unwrap(); - } - } - assert_eq!(db.stats().num_entries(), 6250); - db.merge_small_shards(0.25)?; - assert_eq!(db.stats().num_shards, 1); - - Ok(()) - }) -} - -#[test] -fn test_merge_with_expected_num_keys() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - expected_number_of_keys: 200_000, - ..Default::default() - }, - )?; - - assert_eq!(db.stats().num_entries(), 0); - assert_eq!(db.stats().num_shards, 8); - db.merge_small_shards(0.25)?; - assert_eq!(db.stats().num_shards, 8); - - for i in 0u32..900_000 { - db.set(&i.to_le_bytes(), "val")?; - } - assert_eq!(db.stats().num_entries(), 900_000); - assert_eq!(db.stats().num_shards, 32); - - for i in 0u32..900_000 { - if i % 16 != 0 { - db.remove(&i.to_le_bytes())?.unwrap(); - } - } - assert_eq!(db.stats().num_entries(), 56250); - db.merge_small_shards(0.25)?; - assert_eq!(db.stats().num_shards, 8); - - Ok(()) - }) -} diff --git a/tests/test_multithreading.rs b/tests/test_multithreading.rs deleted file mode 100644 index 9e8bd2d..0000000 --- a/tests/test_multithreading.rs +++ /dev/null @@ -1,75 +0,0 @@ -mod common; - -use std::sync::{atomic::AtomicUsize, Arc}; - -use candystore::{CandyStore, Config, Result}; -use rand::random; - -use crate::common::run_in_tempdir; - -#[test] -fn test_multithreaded() -> Result<()> { - run_in_tempdir(|dir| { - for attempt in 0..10 { - let db = Arc::new(CandyStore::open( - dir, - Config { - max_shard_size: 20 * 1024, - min_compaction_threashold: 10 * 1024, - ..Default::default() - }, - )?); - - const NUM_ITEMS: usize = 10_000; - let succ_gets = Arc::new(AtomicUsize::new(0)); - let succ_removals = Arc::new(AtomicUsize::new(0)); - - let mut thds = Vec::new(); - for thid in 0..50 { - let db = db.clone(); - let succ_gets = succ_gets.clone(); - let succ_removals = succ_removals.clone(); - let handle = std::thread::spawn(move || -> Result<()> { - let value = format!("data{thid}"); - for i in 0..NUM_ITEMS { - let key = format!("key{i}"); - db.set(&key, &value)?; - - if random::() > 0.8 { - if db.remove(&key)?.is_some() { - succ_removals.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - } - } else { - let val2 = db.get(&key)?; - if let Some(val2) = val2 { - assert!(val2.starts_with(b"data")); - succ_gets.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - } - } - } - Ok(()) - }); - //handle.join().unwrap().unwrap(); - thds.push(handle); - } - - for thd in thds { - thd.join().unwrap()?; - } - - let gets = succ_gets.load(std::sync::atomic::Ordering::SeqCst); - let removals = succ_removals.load(std::sync::atomic::Ordering::SeqCst); - - let stats = db.stats(); - println!("[{attempt}] gets={gets} removals={removals} stats={stats}",); - - assert_eq!(db.iter().count(), db.stats().num_entries()); - assert!( - stats.num_entries() >= (NUM_ITEMS * 7) / 10 - && stats.num_entries() <= (NUM_ITEMS * 9) / 10 - ); - db.clear()?; - } - Ok(()) - }) -} diff --git a/tests/test_pre_split.rs b/tests/test_pre_split.rs deleted file mode 100644 index 38afe81..0000000 --- a/tests/test_pre_split.rs +++ /dev/null @@ -1,181 +0,0 @@ -mod common; - -use candystore::{CandyError, CandyStore, Config, Result}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_pre_split() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - max_shard_size: 20 * 1024, // use small files to force lots of splits and compactions - min_compaction_threashold: 10 * 1024, - expected_number_of_keys: 1_000_000, - ..Default::default() - }, - )?; - - db.set("aaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")?; - - let files = std::fs::read_dir(&dir)? - .map(|res| res.unwrap().file_name().to_string_lossy().to_string()) - .filter(|filename| filename.starts_with("shard_")) - .collect::>(); - - assert_eq!(files.len(), 64); - - let stats = db.stats(); - assert_eq!(stats.num_shards, 64); - assert_eq!(stats.num_inserts, 1); - assert_eq!(stats.wasted_bytes, 0); - - db.set("bbb", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")?; - - let stats = db.stats(); - assert_eq!(stats.num_inserts, 2); - assert_eq!(stats.wasted_bytes, 0); - - db.set("aaa", "xxx")?; - - let stats = db.stats(); - assert_eq!(stats.num_inserts, 2); - - // test accounting, it's a bit of an implementation detail, but we have to account for the - // namespace byte as well - assert_eq!( - stats.wasted_bytes, - "aaa?".len() + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".len() - ); - - db.remove("aaa")?; - let stats = db.stats(); - assert_eq!(stats.num_inserts, 2); - assert_eq!(stats.num_removals, 1); - assert_eq!( - stats.wasted_bytes, - "aaa?".len() - + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa".len() - + "aaa?".len() - + "xxx".len() - ); - - Ok(()) - }) -} - -#[test] -fn test_compaction() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - max_shard_size: 1000, - min_compaction_threashold: 900, - ..Default::default() - }, - )?; - - // fill the shard to the brim, creating waste - for i in 0..10 { - db.set("aaa", &format!("11112222333344445555666677778888999900001111222233334444555566667777888899990000111122223333444{:x}", i))?; - - let stats = db.stats(); - assert_eq!(stats.num_inserts, 1, "i={i}"); - assert_eq!(stats.occupied_bytes, 100 * (i + 1), "i={i}"); - assert_eq!(stats.wasted_bytes, 100 * i, "i={i}"); - } - - assert_eq!(db.stats().num_compactions, 0); - - // insert a new entry, which will cause a compaction - db.set("bbb", "x")?; - assert_eq!(db.stats().num_compactions, 1); - - let stats = db.stats(); - assert_eq!(stats.occupied_bytes, 100 + "bbb?".len() + "x".len()); - assert_eq!(stats.wasted_bytes, 0); - - Ok(()) - }) -} - -#[test] -fn test_too_large() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - max_shard_size: 1000, - min_compaction_threashold: 1000, - ..Default::default() - }, - )?; - - assert!(matches!( - db.set("yyy", &vec![7u8; 1000]) - .unwrap_err() - .downcast::() - .unwrap(), - CandyError::EntryCannotFitInShard(_, _) - )); - - db.set("yyy", &vec![7u8; 700])?; - let stats = db.stats(); - assert_eq!(stats.num_splits, 0); - assert_eq!(stats.num_compactions, 0); - - db.set("zzz", &vec![7u8; 700])?; - let stats = db.stats(); - assert_eq!(stats.num_compactions, 0); - assert_eq!(stats.num_splits, 1); - - Ok(()) - }) -} - -#[test] -fn test_compaction_stats() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open( - dir, - Config { - max_shard_size: 20_000, - min_compaction_threashold: 10_000, - ..Default::default() - }, - )?; - - let stats1 = db.stats(); - assert!(stats1.last_compaction_stats.is_empty()); - assert!(stats1.last_split_stats.is_empty()); - - for i in 1..500 { - db.set(&format!("key{i}"), &format!("val{i:0200}"))?; - } - - let stats2 = db.stats(); - println!("stats2={stats2:?}"); - assert!(stats2.last_compaction_stats.is_empty()); - assert!(stats2.last_split_stats.len() > 0); - - for i in 500..10000 { - db.set("key", &format!("val{i:0200}"))?; - } - - let stats3 = db.stats(); - println!("{stats3:?}"); - assert!(stats3.last_compaction_stats.len() > 1); - - for _ in 0..1000 { - assert!(db.get("key")?.is_some()); - } - - let stats4 = db.stats(); - assert!(stats4.last_compaction_stats.is_empty()); - assert!(stats4.last_split_stats.is_empty()); - - Ok(()) - }) -} diff --git a/tests/test_queues.rs b/tests/test_queues.rs deleted file mode 100644 index 40fbac0..0000000 --- a/tests/test_queues.rs +++ /dev/null @@ -1,109 +0,0 @@ -mod common; - -use candystore::{CandyStore, Config, Result}; - -use crate::common::run_in_tempdir; - -#[test] -fn test_queues() -> Result<()> { - run_in_tempdir(|dir| { - let db = CandyStore::open(dir, Config::default())?; - - db.push_to_queue_tail("work", "item1")?; - db.push_to_queue_tail("work", "item2")?; - db.push_to_queue_tail("work", "item3")?; - - assert_eq!(db.peek_queue_head("work")?, Some("item1".into())); - assert_eq!(db.peek_queue_tail("work")?, Some("item3".into())); - - assert_eq!(db.pop_queue_head("work")?, Some("item1".into())); - assert_eq!(db.pop_queue_head("work")?, Some("item2".into())); - assert_eq!(db.pop_queue_head("work")?, Some("item3".into())); - assert_eq!(db.pop_queue_head("work")?, None); - - db.push_to_queue_head("rev", "item1")?; - db.push_to_queue_head("rev", "item2")?; - db.push_to_queue_head("rev", "item3")?; - assert_eq!(db.pop_queue_tail("rev")?, Some("item1".into())); - assert_eq!(db.pop_queue_tail("rev")?, Some("item2".into())); - assert_eq!(db.pop_queue_tail("rev")?, Some("item3".into())); - assert_eq!(db.pop_queue_tail("rev")?, None); - - assert_eq!(db.queue_len("work")?, 0); - - for i in 1000u32..2000 { - db.push_to_queue_tail("work", &i.to_le_bytes())?; - } - assert_eq!(db.queue_len("work")?, 1000); - assert_eq!(db.queue_len("joke")?, 0); - - for (i, res) in db.iter_queue("work").enumerate() { - let (idx, val) = res?; - let v = u32::from_le_bytes(val.try_into().unwrap()); - assert_eq!(v, 1000 + i as u32); - - // create some holes - if v % 5 == 0 { - assert!(db.remove_from_queue("work", idx)?.is_some()); - } - } - - let mut count = 0; - for res in db.iter_queue("work") { - let (_, val) = res?; - let v = u32::from_le_bytes(val.try_into().unwrap()); - assert_ne!(v % 5, 0); - count += 1; - } - assert!(count == 800); - - let mut count2 = 0; - while let Some(val) = db.pop_queue_head("work")? { - let v = u32::from_le_bytes(val.try_into().unwrap()); - assert_ne!(v % 5, 0); - count2 += 1; - if count2 > 400 { - break; - } - } - while let Some(val) = db.pop_queue_tail("work")? { - let v = u32::from_le_bytes(val.try_into().unwrap()); - assert_ne!(v % 5, 0); - count2 += 1; - } - - assert_eq!(count, count2); - assert_eq!(db.queue_len("work")?, 0); - - db.push_to_queue_tail("work", "item1")?; - db.push_to_queue_tail("work", "item2")?; - db.push_to_queue_tail("work", "item3")?; - assert_eq!(db.queue_len("work")?, 3); - db.extend_queue("work", ["item4", "item5"].iter())?; - assert_eq!(db.queue_len("work")?, 5); - - let items = db - .iter_queue("work") - .map(|res| std::str::from_utf8(&res.unwrap().1).unwrap().to_owned()) - .collect::>(); - assert_eq!(items, ["item1", "item2", "item3", "item4", "item5"]); - - db.discard_queue("work")?; - assert_eq!(db.queue_len("work")?, 0); - - db.extend_queue("work", (1u32..10).map(|i| i.to_le_bytes()))?; - let items = db - .iter_queue("work") - .map(|res| u32::from_le_bytes(res.unwrap().1.try_into().unwrap())) - .collect::>(); - assert_eq!(items, (1u32..10).collect::>()); - - let items = db - .iter_queue_backwards("work") - .map(|res| u32::from_le_bytes(res.unwrap().1.try_into().unwrap())) - .collect::>(); - assert_eq!(items, (1u32..10).rev().collect::>()); - - Ok(()) - }) -} diff --git a/tests/test_typed.rs b/tests/test_typed.rs deleted file mode 100644 index b9ffa1a..0000000 --- a/tests/test_typed.rs +++ /dev/null @@ -1,113 +0,0 @@ -mod common; - -use std::sync::Arc; - -use candystore::{CandyStore, CandyTypedKey, CandyTypedStore, Config, Result}; - -use crate::common::run_in_tempdir; - -use databuf::{Decode, Encode}; - -#[derive(Debug, Encode, Decode)] -struct MyKey { - x: u32, - y: u64, - z: String, -} - -impl CandyTypedKey for MyKey { - const TYPE_ID: u32 = 0x3476a551; -} - -#[derive(Debug, PartialEq, Eq, Encode, Decode)] -struct MyVal { - a: [u8; 7], - b: i16, - c: String, -} - -#[test] -fn test_typed() -> Result<()> { - run_in_tempdir(|dir| { - let db = Arc::new(CandyStore::open(dir, Config::default())?); - - let typed = CandyTypedStore::::new(db.clone()); - typed.set( - &MyKey { - x: 12, - y: 34, - z: "hello".into(), - }, - &MyVal { - a: [7, 7, 7, 7, 7, 7, 7], - b: 31415, - c: "world".into(), - }, - )?; - - assert_eq!( - typed - .get(&MyKey { - x: 12, - y: 34, - z: "hello".into(), - }) - .unwrap(), - Some(MyVal { - a: [7, 7, 7, 7, 7, 7, 7], - b: 31415, - c: "world".into() - }) - ); - - assert_eq!( - typed - .get(&MyKey { - x: 12, - y: 34, - z: "ola".into(), - }) - .unwrap(), - None - ); - - assert_eq!( - typed - .remove(&MyKey { - x: 12, - y: 34, - z: "hello".into(), - }) - .unwrap(), - Some(MyVal { - a: [7, 7, 7, 7, 7, 7, 7], - b: 31415, - c: "world".into() - }) - ); - - assert_eq!( - typed - .get(&MyKey { - x: 12, - y: 34, - z: "hello".into(), - }) - .unwrap(), - None - ); - - // two typed-stores can co-exist on the same underlying store - let typed2 = CandyTypedStore::>::new(db); - typed2.set("hello", &vec![1, 2, 3])?; - typed2.set("world", &vec![4, 5, 6, 7])?; - - assert_eq!(typed2.get("hello").unwrap(), Some(vec![1, 2, 3])); - assert_eq!(typed2.get("world").unwrap(), Some(vec![4, 5, 6, 7])); - - assert_eq!(typed2.remove("hello").unwrap(), Some(vec![1, 2, 3])); - assert_eq!(typed2.remove("hello").unwrap(), None); - - Ok(()) - }) -} diff --git a/tests/typed_list.rs b/tests/typed_list.rs new file mode 100644 index 0000000..74f73dc --- /dev/null +++ b/tests/typed_list.rs @@ -0,0 +1,167 @@ +use std::sync::Arc; + +use candystore::{CandyStore, CandyTypedList, Config, ListCompactionParams, Result}; +use tempfile::TempDir; + +#[test] +fn test_typed_list_iter_rev() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let list = CandyTypedList::::new(Arc::clone(&store)); + + let lkey = 42u32; + list.set(&lkey, &1u32, &"a".to_string())?; + list.set(&lkey, &2u32, &"b".to_string())?; + list.set(&lkey, &3u32, &"c".to_string())?; + + assert_eq!(list.remove(&lkey, &2u32)?, Some("b".to_string())); + + let fwd: Vec<_> = list.iter(&lkey).map(|r| r.unwrap()).collect(); + assert_eq!(fwd, vec![(1u32, "a".to_string()), (3u32, "c".to_string())]); + + let rev: Vec<_> = list.iter(&lkey).rev().map(|r| r.unwrap()).collect(); + assert_eq!(rev, vec![(3u32, "c".to_string()), (1u32, "a".to_string())]); + + Ok(()) +} + +#[test] +fn test_typed_list_admin_ops() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let list = CandyTypedList::::new(Arc::clone(&store)); + + let lkey = 9u32; + list.set(&lkey, &1u32, &"a".to_string())?; + list.set(&lkey, &2u32, &"b".to_string())?; + list.set(&lkey, &3u32, &"c".to_string())?; + assert_eq!(list.len(&lkey)?, 3); + + assert_eq!( + list.set_promoting(&lkey, &2u32, &"b".to_string())?, + Some("b".to_string()) + ); + let order: Vec<_> = list.iter(&lkey).map(|r| r.unwrap()).collect(); + assert_eq!( + order, + vec![ + (1u32, "a".to_string()), + (3u32, "c".to_string()), + (2u32, "b".to_string()) + ] + ); + + assert_eq!( + list.set_promoting(&lkey, &3u32, &"cc".to_string())?, + Some("c".to_string()) + ); + let order: Vec<_> = list.iter(&lkey).map(|r| r.unwrap()).collect(); + assert_eq!( + order, + vec![ + (1u32, "a".to_string()), + (2u32, "b".to_string()), + (3u32, "cc".to_string()) + ] + ); + + assert_eq!(list.set_promoting(&lkey, &4u32, &"d".to_string())?, None); + let order: Vec<_> = list.iter(&lkey).map(|r| r.unwrap()).collect(); + assert_eq!( + order, + vec![ + (1u32, "a".to_string()), + (2u32, "b".to_string()), + (3u32, "cc".to_string()), + (4u32, "d".to_string()) + ] + ); + + let existing = list.remove(&lkey, &1u32)?.unwrap(); + list.set(&lkey, &1u32, &existing)?; + let order: Vec<_> = list.iter(&lkey).map(|r| r.unwrap()).collect(); + assert_eq!( + order, + vec![ + (2u32, "b".to_string()), + (3u32, "cc".to_string()), + (4u32, "d".to_string()), + (1u32, "a".to_string()) + ] + ); + + assert_eq!(list.remove(&lkey, &3u32)?, Some("cc".to_string())); + let range_before = list.range(&lkey)?; + assert!(!range_before.is_empty()); + assert_eq!(list.len(&lkey)?, 3); + + let _ = list.compact_if_needed( + &lkey, + ListCompactionParams { + min_length: 0, + min_holes_ratio: 0.0, + }, + )?; + + let order: Vec<_> = list.iter(&lkey).map(|r| r.unwrap()).collect(); + assert_eq!( + order, + vec![ + (2u32, "b".to_string()), + (4u32, "d".to_string()), + (1u32, "a".to_string()) + ] + ); + + assert!(list.discard(&lkey)?); + assert!(list.is_empty(&lkey)?); + assert!(list.range(&lkey)?.is_empty()); + Ok(()) +} + +#[test] +fn test_typed_list_retain() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let list = CandyTypedList::::new(Arc::clone(&store)); + + let lkey = 1u32; + list.set(&lkey, &10u32, &"ten".to_string())?; + list.set(&lkey, &20u32, &"twenty".to_string())?; + list.set(&lkey, &30u32, &"thirty".to_string())?; + list.set(&lkey, &40u32, &"forty".to_string())?; + + list.retain(&lkey, |k, _| Ok(*k % 20 == 0))?; + let items: Vec<_> = list.iter(&lkey).map(|r| r.unwrap().0).collect(); + assert_eq!(items, vec![20u32, 40u32]); + + list.retain(&lkey, |_, v| Ok(v.starts_with('f')))?; + let items: Vec<_> = list.iter(&lkey).map(|r| r.unwrap().0).collect(); + assert_eq!(items, vec![40u32]); + assert_eq!(list.range(&lkey)?.len(), list.len(&lkey)?); + + Ok(()) +} + +#[test] +fn test_typed_list_pop_peek() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let list = CandyTypedList::::new(Arc::clone(&store)); + + let lkey = 100u32; + list.set(&lkey, &1u32, &"one".to_string())?; + list.set(&lkey, &2u32, &"two".to_string())?; + + assert_eq!(list.peek_head(&lkey)?, Some((1u32, "one".to_string()))); + assert_eq!(list.peek_tail(&lkey)?, Some((2u32, "two".to_string()))); + + assert_eq!(list.pop_head(&lkey)?, Some((1u32, "one".to_string()))); + assert_eq!(list.len(&lkey)?, 1); + + assert_eq!(list.pop_tail(&lkey)?, Some((2u32, "two".to_string()))); + assert_eq!(list.len(&lkey)?, 0); + assert!(list.pop_head(&lkey)?.is_none()); + + Ok(()) +} diff --git a/tests/typed_queue.rs b/tests/typed_queue.rs new file mode 100644 index 0000000..6fa5ef1 --- /dev/null +++ b/tests/typed_queue.rs @@ -0,0 +1,63 @@ +use std::sync::Arc; + +use candystore::{CandyStore, CandyTypedDeque, Config, Result}; +use tempfile::TempDir; + +#[test] +fn test_typed_queue_iter_rev() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let queue = CandyTypedDeque::::new(Arc::clone(&store)); + + let qkey = 7u32; + queue.push_tail(&qkey, &10u32)?; + queue.push_tail(&qkey, &20u32)?; + queue.push_tail(&qkey, &30u32)?; + + let fwd: Vec<_> = queue.iter(&qkey).map(|r| r.unwrap().1).collect(); + assert_eq!(fwd, vec![10, 20, 30]); + + let rev: Vec<_> = queue.iter(&qkey).rev().map(|r| r.unwrap().1).collect(); + assert_eq!(rev, vec![30, 20, 10]); + + Ok(()) +} + +#[test] +fn test_typed_queue_discard() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let queue = CandyTypedDeque::::new(Arc::clone(&store)); + + let qkey = 5u32; + queue.push_tail(&qkey, &1u32)?; + queue.push_tail(&qkey, &2u32)?; + assert_eq!(queue.len(&qkey)?, 2); + assert_eq!( + queue.range(&qkey)?, + 9223372036854775808usize..9223372036854775810usize + ); + + assert!(queue.discard(&qkey)?); + assert_eq!(queue.len(&qkey)?, 0); + assert!(queue.pop_head(&qkey)?.is_none()); + assert!(queue.is_empty(&qkey)?); + Ok(()) +} + +#[test] +fn test_typed_queue_empty_push_head_has_simple_range_semantics() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let queue = CandyTypedDeque::::new(Arc::clone(&store)); + + let qkey = 11u32; + assert!(queue.range(&qkey)?.is_empty()); + + queue.push_head(&qkey, &99u32)?; + assert_eq!(queue.peek_head(&qkey)?, Some(99u32)); + assert_eq!(queue.peek_tail(&qkey)?, Some(99u32)); + assert_eq!(queue.len(&qkey)?, 1); + + Ok(()) +} diff --git a/tests/typed_store.rs b/tests/typed_store.rs new file mode 100644 index 0000000..ac0bba5 --- /dev/null +++ b/tests/typed_store.rs @@ -0,0 +1,76 @@ +use std::sync::Arc; + +use candystore::{CandyStore, CandyTypedStore, Config, MAX_USER_VALUE_SIZE, Result}; +use tempfile::TempDir; + +#[test] +fn test_typed_kv_store() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let kv = CandyTypedStore::::new(Arc::clone(&store)); + + assert!(!kv.contains(&1u32)?); + assert!(kv.set(&1u32, &"one".to_string())?.is_none()); + assert!(kv.contains(&1u32)?); + assert_eq!(kv.get(&1u32)?, Some("one".to_string())); + assert_eq!(kv.set(&1u32, &"uno".to_string())?, Some("one".to_string())); + assert_eq!(kv.remove(&1u32)?, Some("uno".to_string())); + assert!(kv.get(&1u32)?.is_none()); + assert!(!kv.contains(&1u32)?); + Ok(()) +} + +#[test] +fn test_typed_atomic_ops() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let kv = CandyTypedStore::::new(Arc::clone(&store)); + + assert_eq!( + kv.get_or_create(&7u32, &"seven".to_string())?, + "seven".to_string() + ); + assert_eq!( + kv.get_or_create(&7u32, &"changed".to_string())?, + "seven".to_string() + ); + assert_eq!(kv.get(&7u32)?, Some("seven".to_string())); + assert!( + kv.replace(&8u32, &"nope".to_string(), None::<&String>)? + .is_none() + ); + assert!(kv.get(&8u32)?.is_none()); + assert!( + kv.replace(&7u32, &"nope".to_string(), Some(&"wrong".to_string()))? + .is_none() + ); + assert_eq!(kv.get(&7u32)?, Some("seven".to_string())); + assert_eq!( + kv.replace(&7u32, &"new".to_string(), None::<&String>)?, + Some("seven".to_string()) + ); + assert_eq!(kv.get(&7u32)?, Some("new".to_string())); + + Ok(()) +} + +#[test] +fn test_typed_big_value_round_trip() -> Result<()> { + let temp_dir = TempDir::new().unwrap(); + let store = Arc::new(CandyStore::open(temp_dir.path(), Config::default())?); + let kv = CandyTypedStore::>::new(Arc::clone(&store)); + + let key = 3u32; + let big1 = vec![1u8; MAX_USER_VALUE_SIZE + 123]; + let big2 = vec![2u8; MAX_USER_VALUE_SIZE * 2 + 17]; + + assert!(!kv.set_big(&key, &big1)?); + assert_eq!(kv.get_big(&key)?, Some(big1.clone())); + assert!(kv.set_big(&key, &big2)?); + assert_eq!(kv.get_big(&key)?, Some(big2.clone())); + assert!(kv.remove_big(&key)?); + assert!(kv.get_big(&key)?.is_none()); + assert!(!kv.remove_big(&key)?); + + Ok(()) +} diff --git a/tests/whitebox.rs b/tests/whitebox.rs new file mode 100644 index 0000000..f4181cd --- /dev/null +++ b/tests/whitebox.rs @@ -0,0 +1,389 @@ +//! Whitebox tests requiring the `whitebox-testing` feature. +//! +//! Run via: `cargo test --features whitebox-testing --test whitebox` + +#![cfg(feature = "whitebox-testing")] + +mod common; +use crate::common::checkpoint_slot_checksum; + +use std::io::{Seek, SeekFrom, Write}; +use std::path::Path; + +use candystore::{CandyStore, Config, Error}; +use tempfile::tempdir; + +/// PAGE_SIZE for one RowLayout. +const PAGE_SIZE: usize = 4096; +/// Number of slots per row. +const ROW_WIDTH: usize = 336; +/// Offset of `signatures` array within a RowLayout (after split_level + padding). +const SIGS_OFFSET: usize = 64; +/// Offset of `pointers` array within a RowLayout (after signatures). +const PTRS_OFFSET: usize = SIGS_OFFSET + ROW_WIDTH * 4; +/// FILE_OFFSET_ALIGNMENT used in EntryPointer encoding. +const FILE_OFFSET_ALIGNMENT: u64 = 16; + +/// Offset of checkpoint slot 0 within the index header. +const CHECKPOINT_SLOT_0_OFFSET: u64 = 128; +const CHECKPOINT_SLOT_GENERATION_OFFSET: u64 = 0; +const CHECKPOINT_SLOT_ORDINAL_OFFSET: u64 = 8; +const CHECKPOINT_SLOT_FILE_OFFSET: u64 = 16; +const CHECKPOINT_SLOT_CHECKSUM_OFFSET: u64 = 24; + +// ----------------------------------------------------------------------- +// Helpers +// ----------------------------------------------------------------------- + +fn encode_entry_pointer(file_idx: u16, file_offset: u64, size: usize) -> u64 { + let fi = (file_idx as u64) & ((1 << 12) - 1); + let fo = ((file_offset / FILE_OFFSET_ALIGNMENT) & ((1 << 26) - 1)) << 12; + let sh = (size.div_ceil(512) as u64) << (12 + 26); + fi | fo | sh +} + +/// Write a phantom entry into the rows file at the given row and column. +fn inject_phantom_entry( + dir: &Path, + row_idx: usize, + col: usize, + file_idx: u16, + file_offset: u64, +) -> Result<(), Error> { + let sig: u32 = 0xDEAD_BEEF; + let ptr = encode_entry_pointer(file_idx, file_offset, 512); + + let mut file = std::fs::OpenOptions::new() + .write(true) + .open(dir.join("rows")) + .map_err(Error::IOError)?; + + let row_base = row_idx * PAGE_SIZE; + + // Write signature + let sig_off = (row_base + SIGS_OFFSET + col * 4) as u64; + file.seek(SeekFrom::Start(sig_off)) + .map_err(Error::IOError)?; + file.write_all(&sig.to_le_bytes()).map_err(Error::IOError)?; + + // Write pointer + let ptr_off = (row_base + PTRS_OFFSET + col * 8) as u64; + file.seek(SeekFrom::Start(ptr_off)) + .map_err(Error::IOError)?; + file.write_all(&ptr.to_le_bytes()).map_err(Error::IOError)?; + + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +/// Read a signature from the rows file. +fn read_signature(dir: &Path, row_idx: usize, col: usize) -> Result { + use std::io::Read; + let mut file = std::fs::File::open(dir.join("rows")).map_err(Error::IOError)?; + let off = (row_idx * PAGE_SIZE + SIGS_OFFSET + col * 4) as u64; + file.seek(SeekFrom::Start(off)).map_err(Error::IOError)?; + let mut buf = [0u8; 4]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + Ok(u32::from_le_bytes(buf)) +} + +fn active_file_ordinal(dir: &Path) -> Result { + use std::io::Read; + + let mut max_ordinal: Option = None; + for entry in std::fs::read_dir(dir).map_err(Error::IOError)? { + let entry = entry.map_err(Error::IOError)?; + let path = entry.path(); + let Some(name) = path.file_name().and_then(|name| name.to_str()) else { + continue; + }; + if !name.starts_with("data_") { + continue; + } + + let mut file = std::fs::File::open(path).map_err(Error::IOError)?; + file.seek(SeekFrom::Start(16)).map_err(Error::IOError)?; + let mut buf = [0u8; 8]; + file.read_exact(&mut buf).map_err(Error::IOError)?; + let ordinal = u64::from_le_bytes(buf); + max_ordinal = Some(max_ordinal.map_or(ordinal, |current| current.max(ordinal))); + } + + max_ordinal.ok_or_else(|| { + Error::IOError(std::io::Error::new( + std::io::ErrorKind::NotFound, + "no data files found", + )) + }) +} + +fn write_commit_cursor(dir: &Path, offset: u64) -> Result<(), Error> { + let mut file = std::fs::OpenOptions::new() + .write(true) + .open(dir.join("index")) + .map_err(Error::IOError)?; + + let ordinal = active_file_ordinal(dir)?; + let generation = 1u64; + let checksum = checkpoint_slot_checksum(generation, ordinal, offset); + + file.seek(SeekFrom::Start( + CHECKPOINT_SLOT_0_OFFSET + CHECKPOINT_SLOT_GENERATION_OFFSET, + )) + .map_err(Error::IOError)?; + file.write_all(&generation.to_le_bytes()) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start( + CHECKPOINT_SLOT_0_OFFSET + CHECKPOINT_SLOT_ORDINAL_OFFSET, + )) + .map_err(Error::IOError)?; + file.write_all(&ordinal.to_le_bytes()) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start( + CHECKPOINT_SLOT_0_OFFSET + CHECKPOINT_SLOT_FILE_OFFSET, + )) + .map_err(Error::IOError)?; + file.write_all(&offset.to_le_bytes()) + .map_err(Error::IOError)?; + + file.seek(SeekFrom::Start( + CHECKPOINT_SLOT_0_OFFSET + CHECKPOINT_SLOT_CHECKSUM_OFFSET, + )) + .map_err(Error::IOError)?; + file.write_all(&checksum.to_le_bytes()) + .map_err(Error::IOError)?; + file.sync_all().map_err(Error::IOError)?; + Ok(()) +} + +/// Fork, run `child_fn` in the child (which should abort), wait and assert +/// the child was killed by SIGABRT. +#[cfg(unix)] +fn fork_expect_abort(child_fn: impl FnOnce()) { + let pid = unsafe { libc::fork() }; + assert!(pid >= 0, "fork failed"); + if pid == 0 { + child_fn(); + // Should not reach here — child_fn should abort. + unsafe { libc::_exit(0) }; + } + let mut status = 0i32; + let wait_rc = unsafe { libc::waitpid(pid, &mut status, 0) }; + assert_eq!(wait_rc, pid); + assert!( + libc::WIFSIGNALED(status), + "child exited normally, expected signal" + ); + assert_eq!( + libc::WTERMSIG(status), + libc::SIGABRT, + "child killed by unexpected signal" + ); +} + +// ----------------------------------------------------------------------- +// Tests +// ----------------------------------------------------------------------- + +/// Inject phantom index entries pointing past the durable extent of the +/// active data file. Rebuild should purge them. +#[test] +fn test_rebuild_purges_phantom_entries() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + // Phase 1: write real data, then close cleanly. + { + let db = CandyStore::open(dir.path(), config)?; + db.set("key1", "val1")?; + db.set("key2", "val2")?; + } + + // Phase 2: inject phantom entries. + // Use a last row slot (col 335) to avoid colliding with real entries. + inject_phantom_entry(dir.path(), 0, 335, 0, 0x100_000)?; + inject_phantom_entry(dir.path(), 1, 335, 0, 0x200_000)?; + + // Verify we actually wrote non-zero signatures. + assert_ne!(read_signature(dir.path(), 0, 335)?, 0); + assert_ne!(read_signature(dir.path(), 1, 335)?, 0); + + // Phase 3: reopen — recovery replays active file and purges phantoms. + { + let db = CandyStore::open(dir.path(), config)?; + assert_eq!(db.get("key1")?, Some(b"val1".to_vec())); + assert_eq!(db.get("key2")?, Some(b"val2".to_vec())); + } + + // Verify phantom signatures are cleared. + assert_eq!(read_signature(dir.path(), 0, 335)?, 0); + assert_eq!(read_signature(dir.path(), 1, 335)?, 0); + + Ok(()) +} + +/// A bogus commit cursor offset beyond the data extent is ignored and rebuild +/// restarts from offset 0 for the active file. +#[test] +fn test_bogus_checkpoint_offset_causes_full_replay() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..50 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + + write_commit_cursor(dir.path(), 0xFFFF_FFFF)?; + + let db = CandyStore::open(dir.path(), config)?; + for i in 0..50 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after bogus commit-cursor rebuild" + ); + } + + Ok(()) +} + +/// A commit cursor offset that points into the middle of an entry or padding +/// must be rejected rather than treated as a valid resume point. +#[test] +fn test_mid_entry_checkpoint_offset_restarts_from_zero() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..50 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + + write_commit_cursor(dir.path(), 1)?; + + let db = CandyStore::open(dir.path(), config)?; + for i in 0..50 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after mid-entry commit-cursor fallback" + ); + } + + Ok(()) +} + +/// Crash mid-rebuild via the `rebuild_entry` crash point, then resume. +#[cfg(unix)] +#[test] +fn test_mid_rebuild_crash_resumes_from_checkpoint() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config { + max_data_file_size: 256 * 1024, + ..Config::default() + }; + + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..3000u32 { + db.set(format!("key{i:04}"), format!("val{i:04}"))?; + } + db._abort_for_testing(); + } + + fork_expect_abort(|| { + unsafe { + libc::setenv( + c"CANDYSTORE_CRASH_POINT".as_ptr(), + c"rebuild_entry".as_ptr(), + 1, + ); + libc::setenv(c"CANDYSTORE_CRASH_AFTER".as_ptr(), c"1200".as_ptr(), 1); + } + let _ = CandyStore::open(dir.path(), config); + }); + + // Reopen — should resume from the persisted replay cursor. + let db = CandyStore::open(dir.path(), config)?; + for i in 0..3000u32 { + assert_eq!( + db.get(format!("key{i:04}"))?, + Some(format!("val{i:04}").into_bytes()), + "key{i:04} missing after resume-from-cursor rebuild" + ); + } + + Ok(()) +} + +/// Crash after data write but before index insert, then rebuild. +/// The data file has the entry but the index doesn't — replay should recover it. +#[cfg(unix)] +#[test] +fn test_crash_after_write_before_insert_recovers_on_rebuild() -> Result<(), Error> { + let dir = tempdir().unwrap(); + let config = Config::default(); + + // Write some baseline data. + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..10 { + db.set(format!("base{i}"), format!("val{i}"))?; + } + db._abort_for_testing(); + } + + // Reopen cleanly to get a stable state, then crash mid-insert. + { + let db = CandyStore::open(dir.path(), config)?; + for i in 0..10 { + assert_eq!( + db.get(format!("base{i}"))?, + Some(format!("val{i}").into_bytes()) + ); + } + } + + // Now write one more key and crash after the data file write but before + // the index is updated. + fork_expect_abort(|| { + unsafe { + libc::setenv( + c"CANDYSTORE_CRASH_POINT".as_ptr(), + c"set_after_write_before_insert".as_ptr(), + 1, + ); + libc::setenv(c"CANDYSTORE_CRASH_AFTER".as_ptr(), c"0".as_ptr(), 1); + } + let db = CandyStore::open(dir.path(), config).unwrap(); + let _ = db.set("crash_key", "crash_val"); + }); + + // Rebuild should recover everything including the crash_key (data is + // durable in the active file even though the index insert never happened). + let db = CandyStore::open(dir.path(), config)?; + for i in 0..10 { + assert_eq!( + db.get(format!("base{i}"))?, + Some(format!("val{i}").into_bytes()), + "base{i} missing after crash recovery" + ); + } + // The crash_key's data was written to the file before the crash, so it + // should be recovered by replay. However, this depends on whether the + // OS flushed the data page to disk before abort — in this test the child + // is doing an in-process abort so the file write may or may not be + // durable. We verify the baseline keys survived; crash_key recovery is + // best-effort. + + Ok(()) +}