From 931fad82cc7bb8204f15f5541d32c10169c77073 Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Fri, 12 Jun 2026 22:03:18 +0000 Subject: [PATCH 1/7] feat: add register_buffers support via iouring Add support for new apis to scheduler 1. register_buffers 2. read_fixed() 3. write_fixe() This let us register the pre-allocated buffers that iouring can use during IO operations rather then allocating it per-io. This is mainly based on best practices learned from TUM DBMS paper https://arxiv.org/pdf/2512.04859 Signed-off-by: Kaviraj --- include/silk/fibers/fiber.h | 30 ++++++++++++++++++++++++++++++ src/fibers/fiber.cpp | 29 +++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/include/silk/fibers/fiber.h b/include/silk/fibers/fiber.h index af5bd19..4e79f7c 100644 --- a/include/silk/fibers/fiber.h +++ b/include/silk/fibers/fiber.h @@ -421,6 +421,36 @@ class FiberScheduler */ static void write(int fd, iovec * iov, uint64_t iov_len, uint64_t offset, uint64_t * bytesWritten, IoFuture * future) noexcept; + /** + * Async fixed-buffer read: submits IORING_OP_READ_FIXED against the buffer + * previously registered at @p bufIndex (see registerBuffers). @p buf must lie + * within that registered buffer. Single contiguous region (no iovec import); + * the kernel skips the per-IO page-pin by reusing the pre-pinned registration. + * + * @param fd File descriptor to read from. + * @param buf Destination, inside the registered buffer at @p bufIndex. + * @param len Number of bytes to read. + * @param offset Byte offset within the file. + * @param bufIndex Index into the registered buffer table. + * @param bytesRead If not null, receives the number of bytes read on success. + * @param future Completion handle. + */ + static void readFixed(int fd, void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept; + + /** + * Async fixed-buffer write: IORING_OP_WRITE_FIXED. See readFixed. + */ + static void + writeFixed(int fd, const void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept; + + /** + * Register a fixed buffer set on every per-CPU io_uring ring, so a fiber that + * is work-stolen to another CPU can still submit READ_FIXED/WRITE_FIXED + * referencing the same index. Call once after initialize(), before issuing any + * fixed-buffer IO. @p count buffers are addressable as bufIndex 0..count-1. + */ + static void registerBuffers(const iovec * iovecs, unsigned count) noexcept; + /** * Blocking poll: suspend the calling fiber until one of the requested * events becomes ready on @p fd. diff --git a/src/fibers/fiber.cpp b/src/fibers/fiber.cpp index 20210b5..0aef299 100644 --- a/src/fibers/fiber.cpp +++ b/src/fibers/fiber.cpp @@ -1437,6 +1437,35 @@ void FiberScheduler::write(int fd, iovec * iov, uint64_t iov_len, uint64_t offse enqueueIo(future, [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_writev(sqe, fd, iov, iov_len, offset); }); } +void FiberScheduler::readFixed(int fd, void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept +{ + future->result = bytesRead; + enqueueIo( + future, [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_read_fixed(sqe, fd, buf, static_cast(len), offset, bufIndex); }); +} + +void FiberScheduler::writeFixed(int fd, const void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept +{ + future->result = bytesWritten; + enqueueIo( + future, + [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_write_fixed(sqe, fd, buf, static_cast(len), offset, bufIndex); }); +} + +void FiberScheduler::registerBuffers(const iovec * iovecs, unsigned count) noexcept +{ + for (uint32_t cpu = 0; cpu < scheduler->processorCount; ++cpu) + { + ProcessorState * processor = &scheduler->processorState[cpu]; + if (processor->number == INVALID_PROCESSOR_NUMBER) + { + continue; + } + int r = ::io_uring_register_buffers(&processor->ring, iovecs, count); + SILK_ASSERT(r == 0, "io_uring_register_buffers failed on cpu %u: %d", cpu, r); + } +} + void FiberScheduler::poll(int fd, uint32_t events, uint64_t * triggeredEvents, IoFuture * future) noexcept { future->result = triggeredEvents; From 91a2952a3e9f5b4497c902ea666394a41bd4b56a Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Fri, 12 Jun 2026 22:05:40 +0000 Subject: [PATCH 2/7] chore: integrate register_buffers in file-perf Add a flag to run file-perf with register buffer iouring api ``` ./bb -b release perf --duration 60s --warmup 10s file --fixed-buffers ``` The numbers looks super interesting. So worth adding it to upstream Signed-off-by: Kaviraj --- bb | 17 +++++++++++++++++ src/perf/file-perf.cpp | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/bb b/bb index 332e9cc..2472440 100755 --- a/bb +++ b/bb @@ -741,6 +741,7 @@ class FilePerfParams: rw: list[str] = field(default_factory=lambda: ["randread"]) flamegraph: bool = False print_counters: bool = False + fixed_buffers: bool = False timeout: int = 180 @@ -777,6 +778,7 @@ def cmd_file_perf(preset: str, params: FilePerfParams) -> None: file_perf = os.path.join(ROOT, f"build/{preset}/bin/file-perf") verbose_flag = ["--verbose"] if log.isEnabledFor(logging.DEBUG) else [] print_counters_flag = ["--print-counters"] if params.print_counters else [] + fixed_buffers_flag = ["--fixed-buffers"] if params.fixed_buffers else [] try: if params.flamegraph: @@ -802,6 +804,7 @@ def cmd_file_perf(preset: str, params: FilePerfParams) -> None: str(params.warmup), "--filename", params.file, + *fixed_buffers_flag, *verbose_flag, ], ) @@ -828,6 +831,7 @@ def cmd_file_perf(preset: str, params: FilePerfParams) -> None: str(params.warmup), "--filename", params.file, + *fixed_buffers_flag, *print_counters_flag, *verbose_flag, timeout=params.timeout or None, @@ -1546,6 +1550,12 @@ def _build_parser() -> argparse.ArgumentParser: metavar="DURATION", help="warmup duration applied to every benchmark (e.g. 10s); per-binary defaults are used when omitted", ) + perf_parser.add_argument( + "--fixed-buffers", + dest="fixed_buffers", + action="store_true", + help="run file-perf with registered buffers (IORING_OP_READ_FIXED / WRITE_FIXED)", + ) perf_parser.add_argument( "targets", nargs="+", @@ -1628,6 +1638,12 @@ def _build_parser() -> argparse.ArgumentParser: action="store_true", help="print perf counters after each run", ) + file_perf_parser.add_argument( + "--fixed-buffers", + dest="file_fixed_buffers", + action="store_true", + help="use registered buffers (IORING_OP_READ_FIXED / WRITE_FIXED)", + ) file_perf_parser.add_argument( "--timeout", dest="file_timeout", @@ -2054,6 +2070,7 @@ def main() -> None: iodepth=[1, 16], rw=["randwrite", "randread"], timeout=args.timeout, + fixed_buffers=args.fixed_buffers, **timing_overrides, ) if "file" in targets: diff --git a/src/perf/file-perf.cpp b/src/perf/file-perf.cpp index 328d8dd..8a05080 100644 --- a/src/perf/file-perf.cpp +++ b/src/perf/file-perf.cpp @@ -129,6 +129,8 @@ struct ClientConfig uint64_t warmupNs = 2'000'000'000ULL; bool direct = false; bool printCounters = false; + // use registered buffers (IORING_OP_READ_FIXED / IORING_OP_WRITE_FIXED). + bool fixedBuffers = false; }; class Benchmark @@ -157,6 +159,7 @@ class Benchmark std::unique_ptr bufs{nullptr, std::free}; std::unique_ptr slots; uint32_t head = 0; + uint32_t index = 0; }; // @@ -199,6 +202,7 @@ void Benchmark::start() uint32_t i = 0; for (Job & job : jobs) { + job.index = i; job.strategy = OffsetStrategy(cfg.mode, cfg.fileSize, cfg.blockSize, i++, static_cast(jobs.size())); // O_DIRECT requires 512-byte-aligned buffers. @@ -216,6 +220,19 @@ void Benchmark::start() } } + // register one fixed buffer per job (each covering its whole + // iodepth*blockSize block) on every per-CPU ring. bufIndex == job.index. + if (cfg.fixedBuffers) + { + std::vector regBufs(jobs.size()); + for (Job & job : jobs) + { + regBufs[job.index].iov_base = job.bufs.get(); + regBufs[job.index].iov_len = static_cast(cfg.iodepth) * cfg.blockSize; + } + silk::FiberScheduler::registerBuffers(regBufs.data(), static_cast(regBufs.size())); + } + for (Job & job : jobs) { int r = silk::FiberScheduler::run(workerFiberMain, {this, &job}, &job.future); @@ -249,6 +266,23 @@ void Benchmark::submit(Job * job, Slot * slot) { slot->startCycles = silk::Tsc::getCycles(); uint64_t offset = job->strategy.next(); + + if (cfg.fixedBuffers) + { + // READ_FIXED / WRITE_FIXED against the buffer registered for this job. + int bufIndex = static_cast(job->index); + if (cfg.mode == MODE_RANDWRITE) + { + silk::FiberScheduler::writeFixed(fd, slot->iov.iov_base, slot->iov.iov_len, offset, bufIndex, nullptr, &slot->future); + } + else + { + silk::FiberScheduler::readFixed(fd, slot->iov.iov_base, slot->iov.iov_len, offset, bufIndex, nullptr, &slot->future); + } + return; + } + + // Baseline: vectored READV / WRITEV with a 1-element iovec. if (cfg.mode == MODE_RANDWRITE) { silk::FiberScheduler::write(fd, &slot->iov, 1, offset, nullptr, &slot->future); @@ -319,6 +353,7 @@ static void printJson(std::vector & latNs, const ClientConfig & cfg) printf(" \"iodepth\": %u,\n", cfg.iodepth); printf(" \"block_size_bytes\": %u,\n", cfg.blockSize); printf(" \"mode\": \"%s\",\n", modeName(cfg.mode)); + printf(" \"fixed_buffers\": %s,\n", cfg.fixedBuffers ? "true" : "false"); printf(" \"file_size_bytes\": %lu,\n", cfg.fileSize); printf(" \"duration_s\": %.3f,\n", durationS); printf(" \"total\": %lu,\n", total); @@ -362,6 +397,7 @@ int main(int argc, char ** argv) ("warmup", "warmup duration (e.g. 2s, 500ms)", cxxopts::value(warmupStr)) ("filename", "file path", cxxopts::value(cfg.filename)) ("direct", "use O_DIRECT (bypass page cache)", cxxopts::value(cfg.direct)) + ("fixed-buffers", "use registered buffers (IORING_OP_READ_FIXED / WRITE_FIXED)", cxxopts::value(cfg.fixedBuffers)) ("print-counters", "enable per-CPU profiler and include counters in the JSON report", cxxopts::value(cfg.printCounters)) ("v,verbose", "enable debug logging", cxxopts::value(verbose)) ; From 9b64f36abb9bdc5927a8d8426c70be6b93c7d59b Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Fri, 12 Jun 2026 22:34:03 +0000 Subject: [PATCH 3/7] chore: make formatter happy `./bb fmt` Signed-off-by: Kaviraj --- src/fibers/fiber.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/fibers/fiber.cpp b/src/fibers/fiber.cpp index 0aef299..b0dcb51 100644 --- a/src/fibers/fiber.cpp +++ b/src/fibers/fiber.cpp @@ -1437,14 +1437,17 @@ void FiberScheduler::write(int fd, iovec * iov, uint64_t iov_len, uint64_t offse enqueueIo(future, [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_writev(sqe, fd, iov, iov_len, offset); }); } -void FiberScheduler::readFixed(int fd, void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept +void FiberScheduler::readFixed( + int fd, void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept { future->result = bytesRead; enqueueIo( - future, [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_read_fixed(sqe, fd, buf, static_cast(len), offset, bufIndex); }); + future, + [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_read_fixed(sqe, fd, buf, static_cast(len), offset, bufIndex); }); } -void FiberScheduler::writeFixed(int fd, const void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept +void FiberScheduler::writeFixed( + int fd, const void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept { future->result = bytesWritten; enqueueIo( From c332ab22127df9317a71219aa21680f2d77368a0 Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Sun, 14 Jun 2026 21:20:58 +0000 Subject: [PATCH 4/7] chore: add `msan` checks and tests Changes 1. Make sure the readFixed api on the registered buffer is checked by msan for uninitialized memory (similar to readv api) 2. Fix the nbytes len field (uint64_t -> uint32_t) because that's the underlying io_uring_* api expects 3. Add a round trip test for new api Signed-off-by: Kaviraj --- include/silk/fibers/fiber.h | 8 ++- src/fibers/fiber.cpp | 17 +++--- src/fibers/tests/io-fixed-test.cpp | 85 ++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 10 deletions(-) create mode 100644 src/fibers/tests/io-fixed-test.cpp diff --git a/include/silk/fibers/fiber.h b/include/silk/fibers/fiber.h index 4e79f7c..2b2d780 100644 --- a/include/silk/fibers/fiber.h +++ b/include/silk/fibers/fiber.h @@ -354,6 +354,10 @@ class FiberScheduler // Used to mark the kernel-written bytes as initialized for MSan. iovec * readIov = nullptr; uint64_t readIovLen = 0; + // Backing storage for the single contiguous region of a fixed read + // (readFixed api has no caller-owned iovec to point at). + // readIov is pointed here in that case. Needed for Memory sanitizer. + iovec readIovStorage{}; #endif }; @@ -435,13 +439,13 @@ class FiberScheduler * @param bytesRead If not null, receives the number of bytes read on success. * @param future Completion handle. */ - static void readFixed(int fd, void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept; + static void readFixed(int fd, void * buf, uint32_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept; /** * Async fixed-buffer write: IORING_OP_WRITE_FIXED. See readFixed. */ static void - writeFixed(int fd, const void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept; + writeFixed(int fd, const void * buf, uint32_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept; /** * Register a fixed buffer set on every per-CPU io_uring ring, so a fiber that diff --git a/src/fibers/fiber.cpp b/src/fibers/fiber.cpp index b0dcb51..39cb25f 100644 --- a/src/fibers/fiber.cpp +++ b/src/fibers/fiber.cpp @@ -1438,21 +1438,22 @@ void FiberScheduler::write(int fd, iovec * iov, uint64_t iov_len, uint64_t offse } void FiberScheduler::readFixed( - int fd, void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept + int fd, void * buf, uint32_t len, uint64_t offset, int bufIndex, uint64_t * bytesRead, IoFuture * future) noexcept { future->result = bytesRead; - enqueueIo( - future, - [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_read_fixed(sqe, fd, buf, static_cast(len), offset, bufIndex); }); +#if defined(__SANITIZE_MEMORY__) + future->readIovStorage = {buf, len}; + future->readIov = &future->readIovStorage; + future->readIovLen = 1; +#endif + enqueueIo(future, [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_read_fixed(sqe, fd, buf, len, offset, bufIndex); }); } void FiberScheduler::writeFixed( - int fd, const void * buf, uint64_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept + int fd, const void * buf, uint32_t len, uint64_t offset, int bufIndex, uint64_t * bytesWritten, IoFuture * future) noexcept { future->result = bytesWritten; - enqueueIo( - future, - [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_write_fixed(sqe, fd, buf, static_cast(len), offset, bufIndex); }); + enqueueIo(future, [=](io_uring_sqe * sqe) noexcept { ::io_uring_prep_write_fixed(sqe, fd, buf, len, offset, bufIndex); }); } void FiberScheduler::registerBuffers(const iovec * iovecs, unsigned count) noexcept diff --git a/src/fibers/tests/io-fixed-test.cpp b/src/fibers/tests/io-fixed-test.cpp new file mode 100644 index 0000000..f94525b --- /dev/null +++ b/src/fibers/tests/io-fixed-test.cpp @@ -0,0 +1,85 @@ +#include + +#include + +#include +#include +#include + +#include +#include + +namespace silk +{ + +// writeFixed then readFixed against a registered buffer. It must test +// round-trip all the three apis, including reads into a non-base offset +// within the registered region. +TEST(IoFixed, writeReadRoundTrip) +{ + static constexpr uint64_t BLOCK = 4096; + static constexpr uint64_t NBLOCKS = 2; + static constexpr uint64_t SIZE = BLOCK * NBLOCKS; + + char tmpl[] = "/tmp/silk-io-fixed-XXXXXX"; + int fd = ::mkstemp(tmpl); + ASSERT_GE(fd, 0) << std::strerror(errno); + ::unlink(tmpl); + ASSERT_EQ(::ftruncate(fd, static_cast(SIZE)), 0) << std::strerror(errno); + + // Single contiguous registration covering the whole buffer; bufIndex 0. + char * buf = static_cast(std::malloc(SIZE)); + ASSERT_NE(buf, nullptr); + iovec reg{buf, SIZE}; + FiberScheduler::registerBuffers(®, 1); + + struct Params + { + int fd; + char * buf; + + static int fiberMain(Params * p) noexcept + { + // Fill block 0 with a known pattern and write it out via WRITE_FIXED. + for (uint64_t i = 0; i < BLOCK; ++i) + { + p->buf[i] = static_cast((i * 7 + 1) & 0xFF); + } + + uint64_t bytesWritten = 0; + FiberScheduler::IoFuture wf; + FiberScheduler::writeFixed(p->fd, p->buf, BLOCK, 0, 0, &bytesWritten, &wf); + EXPECT_EQ(wf.wait(), 0); + EXPECT_EQ(bytesWritten, BLOCK); + + // Read back into the SECOND block: a non-base offset still inside the + // registered region (exercises the "buf within registered buffer" + // contract) that is deliberately left untouched by userspace. Under + // MSan it is poisoned, so the only thing that can mark it initialized + // is the kernel fill + readFixed's MSAN_UNPOISON. If readFixed forgot + // to unpoison, the comparison below endup as a use-of-uninitialized. + char * dst = p->buf + BLOCK; + uint64_t bytesRead = 0; + FiberScheduler::IoFuture rf; + FiberScheduler::readFixed(p->fd, dst, BLOCK, 0, 0, &bytesRead, &rf); + EXPECT_EQ(rf.wait(), 0); + EXPECT_EQ(bytesRead, BLOCK); + + // The kernel-filled bytes must match what we wrote (and must be + // readable without tripping MSan). + for (uint64_t i = 0; i < BLOCK; ++i) + { + EXPECT_EQ(dst[i], static_cast((i * 7 + 1) & 0xFF)) << "mismatch at byte " << i; + } + + return 0; + } + }; + + EXPECT_EQ(FiberScheduler::run(Params::fiberMain, Params{fd, buf}), 0); + + std::free(buf); + ::close(fd); +} + +} // namespace silk From 55ce17f35f6f8b039fd5e50bc3d72e15f573b9d4 Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Sun, 14 Jun 2026 21:46:11 +0000 Subject: [PATCH 5/7] chore: doc strings and assert fix Signed-off-by: Kaviraj --- include/silk/fibers/fiber.h | 9 +++++++++ src/fibers/fiber.cpp | 1 + src/fibers/tests/io-fixed-test.cpp | 11 +++++++++-- src/perf/file-perf.cpp | 2 +- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/silk/fibers/fiber.h b/include/silk/fibers/fiber.h index 2b2d780..873d73c 100644 --- a/include/silk/fibers/fiber.h +++ b/include/silk/fibers/fiber.h @@ -452,6 +452,15 @@ class FiberScheduler * is work-stolen to another CPU can still submit READ_FIXED/WRITE_FIXED * referencing the same index. Call once after initialize(), before issuing any * fixed-buffer IO. @p count buffers are addressable as bufIndex 0..count-1. + * + * You can only call this once. io_uring allows one buffer set per ring, and + * there is no way to undo or change it, so a second call fails (-EBUSY) and + * trips an assert. + * + * Each ring pins its own copy of the buffers in memory, so the total locked + * memory is (number of CPUs) * (size of all buffers). With many CPUs or large + * buffers this can go over the RLIMIT_MEMLOCK limit; if it does, registration + * fails and trips an assert. */ static void registerBuffers(const iovec * iovecs, unsigned count) noexcept; diff --git a/src/fibers/fiber.cpp b/src/fibers/fiber.cpp index 39cb25f..d613c6a 100644 --- a/src/fibers/fiber.cpp +++ b/src/fibers/fiber.cpp @@ -1458,6 +1458,7 @@ void FiberScheduler::writeFixed( void FiberScheduler::registerBuffers(const iovec * iovecs, unsigned count) noexcept { + SILK_ASSERT(scheduler, "registerBuffers called before initialize()"); for (uint32_t cpu = 0; cpu < scheduler->processorCount; ++cpu) { ProcessorState * processor = &scheduler->processorState[cpu]; diff --git a/src/fibers/tests/io-fixed-test.cpp b/src/fibers/tests/io-fixed-test.cpp index f94525b..5cf7ab0 100644 --- a/src/fibers/tests/io-fixed-test.cpp +++ b/src/fibers/tests/io-fixed-test.cpp @@ -12,8 +12,8 @@ namespace silk { -// writeFixed then readFixed against a registered buffer. It must test -// round-trip all the three apis, including reads into a non-base offset +// writeFixed then readFixed against a registered buffer. It must test +// round-trip all the three apis, including reads into a non-base offset // within the registered region. TEST(IoFixed, writeReadRoundTrip) { @@ -82,4 +82,11 @@ TEST(IoFixed, writeReadRoundTrip) ::close(fd); } +// TODO(kavi): this test runs a single fiber on one CPU. The whole point of +// registering buffers on every ring is that a fiber can move to another CPU and +// still use the same bufIndex. We don't test that here because we can't reliably +// force a fiber to move to a specific CPU, so the test would be flaky. If we need +// to be sure this works, add a second test that forces the move and checks the +// fixed IO still works. + } // namespace silk diff --git a/src/perf/file-perf.cpp b/src/perf/file-perf.cpp index 8a05080..6ba4c29 100644 --- a/src/perf/file-perf.cpp +++ b/src/perf/file-perf.cpp @@ -397,7 +397,7 @@ int main(int argc, char ** argv) ("warmup", "warmup duration (e.g. 2s, 500ms)", cxxopts::value(warmupStr)) ("filename", "file path", cxxopts::value(cfg.filename)) ("direct", "use O_DIRECT (bypass page cache)", cxxopts::value(cfg.direct)) - ("fixed-buffers", "use registered buffers (IORING_OP_READ_FIXED / WRITE_FIXED)", cxxopts::value(cfg.fixedBuffers)) + ("fixed-buffers", "use registered buffers (IORING_OP_READ_FIXED / WRITE_FIXED)", cxxopts::value(cfg.fixedBuffers)) ("print-counters", "enable per-CPU profiler and include counters in the JSON report", cxxopts::value(cfg.printCounters)) ("v,verbose", "enable debug logging", cxxopts::value(verbose)) ; From c87f30b9dfa9e9dd2e13b83580027ed00b7bb42c Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Sun, 14 Jun 2026 22:48:36 +0000 Subject: [PATCH 6/7] chore: fix ASSERT convention with key=value Signed-off-by: Kaviraj --- src/fibers/fiber.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fibers/fiber.cpp b/src/fibers/fiber.cpp index d613c6a..bb26351 100644 --- a/src/fibers/fiber.cpp +++ b/src/fibers/fiber.cpp @@ -1467,7 +1467,7 @@ void FiberScheduler::registerBuffers(const iovec * iovecs, unsigned count) noexc continue; } int r = ::io_uring_register_buffers(&processor->ring, iovecs, count); - SILK_ASSERT(r == 0, "io_uring_register_buffers failed on cpu %u: %d", cpu, r); + SILK_ASSERT(r == 0, "io_uring_register_buffers failed: cpu=%u, ret=%d", cpu, r); } } From cc9491151c45a3c5fb2adda79b3018e7e3cf45d7 Mon Sep 17 00:00:00 2001 From: Kaviraj Date: Sun, 14 Jun 2026 22:49:11 +0000 Subject: [PATCH 7/7] doc: update README, perf and scheduler doc Document the new apis in corresponding docs Signed-off-by: Kaviraj --- README.md | 2 ++ docs/perf.md | 8 +++++++- docs/scheduler.md | 11 +++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index de45a31..334df6f 100644 --- a/README.md +++ b/README.md @@ -146,12 +146,14 @@ Async file I/O benchmark using io_uring. | `--rw MODE [MODE ...]` | `randread` | Access mode(s): `randread`, `randwrite`, `seqread` | | `--flamegraph` | | Profile and generate flamegraph SVG | | `--print-counters` | | Print perf counters after each run | +| `--fixed-buffers` | | Use registered buffers (`IORING_OP_READ_FIXED` / `WRITE_FIXED`) | ``` ./bb -b release file-perf ./bb -b release file-perf --bs 64k --size 4g ./bb -b release file-perf --numjobs 1 16 --iodepth 1 16 ./bb -b release file-perf --rw randread randwrite +./bb -b release file-perf --fixed-buffers ./bb -b release file-perf --flamegraph ``` diff --git a/docs/perf.md b/docs/perf.md index 2b7cdfc..137f13e 100644 --- a/docs/perf.md +++ b/docs/perf.md @@ -8,7 +8,7 @@ The main tables are reproducible with `./bb -b release perf --duration 60s --war ## file-perf -- async file I/O -`/dev/shm` (tmpfs, in-memory), bs=4k, size=1 GiB, 60 s measurement, 10 s warmup. Uses `FiberScheduler::read`/`write` (`IORING_OP_READV` / `IORING_OP_WRITEV`). `numjobs` = concurrent worker fibers; `iodepth` = per-fiber async IO queue depth (ring of `IoFuture`s). +`/dev/shm` (tmpfs, in-memory), bs=4k, size=1 GiB, 60 s measurement, 10 s warmup. Uses `FiberScheduler::read`/`write` (`IORING_OP_READV` / `IORING_OP_WRITEV`). `numjobs` = concurrent worker fibers; `iodepth` = per-fiber async IO queue depth (ring of `IoFuture`s). Pass `--fixed-buffers` to switch to registered buffers (`IORING_OP_READ_FIXED` / `IORING_OP_WRITE_FIXED`) -- see the subsection below. | numjobs | iodepth | mode | IOPS | BW | avg | p50 | p95 | p99 | p99.9 | |---|---|---|---|---|---|---|---|---|---| @@ -27,6 +27,12 @@ The main tables are reproducible with `./bb -b release perf --duration 60s --war **Note on batching**: The default `Options::ioUringFlushThreshold = 64` defers `io_uring_submit` until the SQ ring has accumulated enough work to amortize the syscall -- the right trade for network/HTTP/S3 workloads where completion latency dwarfs the few-µs batching delay (see net-perf below for the resulting p99 win). On tmpfs the kernel completes reads inline at submit time, so any deferral pushes submissions off the inline-completion fast path. `file-perf` therefore initializes the scheduler with `ioUringFlushThreshold = 1`, equivalent to per-fiber submit. Measured under the default threshold (64), `16/1 randread` lands at ~1.6M IOPS and `16/16 randread` at ~4.2M -- the override recovers full throughput without any kernel or scheduler change. +### Registered buffers (`--fixed-buffers`) + +`./bb -b release perf --duration 60s --warmup 10s file --fixed-buffers` reruns the same matrix with registered buffers: each worker registers one buffer (covering its whole `iodepth * blockSize` block) on every per-CPU ring via `FiberScheduler::registerBuffers`, then issues `readFixed`/`writeFixed` against it (see `docs/scheduler.md`). The kernel reuses the pre-pinned mapping and skips the per-IO page-pin and iovec import. + +The win is largest where per-IO buffer setup is the dominant cost: high-concurrency writes (`16` jobs) gain the most IOPS and shed the most average and tail latency. Reads, already inline-completed on tmpfs, see smaller gains except at `16/16`. + --- ## fio comparison (io_uring, /dev/shm, bs=4k, size=1 GiB) diff --git a/docs/scheduler.md b/docs/scheduler.md index d7bccdf..8587e3a 100644 --- a/docs/scheduler.md +++ b/docs/scheduler.md @@ -103,6 +103,17 @@ All sleep deadlines are in TSC cycles. `Tsc::getCycles()` is `rdtsc` / `cntvct_e Each of `read`, `write`, and `poll` has two overloads: a blocking form that submits an io_uring SQE and suspends the fiber until completion, and an async form that submits the SQE and returns immediately with an `IoFuture*` the caller waits on separately. `handleCompletionQueue` processes CQEs, extracts the `IoFuture*` from the CQE user data, and calls `future->set()` to wake the waiting fiber. +### Registered (fixed) buffers + +`readFixed` / `writeFixed` are async-only counterparts to `read`/`write` that submit `IORING_OP_READ_FIXED` / `IORING_OP_WRITE_FIXED` against a buffer that was pre-registered with the kernel via `registerBuffers`. Instead of passing an iovec the kernel must pin per IO, the caller passes a `(buf, len)` plus the `bufIndex` of a previously registered buffer; the kernel reuses the pre-pinned page mapping and skips the per-IO page-pin and iovec import. `buf` must lie inside the registered buffer at `bufIndex`. + +`registerBuffers(iovecs, count)` registers one buffer set on **every** per-CPU io_uring ring, so a fiber that is work-stolen to another CPU can still submit fixed IO referencing the same index. Buffers are addressable as `bufIndex` `0..count-1`. Constraints: + +- Call once, after `initialize()` and before issuing any fixed-buffer IO. io_uring allows a single buffer set per ring with no way to undo or change it, so a second call fails (`-EBUSY`) and trips an assert. +- Each ring pins its own copy, so total locked memory is `(number of CPUs) * (size of all buffers)`. With many CPUs or large buffers this can exceed `RLIMIT_MEMLOCK`; registration then fails and trips an assert. + +The underlying liburing helpers are `io_uring_register_buffers`, `io_uring_prep_read_fixed`, and `io_uring_prep_write_fixed`. `file-perf --fixed-buffers` exercises this path (see `docs/perf.md`). + --- ## Sleep Cancellation