From bfacac4d6f71dad373752e16dc1a7d965bb426ec Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Wed, 14 Jan 2026 00:09:33 +0530 Subject: [PATCH 01/26] Add async runtime for io uring --- src/common/src/io_mode.rs | 6 + src/parquet/Cargo.toml | 1 + src/parquet/src/io/io_backend.rs | 20 ++ src/parquet/src/io/io_uring/mod.rs | 2 + src/parquet/src/io/io_uring/runtime.rs | 283 +++++++++++++++++++++++++ src/parquet/src/io/io_uring/tasks.rs | 55 ++++- src/parquet/src/io/io_uring/tests.rs | 36 ++++ 7 files changed, 392 insertions(+), 11 deletions(-) create mode 100644 src/parquet/src/io/io_uring/runtime.rs diff --git a/src/common/src/io_mode.rs b/src/common/src/io_mode.rs index 747f0d5c..75b11295 100644 --- a/src/common/src/io_mode.rs +++ b/src/common/src/io_mode.rs @@ -26,6 +26,10 @@ pub enum IoMode { #[serde(rename = "uring-blocking")] UringBlocking, + /// Uses an io_uring runtime + #[serde(rename = "uring-non-blocking")] + UringNonBlocking, + /// Uses rust's std::fs::File, this is blocking IO. /// On Linux, this is essentially `pread/pwrite` /// This is the default on non-Linux platforms. @@ -57,6 +61,7 @@ impl Display for IoMode { IoMode::StdBlocking => "std-blocking", IoMode::TokioIO => "tokio", IoMode::StdSpawnBlocking => "std-spawn-blocking", + IoMode::UringNonBlocking => "uring-non-blocking", } ) } @@ -75,6 +80,7 @@ impl FromStr for IoMode { "std-blocking" => IoMode::StdBlocking, "tokio" => IoMode::TokioIO, "std-spawn-blocking" => IoMode::StdSpawnBlocking, + "uring-non-blocking" => IoMode::UringNonBlocking, _ => return Err(format!("Invalid IO mode: {s}")), }) } diff --git a/src/parquet/Cargo.toml b/src/parquet/Cargo.toml index 25eb39c2..bdc854f6 100644 --- a/src/parquet/Cargo.toml +++ b/src/parquet/Cargo.toml @@ -27,6 +27,7 @@ parquet-variant-compute = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } itertools = "0.14.0" +async-executor = "1.13.3" [target.'cfg(target_os = "linux")'.dependencies] io-uring = "0.7.10" diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index f915ded3..ab166583 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -55,6 +55,16 @@ pub(super) async fn read( panic!("io_uring modes are only supported on Linux"); } } + IoMode::UringNonBlocking => { + #[cfg(target_os = "linux")] + { + super::io_uring::runtime::read(path, range, false).await + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_uring modes are only supported on Linux"); + } + } IoMode::UringMultiAsync => { #[cfg(target_os = "linux")] { @@ -109,6 +119,16 @@ pub(super) async fn write( panic!("io_uring modes are only supported on Linux"); } } + IoMode::UringNonBlocking => { + #[cfg(target_os = "linux")] + { + super::io_uring::runtime::write(path, &data).await + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_uring modes are only supported on Linux"); + } + } IoMode::UringMultiAsync => { #[cfg(target_os = "linux")] { diff --git a/src/parquet/src/io/io_uring/mod.rs b/src/parquet/src/io/io_uring/mod.rs index 692fc8d1..f9827703 100644 --- a/src/parquet/src/io/io_uring/mod.rs +++ b/src/parquet/src/io/io_uring/mod.rs @@ -7,5 +7,7 @@ pub(crate) use thread_pool_uring::initialize_uring_pool; pub(crate) mod single_uring; +pub(crate) mod runtime; + #[cfg(test)] mod tests; diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs new file mode 100644 index 00000000..59886f96 --- /dev/null +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -0,0 +1,283 @@ +use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::fd::AsRawFd as _, path::PathBuf, pin::Pin, rc::Rc, sync::atomic::{AtomicBool, Ordering}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}}; + +use async_executor::LocalExecutor; +use bytes::Bytes; +use futures::Future; +use io_uring::{IoUring, squeue, cqueue}; +use tokio::sync::oneshot; + +use crate::io::io_uring::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; + +const URING_NUM_ENTRIES: u32 = 128; + +const MAX_CONCURRENT_TASKS: u32 = 128; + +type ExecutorTask = Pin + Send>>; + +pub struct UringExecutor { + workers: Vec>, + sender: crossbeam_channel::Sender, +} + +impl UringExecutor { + /// Spawn worker threads and initialize channel to receive tasks + pub fn new(num_threads: usize) -> UringExecutor { + let mut workers = Vec::new(); + let (sender, receiver) = crossbeam_channel::unbounded::(); + for i in 0..num_threads { + let receiver_clone = receiver.clone(); + let worker = thread::Builder::new() + .name(std::format!("lc-io-worker-{}", i)) + .spawn(move || { + worker_main_loop(receiver_clone); + }) + .expect("Failed to spawn IO runtime worker"); + workers.push(worker); + } + UringExecutor { + workers, + sender, + } + } + + /// Spawns a task in the uring runtime by sending it through a crossbeam channel. + /// The result is received through a oneshot channel + pub fn spawn(self: &mut Self, future: F) -> oneshot::Receiver + where + F::Output: Send + 'static, + { + let (sender, receiver) = oneshot::channel(); + let f = async move { + let output = future.await; + let _res = sender.send(output); + if !_res.is_ok() { + panic!("Failed to send task result back"); + } + }; + let task = Box::pin(f); + self.sender.send(task).expect("UringExecutor failed to send task"); + receiver + } + + pub fn run_to_completion(self: &mut Self, future: F) -> F::Output + where + F::Output: Send + 'static, + { + let receiver = self.spawn(future); + receiver.blocking_recv().expect("Failed to receive result") + } +} + +thread_local! { + static LOCAL_WORKER: RefCell = RefCell::new(RuntimeWorker::new()); +} + +struct RuntimeWorker { + ring: io_uring::IoUring, + inflight_tasks: Vec>, + tokens: VecDeque, + need_submit: bool, + io_performed: u64, +} + +impl RuntimeWorker { + pub fn new() -> RuntimeWorker { + let builder = IoUring::::builder(); + let ring = builder + .build(URING_NUM_ENTRIES) + .expect("Failed to build IoUring instance"); + let mut tokens = VecDeque::::with_capacity(MAX_CONCURRENT_TASKS as usize); + let mut inflight_tasks = Vec::>::with_capacity(MAX_CONCURRENT_TASKS as usize); + for i in 0..MAX_CONCURRENT_TASKS { + tokens.push_back(i as u16); + inflight_tasks.push(None); + } + + RuntimeWorker { + ring, + inflight_tasks, + tokens, + need_submit: false, + io_performed: 0, + } + } + + fn poll_completions(self: &mut Self) { + let cq = &mut self.ring.completion(); + loop { + cq.sync(); + match cq.next() { + Some(cqe) => { + println!("Received completion"); + let token = cqe.user_data() as usize; + let task = self.inflight_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + task.inner.borrow_mut().complete(&cqe); + unsafe { (*task.completed).store(true, Ordering::Relaxed); } + task.waker.wake(); + self.tokens.push_back(token as u16); + self.io_performed += 1; + } + None => break, + } + } + } + + fn submit_task(self: &mut Self, task: AsyncTask) { + println!("Submitting task"); + let token = self.tokens.pop_front().expect("No more tokens"); + let sq = &mut self.ring.submission(); + let sqe = task.inner.borrow_mut().prepare_sqe().user_data(token as u64); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + self.inflight_tasks[token as usize] = Some(task); + self.need_submit = true; + } + + pub fn add_task(task: AsyncTask) { + LOCAL_WORKER.with(|worker| { + let mut worker = worker.borrow_mut(); + worker.submit_task(task); + }); + } +} + +fn worker_main_loop(receiver: crossbeam_channel::Receiver) { + let executor = LocalExecutor::new(); + loop { + while !receiver.is_empty() { + let task = receiver.recv() + .expect("Failed to receive task"); + println!("Spawning task"); + executor.spawn(task).detach(); + } + let task_found = executor.try_tick(); + LOCAL_WORKER.with(|worker| { + let mut worker = worker.borrow_mut(); + let mut num_completions = 0; + if worker.need_submit { + num_completions = worker.ring.submit().expect("Failed to submit"); + worker.need_submit = false; + } else if !task_found && worker.tokens.len() < MAX_CONCURRENT_TASKS as usize { + num_completions = worker.ring.submit_and_wait(1).expect("Failed to submit"); + } + if num_completions > 0 { + worker.poll_completions(); + } + }); + } +} + +struct AsyncTask { + // Note: Should change this to Arc in case of a work-stealing scheduler + pub inner: Rc>, + pub waker: Waker, + pub completed: *mut AtomicBool, +} + +enum UringState +{ + Undecided, + Created, + Submitted, +} + +pub(crate) struct UringFuture +where + T: IoTask + 'static, +{ + state: UringState, + task: Rc>, + completed: AtomicBool, +} + +unsafe impl Send for UringFuture +where T: IoTask + 'static, {} + +impl UringFuture +where + T: IoTask + 'static, +{ + fn new(task: Rc>) -> UringFuture { + UringFuture { + state: UringState::Created, + task: task, + completed: AtomicBool::new(false), + } + } +} + +impl Future for UringFuture +where + T: IoTask + 'static, +{ + type Output = Rc>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + loop { + let state = std::mem::replace(&mut self.state, UringState::Undecided); + match state { + UringState::Created => { + let async_task = AsyncTask { + inner: self.task.clone(), + waker: cx.waker().clone(), + completed: &mut self.completed, + }; + RuntimeWorker::add_task(async_task); + self.state = UringState::Submitted; + } + UringState::Submitted => match self.completed.load(Ordering::Relaxed) { + true => { + return Poll::Ready(self.task.clone()); + } + false => { + self.state = UringState::Submitted; + return Poll::Pending; + } + } + UringState::Undecided => unreachable!("state cannot be undecided during poll"), + } + } + } +} + +fn submit_async_task(task: T) -> UringFuture +where + T: IoTask + 'static, +{ + UringFuture::new(Rc::new(RefCell::new(task))) +} + +pub(crate) async fn read( + path: PathBuf, + range: Option>, + direct_io: bool, +) -> Result { + let open_task = FileOpenTask::build(path, direct_io)?; + let file = submit_async_task(open_task).await.borrow_mut().get_result()?; + + let effective_range = if let Some(range) = range { + range + } else { + let len = file.metadata()?.len(); + 0..len + }; + + let read_task = FileReadTask::build(effective_range, file, direct_io); + submit_async_task(read_task).await.borrow_mut().get_result() +} + +pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(path) + .expect("failed to create file"); + + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + submit_async_task(write_task).await.borrow_mut().get_result() +} \ No newline at end of file diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index 9f79bc5a..346da682 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -1,13 +1,8 @@ use std::{ - any::Any, - ffi::CString, - fs, mem, - ops::Range, - os::{ + any::Any, cell::RefCell, ffi::CString, fs, mem, ops::Range, os::{ fd::{AsRawFd, FromRawFd, RawFd}, unix::ffi::OsStringExt, - }, - path::PathBuf, + }, path::PathBuf, rc::Rc }; use bytes::Bytes; @@ -16,7 +11,7 @@ use io_uring::{cqueue, opcode, squeue}; pub(crate) const BLOCK_ALIGN: usize = 4096; /// Represents an IO request to the uring worker thread. -pub(crate) trait IoTask: Send + Any + std::fmt::Debug { +pub trait IoTask: Send + Any + std::fmt::Debug { /// Convert the request to an io-uring submission queue entry. fn prepare_sqe(&mut self) -> squeue::Entry; @@ -28,7 +23,7 @@ pub(crate) trait IoTask: Send + Any + std::fmt::Debug { } #[derive(Debug)] -pub(crate) struct FileOpenTask { +pub struct FileOpenTask { path: CString, direct_io: bool, fd: Option, @@ -63,6 +58,18 @@ impl FileOpenTask { let file = unsafe { fs::File::from_raw_fd(fd) }; Ok(file) } + + pub(crate) fn get_result(self: &mut Self) -> Result { + if let Some(err) = self.error.take() { + return Err(err); + } + let fd = self.fd.take().ok_or_else(|| { + std::io::Error::other("open operation completed without returning file descriptor") + })?; + // SAFETY: `fd` has been received from the kernel for this task and is uniquely owned here. + let file = unsafe { fs::File::from_raw_fd(fd) }; + Ok(file) + } } impl IoTask for FileOpenTask { @@ -104,7 +111,7 @@ impl Drop for FileOpenTask { } #[derive(Debug)] -pub(crate) struct FileReadTask { +pub struct FileReadTask { buffer: Vec, aligned_offset: usize, file: fs::File, @@ -180,6 +187,24 @@ impl FileReadTask { Ok(bytes.slice(data_start..data_end)) } + + /// Return a bytes object holding the result of the read operation. + #[inline] + pub(crate) fn get_result(self: &mut Self) -> Result { + if let Some(err) = self.error.take() { + return Err(err); + } + + let (start_padding, _) = self.padding(); + let range_len = (self.range.end - self.range.start) as usize; + let data_start = self.aligned_offset + start_padding; + let data_end = data_start + range_len; + + let buffer = mem::take(&mut self.buffer); + let bytes = Bytes::from(buffer); + + Ok(bytes.slice(data_start..data_end)) + } } impl IoTask for FileReadTask { @@ -216,7 +241,7 @@ impl IoTask for FileReadTask { } #[derive(Debug)] -pub(crate) struct FileWriteTask { +pub struct FileWriteTask { data: Bytes, fd: RawFd, error: Option, @@ -238,6 +263,14 @@ impl FileWriteTask { } Ok(()) } + + #[inline] + pub(crate) fn get_result(self: &mut Self) -> Result<(), std::io::Error> { + if let Some(err) = self.error.take() { + return Err(err); + } + Ok(()) + } } impl IoTask for FileWriteTask { diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index 94cb808d..7f4bc14c 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -1,5 +1,7 @@ #![cfg(target_os = "linux")] +use crate::io::io_uring::runtime::{self, UringExecutor}; + use super::{ initialize_uring_pool, multi_async_uring, multi_blocking_uring, single_uring, thread_pool_uring, }; @@ -150,3 +152,37 @@ fn read_write_roundtrip_all_backends() { drop(tmpdir); } } + +/// Non-blocking uring requires a dedicated runtime +#[test] +fn read_write_roundtrip_non_blocking_uring() { + let original: Vec = (0..128).map(|i| (i as u8).wrapping_mul(3)).collect(); + let mut executor = UringExecutor::new(1); + + let (tmpdir, path) = seed_file(&original); + let path_clone = path.clone(); + let read_bytes = executor.run_to_completion(async move { + runtime::read(path_clone, None, false).await + }).unwrap_or_else(|err| panic!("read failed: {err}")); + assert_eq!( + read_bytes.as_ref(), + original.as_slice(), + "read returned unexpected payload", + ); + + let new_payload: Vec = (0..64).map(|i| (i as u8).wrapping_add(1)).collect(); + let bytes = Bytes::from(new_payload.clone()); + let path_clone = path.clone(); + executor.run_to_completion(async move { + runtime::write(path_clone, &bytes.clone()).await + }).unwrap_or_else(|err| panic!("write failed: {err}")); + + let on_disk = fs::read(&path).expect("failed to read updated file"); + assert_eq!( + on_disk, + new_payload, + "wrote unexpected data", + ); + + drop(tmpdir); +} From 533869ea3bcac2ef13148716869d918002645010 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Wed, 14 Jan 2026 00:15:41 +0530 Subject: [PATCH 02/26] Fix --- src/parquet/src/io/io_uring/runtime.rs | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index 59886f96..ea5da015 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -108,7 +108,6 @@ impl RuntimeWorker { cq.sync(); match cq.next() { Some(cqe) => { - println!("Received completion"); let token = cqe.user_data() as usize; let task = self.inflight_tasks[token] .take() @@ -125,7 +124,6 @@ impl RuntimeWorker { } fn submit_task(self: &mut Self, task: AsyncTask) { - println!("Submitting task"); let token = self.tokens.pop_front().expect("No more tokens"); let sq = &mut self.ring.submission(); let sqe = task.inner.borrow_mut().prepare_sqe().user_data(token as u64); @@ -151,22 +149,18 @@ fn worker_main_loop(receiver: crossbeam_channel::Receiver) { while !receiver.is_empty() { let task = receiver.recv() .expect("Failed to receive task"); - println!("Spawning task"); executor.spawn(task).detach(); } let task_found = executor.try_tick(); LOCAL_WORKER.with(|worker| { let mut worker = worker.borrow_mut(); - let mut num_completions = 0; if worker.need_submit { - num_completions = worker.ring.submit().expect("Failed to submit"); + worker.ring.submit().expect("Failed to submit"); worker.need_submit = false; } else if !task_found && worker.tokens.len() < MAX_CONCURRENT_TASKS as usize { - num_completions = worker.ring.submit_and_wait(1).expect("Failed to submit"); - } - if num_completions > 0 { - worker.poll_completions(); + worker.ring.submit_and_wait(1).expect("Failed to submit"); } + worker.poll_completions(); }); } } From 1456d6f781085dce017c6ee6fb214e79b8b70ad1 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 22 Feb 2026 12:52:42 -0600 Subject: [PATCH 03/26] Add a fixed buffer allocator for io_uring This commit implements a buffer pool for fixed buffers in order to remove memory allocation and pinning overheads during io submission. The buffer pool is integrated into the uring threadpool mechanism. Also implements minor optimizations in the uring threadpool, such as syscall batching. --- benchmark/bench_server.rs | 4 + benchmark/in_process.rs | 9 +- benchmark/src/inprocess_runner.rs | 9 + dev/README.md | 18 +- examples/example_server.rs | 1 + src/common/Cargo.toml | 6 + src/common/src/lib.rs | 1 + src/common/src/memory/arena.rs | 125 +++++ src/common/src/memory/global_pool.rs | 0 src/common/src/memory/mod.rs | 6 + src/common/src/memory/page.rs | 216 +++++++++ src/common/src/memory/pool.rs | 454 ++++++++++++++++++ src/common/src/memory/segment.rs | 156 ++++++ src/common/src/memory/tcache.rs | 454 ++++++++++++++++++ src/local/src/lib.rs | 10 + src/parquet/Cargo.toml | 3 +- src/parquet/bench/filter_pushdown.rs | 1 + src/parquet/src/cache/mod.rs | 4 +- src/parquet/src/cache/stats.rs | 1 + src/parquet/src/io/io_backend.rs | 24 +- .../src/io/io_uring/multi_async_uring.rs | 8 +- .../src/io/io_uring/multi_blocking_uring.rs | 8 +- src/parquet/src/io/io_uring/runtime.rs | 9 +- src/parquet/src/io/io_uring/single_uring.rs | 8 +- src/parquet/src/io/io_uring/tasks.rs | 173 ++++++- src/parquet/src/io/io_uring/tests.rs | 12 +- .../src/io/io_uring/thread_pool_uring.rs | 245 ++++++++-- src/parquet/src/io/mod.rs | 14 +- src/parquet/src/optimizers/mod.rs | 1 + .../src/reader/runtime/liquid_cache_reader.rs | 1 + .../src/reader/runtime/liquid_stream.rs | 1 + src/server/src/lib.rs | 3 + src/server/src/service.rs | 3 + src/server/src/tests/mod.rs | 1 + 34 files changed, 1880 insertions(+), 109 deletions(-) create mode 100644 src/common/src/memory/arena.rs create mode 100644 src/common/src/memory/global_pool.rs create mode 100644 src/common/src/memory/mod.rs create mode 100644 src/common/src/memory/page.rs create mode 100644 src/common/src/memory/pool.rs create mode 100644 src/common/src/memory/segment.rs create mode 100644 src/common/src/memory/tcache.rs diff --git a/benchmark/bench_server.rs b/benchmark/bench_server.rs index 17563ede..753f1939 100644 --- a/benchmark/bench_server.rs +++ b/benchmark/bench_server.rs @@ -51,6 +51,9 @@ struct CliArgs { /// IO mode, available options: uring, uring-direct, std-blocking, tokio, std-spawn-blocking #[arg(long = "io-mode", default_value = "uring-multi-async")] io_mode: IoMode, + + #[arg(long = "fixed-buffer-pool-size-mb", default_value = "0")] + fixed_buffer_pool_size_mb: usize, } #[tokio::main] @@ -81,6 +84,7 @@ async fn main() -> Result<(), Box> { squeeze_policy, Box::new(NoHydration::new()), Some(args.io_mode), + args.fixed_buffer_pool_size_mb, )?; let liquid_cache_server = Arc::new(liquid_cache_server); diff --git a/benchmark/in_process.rs b/benchmark/in_process.rs index 8ff6259a..444b963f 100644 --- a/benchmark/in_process.rs +++ b/benchmark/in_process.rs @@ -4,7 +4,7 @@ use fastrace::prelude::*; use liquid_cache_benchmarks::{ BenchmarkManifest, InProcessBenchmarkMode, InProcessBenchmarkRunner, setup_observability, }; -use liquid_cache_common::IoMode; +use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use mimalloc::MiMalloc; use serde::Serialize; use std::path::PathBuf; @@ -70,6 +70,9 @@ struct InProcessBenchmark { /// IO mode, available options: uring, uring-direct, std-blocking, tokio, std-spawn-blocking #[arg(long = "io-mode", default_value = "uring-multi-async")] io_mode: IoMode, + + #[arg(long = "fixed-buffer-pool-size-mb", default_value = "0")] + fixed_buffer_pool_size_mb: usize, } impl InProcessBenchmark { @@ -88,7 +91,8 @@ impl InProcessBenchmark { .with_cache_dir(self.cache_dir.clone()) .with_query_filter(self.query_index) .with_io_mode(self.io_mode) - .with_output_dir(self.output_dir.clone()); + .with_output_dir(self.output_dir.clone()) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb); runner.run(manifest, self, output).await?; Ok(()) } @@ -102,6 +106,7 @@ async fn main() -> Result<()> { let _guard = root.set_local_parent(); benchmark.run().await?; + FixedBufferPool::print_stats(); fastrace::flush(); Ok(()) } diff --git a/benchmark/src/inprocess_runner.rs b/benchmark/src/inprocess_runner.rs index 5e535933..c571ba56 100644 --- a/benchmark/src/inprocess_runner.rs +++ b/benchmark/src/inprocess_runner.rs @@ -202,6 +202,7 @@ pub struct InProcessBenchmarkRunner { pub io_mode: IoMode, pub output_dir: Option, pub collect_perf_events: bool, + pub fixed_buffer_pool_size_mb: usize, } impl Default for InProcessBenchmarkRunner { @@ -224,6 +225,7 @@ impl InProcessBenchmarkRunner { io_mode: IoMode::default(), output_dir: None, collect_perf_events: false, + fixed_buffer_pool_size_mb: 0, } } @@ -282,6 +284,11 @@ impl InProcessBenchmarkRunner { self } + pub fn with_fixed_buffer_pool_size_mb(mut self, fixed_buffer_pool_size_mb: usize) -> Self { + self.fixed_buffer_pool_size_mb = fixed_buffer_pool_size_mb; + self + } + #[fastrace::trace] async fn setup_context( &self, @@ -348,6 +355,7 @@ impl InProcessBenchmarkRunner { .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_io_mode(self.io_mode) .with_eager_shredding(true) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb) .build(session_config)?; (v.0, Some(v.1)) } @@ -359,6 +367,7 @@ impl InProcessBenchmarkRunner { .with_hydration_policy(Box::new(NoHydration::new())) .with_squeeze_policy(Box::new(TranscodeEvict)) .with_io_mode(self.io_mode) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb) .build(session_config)?; (v.0, Some(v.1)) } diff --git a/dev/README.md b/dev/README.md index 1991c206..bfecdf63 100644 --- a/dev/README.md +++ b/dev/README.md @@ -22,7 +22,6 @@ LiquidCache exports OpenTelemetry traces. Spin up a Jaeger v2 ```bash docker run \ --name jaeger \ - --replace \ -e COLLECTOR_OTLP_ENABLED=true \ -p 16686:16686 \ -p 4317:4317 \ @@ -30,6 +29,8 @@ docker run \ cr.jaegertracing.io/jaegertracing/jaeger:2.11.0 ``` +If a container named `jaeger` already exists, remove it first: `docker rm -f jaeger` (or `podman rm -f jaeger`). + This image contains the Jaeger v2 distribution. Port 16686 exposes the frontend UI at http://localhost:16686. 4317 and 4318 expose OTLP over gRPC and HTTP respectively. @@ -76,6 +77,21 @@ This will trace the execution of `iteration = 2` (`arg1 == 2`) and print the `io [512, 1K) 194 |@@@ | ``` +```bash +sudo bpftrace -e ' + usdt:./target/release/in_process:liquid_benchmark:iteration_start /arg1 == 2/ {@enable = 1;} + usdt:./target/release/in_process:liquid_benchmark:iteration_start /arg1 > 2/ {@enable = 0;} + usdt:./target/release/in_process:io_submitted /@enable/ { + @t[arg0] = nsecs; + } + usdt:./target/release/in_process:io_completed /@enable && @t[arg0]/ { + $us = (nsecs - @t[arg0]) / 1000; + @lat = hist($us); + delete(@t[arg0]); + } + ' +``` + If you're using blocking io mode, try this: ```bash sudo bpftrace -e ' diff --git a/examples/example_server.rs b/examples/example_server.rs index 7fcc4c1e..d16d9563 100644 --- a/examples/example_server.rs +++ b/examples/example_server.rs @@ -17,6 +17,7 @@ async fn main() -> Result<(), Box> { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), Some(IoMode::default()), + 0, )?; let flight = FlightServiceServer::new(liquid_cache); diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index 5c059ccd..18dfbe26 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -12,10 +12,16 @@ arrow-flight = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } chrono = "0.4.42" +crossbeam = "0.8.4" futures = { workspace = true } +io-uring = "0.7.11" +libc = "0.2.177" +log.workspace = true object_store = { workspace = true } prost = { workspace = true } +rand = "0.9.2" serde = { workspace = true } +tempfile.workspace = true thiserror = "2.0.17" tokio = { workspace = true } url = { workspace = true } diff --git a/src/common/src/lib.rs b/src/common/src/lib.rs index 05c5d68e..9d9682df 100644 --- a/src/common/src/lib.rs +++ b/src/common/src/lib.rs @@ -5,3 +5,4 @@ pub mod mock_store; pub mod rpc; pub mod utils; pub use io_mode::IoMode; +pub mod memory; diff --git a/src/common/src/memory/arena.rs b/src/common/src/memory/arena.rs new file mode 100644 index 00000000..8258ba63 --- /dev/null +++ b/src/common/src/memory/arena.rs @@ -0,0 +1,125 @@ +use std::{io, os::raw::c_void, ptr::null_mut}; + +use io_uring::IoUring; + +use crate::memory::{ + page::Slice, pool::{FIXED_BUFFER_BITS, FIXED_BUFFER_SIZE_BYTES}, segment::{SEGMENT_SIZE, SEGMENT_SIZE_BITS, Segment} +}; + +pub struct Arena { + size: usize, + slices: Vec, + used_bitmap: Vec, + /** + * Segments need to be aligned to 32MB boundaries. Hence the first segment's starting address + * could be different from the starting address of the allocated memory + */ + aligned_start_ptr: *mut u8, + actual_start_ptr: *mut u8, + buffers_registered: bool, +} + +unsafe impl Send for Arena {} +unsafe impl Sync for Arena {} + +impl Arena { + pub fn new(capacity: usize) -> Arena { + let mem_start = Self::allocate_memory_from_os(capacity); + assert_ne!(mem_start, null_mut()); + let mem_end = mem_start.wrapping_add(capacity); + let ptr_aligned = (mem_start as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; + let mut slice_start = ptr_aligned; + if ptr_aligned != (mem_start as usize) { + slice_start = ptr_aligned + SEGMENT_SIZE; + } + let mut slices = Vec::new(); + while slice_start + SEGMENT_SIZE <= mem_end as usize { + slices.push(Slice { + ptr: slice_start as *mut u8, + size: SEGMENT_SIZE, + }); + slice_start += SEGMENT_SIZE; + } + let mut used_bitmap = Vec::new(); + used_bitmap.resize(slices.len(), 0); + + Arena { + size: capacity, + slices: slices, + used_bitmap: used_bitmap, + aligned_start_ptr: ptr_aligned as *mut u8, + actual_start_ptr: mem_start, + buffers_registered: false, + } + } + + fn allocate_memory_from_os(capacity: usize) -> *mut u8 { + let prot = libc::PROT_READ | libc::PROT_WRITE; + let flags = libc::MAP_ANONYMOUS | libc::MAP_PRIVATE; + unsafe { libc::mmap64(null_mut(), capacity, prot, flags, -1, 0) as *mut u8 } + } + + pub fn allocate_segment(self: &mut Self, size: usize) -> Option<*mut Segment> { + let num_slices = (size + SEGMENT_SIZE - 1) / SEGMENT_SIZE; + let mut contiguous = 0; + let mut result: i32 = -1; + + for index in 0..self.used_bitmap.len() { + let bit = self.used_bitmap[index]; + if bit == 0 { + contiguous += 1; + if contiguous == num_slices { + result = (index + 1 - contiguous) as i32; + break; + } + } else { + contiguous = 0; + } + } + if result == -1 { + return None; + } + for i in 0..contiguous { + self.used_bitmap[result as usize + i] = 1; + } + let combined_slice = Slice { + ptr: self.slices[result as usize].ptr, + size: num_slices * SEGMENT_SIZE, + }; + Some(Segment::new_from_slice(combined_slice)) + } + + pub(crate) fn start_ptr(self: &Self) -> *mut u8 { + self.aligned_start_ptr + } + + pub(crate) fn retire_segment(self: &mut Self, segment: *mut Segment) { + debug_assert!((self.slices[0].ptr as usize) <= segment as usize); + let segment_idx = (segment as usize - self.slices[0].ptr as usize) / SEGMENT_SIZE; + self.used_bitmap[segment_idx] = 0; + } + + pub(crate) fn register_buffers_with_ring(self: &mut Self, ring: &IoUring) -> io::Result<()> { + let num_buffers = self.size >> FIXED_BUFFER_BITS; + let mut buffers = Vec::::new(); + buffers.reserve(num_buffers); + let mut base_ptr = self.aligned_start_ptr; + for _i in 0..num_buffers { + buffers.push(libc::iovec {iov_base: base_ptr as *mut std::ffi::c_void, iov_len: FIXED_BUFFER_SIZE_BYTES}); + base_ptr = base_ptr.wrapping_add(FIXED_BUFFER_SIZE_BYTES); + } + let res = unsafe { + ring.submitter().register_buffers(&buffers) + }; + self.buffers_registered = res.is_ok(); + res + } +} + +impl Drop for Arena { + fn drop(self: &mut Self) { + unsafe { + libc::munmap(self.actual_start_ptr as *mut c_void, self.size); + } + } +} \ No newline at end of file diff --git a/src/common/src/memory/global_pool.rs b/src/common/src/memory/global_pool.rs new file mode 100644 index 00000000..e69de29b diff --git a/src/common/src/memory/mod.rs b/src/common/src/memory/mod.rs new file mode 100644 index 00000000..75406a8e --- /dev/null +++ b/src/common/src/memory/mod.rs @@ -0,0 +1,6 @@ +pub mod page; +pub mod pool; +mod segment; +mod arena; +mod global_pool; +mod tcache; \ No newline at end of file diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs new file mode 100644 index 00000000..41162bff --- /dev/null +++ b/src/common/src/memory/page.rs @@ -0,0 +1,216 @@ +use std::{ptr::null_mut, sync::atomic::{AtomicU8, Ordering}, u8}; + +use crossbeam::utils::CachePadded; + +use crate::memory::tcache::MIN_SIZE_FROM_PAGES; + +pub const PAGE_SIZE: usize = 64<<10; // 64KB +const MAX_BLOCKS_PER_PAGE: usize = PAGE_SIZE/MIN_SIZE_FROM_PAGES; + +struct LocalFreeList { + head: u8, + tail: u8, + num_blocks: u8, + /** + * Stores the block indices within the page for a compact representation, rather than storing pointers. + * That is, if block index=i, it represents ith block from the start of the page. + */ + blocks: [u8; MAX_BLOCKS_PER_PAGE], +} + +impl LocalFreeList { + fn empty() -> LocalFreeList { + LocalFreeList { + head: 0, + tail: 0, + num_blocks: 0, + blocks: [0; MAX_BLOCKS_PER_PAGE], + } + } + + fn new(num_blocks: usize) -> LocalFreeList { + debug_assert!(num_blocks <= MAX_BLOCKS_PER_PAGE); + let mut blocks = [0u8; MAX_BLOCKS_PER_PAGE]; + for i in 0..num_blocks { + blocks[i] = i as u8; + } + LocalFreeList { head: 0, tail: num_blocks as u8, num_blocks: num_blocks as u8, blocks: blocks } + } + + fn push(&mut self, block: u8) { + debug_assert!(self.tail.wrapping_sub(self.head) < self.num_blocks); + self.blocks[self.tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] = block; + self.tail = self.tail.wrapping_add(1); + } + + fn is_empty(&self) -> bool { + self.head == self.tail + } + + fn pop(&mut self) -> Option { + if self.head == self.tail { + return None + } + let ret = self.blocks[self.head as usize & (MAX_BLOCKS_PER_PAGE - 1)]; + self.head = self.head.wrapping_add(1); + Some(ret) + } +} + +struct MPSCQueue { + head: u8, + tail: CachePadded, + num_blocks: u8, + blocks: [u8; MAX_BLOCKS_PER_PAGE], +} + +impl MPSCQueue { + const HAZARD: u8 = u8::MAX; + + fn new(num_blocks: usize) -> MPSCQueue { + debug_assert!(num_blocks <= MAX_BLOCKS_PER_PAGE); + MPSCQueue { + head: 0, + num_blocks: num_blocks as u8, + tail: CachePadded::new(AtomicU8::new(0)), + blocks: [Self::HAZARD; MAX_BLOCKS_PER_PAGE], + } + } + + fn push(&mut self, block: u8) { + loop { + let cur_tail = self.tail.load(Ordering::Relaxed); + assert!(cur_tail.wrapping_sub(self.head) < self.num_blocks); + let new_tail = cur_tail.wrapping_add(1); + if self.tail.compare_exchange(cur_tail, new_tail, Ordering::Relaxed, Ordering::Relaxed).is_ok() { + unsafe { + std::ptr::write_volatile(&mut self.blocks[cur_tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] as *mut u8, block); + } + return + } + } + } + + fn pop(&mut self) -> Option { + if self.head == self.tail.load(Ordering::Relaxed) { + return None + } + let idx = self.head as usize & (MAX_BLOCKS_PER_PAGE - 1); + loop { + let ret = unsafe { std::ptr::read_volatile(&self.blocks[idx] as *const u8) }; + /* + * The hazard value prevents the following race condition: + * The producer has reserved a slot, but before it can write to the slot, the consumer calls pop. + */ + if ret != Self::HAZARD { + unsafe { + std::ptr::write_volatile(&mut self.blocks[idx] as *mut u8, Self::HAZARD); + } + self.head = self.head.wrapping_add(1); + return Some(ret); + } + } + } +} + +pub struct Page { + pub(crate) block_size: usize, // Size of objects that are being allocated to this page + free_list: LocalFreeList, + pub(crate) used: usize, + thread_free_list: MPSCQueue, + pub(crate) capacity: usize, + pub(crate) slice_count: usize, // No. of pages in the slice containing this page + pub(crate) slice_offset: usize, // Offset of this page from the start of this slice + pub(crate) page_start: *mut u8, + // Next and previous pages in the span which is a doubly-linked list + pub(crate) next_page: *mut Page, + pub(crate) previous_page: *mut Page, +} + +impl Page { + pub fn from_slice(slice: Slice) -> Page { + Page { + block_size: 0usize, + free_list: LocalFreeList::empty(), + used: 0, + thread_free_list: MPSCQueue::new(PAGE_SIZE/MIN_SIZE_FROM_PAGES), + capacity: slice.size, + slice_count: 1, + slice_offset: 0, + page_start: slice.ptr, + next_page: null_mut(), + previous_page: null_mut(), + } + } + + pub fn set_block_size(self: &mut Self, block_size: usize) { + self.block_size = block_size; + let num_blocks = (self.slice_count * PAGE_SIZE) / block_size; + self.free_list = LocalFreeList::new(num_blocks); + } + + #[inline] + pub fn get_free_block(self: &mut Self) -> *mut u8 { + let block_idx = self.free_list.pop(); + let block_idx = match block_idx { + Some(i) => i, + None => return null_mut(), + }; + self.used += 1; + unsafe { self.page_start.add(block_idx as usize * self.block_size) } + } + + #[inline(always)] + pub fn is_full(self: &Self) -> bool { + self.free_list.is_empty() + } + + #[inline(always)] + pub fn is_unused(self: &Self) -> bool { + self.used == 0 + } + + /// Pointer freed on the same core + #[inline(always)] + pub fn free(self: &mut Self, ptr: *mut u8) { + let block_idx = (ptr as usize - self.page_start as usize) / self.block_size; + self.free_list.push(block_idx as u8); + self.used -= 1; + } + + /// Pointer freed on a different core + #[inline(always)] + pub(crate) fn foreign_free(self: &mut Self, ptr: *mut u8) { + let blk_idx = unsafe {ptr.offset_from(self.page_start) as usize / self.block_size}; + self.thread_free_list.push(blk_idx as u8); + } + + /// Collect pointers freed by other threads + #[inline] + pub(crate) fn collect_foreign_frees(self: &mut Self) { + while let Some(blk) = self.thread_free_list.pop() { + self.free_list.push(blk as u8); + self.used -= 1; + } + } +} + +pub struct Slice { + pub ptr: *mut u8, + pub size: usize, +} + +impl Slice { + pub fn split(self: Self) -> (Slice, Slice) { + let new_size = self.size >> 1; + let slice1 = Slice { + ptr: self.ptr, + size: new_size, + }; + let slice2 = Slice { + ptr: self.ptr.wrapping_add(new_size), + size: new_size, + }; + (slice1, slice2) + } +} \ No newline at end of file diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs new file mode 100644 index 00000000..4f193b2f --- /dev/null +++ b/src/common/src/memory/pool.rs @@ -0,0 +1,454 @@ +extern crate io_uring; + +use core::slice; +use std::{cmp::min, sync::{Arc, Mutex, OnceLock, atomic::{AtomicBool, AtomicU64, Ordering}}}; + +use futures::io; +use io_uring::IoUring; + +use crate::memory::{arena::Arena, segment::Segment, tcache::{TCache, TCacheStats}}; + +static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); + +pub const FIXED_BUFFER_SIZE_BYTES: usize = 1 << 20; +pub const FIXED_BUFFER_BITS: u32 = FIXED_BUFFER_SIZE_BYTES.trailing_zeros(); + +#[derive(Debug)] +pub struct FixedBuffer { + pub ptr: *mut u8, + pub buf_id: usize, + pub bytes: usize, +} + +#[derive(Debug)] +pub struct FixedBufferAllocation { + pub ptr: *mut u8, + pub size: usize, +} + +unsafe impl Send for FixedBufferAllocation {} + +impl AsRef<[u8]> for FixedBufferAllocation { + fn as_ref(&self) -> &[u8] { + unsafe { slice::from_raw_parts(self.ptr, self.size) } + } +} + +impl Drop for FixedBufferAllocation { + fn drop(&mut self) { + FixedBufferPool::free(self.ptr); + } +} + +pub struct FixedBufferPool { + local_caches: Vec>, + arena: Arc>, + start_ptr: *mut u8, + capacity: usize, + registered: AtomicBool, // Whether buffers have been registered + foreign_free: AtomicU64, +} + +unsafe impl Send for FixedBufferPool {} + +unsafe impl Sync for FixedBufferPool {} + +impl FixedBufferPool { + fn new(capacity_mb: usize) -> FixedBufferPool { + log::info!("Initializing fixed buffer pool with capacity: {} MB", capacity_mb); + let num_cpus = std::thread::available_parallelism().unwrap(); + let capacity = capacity_mb << 20; + let arena = Self::allocate_arena(capacity.clone()); + let start_ptr = { + let guard = arena.try_lock().unwrap(); + guard.start_ptr() + }; + let mut local_caches = Vec::>::new(); + for i in 0..num_cpus.get() { + local_caches.push(Mutex::new(TCache::new(arena.clone(), i))); + } + FixedBufferPool { + local_caches, + arena, + start_ptr, + capacity, + registered: AtomicBool::new(false), + foreign_free: AtomicU64::new(0), + } + } + + pub fn allocate_arena(capacity: usize) -> Arc> { + Arc::new(Mutex::new(Arena::new(capacity))) + } + + pub fn init(capacity_mb: usize) { + FIXED_BUFFER_POOL.get_or_init(|| FixedBufferPool::new(capacity_mb)); + } + + fn get_thread_local_cache() -> &'static Mutex { + let cpu = unsafe { libc::sched_getcpu() }; + &FIXED_BUFFER_POOL.get().unwrap().local_caches[cpu as usize] + } + + pub fn malloc(size: usize) -> *mut u8 { + let cpu = unsafe { libc::sched_getcpu() }; + let local_cache = Self::get_thread_local_cache(); + let ptr = local_cache.lock().unwrap().allocate(size); + log::debug!("Allocated pointer: {:?}, size: {}, cpu: {}", ptr, size, cpu); + if ptr.is_null() { + log::info!("Unsuccessful allocation of {} bytes", size); + } + ptr + } + + pub fn register_buffers_with_ring(ring: &IoUring) -> io::Result<()> { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + let mut arena_guard = pool.arena.lock().unwrap(); + let res = arena_guard.register_buffers_with_ring(ring); + if res.is_ok() { + log::info!("Registered buffers with io-uring ring"); + pool.registered.store(true, Ordering::Relaxed); + } + res + } + + pub(crate) fn get_stats(cpu: usize) -> TCacheStats { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + let tcache = pool.local_caches[cpu].lock().unwrap(); + tcache.get_stats() + } + + pub fn get_fixed_buffers(alloc: &FixedBufferAllocation) -> Vec { + let ptr = alloc.ptr; + let size = alloc.size; + let pool = FIXED_BUFFER_POOL.get().unwrap(); + debug_assert!(ptr >= pool.start_ptr && ptr < pool.start_ptr.wrapping_add(pool.capacity), + "Pointer doesn't lie within the arena"); + let mut remaining = size; + let mut vec = Vec::::new(); + let mut current = ptr.clone(); + let mut buffer_id = (current.wrapping_sub(pool.start_ptr as usize) as usize) >> FIXED_BUFFER_BITS; + while remaining > 0 { + let next_buffer_start = pool.start_ptr.wrapping_add((buffer_id + 1) << FIXED_BUFFER_BITS); + let bytes = min(remaining, next_buffer_start as usize - current as usize); + let fb = FixedBuffer { + ptr: current, + buf_id: buffer_id, + bytes: bytes, + }; + current = next_buffer_start; + vec.push(fb); + remaining -= bytes; + buffer_id += 1; + } + vec + } + + #[inline] + pub fn buffers_registered() -> bool { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + pool.registered.load(Ordering::Relaxed) + } + + fn free(ptr: *mut u8) { + let segment_ptr = Segment::get_segment_from_ptr(ptr); + let page_ptr = unsafe { (*segment_ptr).get_page_from_ptr(ptr) }; + let thread_id = unsafe { (*segment_ptr).thread_id }; + log::debug!("Freed pointer: {:?}, size: {}, owner thread id: {}", ptr, unsafe { (*page_ptr).block_size }, thread_id); + + // If page is local and unused after free, return it to segment + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + if cur_cpu == thread_id { + unsafe { + (*page_ptr).free(ptr); + } + let should_free_page = unsafe { (*page_ptr).is_unused() }; + if should_free_page { + let local_cache = Self::get_thread_local_cache(); + let mut guard = local_cache.lock().unwrap(); + guard.retire_page(page_ptr); + } + } else { + unsafe { (*page_ptr).foreign_free(ptr); } + let pool = FIXED_BUFFER_POOL.get().unwrap(); + pool.foreign_free.fetch_add(1, Ordering::Relaxed); + } + } + + pub fn print_stats() { + if FIXED_BUFFER_POOL.get().is_none() { + return + } + let num_cpus = std::thread::available_parallelism().unwrap(); + let mut agg_stats = TCacheStats::new(); + for i in 0..num_cpus.get() { + let stats = Self::get_stats(i); + agg_stats.allocations_from_arena += stats.allocations_from_arena; + agg_stats.allocations_from_pages += stats.allocations_from_pages; + agg_stats.allocations_from_segment += stats.allocations_from_segment; + agg_stats.fast_allocations += stats.fast_allocations; + agg_stats.pages_retired += stats.pages_retired; + agg_stats.segments_retired += stats.segments_retired; + agg_stats.total_segments_allocated += stats.total_segments_allocated; + agg_stats.unsuccessful_allocations += stats.unsuccessful_allocations; + agg_stats.total_allocations += stats.total_allocations; + } + agg_stats.print(); + } +} + +impl Drop for FixedBufferPool { + fn drop(self: &mut Self) { + let arena = self.arena.lock().unwrap(); + drop(arena); + } +} + +mod tests { + #[allow(unused_imports)] + use std::{io::Write, os::fd::AsRawFd, ptr::{null, null_mut}}; + + use bytes::Bytes; + use io_uring::{IoUring, cqueue, opcode, squeue}; + use libc::rlimit; + use rand::RngCore as _; + + use crate::memory::pool::{FIXED_BUFFER_SIZE_BYTES, FixedBufferAllocation, FixedBufferPool}; + + #[test] + fn test_basic_alloc_and_free() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + let mut ptrs = Vec::<*mut u8>::new(); + for len in buffer_lengths { + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + ptrs.push(ptr); + } + + for ptr in ptrs { + FixedBufferPool::free(ptr); + } + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.fast_allocations, 1); + assert_eq!(stats.pages_retired, 2); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_basic_alloc_and_free_bytes() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + // let mut ptrs = Vec::<*mut u8>::new(); + let mut bytes_vec = Vec::::new(); + for len in buffer_lengths { + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + let alloc = FixedBufferAllocation {ptr: ptr, size: len}; + let bytes = Bytes::from_owner(alloc); + bytes_vec.push(bytes); + } + + drop(bytes_vec); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.fast_allocations, 1); + assert_eq!(stats.pages_retired, 2); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_free_from_different_thread() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096 * 4]; + let mut buffers = Vec::<&mut [u8]>::new(); + for len in buffer_lengths { + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + buffers.push(buffer); + } + + std::thread::spawn(move || { + for buffer in buffers { + let ptr = buffer.as_mut_ptr(); + FixedBufferPool::free(ptr); + } + }); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.allocations_from_segment, 1); + assert_eq!(stats.fast_allocations, 0); + assert_eq!(stats.pages_retired, 0); + assert_eq!(stats.segments_retired, 0); + } + + #[test] + fn test_large_alloc_and_free() { + FixedBufferPool::init(128); + let len = 1024 * 1024; // 1 MB + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + FixedBufferPool::free(ptr); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 1); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_large_alloc_and_free2() { + FixedBufferPool::init(128); + let len = 3 * 1024 * 1024; // 1 MB + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len-1] = 1; + FixedBufferPool::free(ptr); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 1); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_very_large_alloc_fails() { + FixedBufferPool::init(128); + let len = 32 * 1024 * 1024; // 32 MB + let ptr = FixedBufferPool::malloc(len); + + assert_eq!(ptr, null_mut()); + } + + #[test] + fn test_with_uring_basic() { + let mut rlimit = libc::rlimit{ + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { libc::getrlimit(libc::RLIMIT_MEMLOCK, &mut rlimit); } + assert!(64 * 1024 <= rlimit.rlim_max, "rlimit.MEMLOCK should be at least 64 MB to test the fixed-buffer pool. Current rlimit is: {} KB", rlimit.rlim_max); + FixedBufferPool::init(64); + + let mut ring = IoUring::::builder().build(32).unwrap(); + let res = FixedBufferPool::register_buffers_with_ring(&ring); + assert!(res.is_ok()); + + const LEN: usize = 1 << 20; // 1 MB + let mut file = tempfile::tempfile().unwrap(); + let ptr = FixedBufferPool::malloc(LEN); + assert_ne!(ptr, null_mut()); + let alloc = FixedBufferAllocation {ptr: ptr, size: LEN}; + let buffers = FixedBufferPool::get_fixed_buffers(&alloc); + assert!(buffers.len() <= (LEN / FIXED_BUFFER_SIZE_BYTES) + 1); + + let mut total = 0; + for fixed_buffer in buffers.iter().as_ref() { + total += fixed_buffer.bytes; + } + assert_eq!(total, LEN); + + let mut random_bytes = [0u8; LEN]; + let mut rng = rand::rng(); + rng.fill_bytes(&mut random_bytes); + let mut res = file.write(&random_bytes); + assert!(res.is_ok(), "Failed to write to temp file"); + assert_eq!(res.unwrap(), LEN, "Failed to write to temp file"); + + let mut file_offset = 0; + for fixed_buffer in buffers.iter().as_ref() { + let sqe = opcode::ReadFixed::new( + io_uring::types::Fd(file.as_raw_fd()), + fixed_buffer.ptr, + fixed_buffer.bytes as u32, + fixed_buffer.buf_id as u16) + .offset(file_offset).build(); + file_offset += fixed_buffer.bytes as u64; + let mut sq = ring.submission(); + let res = unsafe { sq.push(&sqe) }; + assert!(res.is_ok(), "Failed to submit to io uring"); + sq.sync(); + } + + res = ring.submit_and_wait(buffers.len()); + assert!(res.is_ok(), "Failed to submit"); + let mut total_bytes_read = 0; + + for _i in 0..buffers.len() { + let mut cq = ring.completion(); + let cqe = cq.next(); + assert!(cqe.is_some()); + let res = cqe.as_ref().unwrap().result(); + assert!( res > 0, "Read failed: {}", std::io::Error::from_raw_os_error(-cqe.unwrap().result())); + total_bytes_read += res as usize; + } + assert_eq!(total_bytes_read, LEN, "Expected to read {} bytes, but read {}", LEN, total_bytes_read); + let buffer = Bytes::from_owner(alloc); + assert_eq!(buffer, &random_bytes[..]); + } + + #[test] + fn test_edge_case() { + FixedBufferPool::init(128); + let len = 4 * 1024; + let ptr1 = FixedBufferPool::malloc(len); + let ptr2 = FixedBufferPool::malloc(len << 1); + let ptr3 = FixedBufferPool::malloc(len << 2); + let ptr4 = FixedBufferPool::malloc(len << 4); + + FixedBufferPool::free(ptr1); + FixedBufferPool::free(ptr3); + FixedBufferPool::free(ptr2); + FixedBufferPool::free(ptr4); + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 4); + assert_eq!(stats.segments_retired, 1); + // assert_eq + } +} \ No newline at end of file diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs new file mode 100644 index 00000000..90b552f0 --- /dev/null +++ b/src/common/src/memory/segment.rs @@ -0,0 +1,156 @@ +use std::ptr::{null_mut, write}; + +use crate::memory::{page::{PAGE_SIZE, Page, Slice}}; + +pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; // 32 MB +pub const SEGMENT_SIZE_BITS: usize = SEGMENT_SIZE.ilog2() as usize; + +// The metadata is stored at the beginning of the slice. So we don't get the entirety of it for pages +pub const PAGES_PER_SEGMENT: usize = (SEGMENT_SIZE - 3 * size_of::()) / (PAGE_SIZE + size_of::()); + +pub struct Segment { + pub(crate) allocated: usize, + pub(crate) num_slices: usize, + pub(crate) pages: [Page; PAGES_PER_SEGMENT], + pub(crate) thread_id: usize, +} + +impl Segment { + pub fn new_from_slice(slice: Slice) -> *mut Segment { + let segment_ptr = slice.ptr as *mut Segment; + let segment_end_ptr = slice.ptr.wrapping_add(SEGMENT_SIZE); + let mut start_ptr = unsafe { segment_end_ptr.sub(PAGES_PER_SEGMENT * PAGE_SIZE) }; + unsafe { + let pages_ptr = (*segment_ptr).pages.as_mut_ptr(); + (*segment_ptr).allocated = 0; + (*segment_ptr).num_slices = PAGES_PER_SEGMENT; + for i in 0..PAGES_PER_SEGMENT { + // Use ptr::write after dropping to initialize new Pages + write( + pages_ptr.add(i), + Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}) + ); + start_ptr = start_ptr.wrapping_add(PAGE_SIZE); + } + } + segment_ptr + } + + #[inline] + pub fn full(self: &mut Self) -> bool { + self.allocated == self.num_slices + } + + pub fn reset(self: &mut Self) -> () { + for page in self.pages.iter_mut() { + page.slice_count = 1; + page.slice_offset = 0; + } + } + + pub fn get_segment_from_ptr(ptr: *mut u8) -> *mut Segment { + let aligned_ptr = (ptr as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; + aligned_ptr as *mut Segment + } + + pub fn get_page_from_ptr(self: &mut Self, ptr: *mut u8) -> *mut Page { + let base_page_ptr = self.pages[0].page_start; + debug_assert!(ptr >= base_page_ptr); + let index = unsafe { + ptr.sub(base_page_ptr as usize) as usize / PAGE_SIZE + }; + debug_assert!(index < PAGES_PER_SEGMENT); + &mut self.pages[index] as *mut Page + } + + /** + * Split `page` into 2, with the first partition having `num_slices` pages. + * Returns a pointer to the first page of the second slice. + */ + pub fn split_page(self: &mut Self, page: *mut Page, num_slices: usize) -> *mut Page { + debug_assert_ne!(page, null_mut()); + let base_page_ptr = unsafe { (*page).page_start }; + let base_segment_page_ptr = self.pages[0].page_start; + debug_assert!(base_page_ptr >= base_segment_page_ptr); + let index = unsafe { + base_page_ptr.sub(base_segment_page_ptr as usize) as usize / PAGE_SIZE + }; + + // Read original slice_count before modifying anything + let original_slice_count = unsafe { (*page).slice_count }; + debug_assert!(num_slices > 0 && num_slices < original_slice_count, + "num_slices: {}, slice_count: {}", num_slices, original_slice_count); + debug_assert!(index + original_slice_count <= PAGES_PER_SEGMENT); + // log::info!("[thread_id: {}, segment_id: {}] Splitting page with {} slices", self.thread_id, self.segment_id, original_slice_count); + + /* + * ASSUMPTION: Pointer to the beginning of the slice is passed in. + * We don't need to modify all the intermediate pages while splitting. Only update the following: + * - slice_offset for the first page of each slice (should be 0). + * - slice_offset for the last page of each slice. + * - slice_count for the first page of each slice. + */ + // Use raw pointers to avoid borrow checker issues with multiple mutable references + unsafe { + // Update slice1: the original slice becomes the first part + (*page).slice_offset = 0; + (*page).slice_count = num_slices; + + let pages_ptr = self.pages.as_mut_ptr(); + let last_page_in_slice1 = pages_ptr.add(index + num_slices - 1); + (*last_page_in_slice1).slice_offset = num_slices - 1; + + // Update slice2: the remaining pages become the second slice + let slice2_count = original_slice_count - num_slices; + let slice2 = pages_ptr.add(index + num_slices); + (*slice2).slice_offset = 0; + (*slice2).slice_count = slice2_count; + assert!((*slice2).block_size == 0, "block size: {}", (*slice2).block_size); + + let last_page_in_slice2 = pages_ptr.add(index + original_slice_count - 1); + (*last_page_in_slice2).slice_offset = slice2_count - 1; + + slice2 + } + } + + pub fn coalesce_slices(self: &mut Self, left_slice: &mut Page, right_slice: &mut Page) { + debug_assert!(left_slice.page_start >= self.pages[0].page_start && + left_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start); + debug_assert!(right_slice.page_start >= self.pages[0].page_start && + right_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start); + + let left_slice_idx = (left_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + let right_slice_idx = (right_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + debug_assert!(left_slice_idx + left_slice.slice_count == right_slice_idx, + "left slice count: {}, left slice idx: {}, right slice idx: {}, thread_id: {}", + left_slice.slice_count, left_slice_idx, right_slice_idx, self.thread_id); + debug_assert!(right_slice_idx + right_slice.slice_count <= PAGES_PER_SEGMENT); + + /* + * ASSUMPTION: Pointer to the beginning of the slice is passed in free(). + * We don't need to modify all the intermediate pages while coalescing. Only update the following: + * - slice_count for the first page of the combined slice (left_slice). + * - slice_offset for the last page in the combined slice. + * Note: right_slice becomes an intermediate page after merging, so we don't update its metadata. + */ + left_slice.slice_offset = 0; + left_slice.slice_count += right_slice.slice_count; + + let last_page = &mut self.pages[left_slice_idx + left_slice.slice_count - 1]; + last_page.slice_offset = left_slice.slice_count - 1; + } + + pub fn check_valid_segment(self: &mut Self) { + let mut idx = 0; + while idx < PAGES_PER_SEGMENT { + let page = &mut self.pages[idx]; + debug_assert!(page.slice_offset == 0 && idx + page.slice_count <= PAGES_PER_SEGMENT); + let slice_count = page.slice_count; + let last_page_in_slice = &mut self.pages[idx + slice_count - 1]; + debug_assert!(last_page_in_slice.slice_offset == slice_count - 1, + "slice count: {}, last page slice offset: {}, thread_id: {}", slice_count, last_page_in_slice.slice_offset, self.thread_id); + idx += slice_count; + } + } +} \ No newline at end of file diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs new file mode 100644 index 00000000..21e9020a --- /dev/null +++ b/src/common/src/memory/tcache.rs @@ -0,0 +1,454 @@ +use std::{ + ptr::null_mut, + sync::{Arc, Mutex}, +}; + +use crate::memory::{ + arena::Arena, + page::{PAGE_SIZE, Page}, + segment::{PAGES_PER_SEGMENT, SEGMENT_SIZE, Segment}, +}; + +const SIZE_CLASSES: &'static [usize] = &[ + 4 << 10, + 8 << 10, + 16 << 10, + 32 << 10, + 64 << 10, +]; + +const NUM_SIZE_CLASSES: usize = SIZE_CLASSES.len(); + +pub(crate) const MIN_SIZE_FROM_PAGES: usize = SIZE_CLASSES[0]; + +const SEGMENT_BINS: usize = (SEGMENT_SIZE/PAGE_SIZE).ilog2() as usize + 1; + +#[derive(Default, Clone)] +pub(crate) struct TCacheStats { + // Allocation stats + pub(crate) total_allocations: usize, + pub(crate) unsuccessful_allocations: usize, + pub(crate) total_segments_allocated: usize, + pub(crate) fast_allocations: usize, // Allocations from self.free_pages + pub(crate) allocations_from_pages: usize, // Allocations from self.used_pages + pub(crate) allocations_from_segment: usize, + pub(crate) allocations_from_arena: usize, + + // Deallocation stats + pub(crate) pages_retired: usize, + pub(crate) segments_retired: usize, + // TODO(): Add more stats such as number of local frees and frees from another thread +} + +impl TCacheStats { + pub(crate) fn new() -> TCacheStats { + TCacheStats::default() + } + + #[allow(unused)] + pub(crate) fn print(self: &Self) { + println!("Total allocations: {}", self.total_allocations); + println!("Unsuccessful allocations: {}", self.unsuccessful_allocations); + println!("Fast allocations: {}", self.fast_allocations); + println!("Allocations from pages: {}", self.allocations_from_pages); + println!("Allocations from segment: {}", self.allocations_from_segment); + println!("Allocations from arena: {}", self.allocations_from_arena); + println!("Pages retired: {}", self.pages_retired); + println!("Segments retired: {}", self.segments_retired); + } +} + +#[derive(Copy, Clone)] +struct Span { + pub(crate) first: *mut Page, + pub(crate) last: *mut Page, +} + +pub(crate) struct TCache { + free_pages: [*mut Page; NUM_SIZE_CLASSES], + // TODO(): Make this a linked list + // Last size class holds slices that serve large allocations (>64KB) + used_pages: [Vec<*mut Page>; NUM_SIZE_CLASSES + 1], + // TODO: Use a linked list for O(1) deletion + spans: [Span; SEGMENT_BINS], + arena: Arc>, + thread_id: usize, + stats: TCacheStats, +} + +unsafe impl Send for TCache {} +unsafe impl Sync for TCache {} + +impl TCache { + pub(crate) fn new(arena: Arc>, thread_id: usize) -> TCache { + TCache { + free_pages: [const { null_mut() }; NUM_SIZE_CLASSES], + used_pages: [const { Vec::<*mut Page>::new() }; NUM_SIZE_CLASSES + 1], + spans: [Span { first: null_mut(), last: null_mut() }; SEGMENT_BINS], + arena: arena.clone(), + thread_id, + stats: TCacheStats::new(), + } + } + + #[inline] + fn get_size_class(size: usize) -> usize { + if size <= MIN_SIZE_FROM_PAGES { + return 0; + } + (size.next_power_of_two() / MIN_SIZE_FROM_PAGES).trailing_zeros() as usize + } + + /** + * Get the smallest bin which can hold contiguous runs of `slice_count` pages + */ + #[inline] + fn get_span_idx_from_slice_count(slice_count: usize) -> usize { + (slice_count + 1).next_power_of_two().trailing_zeros() as usize - 1usize + } + + fn add_slice_to_span(span: &mut Span, slice: &mut Page) { + if span.first == null_mut() { + debug_assert!(span.last == null_mut()); + span.first = slice as *mut Page; + span.last = slice as *mut Page; + return + } + debug_assert!(span.last != null_mut()); + unsafe { (*span.last).next_page = slice; } + slice.previous_page = span.last; + span.last = slice as *mut Page; + } + + fn remove_slice_from_span(self: &mut Self, slice: &mut Page) { + let span_idx = Self::get_span_idx_from_slice_count(slice.slice_count); + let span = &mut self.spans[span_idx]; + if span.first == slice as *mut Page { + span.first = slice.next_page; + if slice.next_page != null_mut() { + unsafe { (*slice.next_page).previous_page = null_mut(); } + } else { + span.last = null_mut(); + } + } else if span.last == slice as *mut Page { + span.last = slice.previous_page; + debug_assert!(slice.previous_page != null_mut()); + unsafe { (*span.last).next_page = null_mut(); } + } else { + debug_assert!(slice.previous_page != null_mut()); + debug_assert!(slice.next_page != null_mut()); + unsafe { (*slice.previous_page).next_page = slice.next_page; } + unsafe { (*slice.next_page).previous_page = slice.previous_page; } + } + + slice.next_page = null_mut(); + slice.previous_page = null_mut(); + } + + fn retire_segment(self: &mut Self, segment: *mut Segment) { + // log::info!("Retiring segment from thread with id: {}", self.thread_id); + unsafe { (*segment).check_valid_segment(); } + self.stats.segments_retired += 1; + let pages = unsafe { &mut (*segment).pages }; + let mut slice_idx: usize = 0; + while slice_idx < PAGES_PER_SEGMENT { + if pages[slice_idx].block_size == 0 { + self.remove_slice_from_span(&mut pages[slice_idx]); + } + slice_idx += pages[slice_idx].slice_count; + } + let mut guard = self.arena.lock().unwrap(); + guard.retire_segment(segment); + } + + fn remove_page_from_used_queue(self: &mut Self, page_ptr: *mut Page) { + let mut size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); + if size_class >= NUM_SIZE_CLASSES { + size_class = NUM_SIZE_CLASSES; + } + for i in 0..self.used_pages[size_class].len() { + if self.used_pages[size_class][i] == page_ptr { + self.used_pages[size_class].remove(i); + return + } + } + } + + fn remove_page_from_free_queue(self: &mut Self, page_ptr: *mut Page) { + let size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); + if size_class < NUM_SIZE_CLASSES && self.free_pages[size_class] == page_ptr { + self.free_pages[size_class] = null_mut(); + } + } + + pub(crate) fn retire_page(self: &mut Self, page: *mut Page) { + assert!(unsafe { (*page).is_unused() }); + self.stats.pages_retired += 1; + self.remove_page_from_used_queue(page); + self.remove_page_from_free_queue(page); + let page_ref = unsafe { &mut (*page) }; + + let segment_ptr = Segment::get_segment_from_ptr(page as *mut u8); + let segment = unsafe { &mut *segment_ptr }; + segment.allocated -= page_ref.slice_count; + if segment.allocated == 0 { + // Return segment to arena + self.retire_segment(segment_ptr); + return; + } + page_ref.block_size = 0; + + let next_slice = page.wrapping_add(page_ref.slice_count); + if next_slice <= (&mut segment.pages[PAGES_PER_SEGMENT - 1]) as *mut Page { + let next_slice_ref = unsafe { &mut (*next_slice) }; + if next_slice_ref.block_size == 0 { + log::debug!("[thread_id: {}] Merging released slice with next slice. Slice count of next slice: {}", self.thread_id, next_slice_ref.slice_count); + // Page is not in use, remove it + self.remove_slice_from_span(next_slice_ref); + segment.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); + } + } + + let mut merged_with_prev = false; + + if unsafe { page.offset_from(&mut segment.pages[0] as *mut Page) > 0 } { + let mut prev_slice = page.wrapping_sub(1); + prev_slice = prev_slice.wrapping_sub(unsafe { (*prev_slice).slice_offset }); + let prev_slice_ref = unsafe { &mut (*prev_slice) }; + if prev_slice_ref.block_size == 0 { + // Merge with the previous slice + log::debug!("[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, prev_slice_ref.slice_count); + self.remove_slice_from_span(prev_slice_ref); + segment.coalesce_slices(prev_slice_ref, page_ref); + let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); + Self::add_slice_to_span(&mut self.spans[span_idx], prev_slice_ref); + log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, prev_slice_ref.slice_count, span_idx); + merged_with_prev = true; + } + } + if !merged_with_prev { + let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); + Self::add_slice_to_span(&mut self.spans[span_idx], page_ref); + log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, page_ref.slice_count, span_idx); + } + segment.check_valid_segment(); + } + + fn cleanup_pages(self: &mut Self) { + for i in 0..self.free_pages.len() { + let page = self.free_pages[i]; + if page != null_mut() { + unsafe { + (*page).collect_foreign_frees(); + if (*page).is_unused() { + self.retire_page(page); + self.free_pages[i] = null_mut(); + } + } + } + } + for i in 0..self.used_pages.len() { + let mut page_idx = 0; + while page_idx < self.used_pages[i].len() { + let page = self.used_pages[i][page_idx]; + unsafe { + (*page).collect_foreign_frees(); + if (*page).is_unused() { + self.retire_page(page); + } else { + page_idx += 1; + } + } + } + } + } + + fn find_page_from_used(self: &mut Self, bin: usize) -> *mut u8 { + for i in 0..self.used_pages[bin].len() { + unsafe { + (*self.used_pages[bin][i]).collect_foreign_frees(); + if (*self.used_pages[bin][i]).is_full() { + continue; + } + let page = self.used_pages[bin].remove(i); + let block = (*page).get_free_block(); + self.free_pages[bin] = page; + return block + } + } + null_mut() + } + + fn find_page_from_spans(self: &mut Self, num_slices_required: usize, block_size: usize) -> *mut Page { + debug_assert!(block_size >= MIN_SIZE_FROM_PAGES); + let min_bin = Self::get_span_idx_from_slice_count(num_slices_required); + for i in min_bin..SEGMENT_BINS { + // let span = &mut self.spans[i]; + let mut slice = self.spans[i].first; + while slice != null_mut() { + let num_slices_original = unsafe { (*slice).slice_count }; + debug_assert!(num_slices_original >= 1 << i); + if num_slices_original < num_slices_required { + unsafe { slice = (*slice).next_page; } + continue; + } + self.remove_slice_from_span(unsafe { &mut *slice }); + + let segment = Segment::get_segment_from_ptr(slice as *mut u8); + unsafe { + (*segment).allocated += num_slices_required; + } + if num_slices_original > num_slices_required { + // split slice + let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; + debug_assert!(unsafe { (*slice).slice_count == num_slices_required}); + #[cfg(debug_assertions)] + unsafe { (*segment).check_valid_segment() } ; + let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); + Self::add_slice_to_span(&mut self.spans[bin], unsafe { &mut (*next_slice) } ); + log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, num_slices_original - num_slices_required, bin); + } + unsafe { + (*slice).set_block_size(block_size); + } + return slice; + } + } + null_mut() + } + + fn add_segment_to_spans(self: &mut Self, segment: *mut Segment) { + let segment_ref = unsafe { &mut (*segment) }; + let slice_count = segment_ref.num_slices; + let span_idx = Self::get_span_idx_from_slice_count(slice_count); + let page = &mut segment_ref.pages[0]; + page.slice_count = slice_count; + page.slice_offset = 0; + Self::add_slice_to_span(&mut self.spans[span_idx], page); + + let last_page = &mut segment_ref.pages[PAGES_PER_SEGMENT - 1]; + last_page.slice_offset = PAGES_PER_SEGMENT - 1; + } + + fn allocate_segment_from_arena(self: &mut Self, thread_id: usize) -> bool { + self.stats.total_segments_allocated += 1; + let segment_opt = { + let mut guard = self.arena.lock().unwrap(); + guard.allocate_segment(SEGMENT_SIZE) + }; + if segment_opt.is_none() { + return false; + } + // log::info!("Allocating segment to thread with id: {}", thread_id); + unsafe { + (*segment_opt.unwrap()).thread_id = thread_id; + } + + self.add_segment_to_spans(segment_opt.unwrap()); + true + } + + fn allocate_large(self: &mut Self, size: usize) -> *mut u8 { + // Directly get page from segment + let num_pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + let block_size = num_pages * PAGE_SIZE; + let mut free_page = self.find_page_from_spans(num_pages, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + self.used_pages[NUM_SIZE_CLASSES].push(free_page); + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block + } + self.cleanup_pages(); + // Retry after cleanup + free_page = self.find_page_from_spans(num_pages, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + self.used_pages[NUM_SIZE_CLASSES].push(free_page); + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block + } + + let res = self.allocate_segment_from_arena(self.thread_id); + if !res { + self.stats.unsuccessful_allocations += 1; + return null_mut() + } + self.stats.allocations_from_arena += 1; + free_page = self.find_page_from_spans(num_pages, block_size); + if free_page == null_mut() { + self.stats.unsuccessful_allocations += 1; + return null_mut() + } + self.used_pages[NUM_SIZE_CLASSES].push(free_page); + debug_assert_ne!(free_page, null_mut()); + let free_block = unsafe { (*free_page).get_free_block() }; + debug_assert_ne!(free_block, null_mut()); + return free_block + } + + pub(crate) fn allocate(self: &mut Self, size: usize) -> *mut u8 { + self.stats.total_allocations = self.stats.total_allocations.wrapping_add(1); + if self.stats.total_allocations & 0x7f == 0 { + // Periodically cleanup pages + self.cleanup_pages(); + } + if size > PAGE_SIZE { + return self.allocate_large(size) + } + let size_class = Self::get_size_class(size); + debug_assert!(size_class < NUM_SIZE_CLASSES); + + let block_size = SIZE_CLASSES[size_class]; + let mut free_page = self.free_pages[size_class]; + if !free_page.is_null() { + debug_assert_eq!(unsafe {(*free_page).block_size}, block_size); + // allocate from free page + let page = free_page.clone(); + unsafe { + if !(*page).is_full() { + self.stats.fast_allocations += 1; + return (*page).get_free_block() + } else { + // Try collecting frees from other threads and retrying + (*page).collect_foreign_frees(); + if !(*page).is_full() { + return (*page).get_free_block() + } + self.used_pages[size_class].push(page); + self.free_pages[size_class] = null_mut(); + } + } + } + let block = self.find_page_from_used(size_class); + if !block.is_null() { + self.stats.allocations_from_pages += 1; + return block + } + free_page = self.find_page_from_spans(1, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + let free_block = unsafe { (*free_page).get_free_block() }; + debug_assert_ne!(free_block, null_mut()); + self.free_pages[size_class] = free_page; + return free_block; + } + // No space available in segments, allocate a new one + let res = self.allocate_segment_from_arena(self.thread_id); + if !res { + self.stats.unsuccessful_allocations += 1; + return null_mut() + } + self.stats.allocations_from_arena += 1; + free_page = self.find_page_from_spans(1, block_size); + assert_ne!(free_page, null_mut()); + let free_block = unsafe { (*free_page).get_free_block() }; + self.free_pages[size_class] = free_page; + return free_block + } + + #[allow(unused)] + pub(crate) fn get_stats(self: &Self) -> TCacheStats { + self.stats.clone() + } +} \ No newline at end of file diff --git a/src/local/src/lib.rs b/src/local/src/lib.rs index de494508..36bd5054 100644 --- a/src/local/src/lib.rs +++ b/src/local/src/lib.rs @@ -73,6 +73,8 @@ pub struct LiquidCacheLocalBuilder { io_mode: IoMode, eager_shredding: bool, + + fixed_buffer_pool_size_mb: usize, } impl Default for LiquidCacheLocalBuilder { @@ -87,6 +89,7 @@ impl Default for LiquidCacheLocalBuilder { span: fastrace::Span::enter_with_local_parent("liquid_cache_local_builder"), io_mode: IoMode::StdBlocking, eager_shredding: true, + fixed_buffer_pool_size_mb: 0, } } } @@ -151,6 +154,12 @@ impl LiquidCacheLocalBuilder { self } + /// Set size of fixed buffer pool + pub fn with_fixed_buffer_pool_size_mb(mut self, fixed_buffer_pool_size_mb: usize) -> Self { + self.fixed_buffer_pool_size_mb = fixed_buffer_pool_size_mb; + self + } + /// Build a SessionContext with liquid cache configured /// Returns the SessionContext and the liquid cache reference pub fn build( @@ -175,6 +184,7 @@ impl LiquidCacheLocalBuilder { self.squeeze_policy, self.hydration_policy, self.io_mode, + self.fixed_buffer_pool_size_mb, ); let cache_ref = Arc::new(cache); diff --git a/src/parquet/Cargo.toml b/src/parquet/Cargo.toml index bdc854f6..a13e3c4e 100644 --- a/src/parquet/Cargo.toml +++ b/src/parquet/Cargo.toml @@ -28,6 +28,8 @@ serde = { workspace = true } serde_json = { workspace = true } itertools = "0.14.0" async-executor = "1.13.3" +usdt = "0.6" +rand = "0.9.2" [target.'cfg(target_os = "linux")'.dependencies] io-uring = "0.7.10" @@ -38,7 +40,6 @@ crossbeam-queue = "0.3.11" [dev-dependencies] tempfile = "3.23.0" divan = "0.1" -rand = "0.9.2" shuttle = "0.8.1" tokio-test = "0.4" serde_json = { workspace = true } diff --git a/src/parquet/bench/filter_pushdown.rs b/src/parquet/bench/filter_pushdown.rs index 5504de22..49d0172b 100644 --- a/src/parquet/bench/filter_pushdown.rs +++ b/src/parquet/bench/filter_pushdown.rs @@ -49,6 +49,7 @@ fn setup_cache(tmp_dir: &TempDir) -> Arc { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let field = Arc::new(Field::new("test_column", DataType::Int32, false)); let schema = Arc::new(Schema::new(vec![field.clone()])); diff --git a/src/parquet/src/cache/mod.rs b/src/parquet/src/cache/mod.rs index 4abc790b..fc37a960 100644 --- a/src/parquet/src/cache/mod.rs +++ b/src/parquet/src/cache/mod.rs @@ -248,9 +248,10 @@ impl LiquidCacheParquet { squeeze_policy: Box, hydration_policy: Box, io_mode: IoMode, + fixed_buffer_pool_size_mb: usize, ) -> Self { assert!(batch_size.is_power_of_two()); - let io_context = Arc::new(ParquetIoContext::new(cache_dir.clone(), io_mode)); + let io_context = Arc::new(ParquetIoContext::new(cache_dir.clone(), io_mode, fixed_buffer_pool_size_mb)); let cache_storage_builder = LiquidCacheBuilder::new() .with_batch_size(batch_size) .with_max_cache_bytes(max_cache_bytes) @@ -387,6 +388,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let file = cache.register_or_get_file("test".to_string(), schema); file.create_row_group(0, vec![]) diff --git a/src/parquet/src/cache/stats.rs b/src/parquet/src/cache/stats.rs index b82257eb..a1355cd8 100644 --- a/src/parquet/src/cache/stats.rs +++ b/src/parquet/src/cache/stats.rs @@ -193,6 +193,7 @@ mod tests { Box::new(Evict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let fields: Vec = (0..8) .map(|i| Field::new(format!("test_{i}"), DataType::Int32, false)) diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index ab166583..7a80db4f 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -16,7 +16,7 @@ pub(super) async fn read( IoMode::Uring => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::read(path, range, false).await + super::io_uring::thread_pool_uring::read(path, range, false, false).await } #[cfg(not(target_os = "linux"))] { @@ -38,7 +38,7 @@ pub(super) async fn read( IoMode::UringDirect => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::read(path, range, true).await + super::io_uring::thread_pool_uring::read(path, range, true, true).await } #[cfg(not(target_os = "linux"))] { @@ -89,10 +89,20 @@ pub(super) async fn write( data: Bytes, ) -> Result<(), std::io::Error> { match io_mode { - IoMode::Uring | IoMode::UringDirect => { + IoMode::Uring => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::write(path, &data).await + super::io_uring::thread_pool_uring::write(path, &data, false, false).await + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_uring modes are only supported on Linux"); + } + } + IoMode::UringDirect => { + #[cfg(target_os = "linux")] + { + super::io_uring::thread_pool_uring::write(path, &data, true, false).await } #[cfg(not(target_os = "linux"))] { @@ -102,7 +112,7 @@ pub(super) async fn write( IoMode::UringShared => { #[cfg(target_os = "linux")] { - super::io_uring::single_uring::write(path, &data).await + super::io_uring::single_uring::write(path, &data, false).await } #[cfg(not(target_os = "linux"))] { @@ -112,7 +122,7 @@ pub(super) async fn write( IoMode::UringBlocking => { #[cfg(target_os = "linux")] { - super::io_uring::multi_blocking_uring::write(path, &data) + super::io_uring::multi_blocking_uring::write(path, &data, false) } #[cfg(not(target_os = "linux"))] { @@ -132,7 +142,7 @@ pub(super) async fn write( IoMode::UringMultiAsync => { #[cfg(target_os = "linux")] { - super::io_uring::multi_async_uring::write(path, &data).await + super::io_uring::multi_async_uring::write(path, &data, false).await } #[cfg(not(target_os = "linux"))] { diff --git a/src/parquet/src/io/io_uring/multi_async_uring.rs b/src/parquet/src/io/io_uring/multi_async_uring.rs index e2c26ec4..89809a1f 100644 --- a/src/parquet/src/io/io_uring/multi_async_uring.rs +++ b/src/parquet/src/io/io_uring/multi_async_uring.rs @@ -39,7 +39,7 @@ impl AsyncRing { fn submit_task(&mut self, task: &mut dyn IoTask) { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(0); + let entry = task.prepare_sqe()[0].clone().user_data(0); unsafe { sq.push(&entry) .expect("failed to push entry to io-uring submission queue"); @@ -207,7 +207,7 @@ where } State::Pending { mut ring, mut task } => { if let Some(cqe) = ring.as_mut().take_completion() { - task.complete(&cqe); + task.complete(vec![&cqe]); return Poll::Ready(task); } this.state = State::Pending { ring, task }; @@ -256,7 +256,7 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) @@ -264,6 +264,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/multi_blocking_uring.rs b/src/parquet/src/io/io_uring/multi_blocking_uring.rs index 86d2f6a5..1a677e3a 100644 --- a/src/parquet/src/io/io_uring/multi_blocking_uring.rs +++ b/src/parquet/src/io/io_uring/multi_blocking_uring.rs @@ -29,7 +29,7 @@ impl BlockingRing { { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(0); + let entry = task.prepare_sqe()[0].clone().user_data(0); unsafe { sq.push(&entry).expect("Failed to push to submission queue"); } @@ -44,7 +44,7 @@ impl BlockingRing { let cqe = cq .next() .ok_or_else(|| io::Error::other("io-uring completion queue empty"))?; - task.complete(&cqe); + task.complete(vec![&cqe]); } Ok(task) @@ -168,7 +168,7 @@ pub(crate) fn read( run_blocking_task(Box::new(read_task))?.into_result() } -pub(crate) fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { use std::fs::OpenOptions; let file = OpenOptions::new() @@ -176,6 +176,6 @@ pub(crate) fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { .truncate(true) .write(true) .open(path)?; - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); run_blocking_task(Box::new(write_task))?.into_result() } diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index ea5da015..e7128c1b 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -8,7 +8,7 @@ use tokio::sync::oneshot; use crate::io::io_uring::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; -const URING_NUM_ENTRIES: u32 = 128; +const URING_NUM_ENTRIES: u32 = 256; const MAX_CONCURRENT_TASKS: u32 = 128; @@ -112,7 +112,7 @@ impl RuntimeWorker { let task = self.inflight_tasks[token] .take() .expect("Task not found in submitted tasks"); - task.inner.borrow_mut().complete(&cqe); + task.inner.borrow_mut().complete(vec![&cqe]); unsafe { (*task.completed).store(true, Ordering::Relaxed); } task.waker.wake(); self.tokens.push_back(token as u16); @@ -126,7 +126,8 @@ impl RuntimeWorker { fn submit_task(self: &mut Self, task: AsyncTask) { let token = self.tokens.pop_front().expect("No more tokens"); let sq = &mut self.ring.submission(); - let sqe = task.inner.borrow_mut().prepare_sqe().user_data(token as u64); + let entries = task.inner.borrow_mut().prepare_sqe(); + let sqe = entries[0].clone().user_data(token as u64); unsafe { sq.push(&sqe).expect("Failed to push to submission queue"); } @@ -272,6 +273,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), true, false); submit_async_task(write_task).await.borrow_mut().get_result() } \ No newline at end of file diff --git a/src/parquet/src/io/io_uring/single_uring.rs b/src/parquet/src/io/io_uring/single_uring.rs index 6f5cd55d..2f6c24f3 100644 --- a/src/parquet/src/io/io_uring/single_uring.rs +++ b/src/parquet/src/io/io_uring/single_uring.rs @@ -161,7 +161,7 @@ impl SharedRingInner { fn submit_task(&mut self, task: &mut dyn IoTask, token: u16) { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(token as u64); + let entry = task.prepare_sqe()[0].clone().user_data(token as u64); unsafe { sq.push(&entry) .expect("Failed to push entry to io-uring submission queue"); @@ -264,7 +264,7 @@ where match state { State::Pending { token, mut task } => { if let Some(cqe) = ring.take_completion(token) { - task.complete(&cqe); + task.complete(vec![&cqe]); return Poll::Ready(task); } // Not ready yet, restore state @@ -331,7 +331,7 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) @@ -339,6 +339,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index 346da682..f9fddeef 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -1,5 +1,5 @@ use std::{ - any::Any, cell::RefCell, ffi::CString, fs, mem, ops::Range, os::{ + alloc::{Layout, alloc}, any::Any, cell::RefCell, error::Error, ffi::CString, fs, mem, ops::Range, os::{ fd::{AsRawFd, FromRawFd, RawFd}, unix::ffi::OsStringExt, }, path::PathBuf, rc::Rc @@ -7,16 +7,18 @@ use std::{ use bytes::Bytes; use io_uring::{cqueue, opcode, squeue}; +use liquid_cache_common::memory::pool::{FixedBufferAllocation, FixedBufferPool}; pub(crate) const BLOCK_ALIGN: usize = 4096; /// Represents an IO request to the uring worker thread. pub trait IoTask: Send + Any + std::fmt::Debug { /// Convert the request to an io-uring submission queue entry. - fn prepare_sqe(&mut self) -> squeue::Entry; + fn prepare_sqe(&mut self) -> Vec; + // TODO(): Can we pass completion queue entries on the stack? /// Record the outcome of the completion queue entry. - fn complete(&mut self, cqe: &cqueue::Entry); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>); /// Convert the boxed task to a boxed `Any` so callers can recover the original type. fn into_any(self: Box) -> Box; @@ -74,7 +76,7 @@ impl FileOpenTask { impl IoTask for FileOpenTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let mut flags = libc::O_RDONLY | libc::O_CLOEXEC; if self.direct_io { flags |= libc::O_DIRECT; @@ -82,12 +84,13 @@ impl IoTask for FileOpenTask { let open_op = opcode::OpenAt::new(io_uring::types::Fd(libc::AT_FDCWD), self.path.as_ptr()) .flags(flags); - open_op.build() + vec![open_op.build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - let result = cqe.result(); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { + debug_assert_eq!(cqe.len(), 1, "Should receive a single completion for a file open task"); + let result = cqe[0].result(); if result < 0 { self.error = Some(std::io::Error::from_raw_os_error(-result)); } else { @@ -209,7 +212,7 @@ impl FileReadTask { impl IoTask for FileReadTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let num_bytes = (self.range.end - self.range.start) as usize; let (start_padding, end_padding) = self.padding(); let num_bytes_aligned = num_bytes + start_padding + end_padding; @@ -223,15 +226,120 @@ impl IoTask for FileReadTask { num_bytes_aligned as u32, ); - read_op + vec![read_op .offset(self.range.start - start_padding as u64) - .build() + .build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - if cqe.result() < 0 { - self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { + debug_assert_eq!(cqe.len(), 1, "Should receive a single completion for a FileRead task"); + let result = cqe[0].result(); + if result < 0 { + self.error = Some(std::io::Error::from_raw_os_error(-result)); + } + } + + fn into_any(self: Box) -> Box { + self + } +} + +#[derive(Debug)] +pub(crate) struct FixedFileReadTask { + fixed_buffer: FixedBufferAllocation, + file: RawFd, + range: Range, + direct_io: bool, + error: Option, +} + +impl FixedFileReadTask { + #[inline] + fn compute_padding(range: &Range, direct_io: bool) -> (usize, usize) { + if direct_io { + let start_padding = range.start as usize & (BLOCK_ALIGN - 1); + let end_mod = range.end as usize & (BLOCK_ALIGN - 1); + let end_padding = if end_mod == 0 { + 0 + } else { + BLOCK_ALIGN - end_mod + }; + (start_padding, end_padding) + } else { + (0, 0) + } + } + + #[inline] + fn padding(&self) -> (usize, usize) { + Self::compute_padding(&self.range, self.direct_io) + } + + pub(crate) fn build(range: Range, file: &fs::File, direct_io: bool) -> Result { + let (start_padding, end_padding) = Self::compute_padding(&range, direct_io); + let requested_bytes = (range.end - range.start) as usize; + let num_bytes_aligned = requested_bytes + start_padding + end_padding; + + // Fixed buffers are aligned to the block size. Don't worry about alignment here + let ptr = FixedBufferPool::malloc(num_bytes_aligned); + if ptr.is_null() { + return Err(std::io::Error::from(std::io::ErrorKind::OutOfMemory)); + } + let alloc = FixedBufferAllocation {ptr, size: num_bytes_aligned}; + + Ok(FixedFileReadTask { + fixed_buffer: alloc, + file: file.as_raw_fd(), + range, + direct_io, + error: None, + }) + } + + /// Return a bytes object holding the result of the read operation. + #[inline] + pub(crate) fn into_result(self: Box) -> Result { + let mut this = self; + if let Some(err) = this.error.take() { + return Err(err); + } + + let (start_padding, _) = this.padding(); + let range_len = (this.range.end - this.range.start) as usize; + let data_end = start_padding + range_len; + let bytes = Bytes::from_owner(this.fixed_buffer); + + Ok(bytes.slice(start_padding..data_end)) + } +} + +impl IoTask for FixedFileReadTask { + #[inline] + fn prepare_sqe(&mut self) -> Vec { + let buffers = FixedBufferPool::get_fixed_buffers(&self.fixed_buffer); + let mut sqes = Vec::::new(); + let (start_padding, _) = self.padding(); + let mut file_offset = self.range.start - start_padding as u64; + for buffer in buffers { + let sqe = opcode::ReadFixed::new( + io_uring::types::Fd(self.file), + buffer.ptr, + buffer.bytes as u32, + buffer.buf_id as u16) + .offset(file_offset).build(); + file_offset += buffer.bytes as u64; + sqes.push(sqe); + } + sqes + } + + #[inline] + fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { + for cqe in cqes.iter().as_ref() { + if cqe.result() < 0 { + self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + } } } @@ -242,16 +350,33 @@ impl IoTask for FileReadTask { #[derive(Debug)] pub struct FileWriteTask { - data: Bytes, + data: *const u8, fd: RawFd, + size: usize, error: Option, } +unsafe impl Send for FileWriteTask {} + impl FileWriteTask { - pub(crate) fn build(data: Bytes, fd: RawFd) -> FileWriteTask { + pub(crate) fn build(data: Bytes, fd: RawFd, direct_io: bool, use_fixed_buffers: bool) -> FileWriteTask { + let mut ptr = data.as_ptr(); + let bytes = data.len(); + let mut padding = 0; + if direct_io { + padding = (4096 - (data.len() & 4095)) & 4095; + let layout = Layout::from_size_align(data.len() + padding, 4096).expect("Failed to create layout"); + assert!((data.len() + padding) % 4096 == 0); + unsafe { + let new_ptr = alloc(layout); + std::ptr::copy_nonoverlapping(ptr, new_ptr, data.len()); + ptr = new_ptr; + } + } FileWriteTask { - data, + data: ptr, fd, + size: bytes + padding, error: None, } } @@ -275,20 +400,22 @@ impl FileWriteTask { impl IoTask for FileWriteTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let write_op = opcode::Write::new( io_uring::types::Fd(self.fd), - self.data.as_ptr(), - self.data.len() as u32, + self.data, + self.size as u32, ); - write_op.offset(0u64).build() + vec![write_op.offset(0u64).build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - if cqe.result() < 0 { - self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { + debug_assert_eq!(cqes.len(), 1, "Should receive a single completion for a FileWrite task"); + let result = cqes[0].result(); + if result != self.size as i32 { + self.error = Some(std::io::Error::from_raw_os_error(-result)); } } diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index 7f4bc14c..733cd11e 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -57,7 +57,7 @@ impl BackendKind { } else { IoMode::Uring }; - initialize_uring_pool(mode); + initialize_uring_pool(mode, false); }); } BackendKind::MultiBlocking => { @@ -81,21 +81,21 @@ impl BackendKind { BackendKind::MultiBlocking => { async move { multi_blocking_uring::read(path, range, direct_io) }.boxed() } - BackendKind::ThreadPool => thread_pool_uring::read(path, range, direct_io).boxed(), + BackendKind::ThreadPool => thread_pool_uring::read(path, range, direct_io, true).boxed(), } } fn write_future(self, path: PathBuf, data: Bytes) -> IoFuture<()> { match self { - BackendKind::Shared => async move { single_uring::write(path, &data).await }.boxed(), + BackendKind::Shared => async move { single_uring::write(path, &data, false).await }.boxed(), BackendKind::MultiAsync => { - async move { multi_async_uring::write(path, &data).await }.boxed() + async move { multi_async_uring::write(path, &data, false).await }.boxed() } BackendKind::MultiBlocking => { - async move { multi_blocking_uring::write(path, &data) }.boxed() + async move { multi_blocking_uring::write(path, &data, false) }.boxed() } BackendKind::ThreadPool => { - async move { thread_pool_uring::write(path, &data).await }.boxed() + async move { thread_pool_uring::write(path, &data, false, false).await }.boxed() } } } diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index b63cd2e1..2319dace 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -1,33 +1,49 @@ use std::{ - collections::VecDeque, - fs::OpenOptions, - future::Future, - ops::Range, - os::fd::AsRawFd, - path::PathBuf, - pin::Pin, - sync::{ + collections::VecDeque, fs::OpenOptions, future::Future, io, ops::Range, os::{fd::AsRawFd, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, sync::{ OnceLock, atomic::{AtomicBool, AtomicUsize, Ordering}, - }, - task::{Context, Poll}, - thread, + }, task::{Context, Poll}, thread, time::{Duration, Instant} }; use bytes::Bytes; -use io_uring::{IoUring, cqueue, squeue}; -use liquid_cache_common::IoMode; +use io_uring::{EnterFlags, IoUring, cqueue, squeue}; +use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use tokio::sync::oneshot; -use super::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; +use crate::io::io_uring::tasks::FixedFileReadTask; +use rand::Rng; + +#[usdt::provider] +mod liquid_parquet { + fn io_submitted(id: u64) {} + fn io_completed(id: u64) {} +} + +static REGISTRATION_SUCCEEDED: OnceLock = OnceLock::new(); + +fn ensure_registered() -> bool { + *REGISTRATION_SUCCEEDED.get_or_init(|| match usdt::register_probes() { + Ok(()) => true, + Err(err) => { + log::debug!("failed to register USDT probes: {err}"); + false + } + }) +} + +use super::tasks::{FileReadTask, FileWriteTask, IoTask}; pub(crate) const URING_NUM_ENTRIES: u32 = 256; +const URING_BATCH_SIZE: u32 = 32; + static ENABLED: AtomicBool = AtomicBool::new(true); struct Submission { task: Box, completion_tx: oneshot::Sender>, + pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker + completions: Vec, } impl Submission { @@ -35,15 +51,32 @@ impl Submission { Submission { task, completion_tx, + pending_completions: 0, + completions: Vec::new(), } } - fn send_back(mut self, cqe: &cqueue::Entry) { - self.task.complete(cqe); + fn send_back(mut self) { + self.task.complete(self.completions.iter().collect()); self.completion_tx .send(self.task) .expect("Failed to send task back to caller"); } + + #[inline] + fn set_completions(&mut self, count: usize) { + self.pending_completions = count; + } + + #[inline] + fn reduce_completions(&mut self) { + self.pending_completions -= 1; + } + + #[inline] + fn push_completion(&mut self, cqe: cqueue::Entry) { + self.completions.push(cqe); + } } struct JoinOnDropHandle(Option>); @@ -74,9 +107,9 @@ unsafe impl Sync for IoUringThreadpool {} static IO_URING_THREAD_POOL_INST: OnceLock = OnceLock::new(); -pub(crate) fn initialize_uring_pool(io_mode: IoMode) { +pub(crate) fn initialize_uring_pool(io_mode: IoMode, register_buffers: bool) { if matches!(io_mode, IoMode::Uring | IoMode::UringDirect) { - IO_URING_THREAD_POOL_INST.get_or_init(|| IoUringThreadpool::new(io_mode)); + IO_URING_THREAD_POOL_INST.get_or_init(|| IoUringThreadpool::new(io_mode, register_buffers)); } if matches!(io_mode, IoMode::UringBlocking) { super::multi_blocking_uring::initialize_blocking_rings(); @@ -84,18 +117,13 @@ pub(crate) fn initialize_uring_pool(io_mode: IoMode) { } impl IoUringThreadpool { - fn new(io_type: IoMode) -> IoUringThreadpool { + fn new(io_type: IoMode, register_buffers: bool) -> IoUringThreadpool { let (sender, receiver) = crossbeam_channel::unbounded::(); - let builder = IoUring::::builder(); - let ring = builder - .build(URING_NUM_ENTRIES) - .expect("Failed to build IoUring instance"); - let worker = thread::Builder::new() .name("lc-io-worker".to_string()) .spawn(move || { - let mut uring_worker = UringWorker::new(receiver, ring); + let mut uring_worker = UringWorker::new(receiver, register_buffers); uring_worker.thread_loop(); }) .expect("Failed to spawn io-uring worker thread"); @@ -134,12 +162,37 @@ struct UringWorker { ring: IoUring, tokens: VecDeque, submitted_tasks: Vec>, + /** + * When using fixed buffers, a single task can produce multiple submission queue entries. + * It is possible that we aren't able to submit all of them at one go. Hold them in an + * intermediate queue in that case + */ + queued_entries: VecDeque, io_performed: AtomicUsize, + last_syscall: Instant, + // Number of entries that will be submitted upon calling io_uring_enter + queued_submissions: u32, } impl UringWorker { #[allow(clippy::new_ret_no_self)] - fn new(channel: crossbeam_channel::Receiver, ring: IoUring) -> UringWorker { + fn new(channel: crossbeam_channel::Receiver, register_buffers: bool) -> UringWorker { + let mut builder = IoUring::::builder(); + let ring = builder + .setup_single_issuer() // Only the worker thread will issue IO and poll completions + .setup_defer_taskrun() + // .setup_iopoll() + // .setup_sqpoll(50000) + .build(URING_NUM_ENTRIES) + .expect("Failed to build IoUring instance"); + + if register_buffers { + let res = FixedBufferPool::register_buffers_with_ring(&ring); + if res.is_err() { + log::error!("Failed to register buffers with io-uring ring: {:?}", res); + } + } + let tokens = (0..URING_NUM_ENTRIES as u16).collect(); let mut tasks = Vec::with_capacity(URING_NUM_ENTRIES as usize); tasks.resize_with(URING_NUM_ENTRIES as usize, || None); @@ -149,6 +202,9 @@ impl UringWorker { tokens, submitted_tasks: tasks, io_performed: AtomicUsize::new(0), + queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), + last_syscall: Instant::now(), + queued_submissions: 0, } } @@ -158,31 +214,84 @@ impl UringWorker { break; } + self.drain_intermediate_queue(); self.drain_submissions(); self.poll_completions(); } } + fn drain_intermediate_queue(&mut self) { + { + let sq = &mut self.ring.submission(); + while !sq.is_full() && !self.queued_entries.is_empty() { + let sqe = self.queued_entries.pop_front().unwrap(); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + self.queued_submissions += 1; + } + } + } + #[inline(never)] fn drain_submissions(&mut self) { - let mut need_submit = false; while !self.receiver.is_empty() && !self.tokens.is_empty() { - let mut submission = self.receiver.recv().unwrap(); + let sq = &mut self.ring.submission(); + sq.sync(); + if sq.is_full() { + // A single token might have multiple associated sqes. Free token doesn't always imply that we have free submission slots + break; + } + let token = self.tokens.pop_front().unwrap(); - { - let sq = &mut self.ring.submission(); - let task = submission.task.as_mut(); - let sqe = task.prepare_sqe().user_data(token as u64); - unsafe { - sq.push(&sqe).expect("Failed to push to submission queue"); + let mut submission = self.receiver.recv().unwrap(); + let task = submission.task.as_mut(); + let mut sqes = task.prepare_sqe(); + self.queued_submissions += sqes.len() as u32; + submission.set_completions(sqes.len()); + let mut tasks_submitted = 0; + + for sqe in sqes.iter_mut() { + let res = unsafe { + sq.push(&sqe.clone().user_data(token as u64)) + }; + if res.is_err() { + break; } + tasks_submitted += 1; sq.sync(); } + for i in tasks_submitted..sqes.len() { + self.queued_entries.push_back(sqes[i].clone().user_data(token as u64)); + } self.submitted_tasks[token as usize] = Some(submission); - need_submit = true; } - if need_submit { - self.ring.submit().expect("Failed to submit"); + // let need_poll = self.tokens.len() < URING_NUM_ENTRIES as usize; + let time_from_last_submit = self.last_syscall.elapsed(); + let is_batch_full = self.queued_submissions >= URING_BATCH_SIZE; + let need_syscall = is_batch_full || time_from_last_submit > Duration::from_micros(20); + if need_syscall { + let mut flags = EnterFlags::empty(); + flags.insert(EnterFlags::GETEVENTS); + loop { + let res = unsafe { + self.ring.submitter().enter::(self.queued_submissions, 0, flags.bits(), None) + }; + match res { + Ok(_num_entries) => { + break; + } + Err(e) => { + if e.kind() == io::ErrorKind::Interrupted { + continue; + } + panic!("Failed to submit: {}", e.to_string()); + } + } + } + self.last_syscall = Instant::now(); + self.queued_submissions = 0; } } @@ -194,12 +303,26 @@ impl UringWorker { match cq.next() { Some(cqe) => { let token = cqe.user_data() as usize; - let submission = self.submitted_tasks[token] - .take() - .expect("Task not found in submitted tasks"); - submission.send_back(&cqe); - self.tokens.push_back(token as u16); - self.io_performed.fetch_add(1, Ordering::Relaxed); + let pending_completions = self.submitted_tasks[token] + .as_ref() + .expect("Task not found in submitted tasks") + .pending_completions; + + if pending_completions == 1 { + let mut submission = self.submitted_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + submission.push_completion(cqe); + submission.send_back(); + self.tokens.push_back(token as u16); + self.io_performed.fetch_add(1, Ordering::Relaxed); + } else { + let submission = self.submitted_tasks[token] + .as_mut() + .expect("Task not found in submitted tasks"); + submission.reduce_completions(); + submission.push_completion(cqe); + } } None => break, } @@ -221,6 +344,7 @@ where T: IoTask + 'static, { state: UringState, + id: u64, } impl UringFuture @@ -230,6 +354,7 @@ where fn new(task: Box) -> UringFuture { UringFuture { state: UringState::Created(task), + id: rand::rng().random(), } } } @@ -248,6 +373,9 @@ where let pool = IO_URING_THREAD_POOL_INST .get() .expect("Uring threadpool not initialized"); + if ensure_registered() { + liquid_parquet::io_submitted!(|| self.id); + } let (tx, rx) = oneshot::channel::>(); let boxed_task: Box = task; pool.submit_task(boxed_task, tx); @@ -255,6 +383,9 @@ where } UringState::Submitted(mut receiver) => match Pin::new(&mut receiver).poll(cx) { Poll::Ready(Ok(task)) => { + if ensure_registered() { + liquid_parquet::io_completed!(|| self.id); + } let typed_task = task .into_any() .downcast::() @@ -286,9 +417,14 @@ pub(crate) async fn read( path: PathBuf, range: Option>, direct_io: bool, + use_fixed_buffers: bool, ) -> Result { - let open_task = FileOpenTask::build(path, direct_io)?; - let file = submit_async_task(open_task).await.into_result()?; + // Perform open operations in a blocking manner as they are not compatible with a io_uring instance that uses polled mode IO + let file = OpenOptions::new() + .read(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .expect("failed to create file"); let effective_range = if let Some(range) = range { range @@ -297,18 +433,31 @@ pub(crate) async fn read( 0..len }; + if use_fixed_buffers { + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, direct_io); + // Fall back to normal read if fixed buffers are not available + if read_task.is_ok() { + return submit_async_task(read_task.unwrap()).await.into_result() + } + } let read_task = FileReadTask::build(effective_range, file, direct_io); - submit_async_task(read_task).await.into_result() + return submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write( + path: PathBuf, + data: &Bytes, + direct_io: bool, + use_fixed_buffers: bool +) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) .write(true) + .custom_flags(libc::O_DIRECT) .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, use_fixed_buffers); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/mod.rs b/src/parquet/src/io/mod.rs index 01ce70b6..32af777d 100644 --- a/src/parquet/src/io/mod.rs +++ b/src/parquet/src/io/mod.rs @@ -26,19 +26,25 @@ pub(crate) struct ParquetIoContext { } impl ParquetIoContext { - pub fn new(base_dir: PathBuf, io_mode: IoMode) -> Self { + pub fn new(base_dir: PathBuf, io_mode: IoMode, fixed_buffer_pool_size_mb: usize) -> Self { if matches!( io_mode, IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking ) { #[cfg(target_os = "linux")] { - crate::io::io_uring::initialize_uring_pool(io_mode); + use liquid_cache_common::memory::pool::FixedBufferPool; + if fixed_buffer_pool_size_mb > 0 { + FixedBufferPool::init(fixed_buffer_pool_size_mb); + } + crate::io::io_uring::initialize_uring_pool(io_mode, fixed_buffer_pool_size_mb > 0); } #[cfg(not(target_os = "linux"))] { panic!("io_mode {:?} is only supported on Linux", io_mode); } + } else if fixed_buffer_pool_size_mb > 0 { + panic!("Fixed buffers are only supported for UringDirect, Uring and UringBlocking"); } Self { @@ -147,7 +153,7 @@ mod tests { #[test] fn squeeze_hint_tracks_majority() { let tmp = tempdir().unwrap(); - let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking); + let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking, 0); let e = entry(1, 2, 3); let month = Arc::new(CacheExpression::extract_date32(Date32Field::Month)); let year = Arc::new(CacheExpression::extract_date32(Date32Field::Year)); @@ -163,7 +169,7 @@ mod tests { #[test] fn squeeze_hint_prefers_recent_on_tie() { let tmp = tempdir().unwrap(); - let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking); + let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking, 0); let e = entry(9, 9, 9); let year = Arc::new(CacheExpression::extract_date32(Date32Field::Year)); let day = Arc::new(CacheExpression::extract_date32(Date32Field::Day)); diff --git a/src/parquet/src/optimizers/mod.rs b/src/parquet/src/optimizers/mod.rs index 2fdfecd9..2cd26e38 100644 --- a/src/parquet/src/optimizers/mod.rs +++ b/src/parquet/src/optimizers/mod.rs @@ -339,6 +339,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, )); let rewritten = rewrite_data_source_plan(plan, &liquid_cache, true); diff --git a/src/parquet/src/reader/runtime/liquid_cache_reader.rs b/src/parquet/src/reader/runtime/liquid_cache_reader.rs index 725cf8ba..8a92f547 100644 --- a/src/parquet/src/reader/runtime/liquid_cache_reader.rs +++ b/src/parquet/src/reader/runtime/liquid_cache_reader.rs @@ -301,6 +301,7 @@ mod tests { Box::new(Evict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let field = Arc::new(Field::new("col0", DataType::Int32, false)); let schema = Arc::new(Schema::new(vec![field.clone()])); diff --git a/src/parquet/src/reader/runtime/liquid_stream.rs b/src/parquet/src/reader/runtime/liquid_stream.rs index 359068de..ee2ff130 100644 --- a/src/parquet/src/reader/runtime/liquid_stream.rs +++ b/src/parquet/src/reader/runtime/liquid_stream.rs @@ -716,6 +716,7 @@ mod tests { Box::new(Evict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let file = cache.register_or_get_file("test.parquet".to_string(), schema); file.create_row_group(0, vec![]) diff --git a/src/server/src/lib.rs b/src/server/src/lib.rs index a4af6733..ac0f2221 100644 --- a/src/server/src/lib.rs +++ b/src/server/src/lib.rs @@ -122,6 +122,7 @@ impl LiquidCacheService { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), None, + 0, ) } @@ -141,6 +142,7 @@ impl LiquidCacheService { squeeze_policy: Box, hydration_policy: Box, io_mode: Option, + fixed_buffer_pool_size_mb: usize, ) -> anyhow::Result { let disk_cache_dir = match disk_cache_dir { Some(dir) => dir, @@ -163,6 +165,7 @@ impl LiquidCacheService { squeeze_policy, hydration_policy, io_mode, + fixed_buffer_pool_size_mb, ), }) } diff --git a/src/server/src/service.rs b/src/server/src/service.rs index ec41586e..cc240fad 100644 --- a/src/server/src/service.rs +++ b/src/server/src/service.rs @@ -52,6 +52,7 @@ impl LiquidCacheServiceInner { squeeze_policy: Box, hydration_policy: Box, io_mode: IoMode, + fixed_buffer_pool_size_mb: usize, ) -> Self { let batch_size = default_ctx.state().config().batch_size(); @@ -66,6 +67,7 @@ impl LiquidCacheServiceInner { squeeze_policy, hydration_policy, io_mode, + fixed_buffer_pool_size_mb, )); Self { @@ -224,6 +226,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let url = Url::parse("file:///").unwrap(); server diff --git a/src/server/src/tests/mod.rs b/src/server/src/tests/mod.rs index 1e9db2b8..4bbae70d 100644 --- a/src/server/src/tests/mod.rs +++ b/src/server/src/tests/mod.rs @@ -46,6 +46,7 @@ async fn run_sql( squeeze_policy, Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); async fn get_result(service: &LiquidCacheServiceInner, sql: &str) -> String { let handle = Uuid::new_v4(); From dcb0fa5e4a98da38992014155b7c5f221593f079 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 1 Mar 2026 23:04:58 -0600 Subject: [PATCH 04/26] Integrate uring runtime with fixed buffers - Syscall batching - Handle multiple sqes per IO task - Spawn a batch of tasks on the runtime --- src/parquet/src/io/io_uring/runtime.rs | 209 ++++++++++++++++++---- src/parquet/src/io/io_uring/tasks.rs | 17 +- src/parquet/src/optimizers/lineage_opt.rs | 1 + 3 files changed, 187 insertions(+), 40 deletions(-) diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index e7128c1b..1e8dc779 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -1,12 +1,13 @@ -use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::fd::AsRawFd as _, path::PathBuf, pin::Pin, rc::Rc, sync::atomic::{AtomicBool, Ordering}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}}; +use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::fd::AsRawFd as _, path::PathBuf, pin::Pin, rc::Rc, sync::atomic::{AtomicBool, Ordering}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}, time::{Duration, Instant}}; use async_executor::LocalExecutor; use bytes::Bytes; use futures::Future; use io_uring::{IoUring, squeue, cqueue}; +use rand::Rng; use tokio::sync::oneshot; -use crate::io::io_uring::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; +use crate::io::io_uring::tasks::{FileOpenTask, FileReadTask, FileWriteTask, FixedFileReadTask, IoTask}; const URING_NUM_ENTRIES: u32 = 256; @@ -16,32 +17,34 @@ type ExecutorTask = Pin + Send>>; pub struct UringExecutor { workers: Vec>, - sender: crossbeam_channel::Sender, + /// One sender per worker; tasks are submitted to a worker's dedicated channel. + senders: Vec>, } impl UringExecutor { - /// Spawn worker threads and initialize channel to receive tasks + /// Spawn worker threads; each worker has its own channel to receive tasks. pub fn new(num_threads: usize) -> UringExecutor { let mut workers = Vec::new(); - let (sender, receiver) = crossbeam_channel::unbounded::(); + let mut senders = Vec::with_capacity(num_threads); for i in 0..num_threads { - let receiver_clone = receiver.clone(); + let (sender, receiver) = crossbeam_channel::unbounded::(); + senders.push(sender); let worker = thread::Builder::new() .name(std::format!("lc-io-worker-{}", i)) .spawn(move || { - worker_main_loop(receiver_clone); + worker_main_loop(receiver); }) .expect("Failed to spawn IO runtime worker"); workers.push(worker); } UringExecutor { workers, - sender, + senders, } } - /// Spawns a task in the uring runtime by sending it through a crossbeam channel. - /// The result is received through a oneshot channel + /// Spawns a task in the uring runtime by sending it to a randomly chosen worker's channel. + /// The result is received through a oneshot channel. pub fn spawn(self: &mut Self, future: F) -> oneshot::Receiver where F::Output: Send + 'static, @@ -55,10 +58,36 @@ impl UringExecutor { } }; let task = Box::pin(f); - self.sender.send(task).expect("UringExecutor failed to send task"); + let idx = rand::rng().random_range(0..self.senders.len()); + self.senders[idx] + .send(task) + .expect("UringExecutor failed to send task"); receiver } + /// Spawn a batch of tasks on the io_uring runtime, balancing across workers (round-robin). + pub fn spawn_many(self: &mut Self, futures: &mut Vec) -> crossbeam_channel::Receiver + where + F::Output: Send + 'static, + { + let (sender, receiver) = crossbeam_channel::bounded::(futures.len()); + let num_workers = self.senders.len(); + for (i, f) in futures.drain(..).enumerate() { + let sender_clone = sender.clone(); + let f = Box::pin(f); + let task = async move { + let output = f.await; + sender_clone.send(output).expect("Failed to send back result"); + }; + let idx = i % num_workers; + self.senders[idx] + .send(Box::pin(task)) + .expect("UringExecutor failed to send task"); + } + receiver + } + + /// Spawns a task on the io_uring runtime and blocks on it pub fn run_to_completion(self: &mut Self, future: F) -> F::Output where F::Output: Send + 'static, @@ -72,18 +101,33 @@ thread_local! { static LOCAL_WORKER: RefCell = RefCell::new(RuntimeWorker::new()); } +const URING_BATCH_SIZE: u32 = 8; + +const URING_SYSCALL_INTERVAL_US: u64 = 5; + +const RUNTIME_TASK_BATCH_SIZE: u32 = 4; + struct RuntimeWorker { ring: io_uring::IoUring, - inflight_tasks: Vec>, + submitted_tasks: Vec>, + /** + * When using fixed buffers, a single task can produce multiple submission queue entries. + * It is possible that we aren't able to submit all of them at one go. Hold them in an + * intermediate queue in that case + */ + queued_entries: VecDeque, + last_syscall: Instant, tokens: VecDeque, - need_submit: bool, io_performed: u64, + queued_submissions: u64, } impl RuntimeWorker { pub fn new() -> RuntimeWorker { - let builder = IoUring::::builder(); + let mut builder = IoUring::::builder(); let ring = builder + .setup_single_issuer() // Only the worker thread will issue IO and poll completions + .setup_defer_taskrun() .build(URING_NUM_ENTRIES) .expect("Failed to build IoUring instance"); let mut tokens = VecDeque::::with_capacity(MAX_CONCURRENT_TASKS as usize); @@ -95,13 +139,22 @@ impl RuntimeWorker { RuntimeWorker { ring, - inflight_tasks, + submitted_tasks: inflight_tasks, tokens, - need_submit: false, + queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), + last_syscall: Instant::now(), io_performed: 0, + queued_submissions: 0, } } + #[inline] + fn need_syscall(self: &Self) -> bool { + let time_from_last_submit = self.last_syscall.elapsed(); + let is_batch_full = self.queued_entries.len() >= URING_BATCH_SIZE as usize; + is_batch_full || time_from_last_submit > Duration::from_micros(URING_SYSCALL_INTERVAL_US) + } + fn poll_completions(self: &mut Self) { let cq = &mut self.ring.completion(); loop { @@ -109,31 +162,68 @@ impl RuntimeWorker { match cq.next() { Some(cqe) => { let token = cqe.user_data() as usize; - let task = self.inflight_tasks[token] + let pending_completions = self.submitted_tasks[token] + .as_ref() + .expect("Task not found in submitted tasks") + .pending_completions; + + let mut submission = self.submitted_tasks[token] .take() .expect("Task not found in submitted tasks"); - task.inner.borrow_mut().complete(vec![&cqe]); - unsafe { (*task.completed).store(true, Ordering::Relaxed); } - task.waker.wake(); - self.tokens.push_back(token as u16); - self.io_performed += 1; + submission.push_completion(cqe); + if pending_completions == 1 { + unsafe { + (*submission.completed).store(true, Ordering::Relaxed); + } + submission.waker.wake(); + self.tokens.push_back(token as u16); + self.io_performed += 1; + } else { + submission.reduce_completions(); + } } None => break, } } } - fn submit_task(self: &mut Self, task: AsyncTask) { + fn drain_intermediate_queue(&mut self) { + { + let sq = &mut self.ring.submission(); + while !sq.is_full() && !self.queued_entries.is_empty() { + let sqe = self.queued_entries.pop_front().unwrap(); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + self.queued_submissions += 1; + } + } + } + + fn submit_task(self: &mut Self, mut task: AsyncTask) { let token = self.tokens.pop_front().expect("No more tokens"); let sq = &mut self.ring.submission(); - let entries = task.inner.borrow_mut().prepare_sqe(); - let sqe = entries[0].clone().user_data(token as u64); - unsafe { - sq.push(&sqe).expect("Failed to push to submission queue"); + let sqes = task.inner.borrow_mut().prepare_sqe(); + task.set_completions(sqes.len()); + self.submitted_tasks[token as usize] = Some(task); + let mut sqes_submitted = 0; + + for sqe in sqes.iter() { + let res = unsafe { + sq.push(&sqe.clone().user_data(token as u64)) + }; + if res.is_err() { + // submission queue is full + break; + } + sqes_submitted += 1; + self.queued_submissions += 1; + sq.sync(); + } + for i in sqes_submitted..sqes.len() { + self.queued_entries.push_back(sqes[i].clone().user_data(token as u64)); } - sq.sync(); - self.inflight_tasks[token as usize] = Some(task); - self.need_submit = true; } pub fn add_task(task: AsyncTask) { @@ -147,20 +237,29 @@ impl RuntimeWorker { fn worker_main_loop(receiver: crossbeam_channel::Receiver) { let executor = LocalExecutor::new(); loop { - while !receiver.is_empty() { - let task = receiver.recv() - .expect("Failed to receive task"); - executor.spawn(task).detach(); + let mut tasks_submitted = 0; + // Need some form of admission control here + while tasks_submitted < RUNTIME_TASK_BATCH_SIZE && !receiver.is_empty() { + let task = receiver.try_recv(); + if task.is_err() { + continue; + } + executor.spawn(task.unwrap()).detach(); + tasks_submitted += 1; } - let task_found = executor.try_tick(); + // Can we batch the ticks? + let _task_found = executor.try_tick(); LOCAL_WORKER.with(|worker| { let mut worker = worker.borrow_mut(); - if worker.need_submit { + worker.drain_intermediate_queue(); + if worker.need_syscall() { worker.ring.submit().expect("Failed to submit"); - worker.need_submit = false; - } else if !task_found && worker.tokens.len() < MAX_CONCURRENT_TASKS as usize { - worker.ring.submit_and_wait(1).expect("Failed to submit"); + worker.queued_submissions = 0; + worker.last_syscall = Instant::now(); } + // else if !task_found && worker.tokens.len() < MAX_CONCURRENT_TASKS as usize { + // worker.ring.submit_and_wait(1).expect("Failed to submit"); + // } worker.poll_completions(); }); } @@ -171,6 +270,25 @@ struct AsyncTask { pub inner: Rc>, pub waker: Waker, pub completed: *mut AtomicBool, + pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker + completions: Vec, +} + +impl AsyncTask { + #[inline] + fn set_completions(&mut self, count: usize) { + self.pending_completions = count; + } + + #[inline] + fn reduce_completions(&mut self) { + self.pending_completions -= 1; + } + + #[inline] + fn push_completion(&mut self, cqe: cqueue::Entry) { + self.completions.push(cqe); + } } enum UringState @@ -220,6 +338,8 @@ where inner: self.task.clone(), waker: cx.waker().clone(), completed: &mut self.completed, + pending_completions: 0, + completions: Vec::new(), }; RuntimeWorker::add_task(async_task); self.state = UringState::Submitted; @@ -261,6 +381,17 @@ pub(crate) async fn read( 0..len }; + { + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, direct_io); + if read_task.is_ok() { + let rc = submit_async_task(read_task.unwrap()).await; + return match Rc::try_unwrap(rc) { + Ok(cell) => FixedFileReadTask::into_result(Box::new(cell.into_inner())), + Err(rc) => rc.borrow_mut().get_result(), + }; + } + } + // Fall back to normal read if fixed buffers are not available let read_task = FileReadTask::build(effective_range, file, direct_io); submit_async_task(read_task).await.borrow_mut().get_result() } diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index f9fddeef..3a10f6d4 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -297,7 +297,7 @@ impl FixedFileReadTask { }) } - /// Return a bytes object holding the result of the read operation. + /// Return a bytes object holding the result of the read operation (consumes the task). #[inline] pub(crate) fn into_result(self: Box) -> Result { let mut this = self; @@ -312,6 +312,21 @@ impl FixedFileReadTask { Ok(bytes.slice(start_padding..data_end)) } + + /// Return a bytes object holding the result of the read operation (by copy, for use with RefCell). + #[inline] + pub(crate) fn get_result(&mut self) -> Result { + if let Some(err) = self.error.take() { + return Err(err); + } + + let (start_padding, _) = self.padding(); + let range_len = (self.range.end - self.range.start) as usize; + let data_end = start_padding + range_len; + let slice = &self.fixed_buffer.as_ref()[start_padding..data_end]; + + Ok(Bytes::copy_from_slice(slice)) + } } impl IoTask for FixedFileReadTask { diff --git a/src/parquet/src/optimizers/lineage_opt.rs b/src/parquet/src/optimizers/lineage_opt.rs index 58f25cb3..caa8240a 100644 --- a/src/parquet/src/optimizers/lineage_opt.rs +++ b/src/parquet/src/optimizers/lineage_opt.rs @@ -1102,6 +1102,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0 ))) } From 31df6c0cc974657fb2da63464398d6d3a20a7bf3 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 1 Mar 2026 23:09:21 -0600 Subject: [PATCH 05/26] Add benchmark for io_uring runtime --- benchmark/Cargo.toml | 5 + benchmark/src/storage_runner.rs | 426 ++++++++++++++++++++++++++++++++ src/parquet/src/io/mod.rs | 4 +- src/parquet/src/lib.rs | 5 + 4 files changed, 438 insertions(+), 2 deletions(-) create mode 100644 benchmark/src/storage_runner.rs diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml index 4e15517c..71404b89 100644 --- a/benchmark/Cargo.toml +++ b/benchmark/Cargo.toml @@ -12,6 +12,7 @@ liquid-cache-storage = { workspace = true } liquid-cache-common = { workspace = true } liquid-cache-local = { workspace = true } datafusion = { workspace = true } +futures = { workspace = true } tokio = { workspace = true } log = { workspace = true } arrow-flight = { workspace = true } @@ -67,3 +68,7 @@ path = "bench_server.rs" [[bin]] name = "in_process" path = "in_process.rs" + +[[bin]] +name = "storage_runner" +path = "src/storage_runner.rs" diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs new file mode 100644 index 00000000..91c9b62b --- /dev/null +++ b/benchmark/src/storage_runner.rs @@ -0,0 +1,426 @@ +#![cfg(target_os = "linux")] + +/** + * Benchmark to test the performance of io_uring runtime for clickbench queries. The queries are executed directly + * on a LiquidCache instance to bypass datafusion, which is strongly coupled with tokio. The benchmark is based on + * the arrow benchmark (https://github.com/apache/arrow-rs/blob/main/parquet/benches/arrow_reader_clickbench.rs#L729) + */ + +use arrow::array::BooleanArray; +use arrow::buffer::BooleanBuffer; +use clap::Parser; +use datafusion::logical_expr::Operator; +use datafusion::physical_plan::expressions::{BinaryExpr, Column}; +use datafusion::physical_plan::PhysicalExpr; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion::scalar::ScalarValue; +use futures::StreamExt; +use liquid_cache_common::IoMode; +use liquid_cache_storage::cache::{EntryID, LiquidCacheBuilder, LiquidCache}; +use liquid_cache_parquet::{ParquetIoContext, UringExecutor}; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::Instant; + +#[derive(Parser)] +#[command(name = "storage_runner")] +struct Args { + /// ClickBench query index (0-based). Only queries with filters are supported (e.g. 1, 10, 19, 20). + #[arg(long)] + query_index: usize, + + /// Number of partitions (tasks to spawn on UringExecutor). + #[arg(long)] + partitions: usize, + + #[arg(long)] + worker_threads: usize, + + #[arg(long)] + iterations: usize, + + /// Path to hits.parquet. Default: benchmark/clickbench/data/hits.parquet + #[arg(long, default_value = "benchmark/clickbench/data/hits.parquet")] + parquet: PathBuf, + + /// Directory for the liquid-cache storage. Default: $TMPDIR/liquid_cache_storage_runner + #[arg(long)] + cache_dir: Option, +} + +/// ClickBench query descriptor: filter column(s) and predicate expression(s). +/// Each expression is evaluated on a single column (column index 0 in the cached array). +/// TODO(): Add support for columns that are projected. +struct FilterQuery { + /// Column names to load and cache (in schema order). + filter_columns: Vec<&'static str>, + /// One predicate per filter column; each expects Column(0) op Literal. + predicates: Vec>, + /// Number of expected rows in result + expected_row_count: usize, +} + +fn all_filter_queries() -> Vec> { + use datafusion::physical_plan::expressions::Literal as Lit; + let col = || Arc::new(Column::new("col", 0)) as Arc; + + let mut q: Vec> = (0..43).map(|_| None).collect(); + + // Q1: AdvEngineID <> 0 + q[1] = Some(FilterQuery { + filter_columns: vec!["AdvEngineID"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::UInt64(Some(0)))), + ))], + expected_row_count: 3312, + }); + + // Q10: MobilePhoneModel <> '' + q[10] = Some(FilterQuery { + filter_columns: vec!["MobilePhoneModel"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 34276, + }); + + // Q12: SearchPhrase <> '' + q[12] = Some(FilterQuery { + filter_columns: vec!["SearchPhrase"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 131559, + }); + + // Q19: UserID = 3233473875476175636 (value that exists in hits_1) + q[19] = Some(FilterQuery { + filter_columns: vec!["UserID"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt64(Some(3233473875476175636)))), + ))], + expected_row_count: 4, + }); + + q[20] = Some(FilterQuery { + filter_columns: vec!["URL"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::LikeMatch, + Arc::new(Lit::new(ScalarValue::Utf8(Some("%google%".to_string())))), + ))], + expected_row_count: 137, + }); + + // Q27: URL <> '' + q[27] = Some(FilterQuery { + filter_columns: vec!["URL"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 999978, + }); + + // Q28: Referer <> '' + q[28] = Some(FilterQuery { + filter_columns: vec!["Referer"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 925813, + }); + + // Q30: SearchPhrase <> '' + q[30] = Some(FilterQuery { + filter_columns: vec!["SearchPhrase"], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 131559, + }); + + // Q36: CounterID = 62, DontCountHits = 0, IsRefresh = 0, URL <> '' + q[36] = Some(FilterQuery { + filter_columns: vec!["CounterID", "EventDate", "DontCountHits", "IsRefresh", "URL"], + predicates: vec![ + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt32(Some(62)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::Int16(Some(0)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt8(Some(0)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt8(Some(0)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + )), + ], + expected_row_count: 181198, + }); + + q +} + +fn run_single_iter( + num_batches: usize, + num_partitions: usize, + query: &FilterQuery, + storage: Arc, + entry_ids: &Vec, + batch_lengths: &Vec, + executor: &mut UringExecutor +) { + // 2) Partition batch indices evenly across workers. + let batches_per_partition = (num_batches + num_partitions - 1) / num_partitions; + let num_cols = query.filter_columns.len(); + + // 3) Create futures for every partition + let mut futures = Vec::new(); + for p in 0..num_partitions { + let start = p * batches_per_partition; + let end = (start + batches_per_partition).min(num_batches); + if start >= end { + continue; + } + let storage_clone = Arc::clone(&storage); + let batch_range = start..end; + let predicates = query.predicates.iter().map(Arc::clone).collect::>(); + let entry_ids_clone = entry_ids.clone(); + let batch_lengths_clone = batch_lengths.clone(); + futures.push(run_partition( + storage_clone, + batch_range, + num_cols, + predicates, + entry_ids_clone, + batch_lengths_clone, + )); + } + + let start = Instant::now(); + let receiver = executor.spawn_many(&mut futures); + + let mut tasks_completed = 0; + let mut total_rows = 0; + while tasks_completed < num_partitions { + total_rows += receiver.recv().expect("Failed to receive result"); + tasks_completed += 1; + } + let elapsed = start.elapsed(); + if total_rows != query.expected_row_count { + log::warn!("Expected row count doesn't match. Actual: {}, expected: {}", total_rows, query.expected_row_count); + } + log::info!("Partitions: {}, Time: {:.3}s, Total rows: {}", num_partitions, elapsed.as_secs_f64(), total_rows); +} + +fn run_bench( + cache_dir: PathBuf, + parquet_path: PathBuf, + query: &FilterQuery, + num_partitions: usize, + num_iter: usize, + num_workers: usize, +) { + let _ = std::fs::create_dir_all(&cache_dir); + let io_context = Arc::new(ParquetIoContext::new( + cache_dir.clone(), + IoMode::UringNonBlocking, + 4096, + )); + let storage = LiquidCacheBuilder::new() + .with_io_context(io_context) + .with_cache_dir(cache_dir) + .build(); + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("tokio runtime"); + let storage_clone = storage.clone(); + let (num_batches, entry_ids, batch_lengths) = rt.block_on( + async move { + // 1) Load parquet into record batches (filter columns only) and insert into cache. + let (entry_ids, batch_lengths) = load_and_insert(storage_clone.clone(), parquet_path, query).await; + let num_batches = entry_ids.len() / query.filter_columns.len(); + log::info!( + "Populated cache: {} batches, {} filter columns, {} entries", + num_batches, + query.filter_columns.len(), + entry_ids.len() + ); + + storage_clone.flush_all_to_disk().await; + (num_batches, entry_ids, batch_lengths) + }); + let mut executor = UringExecutor::new(num_workers); + + for _i in 0..num_iter { + run_single_iter(num_batches, + num_partitions, + &query, + storage.clone(), + &entry_ids, + &batch_lengths, + &mut executor + ); + } +} + +async fn run_partition( + storage: Arc, + batch_range: std::ops::Range, + num_cols: usize, + predicates: Vec::>, + entry_ids: Vec::, + batch_lengths: Vec::, +) -> usize { + let mut total_matched = 0usize; + for batch_idx in batch_range { + let mut combined_mask: Option = None; + for (col_idx, pred) in predicates.iter().enumerate() { + let entry_idx = batch_idx * num_cols + col_idx; + let entry_id = &entry_ids[entry_idx]; + let len = batch_lengths[entry_idx]; + let selection = BooleanBuffer::new_set(len); + let result = storage + .eval_predicate(entry_id, pred) + .with_selection(&selection) + .await; + match result { + Some(Ok(mask)) => { + combined_mask = Some(match combined_mask.take() { + Some(prev) => arrow::compute::and(&prev, &mask).unwrap(), + None => mask, + }); + } + Some(Err(_)) | None => { + // Predicate could not be evaluated in cache; treat as no match for this batch. + combined_mask = Some(BooleanArray::from(vec![false; len])); + } + } + } + if let Some(m) = combined_mask { + total_matched += m.true_count(); + } + } + total_matched +} + +/// Load parquet with projection = query.filter_columns, insert each (batch, column) into cache. +/// Returns (entry_ids in order batch0_col0, batch0_col1, ..., batch1_col0, ...), (length per entry). +async fn load_and_insert( + storage: Arc, + parquet_path: PathBuf, + query: &FilterQuery, +) -> (Vec, Vec) { + let config = SessionConfig::default().with_batch_size(8192); + let ctx = SessionContext::new_with_config(config); + ctx.register_parquet("hits", parquet_path.to_string_lossy().as_ref(), Default::default()) + .await + .expect("register parquet"); + + let cols: String = query + .filter_columns + .iter() + .map(|c| format!("\"{}\"", c)) + .collect::>() + .join(", "); + let sql = format!("SELECT {} FROM \"hits\"", cols); + let df = ctx.sql(&sql).await.expect("sql"); + let mut stream = df.execute_stream().await.expect("execute"); + + let num_cols = query.filter_columns.len(); + let mut entry_ids = Vec::new(); + let mut batch_lengths = Vec::new(); + let mut batch_idx = 0usize; + + while let Some(batch_res) = stream.next().await { + let batch = batch_res.expect("batch"); + let nrows = batch.num_rows(); + for col_idx in 0..num_cols { + let entry_id = EntryID::from(batch_idx * num_cols + col_idx); + let array = batch.column(col_idx).clone(); + storage.insert(entry_id, array).await; + entry_ids.push(entry_id); + batch_lengths.push(nrows); + } + batch_idx += 1; + } + + (entry_ids, batch_lengths) +} + +fn main() { + let args = Args::parse(); + + let queries = all_filter_queries(); + let query = match args.query_index { + i if i < queries.len() => match &queries[i] { + Some(q) => q, + None => { + eprintln!( + "Query index {} has no filters. Only filter queries are supported. \ + Try e.g. 1, 10, 12, 19, 27, 28, 30, 36.", + args.query_index + ); + std::process::exit(1); + } + }, + _ => { + eprintln!("Query index {} out of range (0..{}).", args.query_index, queries.len()); + std::process::exit(1); + } + }; + + if args.partitions == 0 { + eprintln!("partitions must be >= 1."); + std::process::exit(1); + } + + if !args.parquet.exists() { + eprintln!( + "Parquet file not found: {}. Download e.g. wget https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet -O {}", + args.parquet.display(), + args.parquet.display() + ); + std::process::exit(1); + } + let cache_dir = args.cache_dir.unwrap_or_else(|| { + std::env::temp_dir().join("lc_cache_dir") + }); + run_bench(cache_dir, + args.parquet, + query, + args.partitions, + args.iterations, + args.worker_threads + ); +} \ No newline at end of file diff --git a/src/parquet/src/io/mod.rs b/src/parquet/src/io/mod.rs index 32af777d..c72a4f91 100644 --- a/src/parquet/src/io/mod.rs +++ b/src/parquet/src/io/mod.rs @@ -13,12 +13,12 @@ use liquid_cache_storage::cache::{CacheExpression, EntryID, IoContext, LiquidCom use crate::cache::{ColumnAccessPath, ParquetArrayID}; #[cfg(target_os = "linux")] -mod io_uring; +pub mod io_uring; mod io_backend; #[derive(Debug)] -pub(crate) struct ParquetIoContext { +pub struct ParquetIoContext { compressor_states: RwLock>>, expression_hints: RwLock>, base_dir: PathBuf, diff --git a/src/parquet/src/lib.rs b/src/parquet/src/lib.rs index 69f8bbcd..edd3961f 100644 --- a/src/parquet/src/lib.rs +++ b/src/parquet/src/lib.rs @@ -15,3 +15,8 @@ pub use liquid_cache_storage as storage; pub use reader::variant_udf::{VariantGetUdf, VariantPretty, VariantToJsonUdf}; pub use reader::{FilterCandidateBuilder, LiquidParquetSource, LiquidPredicate, LiquidRowFilter}; pub use utils::{boolean_buffer_and_then, extract_execution_metrics}; + +#[cfg(target_os = "linux")] +pub use crate::io::io_uring::runtime::UringExecutor; + +pub use crate::io::ParquetIoContext; \ No newline at end of file From 3399f401be1b4a7139bd9711c7f3305f7c2aa52c Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 00:39:56 -0600 Subject: [PATCH 06/26] Add a SimpleIoContext to support different IO modes --- benchmark/src/storage_runner.rs | 23 +++++---- src/parquet/src/io/mod.rs | 85 ++++++++++++++++++++++++++++++++- src/parquet/src/lib.rs | 2 +- 3 files changed, 95 insertions(+), 15 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 91c9b62b..82fa1577 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -16,8 +16,8 @@ use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion::scalar::ScalarValue; use futures::StreamExt; use liquid_cache_common::IoMode; -use liquid_cache_storage::cache::{EntryID, LiquidCacheBuilder, LiquidCache}; -use liquid_cache_parquet::{ParquetIoContext, UringExecutor}; +use liquid_cache_storage::cache::{EntryID, LiquidCache, LiquidCacheBuilder}; +use liquid_cache_parquet::{SimpleIoContext, UringExecutor}; use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; @@ -51,6 +51,7 @@ struct Args { /// ClickBench query descriptor: filter column(s) and predicate expression(s). /// Each expression is evaluated on a single column (column index 0 in the cached array). /// TODO(): Add support for columns that are projected. +#[derive(Clone)] struct FilterQuery { /// Column names to load and cache (in schema order). filter_columns: Vec<&'static str>, @@ -250,7 +251,7 @@ fn run_bench( num_workers: usize, ) { let _ = std::fs::create_dir_all(&cache_dir); - let io_context = Arc::new(ParquetIoContext::new( + let io_context = Arc::new(SimpleIoContext::new( cache_dir.clone(), IoMode::UringNonBlocking, 4096, @@ -260,27 +261,25 @@ fn run_bench( .with_cache_dir(cache_dir) .build(); - let rt = tokio::runtime::Builder::new_multi_thread() - .enable_all() - .build() - .expect("tokio runtime"); + let mut executor = UringExecutor::new(num_workers); let storage_clone = storage.clone(); - let (num_batches, entry_ids, batch_lengths) = rt.block_on( + let query_owned = query.clone(); + let (num_batches, entry_ids, batch_lengths) = executor.run_to_completion( async move { // 1) Load parquet into record batches (filter columns only) and insert into cache. - let (entry_ids, batch_lengths) = load_and_insert(storage_clone.clone(), parquet_path, query).await; - let num_batches = entry_ids.len() / query.filter_columns.len(); + let (entry_ids, batch_lengths) = load_and_insert(storage_clone.clone(), parquet_path, &query_owned).await; + let num_batches = entry_ids.len() / query_owned.filter_columns.len(); log::info!( "Populated cache: {} batches, {} filter columns, {} entries", num_batches, - query.filter_columns.len(), + query_owned.filter_columns.len(), entry_ids.len() ); storage_clone.flush_all_to_disk().await; (num_batches, entry_ids, batch_lengths) }); - let mut executor = UringExecutor::new(num_workers); + for _i in 0..num_iter { run_single_iter(num_batches, diff --git a/src/parquet/src/io/mod.rs b/src/parquet/src/io/mod.rs index c72a4f91..dd41e5c5 100644 --- a/src/parquet/src/io/mod.rs +++ b/src/parquet/src/io/mod.rs @@ -15,7 +15,7 @@ use crate::cache::{ColumnAccessPath, ParquetArrayID}; #[cfg(target_os = "linux")] pub mod io_uring; -mod io_backend; +pub mod io_backend; #[derive(Debug)] pub struct ParquetIoContext { @@ -29,7 +29,7 @@ impl ParquetIoContext { pub fn new(base_dir: PathBuf, io_mode: IoMode, fixed_buffer_pool_size_mb: usize) -> Self { if matches!( io_mode, - IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking + IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking | IoMode::UringNonBlocking ) { #[cfg(target_os = "linux")] { @@ -139,6 +139,87 @@ impl IoContext for ParquetIoContext { } } +/// Simple [IoContext] with IO mode selection (tokio, blocking, io_uring, etc.). +/// Uses simple EntryID-based paths and a single compressor, like storage's [liquid_cache_storage::cache::DefaultIoContext], +/// but delegates read/write to [io_backend] so all [IoMode]s are supported. +#[derive(Debug)] +pub struct SimpleIoContext { + compressor_state: Arc, + squeeze_hints: RwLock>>, + base_dir: PathBuf, + io_mode: IoMode, +} + +impl SimpleIoContext { + /// Create a new [SimpleIoContext] with the given base directory and IO mode. + pub fn new(base_dir: PathBuf, io_mode: IoMode, fixed_buffer_pool_size_mb: usize) -> Self { + if matches!( + io_mode, + IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking | IoMode::UringNonBlocking + ) { + #[cfg(target_os = "linux")] + { + use liquid_cache_common::memory::pool::FixedBufferPool; + if fixed_buffer_pool_size_mb > 0 { + FixedBufferPool::init(fixed_buffer_pool_size_mb); + } + crate::io::io_uring::initialize_uring_pool(io_mode, fixed_buffer_pool_size_mb > 0); + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_mode {:?} is only supported on Linux", io_mode); + } + } else if fixed_buffer_pool_size_mb > 0 { + panic!("Fixed buffers are only supported for UringDirect, Uring and UringBlocking"); + } + + Self { + compressor_state: Arc::new(LiquidCompressorStates::new()), + squeeze_hints: RwLock::new(AHashMap::new()), + base_dir, + io_mode, + } + } +} + +#[async_trait::async_trait] +impl IoContext for SimpleIoContext { + fn add_squeeze_hint(&self, entry_id: &EntryID, expression: Arc) { + let mut guard = self.squeeze_hints.write().unwrap(); + guard.insert(*entry_id, expression); + } + + fn squeeze_hint(&self, entry_id: &EntryID) -> Option> { + let guard = self.squeeze_hints.read().unwrap(); + guard.get(entry_id).cloned() + } + + fn get_compressor(&self, _entry_id: &EntryID) -> Arc { + self.compressor_state.clone() + } + + fn disk_path(&self, entry_id: &EntryID) -> PathBuf { + self.base_dir + .join(format!("{:016x}.liquid", usize::from(*entry_id))) + } + + #[inline(never)] + #[fastrace::trace] + async fn read( + &self, + path: PathBuf, + range: Option>, + ) -> Result { + io_backend::read(self.io_mode, path, range).await + } + + #[inline(never)] + #[fastrace::trace] + async fn write_file(&self, path: PathBuf, data: Bytes) -> Result<(), std::io::Error> { + io_backend::write(self.io_mode, path, data).await + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/parquet/src/lib.rs b/src/parquet/src/lib.rs index edd3961f..82d5055b 100644 --- a/src/parquet/src/lib.rs +++ b/src/parquet/src/lib.rs @@ -19,4 +19,4 @@ pub use utils::{boolean_buffer_and_then, extract_execution_metrics}; #[cfg(target_os = "linux")] pub use crate::io::io_uring::runtime::UringExecutor; -pub use crate::io::ParquetIoContext; \ No newline at end of file +pub use crate::io::{ParquetIoContext, SimpleIoContext}; \ No newline at end of file From 89eed170467c9029cc1cabc2faaf95ab092b210c Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 00:58:10 -0600 Subject: [PATCH 07/26] Remove datafusion dependency in benchmark --- benchmark/src/storage_runner.rs | 40 +++++++++++++++----------- src/parquet/src/io/io_uring/runtime.rs | 2 ++ 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 82fa1577..3f1c747f 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -12,12 +12,11 @@ use clap::Parser; use datafusion::logical_expr::Operator; use datafusion::physical_plan::expressions::{BinaryExpr, Column}; use datafusion::physical_plan::PhysicalExpr; -use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion::scalar::ScalarValue; -use futures::StreamExt; use liquid_cache_common::IoMode; use liquid_cache_storage::cache::{EntryID, LiquidCache, LiquidCacheBuilder}; use liquid_cache_parquet::{SimpleIoContext, UringExecutor}; +use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; @@ -340,29 +339,38 @@ async fn load_and_insert( parquet_path: PathBuf, query: &FilterQuery, ) -> (Vec, Vec) { - let config = SessionConfig::default().with_batch_size(8192); - let ctx = SessionContext::new_with_config(config); - ctx.register_parquet("hits", parquet_path.to_string_lossy().as_ref(), Default::default()) - .await - .expect("register parquet"); + let Ok(parquet_file) = std::fs::File::open(parquet_path.clone()) else { + panic!("Failed to open {:?}", parquet_path.to_str()); + }; - let cols: String = query + let builder = ParquetRecordBatchReaderBuilder::try_new(parquet_file).unwrap(); + let schema = builder.parquet_schema(); + let root_fields = schema.root_schema().get_fields(); + let projection_root_indices: Vec = query .filter_columns .iter() - .map(|c| format!("\"{}\"", c)) - .collect::>() - .join(", "); - let sql = format!("SELECT {} FROM \"hits\"", cols); - let df = ctx.sql(&sql).await.expect("sql"); - let mut stream = df.execute_stream().await.expect("execute"); + .map(|name| { + root_fields + .iter() + .position(|f| f.name() == *name) + .unwrap_or_else(|| panic!("parquet schema has no column '{name}'")) + }) + .collect(); + let projection_mask = ProjectionMask::roots(schema, projection_root_indices); + + let mut reader = builder + .with_batch_size(8192) + .with_projection(projection_mask) + .build() + .unwrap(); let num_cols = query.filter_columns.len(); let mut entry_ids = Vec::new(); let mut batch_lengths = Vec::new(); let mut batch_idx = 0usize; - while let Some(batch_res) = stream.next().await { - let batch = batch_res.expect("batch"); + while let Some(batch_res) = reader.next() { + let batch = batch_res.expect("parquet read batch"); let nrows = batch.num_rows(); for col_idx in 0..num_cols { let entry_id = EntryID::from(batch_idx * num_cols + col_idx); diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index 1e8dc779..d716e8b8 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -15,6 +15,8 @@ const MAX_CONCURRENT_TASKS: u32 = 128; type ExecutorTask = Pin + Send>>; +/// A dedicated runtime for io_uring, in which the worker threads are responsible for submitting IO and polling for completions. +/// Each worker thread has its own ring, and an executor which is responsible for scheduling. pub struct UringExecutor { workers: Vec>, /// One sender per worker; tasks are submitted to a worker's dedicated channel. From eb3f31b301cc1ae84cd36df30634869ee320ab2b Mon Sep 17 00:00:00 2001 From: proteet Date: Mon, 2 Mar 2026 10:23:45 -0700 Subject: [PATCH 08/26] Fix bug related to io_uring_enter --- src/parquet/src/io/io_uring/runtime.rs | 37 +++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index d716e8b8..a6c75aec 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -1,13 +1,13 @@ -use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::fd::AsRawFd as _, path::PathBuf, pin::Pin, rc::Rc, sync::atomic::{AtomicBool, Ordering}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}, time::{Duration, Instant}}; +use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, rc::Rc, sync::atomic::{AtomicBool, Ordering}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}, time::{Duration, Instant}}; use async_executor::LocalExecutor; use bytes::Bytes; use futures::Future; -use io_uring::{IoUring, squeue, cqueue}; +use io_uring::{EnterFlags, IoUring, cqueue, squeue}; use rand::Rng; use tokio::sync::oneshot; -use crate::io::io_uring::tasks::{FileOpenTask, FileReadTask, FileWriteTask, FixedFileReadTask, IoTask}; +use crate::io::io_uring::tasks::{FileReadTask, FileWriteTask, FixedFileReadTask, IoTask}; const URING_NUM_ENTRIES: u32 = 256; @@ -255,7 +255,29 @@ fn worker_main_loop(receiver: crossbeam_channel::Receiver) { let mut worker = worker.borrow_mut(); worker.drain_intermediate_queue(); if worker.need_syscall() { - worker.ring.submit().expect("Failed to submit"); + let mut flags = EnterFlags::empty(); + flags.insert(EnterFlags::GETEVENTS); + loop { + let res = unsafe { + worker.ring.submitter().enter::( + worker.queued_submissions as u32, + 0, + flags.bits(), + None + ) + }; + match res { + Ok(_num_entries) => { + break; + } + Err(e) => { + if e.kind() == std::io::ErrorKind::Interrupted { + continue; + } + panic!("Failed to submit: {}", e.to_string()); + } + } + } worker.queued_submissions = 0; worker.last_syscall = Instant::now(); } @@ -373,8 +395,11 @@ pub(crate) async fn read( range: Option>, direct_io: bool, ) -> Result { - let open_task = FileOpenTask::build(path, direct_io)?; - let file = submit_async_task(open_task).await.borrow_mut().get_result()?; + let file = OpenOptions::new() + .read(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .expect("failed to open file"); let effective_range = if let Some(range) = range { range From 44c6e0f04a158e296517f0cbe808b19289f05d29 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 12:33:44 -0600 Subject: [PATCH 09/26] Setup logger --- benchmark/src/storage_runner.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 3f1c747f..011e59ff 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -16,6 +16,7 @@ use datafusion::scalar::ScalarValue; use liquid_cache_common::IoMode; use liquid_cache_storage::cache::{EntryID, LiquidCache, LiquidCacheBuilder}; use liquid_cache_parquet::{SimpleIoContext, UringExecutor}; +use logforth::filter::EnvFilter; use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; use std::path::PathBuf; use std::sync::Arc; @@ -385,8 +386,18 @@ async fn load_and_insert( (entry_ids, batch_lengths) } +fn setup_logging() { + let mut builder = logforth::builder(); + builder = builder.dispatch(|d| { + d.filter(EnvFilter::from_default_env()) + .append(logforth::append::Stdout::default()) + }); + builder.apply(); +} + fn main() { let args = Args::parse(); + setup_logging(); let queries = all_filter_queries(); let query = match args.query_index { From 9180bf8656a439fa99693ee24cc2f99090faa1d9 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 13:06:41 -0600 Subject: [PATCH 10/26] Bug fix: Process task completion --- src/parquet/src/io/io_uring/runtime.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index a6c75aec..02dfeb46 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -174,10 +174,7 @@ impl RuntimeWorker { .expect("Task not found in submitted tasks"); submission.push_completion(cqe); if pending_completions == 1 { - unsafe { - (*submission.completed).store(true, Ordering::Relaxed); - } - submission.waker.wake(); + submission.complete(); self.tokens.push_back(token as u16); self.io_performed += 1; } else { @@ -292,13 +289,22 @@ fn worker_main_loop(receiver: crossbeam_channel::Receiver) { struct AsyncTask { // Note: Should change this to Arc in case of a work-stealing scheduler pub inner: Rc>, - pub waker: Waker, - pub completed: *mut AtomicBool, + waker: Waker, + completed: *mut AtomicBool, pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker completions: Vec, } impl AsyncTask { + #[inline] + fn complete(self) { + self.inner.borrow_mut().complete(self.completions.iter().collect()); + unsafe { + (*self.completed).store(true, Ordering::Relaxed); + } + self.waker.wake(); + } + #[inline] fn set_completions(&mut self, count: usize) { self.pending_completions = count; From cddf6273b83a273704e0fca9b21c827f9105e751 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 14:40:04 -0600 Subject: [PATCH 11/26] Bug fix: Register buffers with runtime worker rings --- src/parquet/src/io/io_backend.rs | 2 +- src/parquet/src/io/io_uring/runtime.rs | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index 7a80db4f..54d3f28c 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -58,7 +58,7 @@ pub(super) async fn read( IoMode::UringNonBlocking => { #[cfg(target_os = "linux")] { - super::io_uring::runtime::read(path, range, false).await + super::io_uring::runtime::read(path, range).await } #[cfg(not(target_os = "linux"))] { diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index 02dfeb46..8467894b 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -4,6 +4,7 @@ use async_executor::LocalExecutor; use bytes::Bytes; use futures::Future; use io_uring::{EnterFlags, IoUring, cqueue, squeue}; +use liquid_cache_common::memory::pool::FixedBufferPool; use rand::Rng; use tokio::sync::oneshot; @@ -132,6 +133,9 @@ impl RuntimeWorker { .setup_defer_taskrun() .build(URING_NUM_ENTRIES) .expect("Failed to build IoUring instance"); + if FixedBufferPool::register_buffers_with_ring(&ring).is_err() { + log::warn!("Failed to register fixed buffers with runtime worker ring"); + } let mut tokens = VecDeque::::with_capacity(MAX_CONCURRENT_TASKS as usize); let mut inflight_tasks = Vec::>::with_capacity(MAX_CONCURRENT_TASKS as usize); for i in 0..MAX_CONCURRENT_TASKS { @@ -399,7 +403,6 @@ where pub(crate) async fn read( path: PathBuf, range: Option>, - direct_io: bool, ) -> Result { let file = OpenOptions::new() .read(true) @@ -415,7 +418,7 @@ pub(crate) async fn read( }; { - let read_task = FixedFileReadTask::build(effective_range.clone(), &file, direct_io); + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, true); if read_task.is_ok() { let rc = submit_async_task(read_task.unwrap()).await; return match Rc::try_unwrap(rc) { @@ -425,7 +428,7 @@ pub(crate) async fn read( } } // Fall back to normal read if fixed buffers are not available - let read_task = FileReadTask::build(effective_range, file, direct_io); + let read_task = FileReadTask::build(effective_range, file, true); submit_async_task(read_task).await.borrow_mut().get_result() } @@ -434,6 +437,7 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .create(true) .truncate(true) .write(true) + .custom_flags(libc::O_DIRECT) .open(path) .expect("failed to create file"); From 363f25b29d1dff4ff7e3ca0e3c4c975e0381244a Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 15:12:28 -0600 Subject: [PATCH 12/26] Fix bug in fixed buffer registration in arena --- src/common/src/memory/arena.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/common/src/memory/arena.rs b/src/common/src/memory/arena.rs index 8258ba63..339ddda3 100644 --- a/src/common/src/memory/arena.rs +++ b/src/common/src/memory/arena.rs @@ -27,11 +27,11 @@ impl Arena { let mem_start = Self::allocate_memory_from_os(capacity); assert_ne!(mem_start, null_mut()); let mem_end = mem_start.wrapping_add(capacity); - let ptr_aligned = (mem_start as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; - let mut slice_start = ptr_aligned; + let mut ptr_aligned = (mem_start as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; if ptr_aligned != (mem_start as usize) { - slice_start = ptr_aligned + SEGMENT_SIZE; + ptr_aligned += SEGMENT_SIZE; } + let mut slice_start = ptr_aligned; let mut slices = Vec::new(); while slice_start + SEGMENT_SIZE <= mem_end as usize { slices.push(Slice { @@ -100,7 +100,9 @@ impl Arena { } pub(crate) fn register_buffers_with_ring(self: &mut Self, ring: &IoUring) -> io::Result<()> { - let num_buffers = self.size >> FIXED_BUFFER_BITS; + let usable_bytes = self.size + .saturating_sub(self.aligned_start_ptr as usize - self.actual_start_ptr as usize); + let num_buffers = usable_bytes >> FIXED_BUFFER_BITS; let mut buffers = Vec::::new(); buffers.reserve(num_buffers); let mut base_ptr = self.aligned_start_ptr; From 1ef1abbb1f052e7caca343aeeaab16a7284cb633 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 15:13:01 -0600 Subject: [PATCH 13/26] Fix bug in runtime worker IO task completion --- src/parquet/src/io/io_uring/runtime.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index 8467894b..00791972 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -172,16 +172,19 @@ impl RuntimeWorker { .as_ref() .expect("Task not found in submitted tasks") .pending_completions; - - let mut submission = self.submitted_tasks[token] - .take() - .expect("Task not found in submitted tasks"); - submission.push_completion(cqe); if pending_completions == 1 { + let mut submission = self.submitted_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + submission.push_completion(cqe); submission.complete(); self.tokens.push_back(token as u16); self.io_performed += 1; } else { + let submission = self.submitted_tasks[token] + .as_mut() + .expect("Task not found in submitted tasks"); + submission.push_completion(cqe); submission.reduce_completions(); } } From 0ae5af9d52ed435bdc5e550e67b2af413d076bf8 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 2 Mar 2026 15:27:14 -0600 Subject: [PATCH 14/26] Record flamegraphs in benchmark --- benchmark/src/storage_runner.rs | 81 +++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 011e59ff..b3b5afae 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -18,6 +18,7 @@ use liquid_cache_storage::cache::{EntryID, LiquidCache, LiquidCacheBuilder}; use liquid_cache_parquet::{SimpleIoContext, UringExecutor}; use logforth::filter::EnvFilter; use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; +use std::fs::create_dir_all; use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; @@ -46,6 +47,10 @@ struct Args { /// Directory for the liquid-cache storage. Default: $TMPDIR/liquid_cache_storage_runner #[arg(long)] cache_dir: Option, + + /// Directory to write flamegraph SVG files to (one per query iteration). + #[arg(long = "flamegraph-dir")] + flamegraph_dir: Option, } /// ClickBench query descriptor: filter column(s) and predicate expression(s). @@ -242,13 +247,41 @@ fn run_single_iter( log::info!("Partitions: {}, Time: {:.3}s, Total rows: {}", num_partitions, elapsed.as_secs_f64(), total_rows); } +fn write_flamegraph( + profiler: &pprof::ProfilerGuard<'_>, + flamegraph_dir: &std::path::Path, + query_index: usize, + iteration: u32, +) -> Result<(), Box> { + let report = profiler.report().build()?; + let mut svg_data = Vec::new(); + report.flamegraph(&mut svg_data)?; + create_dir_all(flamegraph_dir)?; + + let now = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?; + let secs = now.as_secs(); + let hour = (secs / 3600) % 24; + let minute = (secs / 60) % 60; + let second = secs % 60; + + let filename = format!( + "{hour:02}h{minute:02}m{second:02}s_q{query_index:02}_i{iteration:02}.svg" + ); + let filepath = flamegraph_dir.join(filename); + std::fs::write(&filepath, svg_data)?; + log::info!("Flamegraph written to: {}", filepath.display()); + Ok(()) +} + fn run_bench( cache_dir: PathBuf, parquet_path: PathBuf, query: &FilterQuery, + query_index: usize, num_partitions: usize, num_iter: usize, num_workers: usize, + flamegraph_dir: Option, ) { let _ = std::fs::create_dir_all(&cache_dir); let io_context = Arc::new(SimpleIoContext::new( @@ -281,15 +314,34 @@ fn run_bench( }); - for _i in 0..num_iter { - run_single_iter(num_batches, - num_partitions, - &query, - storage.clone(), - &entry_ids, - &batch_lengths, - &mut executor + for i in 0..num_iter { + let profiler_guard = if flamegraph_dir.is_some() { + Some( + pprof::ProfilerGuardBuilder::default() + .frequency(500) + .blocklist(&["libpthread.so.0", "libm.so.6", "libgcc_s.so.1"]) + .build() + .expect("pprof ProfilerGuardBuilder::build"), + ) + } else { + None + }; + + run_single_iter( + num_batches, + num_partitions, + &query, + storage.clone(), + &entry_ids, + &batch_lengths, + &mut executor, ); + + if let (Some(profiler), Some(dir)) = (profiler_guard, flamegraph_dir.as_ref()) { + if let Err(e) = write_flamegraph(&profiler, dir, query_index, i as u32) { + log::warn!("Failed to write flamegraph for iteration {}: {}", i, e); + } + } } } @@ -434,11 +486,14 @@ fn main() { let cache_dir = args.cache_dir.unwrap_or_else(|| { std::env::temp_dir().join("lc_cache_dir") }); - run_bench(cache_dir, - args.parquet, - query, - args.partitions, + run_bench( + cache_dir, + args.parquet, + query, + args.query_index, + args.partitions, args.iterations, - args.worker_threads + args.worker_threads, + args.flamegraph_dir, ); } \ No newline at end of file From 5e739d45e8c45fc21bdb75bfd4f66c8c640a3a30 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 8 Mar 2026 21:31:41 -0500 Subject: [PATCH 15/26] Add support for projection columns --- benchmark/src/storage_runner.rs | 73 ++++++++++++++++++++-------- src/parquet/src/io/io_uring/tests.rs | 2 +- 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index b3b5afae..58aa7ff9 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -53,19 +53,31 @@ struct Args { flamegraph_dir: Option, } -/// ClickBench query descriptor: filter column(s) and predicate expression(s). -/// Each expression is evaluated on a single column (column index 0 in the cached array). -/// TODO(): Add support for columns that are projected. +/// ClickBench query descriptor: filter column(s), optional projection column(s), and predicate expression(s). +/// Each predicate is evaluated on a single column (column index 0 in the cached array). #[derive(Clone)] struct FilterQuery { - /// Column names to load and cache (in schema order). + /// Column names to load and cache for filtering (in schema order). filter_columns: Vec<&'static str>, - /// One predicate per filter column; each expects Column(0) op Literal. + /// Column names to load when there are no predicates (projection-only / full-scan queries). + projection_columns: Vec<&'static str>, + /// One predicate per filter column; each expects Column(0) op Literal. Empty for projection-only. predicates: Vec>, /// Number of expected rows in result expected_row_count: usize, } +impl FilterQuery { + /// Columns to load into cache: filter_columns when present, else projection_columns. + fn columns_to_load(&self) -> &[&'static str] { + if self.filter_columns.is_empty() { + &self.projection_columns + } else { + &self.filter_columns + } + } +} + fn all_filter_queries() -> Vec> { use datafusion::physical_plan::expressions::Literal as Lit; let col = || Arc::new(Column::new("col", 0)) as Arc; @@ -75,6 +87,7 @@ fn all_filter_queries() -> Vec> { // Q1: AdvEngineID <> 0 q[1] = Some(FilterQuery { filter_columns: vec!["AdvEngineID"], + projection_columns: vec![], predicates: vec![Arc::new(BinaryExpr::new( col(), Operator::NotEq, @@ -86,6 +99,7 @@ fn all_filter_queries() -> Vec> { // Q10: MobilePhoneModel <> '' q[10] = Some(FilterQuery { filter_columns: vec!["MobilePhoneModel"], + projection_columns: vec![], predicates: vec![Arc::new(BinaryExpr::new( col(), Operator::NotEq, @@ -97,6 +111,7 @@ fn all_filter_queries() -> Vec> { // Q12: SearchPhrase <> '' q[12] = Some(FilterQuery { filter_columns: vec!["SearchPhrase"], + projection_columns: vec![], predicates: vec![Arc::new(BinaryExpr::new( col(), Operator::NotEq, @@ -108,6 +123,7 @@ fn all_filter_queries() -> Vec> { // Q19: UserID = 3233473875476175636 (value that exists in hits_1) q[19] = Some(FilterQuery { filter_columns: vec!["UserID"], + projection_columns: vec![], predicates: vec![Arc::new(BinaryExpr::new( col(), Operator::Eq, @@ -116,19 +132,17 @@ fn all_filter_queries() -> Vec> { expected_row_count: 4, }); - q[20] = Some(FilterQuery { - filter_columns: vec!["URL"], - predicates: vec![Arc::new(BinaryExpr::new( - col(), - Operator::LikeMatch, - Arc::new(Lit::new(ScalarValue::Utf8(Some("%google%".to_string())))), - ))], + q[20] = Some(FilterQuery { + filter_columns: vec![], + projection_columns: vec!["URL"], + predicates: vec![], expected_row_count: 137, }); // Q27: URL <> '' q[27] = Some(FilterQuery { filter_columns: vec!["URL"], + projection_columns: vec![], predicates: vec![Arc::new(BinaryExpr::new( col(), Operator::NotEq, @@ -140,6 +154,7 @@ fn all_filter_queries() -> Vec> { // Q28: Referer <> '' q[28] = Some(FilterQuery { filter_columns: vec!["Referer"], + projection_columns: vec![], predicates: vec![Arc::new(BinaryExpr::new( col(), Operator::NotEq, @@ -151,6 +166,7 @@ fn all_filter_queries() -> Vec> { // Q30: SearchPhrase <> '' q[30] = Some(FilterQuery { filter_columns: vec!["SearchPhrase"], + projection_columns: vec![], predicates: vec![Arc::new(BinaryExpr::new( col(), Operator::NotEq, @@ -162,6 +178,7 @@ fn all_filter_queries() -> Vec> { // Q36: CounterID = 62, DontCountHits = 0, IsRefresh = 0, URL <> '' q[36] = Some(FilterQuery { filter_columns: vec!["CounterID", "EventDate", "DontCountHits", "IsRefresh", "URL"], + projection_columns: vec![], predicates: vec![ Arc::new(BinaryExpr::new( col(), @@ -206,7 +223,7 @@ fn run_single_iter( ) { // 2) Partition batch indices evenly across workers. let batches_per_partition = (num_batches + num_partitions - 1) / num_partitions; - let num_cols = query.filter_columns.len(); + let num_cols = query.columns_to_load().len(); // 3) Create futures for every partition let mut futures = Vec::new(); @@ -292,6 +309,7 @@ fn run_bench( let storage = LiquidCacheBuilder::new() .with_io_context(io_context) .with_cache_dir(cache_dir) + .with_max_cache_bytes(256 * 1024 * 1024) .build(); let mut executor = UringExecutor::new(num_workers); @@ -301,11 +319,12 @@ fn run_bench( async move { // 1) Load parquet into record batches (filter columns only) and insert into cache. let (entry_ids, batch_lengths) = load_and_insert(storage_clone.clone(), parquet_path, &query_owned).await; - let num_batches = entry_ids.len() / query_owned.filter_columns.len(); + let num_cols_loaded = query_owned.columns_to_load().len(); + let num_batches = entry_ids.len() / num_cols_loaded; log::info!( - "Populated cache: {} batches, {} filter columns, {} entries", + "Populated cache: {} batches, {} columns, {} entries", num_batches, - query_owned.filter_columns.len(), + num_cols_loaded, entry_ids.len() ); @@ -354,6 +373,16 @@ async fn run_partition( batch_lengths: Vec::, ) -> usize { let mut total_matched = 0usize; + + if predicates.is_empty() { + // No predicates: full scan, count all rows in the partition. + for batch_idx in batch_range.clone() { + let entry_idx = batch_idx * num_cols; + total_matched += batch_lengths[entry_idx]; + } + return total_matched; + } + for batch_idx in batch_range { let mut combined_mask: Option = None; for (col_idx, pred) in predicates.iter().enumerate() { @@ -363,7 +392,7 @@ async fn run_partition( let selection = BooleanBuffer::new_set(len); let result = storage .eval_predicate(entry_id, pred) - .with_selection(&selection) + .with_selection(&selection) // Is this necessary? .await; match result { Some(Ok(mask)) => { @@ -385,13 +414,16 @@ async fn run_partition( total_matched } -/// Load parquet with projection = query.filter_columns, insert each (batch, column) into cache. +/// Load parquet with projection = query.columns_to_load(), insert each (batch, column) into cache. /// Returns (entry_ids in order batch0_col0, batch0_col1, ..., batch1_col0, ...), (length per entry). async fn load_and_insert( storage: Arc, parquet_path: PathBuf, query: &FilterQuery, ) -> (Vec, Vec) { + let columns_to_load = query.columns_to_load(); + assert!(!columns_to_load.is_empty(), "query must have filter_columns or projection_columns"); + let Ok(parquet_file) = std::fs::File::open(parquet_path.clone()) else { panic!("Failed to open {:?}", parquet_path.to_str()); }; @@ -399,8 +431,7 @@ async fn load_and_insert( let builder = ParquetRecordBatchReaderBuilder::try_new(parquet_file).unwrap(); let schema = builder.parquet_schema(); let root_fields = schema.root_schema().get_fields(); - let projection_root_indices: Vec = query - .filter_columns + let projection_root_indices: Vec = columns_to_load .iter() .map(|name| { root_fields @@ -417,7 +448,7 @@ async fn load_and_insert( .build() .unwrap(); - let num_cols = query.filter_columns.len(); + let num_cols = columns_to_load.len(); let mut entry_ids = Vec::new(); let mut batch_lengths = Vec::new(); let mut batch_idx = 0usize; diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index 733cd11e..4c32e586 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -162,7 +162,7 @@ fn read_write_roundtrip_non_blocking_uring() { let (tmpdir, path) = seed_file(&original); let path_clone = path.clone(); let read_bytes = executor.run_to_completion(async move { - runtime::read(path_clone, None, false).await + runtime::read(path_clone, None).await }).unwrap_or_else(|err| panic!("read failed: {err}")); assert_eq!( read_bytes.as_ref(), From 6eec22076a32995d9f2ca1c04e6d2e7ff18e8ea1 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Sun, 8 Mar 2026 22:10:39 -0500 Subject: [PATCH 16/26] Add some tracepoints and disk usage tracking --- benchmark/src/lib.rs | 2 +- benchmark/src/storage_runner.rs | 60 ++++++++++++++++++++++++++ src/parquet/src/io/io_uring/runtime.rs | 30 ++++++++++++- 3 files changed, 89 insertions(+), 3 deletions(-) diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs index 6e7da983..420652d1 100644 --- a/benchmark/src/lib.rs +++ b/benchmark/src/lib.rs @@ -23,7 +23,7 @@ pub mod client_runner; pub mod inprocess_runner; mod manifest; mod observability; -mod tracepoints; +pub mod tracepoints; pub mod utils; pub use client_runner::*; diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 58aa7ff9..93ff8bf3 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -22,6 +22,7 @@ use std::fs::create_dir_all; use std::path::PathBuf; use std::sync::Arc; use std::time::Instant; +use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System}; #[derive(Parser)] #[command(name = "storage_runner")] @@ -53,6 +54,51 @@ struct Args { flamegraph_dir: Option, } +/// Tracks process disk I/O (bytes read/written) between creation and stop(). +struct DiskIoGuard { + system: System, + pid: sysinfo::Pid, + start_read_total: u64, + start_written_total: u64, +} + +impl DiskIoGuard { + fn new() -> Self { + let mut system = System::new(); + let pid = sysinfo::get_current_pid().unwrap(); + system.refresh_processes_specifics( + ProcessesToUpdate::Some(&[pid]), + true, + ProcessRefreshKind::nothing().with_disk_usage(), + ); + let p = system.process(pid).unwrap(); + let du = p.disk_usage(); + Self { + system, + pid, + start_read_total: du.total_read_bytes, + start_written_total: du.total_written_bytes, + } + } + + fn stop(mut self) -> (u64, u64) { + self.system.refresh_processes_specifics( + ProcessesToUpdate::Some(&[self.pid]), + true, + ProcessRefreshKind::nothing().with_disk_usage(), + ); + if let Some(p) = self.system.process(self.pid) { + let du = p.disk_usage(); + ( + du.total_read_bytes.saturating_sub(self.start_read_total), + du.total_written_bytes.saturating_sub(self.start_written_total), + ) + } else { + (0, 0) + } + } +} + /// ClickBench query descriptor: filter column(s), optional projection column(s), and predicate expression(s). /// Each predicate is evaluated on a single column (column index 0 in the cached array). #[derive(Clone)] @@ -334,6 +380,8 @@ fn run_bench( for i in 0..num_iter { + liquid_cache_benchmarks::tracepoints::iteration_start(query_index as u32, i as u32); + let io_guard = DiskIoGuard::new(); let profiler_guard = if flamegraph_dir.is_some() { Some( pprof::ProfilerGuardBuilder::default() @@ -356,6 +404,14 @@ fn run_bench( &mut executor, ); + let (disk_read, disk_written) = io_guard.stop(); + log::info!( + "Iteration {}: disk read {} bytes, disk written {} bytes", + i, + disk_read, + disk_written + ); + if let (Some(profiler), Some(dir)) = (profiler_guard, flamegraph_dir.as_ref()) { if let Err(e) = write_flamegraph(&profiler, dir, query_index, i as u32) { log::warn!("Failed to write flamegraph for iteration {}: {}", i, e); @@ -378,6 +434,10 @@ async fn run_partition( // No predicates: full scan, count all rows in the partition. for batch_idx in batch_range.clone() { let entry_idx = batch_idx * num_cols; + let entry_id = &entry_ids[entry_idx]; + let _result = storage + .get(entry_id) + .await; total_matched += batch_lengths[entry_idx]; } return total_matched; diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index 00791972..dd849380 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -1,4 +1,4 @@ -use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, rc::Rc, sync::atomic::{AtomicBool, Ordering}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}, time::{Duration, Instant}}; +use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, rc::Rc, sync::{atomic::{AtomicBool, Ordering}, OnceLock}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}, time::{Duration, Instant}}; use async_executor::LocalExecutor; use bytes::Bytes; @@ -10,6 +10,23 @@ use tokio::sync::oneshot; use crate::io::io_uring::tasks::{FileReadTask, FileWriteTask, FixedFileReadTask, IoTask}; +#[usdt::provider] +mod liquid_uring_runtime { + fn io_submission(id: u64) {} + fn io_completion(id: u64) {} +} + +fn ensure_uring_trace_registered() -> bool { + static REGISTERED: OnceLock = OnceLock::new(); + *REGISTERED.get_or_init(|| match usdt::register_probes() { + Ok(()) => true, + Err(err) => { + log::debug!("failed to register io_uring runtime USDT probes: {err}"); + false + } + }) +} + const URING_NUM_ENTRIES: u32 = 256; const MAX_CONCURRENT_TASKS: u32 = 128; @@ -211,7 +228,8 @@ impl RuntimeWorker { let token = self.tokens.pop_front().expect("No more tokens"); let sq = &mut self.ring.submission(); let sqes = task.inner.borrow_mut().prepare_sqe(); - task.set_completions(sqes.len()); + let num_sqes = sqes.len(); + task.set_completions(num_sqes); self.submitted_tasks[token as usize] = Some(task); let mut sqes_submitted = 0; @@ -342,6 +360,7 @@ where state: UringState, task: Rc>, completed: AtomicBool, + id: u64, } unsafe impl Send for UringFuture @@ -356,6 +375,7 @@ where state: UringState::Created, task: task, completed: AtomicBool::new(false), + id: rand::rng().random(), } } } @@ -379,10 +399,16 @@ where completions: Vec::new(), }; RuntimeWorker::add_task(async_task); + if ensure_uring_trace_registered() { + liquid_uring_runtime::io_submission!(|| self.id); + } self.state = UringState::Submitted; } UringState::Submitted => match self.completed.load(Ordering::Relaxed) { true => { + if ensure_uring_trace_registered() { + liquid_uring_runtime::io_completion!(|| self.id); + } return Poll::Ready(self.task.clone()); } false => { From f6e889c42773980fa83d609b0a1b6749b6019dee Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 9 Mar 2026 12:14:53 -0500 Subject: [PATCH 17/26] Avoid redundant writes in storage_runner --- benchmark/src/storage_runner.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 93ff8bf3..e45dbd61 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -14,7 +14,7 @@ use datafusion::physical_plan::expressions::{BinaryExpr, Column}; use datafusion::physical_plan::PhysicalExpr; use datafusion::scalar::ScalarValue; use liquid_cache_common::IoMode; -use liquid_cache_storage::cache::{EntryID, LiquidCache, LiquidCacheBuilder}; +use liquid_cache_storage::cache::{EntryID, LiquidCache, LiquidCacheBuilder, LiquidPolicy, NoHydration, TranscodeSqueezeEvict}; use liquid_cache_parquet::{SimpleIoContext, UringExecutor}; use logforth::filter::EnvFilter; use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; @@ -182,7 +182,7 @@ fn all_filter_queries() -> Vec> { filter_columns: vec![], projection_columns: vec!["URL"], predicates: vec![], - expected_row_count: 137, + expected_row_count: 99997497, }); // Q27: URL <> '' @@ -356,6 +356,9 @@ fn run_bench( .with_io_context(io_context) .with_cache_dir(cache_dir) .with_max_cache_bytes(256 * 1024 * 1024) + .with_cache_policy(Box::new(LiquidPolicy::new())) + .with_hydration_policy(Box::new(NoHydration::new())) + .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(); let mut executor = UringExecutor::new(num_workers); From a0691191268ec921cff894ec2bf5b209edd0aef0 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 9 Mar 2026 16:41:22 -0500 Subject: [PATCH 18/26] Fix partitioning logic in storage_runner Partition batches more evenly, add support for other io modes, minor code cleanup --- benchmark/src/storage_runner.rs | 31 ++++++++++++++++++-------- src/parquet/src/io/io_uring/runtime.rs | 4 ++-- src/parquet/src/io/io_uring/tasks.rs | 16 ++----------- src/parquet/src/io/mod.rs | 2 +- src/parquet/src/lib.rs | 2 +- 5 files changed, 28 insertions(+), 27 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index e45dbd61..adada9a5 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -52,6 +52,10 @@ struct Args { /// Directory to write flamegraph SVG files to (one per query iteration). #[arg(long = "flamegraph-dir")] flamegraph_dir: Option, + + /// IO mode: uring-non-blocking (default) or std-blocking. + #[arg(long = "io-mode", default_value = "uring-non-blocking")] + io_mode: IoMode, } /// Tracks process disk I/O (bytes read/written) between creation and stop(). @@ -268,19 +272,25 @@ fn run_single_iter( executor: &mut UringExecutor ) { // 2) Partition batch indices evenly across workers. - let batches_per_partition = (num_batches + num_partitions - 1) / num_partitions; + let batches_per_partition = num_batches / num_partitions; let num_cols = query.columns_to_load().len(); - // 3) Create futures for every partition + // 3) Create futures for every partition (only for partitions that have at least one batch) let mut futures = Vec::new(); + let mut start_batch_idx = 0; for p in 0..num_partitions { - let start = p * batches_per_partition; - let end = (start + batches_per_partition).min(num_batches); - if start >= end { + let batch_count = if p < num_batches % num_partitions { + batches_per_partition + 1 + } else { + batches_per_partition + }; + let end = (start_batch_idx + batch_count).min(num_batches); + if start_batch_idx >= end { continue; } let storage_clone = Arc::clone(&storage); - let batch_range = start..end; + let batch_range = start_batch_idx..end; + start_batch_idx = end; let predicates = query.predicates.iter().map(Arc::clone).collect::>(); let entry_ids_clone = entry_ids.clone(); let batch_lengths_clone = batch_lengths.clone(); @@ -293,13 +303,14 @@ fn run_single_iter( batch_lengths_clone, )); } - + let num_tasks = futures.len(); + let start = Instant::now(); let receiver = executor.spawn_many(&mut futures); let mut tasks_completed = 0; let mut total_rows = 0; - while tasks_completed < num_partitions { + while tasks_completed < num_tasks { total_rows += receiver.recv().expect("Failed to receive result"); tasks_completed += 1; } @@ -345,11 +356,12 @@ fn run_bench( num_iter: usize, num_workers: usize, flamegraph_dir: Option, + io_mode: IoMode, ) { let _ = std::fs::create_dir_all(&cache_dir); let io_context = Arc::new(SimpleIoContext::new( cache_dir.clone(), - IoMode::UringNonBlocking, + io_mode, 4096, )); let storage = LiquidCacheBuilder::new() @@ -589,5 +601,6 @@ fn main() { args.iterations, args.worker_threads, args.flamegraph_dir, + args.io_mode, ); } \ No newline at end of file diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index dd849380..ecf2d80e 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -36,7 +36,7 @@ type ExecutorTask = Pin + Send>>; /// A dedicated runtime for io_uring, in which the worker threads are responsible for submitting IO and polling for completions. /// Each worker thread has its own ring, and an executor which is responsible for scheduling. pub struct UringExecutor { - workers: Vec>, + _workers: Vec>, /// One sender per worker; tasks are submitted to a worker's dedicated channel. senders: Vec>, } @@ -58,7 +58,7 @@ impl UringExecutor { workers.push(worker); } UringExecutor { - workers, + _workers: workers, senders, } } diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index 3a10f6d4..e221f3c1 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -1,8 +1,8 @@ use std::{ - alloc::{Layout, alloc}, any::Any, cell::RefCell, error::Error, ffi::CString, fs, mem, ops::Range, os::{ + alloc::{Layout, alloc}, any::Any, ffi::CString, fs, mem, ops::Range, os::{ fd::{AsRawFd, FromRawFd, RawFd}, unix::ffi::OsStringExt, - }, path::PathBuf, rc::Rc + }, path::PathBuf }; use bytes::Bytes; @@ -60,18 +60,6 @@ impl FileOpenTask { let file = unsafe { fs::File::from_raw_fd(fd) }; Ok(file) } - - pub(crate) fn get_result(self: &mut Self) -> Result { - if let Some(err) = self.error.take() { - return Err(err); - } - let fd = self.fd.take().ok_or_else(|| { - std::io::Error::other("open operation completed without returning file descriptor") - })?; - // SAFETY: `fd` has been received from the kernel for this task and is uniquely owned here. - let file = unsafe { fs::File::from_raw_fd(fd) }; - Ok(file) - } } impl IoTask for FileOpenTask { diff --git a/src/parquet/src/io/mod.rs b/src/parquet/src/io/mod.rs index dd41e5c5..53782dbe 100644 --- a/src/parquet/src/io/mod.rs +++ b/src/parquet/src/io/mod.rs @@ -18,7 +18,7 @@ pub mod io_uring; pub mod io_backend; #[derive(Debug)] -pub struct ParquetIoContext { +pub(crate) struct ParquetIoContext { compressor_states: RwLock>>, expression_hints: RwLock>, base_dir: PathBuf, diff --git a/src/parquet/src/lib.rs b/src/parquet/src/lib.rs index 82d5055b..bbf65a0e 100644 --- a/src/parquet/src/lib.rs +++ b/src/parquet/src/lib.rs @@ -19,4 +19,4 @@ pub use utils::{boolean_buffer_and_then, extract_execution_metrics}; #[cfg(target_os = "linux")] pub use crate::io::io_uring::runtime::UringExecutor; -pub use crate::io::{ParquetIoContext, SimpleIoContext}; \ No newline at end of file +pub use crate::io::SimpleIoContext; \ No newline at end of file From 6f498d84e7ca0b51d64391b8fb6561ab0bc4e50c Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 9 Mar 2026 17:00:12 -0500 Subject: [PATCH 19/26] Changes - Add error handling in pool.rs - Run cargo fmt - Minor fixes --- benchmark/src/storage_runner.rs | 99 +++++++++++------- src/common/src/memory/arena.rs | 22 ++-- src/common/src/memory/global_pool.rs | 0 src/common/src/memory/mod.rs | 5 +- src/common/src/memory/page.rs | 50 +++++---- src/common/src/memory/pool.rs | 146 +++++++++++++++++++-------- src/common/src/memory/segment.rs | 89 ++++++++++------ src/common/src/memory/tcache.rs | 144 +++++++++++++++++--------- 8 files changed, 365 insertions(+), 190 deletions(-) delete mode 100644 src/common/src/memory/global_pool.rs diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index adada9a5..648ce7f7 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -5,19 +5,20 @@ * on a LiquidCache instance to bypass datafusion, which is strongly coupled with tokio. The benchmark is based on * the arrow benchmark (https://github.com/apache/arrow-rs/blob/main/parquet/benches/arrow_reader_clickbench.rs#L729) */ - use arrow::array::BooleanArray; use arrow::buffer::BooleanBuffer; use clap::Parser; use datafusion::logical_expr::Operator; -use datafusion::physical_plan::expressions::{BinaryExpr, Column}; use datafusion::physical_plan::PhysicalExpr; +use datafusion::physical_plan::expressions::{BinaryExpr, Column}; use datafusion::scalar::ScalarValue; use liquid_cache_common::IoMode; -use liquid_cache_storage::cache::{EntryID, LiquidCache, LiquidCacheBuilder, LiquidPolicy, NoHydration, TranscodeSqueezeEvict}; use liquid_cache_parquet::{SimpleIoContext, UringExecutor}; +use liquid_cache_storage::cache::{ + EntryID, LiquidCache, LiquidCacheBuilder, LiquidPolicy, NoHydration, TranscodeSqueezeEvict, +}; use logforth::filter::EnvFilter; -use parquet::arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, ProjectionMask}; +use parquet::arrow::{ProjectionMask, arrow_reader::ParquetRecordBatchReaderBuilder}; use std::fs::create_dir_all; use std::path::PathBuf; use std::sync::Arc; @@ -95,7 +96,8 @@ impl DiskIoGuard { let du = p.disk_usage(); ( du.total_read_bytes.saturating_sub(self.start_read_total), - du.total_written_bytes.saturating_sub(self.start_written_total), + du.total_written_bytes + .saturating_sub(self.start_written_total), ) } else { (0, 0) @@ -227,7 +229,13 @@ fn all_filter_queries() -> Vec> { // Q36: CounterID = 62, DontCountHits = 0, IsRefresh = 0, URL <> '' q[36] = Some(FilterQuery { - filter_columns: vec!["CounterID", "EventDate", "DontCountHits", "IsRefresh", "URL"], + filter_columns: vec![ + "CounterID", + "EventDate", + "DontCountHits", + "IsRefresh", + "URL", + ], projection_columns: vec![], predicates: vec![ Arc::new(BinaryExpr::new( @@ -265,11 +273,11 @@ fn all_filter_queries() -> Vec> { fn run_single_iter( num_batches: usize, num_partitions: usize, - query: &FilterQuery, + query: &FilterQuery, storage: Arc, entry_ids: &Vec, batch_lengths: &Vec, - executor: &mut UringExecutor + executor: &mut UringExecutor, ) { // 2) Partition batch indices evenly across workers. let batches_per_partition = num_batches / num_partitions; @@ -295,12 +303,12 @@ fn run_single_iter( let entry_ids_clone = entry_ids.clone(); let batch_lengths_clone = batch_lengths.clone(); futures.push(run_partition( - storage_clone, - batch_range, - num_cols, - predicates, - entry_ids_clone, - batch_lengths_clone, + storage_clone, + batch_range, + num_cols, + predicates, + entry_ids_clone, + batch_lengths_clone, )); } let num_tasks = futures.len(); @@ -316,9 +324,18 @@ fn run_single_iter( } let elapsed = start.elapsed(); if total_rows != query.expected_row_count { - log::warn!("Expected row count doesn't match. Actual: {}, expected: {}", total_rows, query.expected_row_count); + log::warn!( + "Expected row count doesn't match. Actual: {}, expected: {}", + total_rows, + query.expected_row_count + ); } - log::info!("Partitions: {}, Time: {:.3}s, Total rows: {}", num_partitions, elapsed.as_secs_f64(), total_rows); + log::info!( + "Partitions: {}, Time: {:.3}s, Total rows: {}", + num_partitions, + elapsed.as_secs_f64(), + total_rows + ); } fn write_flamegraph( @@ -338,9 +355,8 @@ fn write_flamegraph( let minute = (secs / 60) % 60; let second = secs % 60; - let filename = format!( - "{hour:02}h{minute:02}m{second:02}s_q{query_index:02}_i{iteration:02}.svg" - ); + let filename = + format!("{hour:02}h{minute:02}m{second:02}s_q{query_index:02}_i{iteration:02}.svg"); let filepath = flamegraph_dir.join(filename); std::fs::write(&filepath, svg_data)?; log::info!("Flamegraph written to: {}", filepath.display()); @@ -359,10 +375,15 @@ fn run_bench( io_mode: IoMode, ) { let _ = std::fs::create_dir_all(&cache_dir); + let fb_pool_size = if io_mode == IoMode::UringNonBlocking { + 4096 + } else { + 0 + }; let io_context = Arc::new(SimpleIoContext::new( cache_dir.clone(), io_mode, - 4096, + fb_pool_size, )); let storage = LiquidCacheBuilder::new() .with_io_context(io_context) @@ -376,10 +397,10 @@ fn run_bench( let mut executor = UringExecutor::new(num_workers); let storage_clone = storage.clone(); let query_owned = query.clone(); - let (num_batches, entry_ids, batch_lengths) = executor.run_to_completion( - async move { + let (num_batches, entry_ids, batch_lengths) = executor.run_to_completion(async move { // 1) Load parquet into record batches (filter columns only) and insert into cache. - let (entry_ids, batch_lengths) = load_and_insert(storage_clone.clone(), parquet_path, &query_owned).await; + let (entry_ids, batch_lengths) = + load_and_insert(storage_clone.clone(), parquet_path, &query_owned).await; let num_cols_loaded = query_owned.columns_to_load().len(); let num_batches = entry_ids.len() / num_cols_loaded; log::info!( @@ -392,7 +413,6 @@ fn run_bench( storage_clone.flush_all_to_disk().await; (num_batches, entry_ids, batch_lengths) }); - for i in 0..num_iter { liquid_cache_benchmarks::tracepoints::iteration_start(query_index as u32, i as u32); @@ -439,9 +459,9 @@ async fn run_partition( storage: Arc, batch_range: std::ops::Range, num_cols: usize, - predicates: Vec::>, - entry_ids: Vec::, - batch_lengths: Vec::, + predicates: Vec>, + entry_ids: Vec, + batch_lengths: Vec, ) -> usize { let mut total_matched = 0usize; @@ -450,9 +470,7 @@ async fn run_partition( for batch_idx in batch_range.clone() { let entry_idx = batch_idx * num_cols; let entry_id = &entry_ids[entry_idx]; - let _result = storage - .get(entry_id) - .await; + let _result = storage.get(entry_id).await; total_matched += batch_lengths[entry_idx]; } return total_matched; @@ -467,7 +485,7 @@ async fn run_partition( let selection = BooleanBuffer::new_set(len); let result = storage .eval_predicate(entry_id, pred) - .with_selection(&selection) // Is this necessary? + .with_selection(&selection) // Is this necessary? .await; match result { Some(Ok(mask)) => { @@ -497,7 +515,10 @@ async fn load_and_insert( query: &FilterQuery, ) -> (Vec, Vec) { let columns_to_load = query.columns_to_load(); - assert!(!columns_to_load.is_empty(), "query must have filter_columns or projection_columns"); + assert!( + !columns_to_load.is_empty(), + "query must have filter_columns or projection_columns" + ); let Ok(parquet_file) = std::fs::File::open(parquet_path.clone()) else { panic!("Failed to open {:?}", parquet_path.to_str()); @@ -571,7 +592,11 @@ fn main() { } }, _ => { - eprintln!("Query index {} out of range (0..{}).", args.query_index, queries.len()); + eprintln!( + "Query index {} out of range (0..{}).", + args.query_index, + queries.len() + ); std::process::exit(1); } }; @@ -589,9 +614,9 @@ fn main() { ); std::process::exit(1); } - let cache_dir = args.cache_dir.unwrap_or_else(|| { - std::env::temp_dir().join("lc_cache_dir") - }); + let cache_dir = args + .cache_dir + .unwrap_or_else(|| std::env::temp_dir().join("lc_cache_dir")); run_bench( cache_dir, args.parquet, @@ -603,4 +628,4 @@ fn main() { args.flamegraph_dir, args.io_mode, ); -} \ No newline at end of file +} diff --git a/src/common/src/memory/arena.rs b/src/common/src/memory/arena.rs index 339ddda3..09eab355 100644 --- a/src/common/src/memory/arena.rs +++ b/src/common/src/memory/arena.rs @@ -3,7 +3,9 @@ use std::{io, os::raw::c_void, ptr::null_mut}; use io_uring::IoUring; use crate::memory::{ - page::Slice, pool::{FIXED_BUFFER_BITS, FIXED_BUFFER_SIZE_BYTES}, segment::{SEGMENT_SIZE, SEGMENT_SIZE_BITS, Segment} + page::Slice, + pool::{FIXED_BUFFER_BITS, FIXED_BUFFER_SIZE_BYTES}, + segment::{SEGMENT_SIZE, SEGMENT_SIZE_BITS, Segment}, }; pub struct Arena { @@ -14,7 +16,7 @@ pub struct Arena { * Segments need to be aligned to 32MB boundaries. Hence the first segment's starting address * could be different from the starting address of the allocated memory */ - aligned_start_ptr: *mut u8, + aligned_start_ptr: *mut u8, actual_start_ptr: *mut u8, buffers_registered: bool, } @@ -100,22 +102,24 @@ impl Arena { } pub(crate) fn register_buffers_with_ring(self: &mut Self, ring: &IoUring) -> io::Result<()> { - let usable_bytes = self.size + let usable_bytes = self + .size .saturating_sub(self.aligned_start_ptr as usize - self.actual_start_ptr as usize); let num_buffers = usable_bytes >> FIXED_BUFFER_BITS; let mut buffers = Vec::::new(); buffers.reserve(num_buffers); let mut base_ptr = self.aligned_start_ptr; for _i in 0..num_buffers { - buffers.push(libc::iovec {iov_base: base_ptr as *mut std::ffi::c_void, iov_len: FIXED_BUFFER_SIZE_BYTES}); + buffers.push(libc::iovec { + iov_base: base_ptr as *mut std::ffi::c_void, + iov_len: FIXED_BUFFER_SIZE_BYTES, + }); base_ptr = base_ptr.wrapping_add(FIXED_BUFFER_SIZE_BYTES); } - let res = unsafe { - ring.submitter().register_buffers(&buffers) - }; + let res = unsafe { ring.submitter().register_buffers(&buffers) }; self.buffers_registered = res.is_ok(); res - } + } } impl Drop for Arena { @@ -124,4 +128,4 @@ impl Drop for Arena { libc::munmap(self.actual_start_ptr as *mut c_void, self.size); } } -} \ No newline at end of file +} diff --git a/src/common/src/memory/global_pool.rs b/src/common/src/memory/global_pool.rs deleted file mode 100644 index e69de29b..00000000 diff --git a/src/common/src/memory/mod.rs b/src/common/src/memory/mod.rs index 75406a8e..72ab6966 100644 --- a/src/common/src/memory/mod.rs +++ b/src/common/src/memory/mod.rs @@ -1,6 +1,5 @@ +mod arena; pub mod page; pub mod pool; mod segment; -mod arena; -mod global_pool; -mod tcache; \ No newline at end of file +mod tcache; diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs index 41162bff..edcb809c 100644 --- a/src/common/src/memory/page.rs +++ b/src/common/src/memory/page.rs @@ -1,11 +1,15 @@ -use std::{ptr::null_mut, sync::atomic::{AtomicU8, Ordering}, u8}; +use std::{ + ptr::null_mut, + sync::atomic::{AtomicU8, Ordering}, + u8, +}; use crossbeam::utils::CachePadded; use crate::memory::tcache::MIN_SIZE_FROM_PAGES; -pub const PAGE_SIZE: usize = 64<<10; // 64KB -const MAX_BLOCKS_PER_PAGE: usize = PAGE_SIZE/MIN_SIZE_FROM_PAGES; +pub const PAGE_SIZE: usize = 64 << 10; // 64KB +const MAX_BLOCKS_PER_PAGE: usize = PAGE_SIZE / MIN_SIZE_FROM_PAGES; struct LocalFreeList { head: u8, @@ -34,7 +38,12 @@ impl LocalFreeList { for i in 0..num_blocks { blocks[i] = i as u8; } - LocalFreeList { head: 0, tail: num_blocks as u8, num_blocks: num_blocks as u8, blocks: blocks } + LocalFreeList { + head: 0, + tail: num_blocks as u8, + num_blocks: num_blocks as u8, + blocks: blocks, + } } fn push(&mut self, block: u8) { @@ -49,7 +58,7 @@ impl LocalFreeList { fn pop(&mut self) -> Option { if self.head == self.tail { - return None + return None; } let ret = self.blocks[self.head as usize & (MAX_BLOCKS_PER_PAGE - 1)]; self.head = self.head.wrapping_add(1); @@ -82,18 +91,25 @@ impl MPSCQueue { let cur_tail = self.tail.load(Ordering::Relaxed); assert!(cur_tail.wrapping_sub(self.head) < self.num_blocks); let new_tail = cur_tail.wrapping_add(1); - if self.tail.compare_exchange(cur_tail, new_tail, Ordering::Relaxed, Ordering::Relaxed).is_ok() { + if self + .tail + .compare_exchange(cur_tail, new_tail, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { unsafe { - std::ptr::write_volatile(&mut self.blocks[cur_tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] as *mut u8, block); + std::ptr::write_volatile( + &mut self.blocks[cur_tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] as *mut u8, + block, + ); } - return + return; } } } fn pop(&mut self) -> Option { if self.head == self.tail.load(Ordering::Relaxed) { - return None + return None; } let idx = self.head as usize & (MAX_BLOCKS_PER_PAGE - 1); loop { @@ -114,15 +130,14 @@ impl MPSCQueue { } pub struct Page { - pub(crate) block_size: usize, // Size of objects that are being allocated to this page + pub(crate) block_size: usize, // Size of objects that are being allocated to this page free_list: LocalFreeList, pub(crate) used: usize, thread_free_list: MPSCQueue, - pub(crate) capacity: usize, - pub(crate) slice_count: usize, // No. of pages in the slice containing this page - pub(crate) slice_offset: usize, // Offset of this page from the start of this slice + pub(crate) slice_count: usize, // No. of pages in the slice containing this page + pub(crate) slice_offset: usize, // Offset of this page from the start of this slice pub(crate) page_start: *mut u8, - // Next and previous pages in the span which is a doubly-linked list + // Next and previous pages in the span which is a doubly-linked list pub(crate) next_page: *mut Page, pub(crate) previous_page: *mut Page, } @@ -133,8 +148,7 @@ impl Page { block_size: 0usize, free_list: LocalFreeList::empty(), used: 0, - thread_free_list: MPSCQueue::new(PAGE_SIZE/MIN_SIZE_FROM_PAGES), - capacity: slice.size, + thread_free_list: MPSCQueue::new(PAGE_SIZE / MIN_SIZE_FROM_PAGES), slice_count: 1, slice_offset: 0, page_start: slice.ptr, @@ -181,7 +195,7 @@ impl Page { /// Pointer freed on a different core #[inline(always)] pub(crate) fn foreign_free(self: &mut Self, ptr: *mut u8) { - let blk_idx = unsafe {ptr.offset_from(self.page_start) as usize / self.block_size}; + let blk_idx = unsafe { ptr.offset_from(self.page_start) as usize / self.block_size }; self.thread_free_list.push(blk_idx as u8); } @@ -213,4 +227,4 @@ impl Slice { }; (slice1, slice2) } -} \ No newline at end of file +} diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs index 4f193b2f..1640f8d4 100644 --- a/src/common/src/memory/pool.rs +++ b/src/common/src/memory/pool.rs @@ -1,12 +1,22 @@ extern crate io_uring; use core::slice; -use std::{cmp::min, sync::{Arc, Mutex, OnceLock, atomic::{AtomicBool, AtomicU64, Ordering}}}; +use std::{ + cmp::min, + sync::{ + Arc, Mutex, OnceLock, + atomic::{AtomicBool, AtomicU64, Ordering}, + }, +}; use futures::io; use io_uring::IoUring; -use crate::memory::{arena::Arena, segment::Segment, tcache::{TCache, TCacheStats}}; +use crate::memory::{ + arena::Arena, + segment::Segment, + tcache::{TCache, TCacheStats}, +}; static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); @@ -45,7 +55,7 @@ pub struct FixedBufferPool { arena: Arc>, start_ptr: *mut u8, capacity: usize, - registered: AtomicBool, // Whether buffers have been registered + registered: AtomicBool, // Whether buffers have been registered foreign_free: AtomicU64, } @@ -55,7 +65,10 @@ unsafe impl Sync for FixedBufferPool {} impl FixedBufferPool { fn new(capacity_mb: usize) -> FixedBufferPool { - log::info!("Initializing fixed buffer pool with capacity: {} MB", capacity_mb); + log::info!( + "Initializing fixed buffer pool with capacity: {} MB", + capacity_mb + ); let num_cpus = std::thread::available_parallelism().unwrap(); let capacity = capacity_mb << 20; let arena = Self::allocate_arena(capacity.clone()); @@ -67,11 +80,11 @@ impl FixedBufferPool { for i in 0..num_cpus.get() { local_caches.push(Mutex::new(TCache::new(arena.clone(), i))); } - FixedBufferPool { - local_caches, - arena, - start_ptr, - capacity, + FixedBufferPool { + local_caches, + arena, + start_ptr, + capacity, registered: AtomicBool::new(false), foreign_free: AtomicU64::new(0), } @@ -102,7 +115,12 @@ impl FixedBufferPool { } pub fn register_buffers_with_ring(ring: &IoUring) -> io::Result<()> { - let pool = FIXED_BUFFER_POOL.get().unwrap(); + let Some(pool) = FIXED_BUFFER_POOL.get() else { + return Err(io::Error::new( + io::ErrorKind::Other, + "fixed buffer pool not initialized", + )); + }; let mut arena_guard = pool.arena.lock().unwrap(); let res = arena_guard.register_buffers_with_ring(ring); if res.is_ok() { @@ -113,7 +131,9 @@ impl FixedBufferPool { } pub(crate) fn get_stats(cpu: usize) -> TCacheStats { - let pool = FIXED_BUFFER_POOL.get().unwrap(); + let Some(pool) = FIXED_BUFFER_POOL.get() else { + return TCacheStats::new(); + }; let tcache = pool.local_caches[cpu].lock().unwrap(); tcache.get_stats() } @@ -122,14 +142,19 @@ impl FixedBufferPool { let ptr = alloc.ptr; let size = alloc.size; let pool = FIXED_BUFFER_POOL.get().unwrap(); - debug_assert!(ptr >= pool.start_ptr && ptr < pool.start_ptr.wrapping_add(pool.capacity), - "Pointer doesn't lie within the arena"); + debug_assert!( + ptr >= pool.start_ptr && ptr < pool.start_ptr.wrapping_add(pool.capacity), + "Pointer doesn't lie within the arena" + ); let mut remaining = size; let mut vec = Vec::::new(); let mut current = ptr.clone(); - let mut buffer_id = (current.wrapping_sub(pool.start_ptr as usize) as usize) >> FIXED_BUFFER_BITS; + let mut buffer_id = + (current.wrapping_sub(pool.start_ptr as usize) as usize) >> FIXED_BUFFER_BITS; while remaining > 0 { - let next_buffer_start = pool.start_ptr.wrapping_add((buffer_id + 1) << FIXED_BUFFER_BITS); + let next_buffer_start = pool + .start_ptr + .wrapping_add((buffer_id + 1) << FIXED_BUFFER_BITS); let bytes = min(remaining, next_buffer_start as usize - current as usize); let fb = FixedBuffer { ptr: current, @@ -154,9 +179,14 @@ impl FixedBufferPool { let segment_ptr = Segment::get_segment_from_ptr(ptr); let page_ptr = unsafe { (*segment_ptr).get_page_from_ptr(ptr) }; let thread_id = unsafe { (*segment_ptr).thread_id }; - log::debug!("Freed pointer: {:?}, size: {}, owner thread id: {}", ptr, unsafe { (*page_ptr).block_size }, thread_id); - - // If page is local and unused after free, return it to segment + log::debug!( + "Freed pointer: {:?}, size: {}, owner thread id: {}", + ptr, + unsafe { (*page_ptr).block_size }, + thread_id + ); + + // If page is local and unused after free, return it to segment let cur_cpu = unsafe { libc::sched_getcpu() as usize }; if cur_cpu == thread_id { unsafe { @@ -169,7 +199,9 @@ impl FixedBufferPool { guard.retire_page(page_ptr); } } else { - unsafe { (*page_ptr).foreign_free(ptr); } + unsafe { + (*page_ptr).foreign_free(ptr); + } let pool = FIXED_BUFFER_POOL.get().unwrap(); pool.foreign_free.fetch_add(1, Ordering::Relaxed); } @@ -177,7 +209,7 @@ impl FixedBufferPool { pub fn print_stats() { if FIXED_BUFFER_POOL.get().is_none() { - return + return; } let num_cpus = std::thread::available_parallelism().unwrap(); let mut agg_stats = TCacheStats::new(); @@ -206,7 +238,11 @@ impl Drop for FixedBufferPool { mod tests { #[allow(unused_imports)] - use std::{io::Write, os::fd::AsRawFd, ptr::{null, null_mut}}; + use std::{ + io::Write, + os::fd::AsRawFd, + ptr::{null, null_mut}, + }; use bytes::Bytes; use io_uring::{IoUring, cqueue, opcode, squeue}; @@ -219,7 +255,7 @@ mod tests { fn test_basic_alloc_and_free() { FixedBufferPool::init(128); - let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes let mut ptrs = Vec::<*mut u8>::new(); for len in buffer_lengths { let ptr = FixedBufferPool::malloc(len); @@ -229,7 +265,7 @@ mod tests { let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; buffer[0] = 1; - buffer[len-1] = 1; + buffer[len - 1] = 1; ptrs.push(ptr); } @@ -250,7 +286,7 @@ mod tests { fn test_basic_alloc_and_free_bytes() { FixedBufferPool::init(128); - let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes // let mut ptrs = Vec::<*mut u8>::new(); let mut bytes_vec = Vec::::new(); for len in buffer_lengths { @@ -261,8 +297,11 @@ mod tests { let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; buffer[0] = 1; - buffer[len-1] = 1; - let alloc = FixedBufferAllocation {ptr: ptr, size: len}; + buffer[len - 1] = 1; + let alloc = FixedBufferAllocation { + ptr: ptr, + size: len, + }; let bytes = Bytes::from_owner(alloc); bytes_vec.push(bytes); } @@ -292,7 +331,7 @@ mod tests { let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; buffer[0] = 1; - buffer[len-1] = 1; + buffer[len - 1] = 1; buffers.push(buffer); } @@ -315,14 +354,14 @@ mod tests { #[test] fn test_large_alloc_and_free() { FixedBufferPool::init(128); - let len = 1024 * 1024; // 1 MB + let len = 1024 * 1024; // 1 MB let ptr = FixedBufferPool::malloc(len); assert_ne!(ptr, null_mut()); // 4096 byte alignment is necessary for direct IO assert_eq!(ptr as usize % 4096, 0); let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; buffer[0] = 1; - buffer[len-1] = 1; + buffer[len - 1] = 1; FixedBufferPool::free(ptr); let cur_cpu = unsafe { libc::sched_getcpu() as usize }; @@ -336,14 +375,14 @@ mod tests { #[test] fn test_large_alloc_and_free2() { FixedBufferPool::init(128); - let len = 3 * 1024 * 1024; // 1 MB + let len = 3 * 1024 * 1024; // 1 MB let ptr = FixedBufferPool::malloc(len); assert_ne!(ptr, null_mut()); // 4096 byte alignment is necessary for direct IO assert_eq!(ptr as usize % 4096, 0); let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; buffer[0] = 1; - buffer[len-1] = 1; + buffer[len - 1] = 1; FixedBufferPool::free(ptr); let cur_cpu = unsafe { libc::sched_getcpu() as usize }; @@ -357,7 +396,7 @@ mod tests { #[test] fn test_very_large_alloc_fails() { FixedBufferPool::init(128); - let len = 32 * 1024 * 1024; // 32 MB + let len = 32 * 1024 * 1024; // 32 MB let ptr = FixedBufferPool::malloc(len); assert_eq!(ptr, null_mut()); @@ -365,15 +404,23 @@ mod tests { #[test] fn test_with_uring_basic() { - let mut rlimit = libc::rlimit{ + let mut rlimit = libc::rlimit { rlim_cur: 0, rlim_max: 0, }; - unsafe { libc::getrlimit(libc::RLIMIT_MEMLOCK, &mut rlimit); } - assert!(64 * 1024 <= rlimit.rlim_max, "rlimit.MEMLOCK should be at least 64 MB to test the fixed-buffer pool. Current rlimit is: {} KB", rlimit.rlim_max); + unsafe { + libc::getrlimit(libc::RLIMIT_MEMLOCK, &mut rlimit); + } + assert!( + 64 * 1024 <= rlimit.rlim_max, + "rlimit.MEMLOCK should be at least 64 MB to test the fixed-buffer pool. Current rlimit is: {} KB", + rlimit.rlim_max + ); FixedBufferPool::init(64); - - let mut ring = IoUring::::builder().build(32).unwrap(); + + let mut ring = IoUring::::builder() + .build(32) + .unwrap(); let res = FixedBufferPool::register_buffers_with_ring(&ring); assert!(res.is_ok()); @@ -381,7 +428,10 @@ mod tests { let mut file = tempfile::tempfile().unwrap(); let ptr = FixedBufferPool::malloc(LEN); assert_ne!(ptr, null_mut()); - let alloc = FixedBufferAllocation {ptr: ptr, size: LEN}; + let alloc = FixedBufferAllocation { + ptr: ptr, + size: LEN, + }; let buffers = FixedBufferPool::get_fixed_buffers(&alloc); assert!(buffers.len() <= (LEN / FIXED_BUFFER_SIZE_BYTES) + 1); @@ -404,8 +454,10 @@ mod tests { io_uring::types::Fd(file.as_raw_fd()), fixed_buffer.ptr, fixed_buffer.bytes as u32, - fixed_buffer.buf_id as u16) - .offset(file_offset).build(); + fixed_buffer.buf_id as u16, + ) + .offset(file_offset) + .build(); file_offset += fixed_buffer.bytes as u64; let mut sq = ring.submission(); let res = unsafe { sq.push(&sqe) }; @@ -422,10 +474,18 @@ mod tests { let cqe = cq.next(); assert!(cqe.is_some()); let res = cqe.as_ref().unwrap().result(); - assert!( res > 0, "Read failed: {}", std::io::Error::from_raw_os_error(-cqe.unwrap().result())); + assert!( + res > 0, + "Read failed: {}", + std::io::Error::from_raw_os_error(-cqe.unwrap().result()) + ); total_bytes_read += res as usize; } - assert_eq!(total_bytes_read, LEN, "Expected to read {} bytes, but read {}", LEN, total_bytes_read); + assert_eq!( + total_bytes_read, LEN, + "Expected to read {} bytes, but read {}", + LEN, total_bytes_read + ); let buffer = Bytes::from_owner(alloc); assert_eq!(buffer, &random_bytes[..]); } @@ -451,4 +511,4 @@ mod tests { assert_eq!(stats.segments_retired, 1); // assert_eq } -} \ No newline at end of file +} diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs index 90b552f0..e5751d18 100644 --- a/src/common/src/memory/segment.rs +++ b/src/common/src/memory/segment.rs @@ -1,12 +1,13 @@ use std::ptr::{null_mut, write}; -use crate::memory::{page::{PAGE_SIZE, Page, Slice}}; +use crate::memory::page::{PAGE_SIZE, Page, Slice}; -pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; // 32 MB +pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; // 32 MB pub const SEGMENT_SIZE_BITS: usize = SEGMENT_SIZE.ilog2() as usize; // The metadata is stored at the beginning of the slice. So we don't get the entirety of it for pages -pub const PAGES_PER_SEGMENT: usize = (SEGMENT_SIZE - 3 * size_of::()) / (PAGE_SIZE + size_of::()); +pub const PAGES_PER_SEGMENT: usize = + (SEGMENT_SIZE - 3 * size_of::()) / (PAGE_SIZE + size_of::()); pub struct Segment { pub(crate) allocated: usize, @@ -28,7 +29,10 @@ impl Segment { // Use ptr::write after dropping to initialize new Pages write( pages_ptr.add(i), - Page::from_slice(Slice {ptr: start_ptr, size: PAGE_SIZE}) + Page::from_slice(Slice { + ptr: start_ptr, + size: PAGE_SIZE, + }), ); start_ptr = start_ptr.wrapping_add(PAGE_SIZE); } @@ -38,7 +42,7 @@ impl Segment { #[inline] pub fn full(self: &mut Self) -> bool { - self.allocated == self.num_slices + self.allocated == self.num_slices } pub fn reset(self: &mut Self) -> () { @@ -56,9 +60,7 @@ impl Segment { pub fn get_page_from_ptr(self: &mut Self, ptr: *mut u8) -> *mut Page { let base_page_ptr = self.pages[0].page_start; debug_assert!(ptr >= base_page_ptr); - let index = unsafe { - ptr.sub(base_page_ptr as usize) as usize / PAGE_SIZE - }; + let index = unsafe { ptr.sub(base_page_ptr as usize) as usize / PAGE_SIZE }; debug_assert!(index < PAGES_PER_SEGMENT); &mut self.pages[index] as *mut Page } @@ -72,17 +74,20 @@ impl Segment { let base_page_ptr = unsafe { (*page).page_start }; let base_segment_page_ptr = self.pages[0].page_start; debug_assert!(base_page_ptr >= base_segment_page_ptr); - let index = unsafe { - base_page_ptr.sub(base_segment_page_ptr as usize) as usize / PAGE_SIZE - }; - + let index = + unsafe { base_page_ptr.sub(base_segment_page_ptr as usize) as usize / PAGE_SIZE }; + // Read original slice_count before modifying anything let original_slice_count = unsafe { (*page).slice_count }; - debug_assert!(num_slices > 0 && num_slices < original_slice_count, - "num_slices: {}, slice_count: {}", num_slices, original_slice_count); + debug_assert!( + num_slices > 0 && num_slices < original_slice_count, + "num_slices: {}, slice_count: {}", + num_slices, + original_slice_count + ); debug_assert!(index + original_slice_count <= PAGES_PER_SEGMENT); // log::info!("[thread_id: {}, segment_id: {}] Splitting page with {} slices", self.thread_id, self.segment_id, original_slice_count); - + /* * ASSUMPTION: Pointer to the beginning of the slice is passed in. * We don't need to modify all the intermediate pages while splitting. Only update the following: @@ -95,7 +100,7 @@ impl Segment { // Update slice1: the original slice becomes the first part (*page).slice_offset = 0; (*page).slice_count = num_slices; - + let pages_ptr = self.pages.as_mut_ptr(); let last_page_in_slice1 = pages_ptr.add(index + num_slices - 1); (*last_page_in_slice1).slice_offset = num_slices - 1; @@ -105,26 +110,41 @@ impl Segment { let slice2 = pages_ptr.add(index + num_slices); (*slice2).slice_offset = 0; (*slice2).slice_count = slice2_count; - assert!((*slice2).block_size == 0, "block size: {}", (*slice2).block_size); - + assert!( + (*slice2).block_size == 0, + "block size: {}", + (*slice2).block_size + ); + let last_page_in_slice2 = pages_ptr.add(index + original_slice_count - 1); (*last_page_in_slice2).slice_offset = slice2_count - 1; - + slice2 } } pub fn coalesce_slices(self: &mut Self, left_slice: &mut Page, right_slice: &mut Page) { - debug_assert!(left_slice.page_start >= self.pages[0].page_start && - left_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start); - debug_assert!(right_slice.page_start >= self.pages[0].page_start && - right_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start); - - let left_slice_idx = (left_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; - let right_slice_idx = (right_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; - debug_assert!(left_slice_idx + left_slice.slice_count == right_slice_idx, - "left slice count: {}, left slice idx: {}, right slice idx: {}, thread_id: {}", - left_slice.slice_count, left_slice_idx, right_slice_idx, self.thread_id); + debug_assert!( + left_slice.page_start >= self.pages[0].page_start + && left_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start + ); + debug_assert!( + right_slice.page_start >= self.pages[0].page_start + && right_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start + ); + + let left_slice_idx = + (left_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + let right_slice_idx = + (right_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + debug_assert!( + left_slice_idx + left_slice.slice_count == right_slice_idx, + "left slice count: {}, left slice idx: {}, right slice idx: {}, thread_id: {}", + left_slice.slice_count, + left_slice_idx, + right_slice_idx, + self.thread_id + ); debug_assert!(right_slice_idx + right_slice.slice_count <= PAGES_PER_SEGMENT); /* @@ -148,9 +168,14 @@ impl Segment { debug_assert!(page.slice_offset == 0 && idx + page.slice_count <= PAGES_PER_SEGMENT); let slice_count = page.slice_count; let last_page_in_slice = &mut self.pages[idx + slice_count - 1]; - debug_assert!(last_page_in_slice.slice_offset == slice_count - 1, - "slice count: {}, last page slice offset: {}, thread_id: {}", slice_count, last_page_in_slice.slice_offset, self.thread_id); + debug_assert!( + last_page_in_slice.slice_offset == slice_count - 1, + "slice count: {}, last page slice offset: {}, thread_id: {}", + slice_count, + last_page_in_slice.slice_offset, + self.thread_id + ); idx += slice_count; } } -} \ No newline at end of file +} diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs index 21e9020a..ec2e0701 100644 --- a/src/common/src/memory/tcache.rs +++ b/src/common/src/memory/tcache.rs @@ -9,19 +9,13 @@ use crate::memory::{ segment::{PAGES_PER_SEGMENT, SEGMENT_SIZE, Segment}, }; -const SIZE_CLASSES: &'static [usize] = &[ - 4 << 10, - 8 << 10, - 16 << 10, - 32 << 10, - 64 << 10, -]; +const SIZE_CLASSES: &'static [usize] = &[4 << 10, 8 << 10, 16 << 10, 32 << 10, 64 << 10]; const NUM_SIZE_CLASSES: usize = SIZE_CLASSES.len(); pub(crate) const MIN_SIZE_FROM_PAGES: usize = SIZE_CLASSES[0]; -const SEGMENT_BINS: usize = (SEGMENT_SIZE/PAGE_SIZE).ilog2() as usize + 1; +const SEGMENT_BINS: usize = (SEGMENT_SIZE / PAGE_SIZE).ilog2() as usize + 1; #[derive(Default, Clone)] pub(crate) struct TCacheStats { @@ -29,8 +23,8 @@ pub(crate) struct TCacheStats { pub(crate) total_allocations: usize, pub(crate) unsuccessful_allocations: usize, pub(crate) total_segments_allocated: usize, - pub(crate) fast_allocations: usize, // Allocations from self.free_pages - pub(crate) allocations_from_pages: usize, // Allocations from self.used_pages + pub(crate) fast_allocations: usize, // Allocations from self.free_pages + pub(crate) allocations_from_pages: usize, // Allocations from self.used_pages pub(crate) allocations_from_segment: usize, pub(crate) allocations_from_arena: usize, @@ -48,10 +42,16 @@ impl TCacheStats { #[allow(unused)] pub(crate) fn print(self: &Self) { println!("Total allocations: {}", self.total_allocations); - println!("Unsuccessful allocations: {}", self.unsuccessful_allocations); + println!( + "Unsuccessful allocations: {}", + self.unsuccessful_allocations + ); println!("Fast allocations: {}", self.fast_allocations); println!("Allocations from pages: {}", self.allocations_from_pages); - println!("Allocations from segment: {}", self.allocations_from_segment); + println!( + "Allocations from segment: {}", + self.allocations_from_segment + ); println!("Allocations from arena: {}", self.allocations_from_arena); println!("Pages retired: {}", self.pages_retired); println!("Segments retired: {}", self.segments_retired); @@ -84,7 +84,10 @@ impl TCache { TCache { free_pages: [const { null_mut() }; NUM_SIZE_CLASSES], used_pages: [const { Vec::<*mut Page>::new() }; NUM_SIZE_CLASSES + 1], - spans: [Span { first: null_mut(), last: null_mut() }; SEGMENT_BINS], + spans: [Span { + first: null_mut(), + last: null_mut(), + }; SEGMENT_BINS], arena: arena.clone(), thread_id, stats: TCacheStats::new(), @@ -112,10 +115,12 @@ impl TCache { debug_assert!(span.last == null_mut()); span.first = slice as *mut Page; span.last = slice as *mut Page; - return + return; } debug_assert!(span.last != null_mut()); - unsafe { (*span.last).next_page = slice; } + unsafe { + (*span.last).next_page = slice; + } slice.previous_page = span.last; span.last = slice as *mut Page; } @@ -126,28 +131,38 @@ impl TCache { if span.first == slice as *mut Page { span.first = slice.next_page; if slice.next_page != null_mut() { - unsafe { (*slice.next_page).previous_page = null_mut(); } + unsafe { + (*slice.next_page).previous_page = null_mut(); + } } else { span.last = null_mut(); } } else if span.last == slice as *mut Page { span.last = slice.previous_page; debug_assert!(slice.previous_page != null_mut()); - unsafe { (*span.last).next_page = null_mut(); } + unsafe { + (*span.last).next_page = null_mut(); + } } else { debug_assert!(slice.previous_page != null_mut()); debug_assert!(slice.next_page != null_mut()); - unsafe { (*slice.previous_page).next_page = slice.next_page; } - unsafe { (*slice.next_page).previous_page = slice.previous_page; } + unsafe { + (*slice.previous_page).next_page = slice.next_page; + } + unsafe { + (*slice.next_page).previous_page = slice.previous_page; + } } - + slice.next_page = null_mut(); slice.previous_page = null_mut(); } fn retire_segment(self: &mut Self, segment: *mut Segment) { // log::info!("Retiring segment from thread with id: {}", self.thread_id); - unsafe { (*segment).check_valid_segment(); } + unsafe { + (*segment).check_valid_segment(); + } self.stats.segments_retired += 1; let pages = unsafe { &mut (*segment).pages }; let mut slice_idx: usize = 0; @@ -169,7 +184,7 @@ impl TCache { for i in 0..self.used_pages[size_class].len() { if self.used_pages[size_class][i] == page_ptr { self.used_pages[size_class].remove(i); - return + return; } } } @@ -202,7 +217,11 @@ impl TCache { if next_slice <= (&mut segment.pages[PAGES_PER_SEGMENT - 1]) as *mut Page { let next_slice_ref = unsafe { &mut (*next_slice) }; if next_slice_ref.block_size == 0 { - log::debug!("[thread_id: {}] Merging released slice with next slice. Slice count of next slice: {}", self.thread_id, next_slice_ref.slice_count); + log::debug!( + "[thread_id: {}] Merging released slice with next slice. Slice count of next slice: {}", + self.thread_id, + next_slice_ref.slice_count + ); // Page is not in use, remove it self.remove_slice_from_span(next_slice_ref); segment.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); @@ -217,19 +236,33 @@ impl TCache { let prev_slice_ref = unsafe { &mut (*prev_slice) }; if prev_slice_ref.block_size == 0 { // Merge with the previous slice - log::debug!("[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", self.thread_id, prev_slice_ref.slice_count); + log::debug!( + "[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", + self.thread_id, + prev_slice_ref.slice_count + ); self.remove_slice_from_span(prev_slice_ref); segment.coalesce_slices(prev_slice_ref, page_ref); let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); Self::add_slice_to_span(&mut self.spans[span_idx], prev_slice_ref); - log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, prev_slice_ref.slice_count, span_idx); + log::debug!( + "[thread_id: {}] Added page with slice count {} to span with index: {}", + self.thread_id, + prev_slice_ref.slice_count, + span_idx + ); merged_with_prev = true; } } if !merged_with_prev { let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); Self::add_slice_to_span(&mut self.spans[span_idx], page_ref); - log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, page_ref.slice_count, span_idx); + log::debug!( + "[thread_id: {}] Added page with slice count {} to span with index: {}", + self.thread_id, + page_ref.slice_count, + span_idx + ); } segment.check_valid_segment(); } @@ -273,13 +306,17 @@ impl TCache { let page = self.used_pages[bin].remove(i); let block = (*page).get_free_block(); self.free_pages[bin] = page; - return block + return block; } } null_mut() } - fn find_page_from_spans(self: &mut Self, num_slices_required: usize, block_size: usize) -> *mut Page { + fn find_page_from_spans( + self: &mut Self, + num_slices_required: usize, + block_size: usize, + ) -> *mut Page { debug_assert!(block_size >= MIN_SIZE_FROM_PAGES); let min_bin = Self::get_span_idx_from_slice_count(num_slices_required); for i in min_bin..SEGMENT_BINS { @@ -289,7 +326,9 @@ impl TCache { let num_slices_original = unsafe { (*slice).slice_count }; debug_assert!(num_slices_original >= 1 << i); if num_slices_original < num_slices_required { - unsafe { slice = (*slice).next_page; } + unsafe { + slice = (*slice).next_page; + } continue; } self.remove_slice_from_span(unsafe { &mut *slice }); @@ -301,12 +340,21 @@ impl TCache { if num_slices_original > num_slices_required { // split slice let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; - debug_assert!(unsafe { (*slice).slice_count == num_slices_required}); + debug_assert!(unsafe { (*slice).slice_count == num_slices_required }); #[cfg(debug_assertions)] - unsafe { (*segment).check_valid_segment() } ; - let bin = Self::get_span_idx_from_slice_count(num_slices_original - num_slices_required); - Self::add_slice_to_span(&mut self.spans[bin], unsafe { &mut (*next_slice) } ); - log::debug!("[thread_id: {}] Added page with slice count {} to span with index: {}", self.thread_id, num_slices_original - num_slices_required, bin); + unsafe { + (*segment).check_valid_segment() + }; + let bin = Self::get_span_idx_from_slice_count( + num_slices_original - num_slices_required, + ); + Self::add_slice_to_span(&mut self.spans[bin], unsafe { &mut (*next_slice) }); + log::debug!( + "[thread_id: {}] Added page with slice count {} to span with index: {}", + self.thread_id, + num_slices_original - num_slices_required, + bin + ); } unsafe { (*slice).set_block_size(block_size); @@ -343,7 +391,7 @@ impl TCache { unsafe { (*segment_opt.unwrap()).thread_id = thread_id; } - + self.add_segment_to_spans(segment_opt.unwrap()); true } @@ -357,7 +405,7 @@ impl TCache { self.stats.allocations_from_segment += 1; self.used_pages[NUM_SIZE_CLASSES].push(free_page); let free_block = unsafe { (*free_page).get_free_block() }; - return free_block + return free_block; } self.cleanup_pages(); // Retry after cleanup @@ -366,25 +414,25 @@ impl TCache { self.stats.allocations_from_segment += 1; self.used_pages[NUM_SIZE_CLASSES].push(free_page); let free_block = unsafe { (*free_page).get_free_block() }; - return free_block + return free_block; } let res = self.allocate_segment_from_arena(self.thread_id); if !res { self.stats.unsuccessful_allocations += 1; - return null_mut() + return null_mut(); } self.stats.allocations_from_arena += 1; free_page = self.find_page_from_spans(num_pages, block_size); if free_page == null_mut() { self.stats.unsuccessful_allocations += 1; - return null_mut() + return null_mut(); } self.used_pages[NUM_SIZE_CLASSES].push(free_page); debug_assert_ne!(free_page, null_mut()); let free_block = unsafe { (*free_page).get_free_block() }; debug_assert_ne!(free_block, null_mut()); - return free_block + return free_block; } pub(crate) fn allocate(self: &mut Self, size: usize) -> *mut u8 { @@ -394,7 +442,7 @@ impl TCache { self.cleanup_pages(); } if size > PAGE_SIZE { - return self.allocate_large(size) + return self.allocate_large(size); } let size_class = Self::get_size_class(size); debug_assert!(size_class < NUM_SIZE_CLASSES); @@ -402,18 +450,18 @@ impl TCache { let block_size = SIZE_CLASSES[size_class]; let mut free_page = self.free_pages[size_class]; if !free_page.is_null() { - debug_assert_eq!(unsafe {(*free_page).block_size}, block_size); + debug_assert_eq!(unsafe { (*free_page).block_size }, block_size); // allocate from free page let page = free_page.clone(); unsafe { if !(*page).is_full() { self.stats.fast_allocations += 1; - return (*page).get_free_block() + return (*page).get_free_block(); } else { // Try collecting frees from other threads and retrying (*page).collect_foreign_frees(); if !(*page).is_full() { - return (*page).get_free_block() + return (*page).get_free_block(); } self.used_pages[size_class].push(page); self.free_pages[size_class] = null_mut(); @@ -423,7 +471,7 @@ impl TCache { let block = self.find_page_from_used(size_class); if !block.is_null() { self.stats.allocations_from_pages += 1; - return block + return block; } free_page = self.find_page_from_spans(1, block_size); if free_page != null_mut() { @@ -437,18 +485,18 @@ impl TCache { let res = self.allocate_segment_from_arena(self.thread_id); if !res { self.stats.unsuccessful_allocations += 1; - return null_mut() + return null_mut(); } self.stats.allocations_from_arena += 1; free_page = self.find_page_from_spans(1, block_size); assert_ne!(free_page, null_mut()); let free_block = unsafe { (*free_page).get_free_block() }; self.free_pages[size_class] = free_page; - return free_block + return free_block; } #[allow(unused)] pub(crate) fn get_stats(self: &Self) -> TCacheStats { self.stats.clone() } -} \ No newline at end of file +} From dc19baa832b5f81489156ab6a0949ac18eaa867c Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 16 Mar 2026 18:27:16 -0500 Subject: [PATCH 20/26] Formatting changes --- src/parquet/src/cache/mod.rs | 6 +- src/parquet/src/io/io_backend.rs | 2 +- .../src/io/io_uring/multi_async_uring.rs | 6 +- src/parquet/src/io/io_uring/runtime.rs | 81 ++++++++++++------- src/parquet/src/io/io_uring/single_uring.rs | 6 +- src/parquet/src/io/io_uring/tasks.rs | 76 ++++++++++++----- src/parquet/src/io/io_uring/tests.rs | 26 +++--- .../src/io/io_uring/thread_pool_uring.rs | 57 ++++++++----- src/parquet/src/lib.rs | 2 +- src/parquet/src/optimizers/lineage_opt.rs | 2 +- 10 files changed, 177 insertions(+), 87 deletions(-) diff --git a/src/parquet/src/cache/mod.rs b/src/parquet/src/cache/mod.rs index fc37a960..f73ffc5d 100644 --- a/src/parquet/src/cache/mod.rs +++ b/src/parquet/src/cache/mod.rs @@ -251,7 +251,11 @@ impl LiquidCacheParquet { fixed_buffer_pool_size_mb: usize, ) -> Self { assert!(batch_size.is_power_of_two()); - let io_context = Arc::new(ParquetIoContext::new(cache_dir.clone(), io_mode, fixed_buffer_pool_size_mb)); + let io_context = Arc::new(ParquetIoContext::new( + cache_dir.clone(), + io_mode, + fixed_buffer_pool_size_mb, + )); let cache_storage_builder = LiquidCacheBuilder::new() .with_batch_size(batch_size) .with_max_cache_bytes(max_cache_bytes) diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index 54d3f28c..fa309947 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -89,7 +89,7 @@ pub(super) async fn write( data: Bytes, ) -> Result<(), std::io::Error> { match io_mode { - IoMode::Uring => { + IoMode::Uring => { #[cfg(target_os = "linux")] { super::io_uring::thread_pool_uring::write(path, &data, false, false).await diff --git a/src/parquet/src/io/io_uring/multi_async_uring.rs b/src/parquet/src/io/io_uring/multi_async_uring.rs index 89809a1f..3e40a463 100644 --- a/src/parquet/src/io/io_uring/multi_async_uring.rs +++ b/src/parquet/src/io/io_uring/multi_async_uring.rs @@ -256,7 +256,11 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { +pub(crate) async fn write( + path: PathBuf, + data: &Bytes, + direct_io: bool, +) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/runtime.rs index ecf2d80e..a7bc5b70 100644 --- a/src/parquet/src/io/io_uring/runtime.rs +++ b/src/parquet/src/io/io_uring/runtime.rs @@ -1,4 +1,20 @@ -use std::{cell::RefCell, collections::VecDeque, fs::OpenOptions, ops::Range, os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, rc::Rc, sync::{atomic::{AtomicBool, Ordering}, OnceLock}, task::{Context, Poll, Waker}, thread::{self, JoinHandle}, time::{Duration, Instant}}; +use std::{ + cell::RefCell, + collections::VecDeque, + fs::OpenOptions, + ops::Range, + os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, + path::PathBuf, + pin::Pin, + rc::Rc, + sync::{ + OnceLock, + atomic::{AtomicBool, Ordering}, + }, + task::{Context, Poll, Waker}, + thread::{self, JoinHandle}, + time::{Duration, Instant}, +}; use async_executor::LocalExecutor; use bytes::Bytes; @@ -65,7 +81,10 @@ impl UringExecutor { /// Spawns a task in the uring runtime by sending it to a randomly chosen worker's channel. /// The result is received through a oneshot channel. - pub fn spawn(self: &mut Self, future: F) -> oneshot::Receiver + pub fn spawn( + self: &mut Self, + future: F, + ) -> oneshot::Receiver where F::Output: Send + 'static, { @@ -86,7 +105,10 @@ impl UringExecutor { } /// Spawn a batch of tasks on the io_uring runtime, balancing across workers (round-robin). - pub fn spawn_many(self: &mut Self, futures: &mut Vec) -> crossbeam_channel::Receiver + pub fn spawn_many( + self: &mut Self, + futures: &mut Vec, + ) -> crossbeam_channel::Receiver where F::Output: Send + 'static, { @@ -97,7 +119,9 @@ impl UringExecutor { let f = Box::pin(f); let task = async move { let output = f.await; - sender_clone.send(output).expect("Failed to send back result"); + sender_clone + .send(output) + .expect("Failed to send back result"); }; let idx = i % num_workers; self.senders[idx] @@ -132,7 +156,7 @@ struct RuntimeWorker { submitted_tasks: Vec>, /** * When using fixed buffers, a single task can produce multiple submission queue entries. - * It is possible that we aren't able to submit all of them at one go. Hold them in an + * It is possible that we aren't able to submit all of them at one go. Hold them in an * intermediate queue in that case */ queued_entries: VecDeque, @@ -146,7 +170,7 @@ impl RuntimeWorker { pub fn new() -> RuntimeWorker { let mut builder = IoUring::::builder(); let ring = builder - .setup_single_issuer() // Only the worker thread will issue IO and poll completions + .setup_single_issuer() // Only the worker thread will issue IO and poll completions .setup_defer_taskrun() .build(URING_NUM_ENTRIES) .expect("Failed to build IoUring instance"); @@ -154,14 +178,15 @@ impl RuntimeWorker { log::warn!("Failed to register fixed buffers with runtime worker ring"); } let mut tokens = VecDeque::::with_capacity(MAX_CONCURRENT_TASKS as usize); - let mut inflight_tasks = Vec::>::with_capacity(MAX_CONCURRENT_TASKS as usize); + let mut inflight_tasks = + Vec::>::with_capacity(MAX_CONCURRENT_TASKS as usize); for i in 0..MAX_CONCURRENT_TASKS { tokens.push_back(i as u16); inflight_tasks.push(None); } - + RuntimeWorker { - ring, + ring, submitted_tasks: inflight_tasks, tokens, queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), @@ -212,7 +237,7 @@ impl RuntimeWorker { fn drain_intermediate_queue(&mut self) { { - let sq = &mut self.ring.submission(); + let sq = &mut self.ring.submission(); while !sq.is_full() && !self.queued_entries.is_empty() { let sqe = self.queued_entries.pop_front().unwrap(); unsafe { @@ -234,9 +259,7 @@ impl RuntimeWorker { let mut sqes_submitted = 0; for sqe in sqes.iter() { - let res = unsafe { - sq.push(&sqe.clone().user_data(token as u64)) - }; + let res = unsafe { sq.push(&sqe.clone().user_data(token as u64)) }; if res.is_err() { // submission queue is full break; @@ -246,7 +269,8 @@ impl RuntimeWorker { sq.sync(); } for i in sqes_submitted..sqes.len() { - self.queued_entries.push_back(sqes[i].clone().user_data(token as u64)); + self.queued_entries + .push_back(sqes[i].clone().user_data(token as u64)); } } @@ -282,10 +306,10 @@ fn worker_main_loop(receiver: crossbeam_channel::Receiver) { loop { let res = unsafe { worker.ring.submitter().enter::( - worker.queued_submissions as u32, - 0, - flags.bits(), - None + worker.queued_submissions as u32, + 0, + flags.bits(), + None, ) }; match res { @@ -316,14 +340,16 @@ struct AsyncTask { pub inner: Rc>, waker: Waker, completed: *mut AtomicBool, - pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker + pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker completions: Vec, } impl AsyncTask { #[inline] fn complete(self) { - self.inner.borrow_mut().complete(self.completions.iter().collect()); + self.inner + .borrow_mut() + .complete(self.completions.iter().collect()); unsafe { (*self.completed).store(true, Ordering::Relaxed); } @@ -346,8 +372,7 @@ impl AsyncTask { } } -enum UringState -{ +enum UringState { Undecided, Created, Submitted, @@ -363,8 +388,7 @@ where id: u64, } -unsafe impl Send for UringFuture -where T: IoTask + 'static, {} +unsafe impl Send for UringFuture where T: IoTask + 'static {} impl UringFuture where @@ -415,7 +439,7 @@ where self.state = UringState::Submitted; return Poll::Pending; } - } + }, UringState::Undecided => unreachable!("state cannot be undecided during poll"), } } @@ -471,5 +495,8 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .expect("failed to create file"); let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), true, false); - submit_async_task(write_task).await.borrow_mut().get_result() -} \ No newline at end of file + submit_async_task(write_task) + .await + .borrow_mut() + .get_result() +} diff --git a/src/parquet/src/io/io_uring/single_uring.rs b/src/parquet/src/io/io_uring/single_uring.rs index 2f6c24f3..0ff1bf1b 100644 --- a/src/parquet/src/io/io_uring/single_uring.rs +++ b/src/parquet/src/io/io_uring/single_uring.rs @@ -331,7 +331,11 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { +pub(crate) async fn write( + path: PathBuf, + data: &Bytes, + direct_io: bool, +) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index e221f3c1..828c6a0e 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -1,8 +1,14 @@ use std::{ - alloc::{Layout, alloc}, any::Any, ffi::CString, fs, mem, ops::Range, os::{ + alloc::{Layout, alloc}, + any::Any, + ffi::CString, + fs, mem, + ops::Range, + os::{ fd::{AsRawFd, FromRawFd, RawFd}, unix::ffi::OsStringExt, - }, path::PathBuf + }, + path::PathBuf, }; use bytes::Bytes; @@ -77,7 +83,11 @@ impl IoTask for FileOpenTask { #[inline] fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { - debug_assert_eq!(cqe.len(), 1, "Should receive a single completion for a file open task"); + debug_assert_eq!( + cqe.len(), + 1, + "Should receive a single completion for a file open task" + ); let result = cqe[0].result(); if result < 0 { self.error = Some(std::io::Error::from_raw_os_error(-result)); @@ -214,14 +224,20 @@ impl IoTask for FileReadTask { num_bytes_aligned as u32, ); - vec![read_op - .offset(self.range.start - start_padding as u64) - .build()] + vec![ + read_op + .offset(self.range.start - start_padding as u64) + .build(), + ] } #[inline] fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { - debug_assert_eq!(cqe.len(), 1, "Should receive a single completion for a FileRead task"); + debug_assert_eq!( + cqe.len(), + 1, + "Should receive a single completion for a FileRead task" + ); let result = cqe[0].result(); if result < 0 { self.error = Some(std::io::Error::from_raw_os_error(-result)); @@ -264,7 +280,11 @@ impl FixedFileReadTask { Self::compute_padding(&self.range, self.direct_io) } - pub(crate) fn build(range: Range, file: &fs::File, direct_io: bool) -> Result { + pub(crate) fn build( + range: Range, + file: &fs::File, + direct_io: bool, + ) -> Result { let (start_padding, end_padding) = Self::compute_padding(&range, direct_io); let requested_bytes = (range.end - range.start) as usize; let num_bytes_aligned = requested_bytes + start_padding + end_padding; @@ -274,7 +294,10 @@ impl FixedFileReadTask { if ptr.is_null() { return Err(std::io::Error::from(std::io::ErrorKind::OutOfMemory)); } - let alloc = FixedBufferAllocation {ptr, size: num_bytes_aligned}; + let alloc = FixedBufferAllocation { + ptr, + size: num_bytes_aligned, + }; Ok(FixedFileReadTask { fixed_buffer: alloc, @@ -319,7 +342,7 @@ impl FixedFileReadTask { impl IoTask for FixedFileReadTask { #[inline] - fn prepare_sqe(&mut self) -> Vec { + fn prepare_sqe(&mut self) -> Vec { let buffers = FixedBufferPool::get_fixed_buffers(&self.fixed_buffer); let mut sqes = Vec::::new(); let (start_padding, _) = self.padding(); @@ -329,9 +352,11 @@ impl IoTask for FixedFileReadTask { io_uring::types::Fd(self.file), buffer.ptr, buffer.bytes as u32, - buffer.buf_id as u16) - .offset(file_offset).build(); - file_offset += buffer.bytes as u64; + buffer.buf_id as u16, + ) + .offset(file_offset) + .build(); + file_offset += buffer.bytes as u64; sqes.push(sqe); } sqes @@ -362,13 +387,19 @@ pub struct FileWriteTask { unsafe impl Send for FileWriteTask {} impl FileWriteTask { - pub(crate) fn build(data: Bytes, fd: RawFd, direct_io: bool, use_fixed_buffers: bool) -> FileWriteTask { + pub(crate) fn build( + data: Bytes, + fd: RawFd, + direct_io: bool, + use_fixed_buffers: bool, + ) -> FileWriteTask { let mut ptr = data.as_ptr(); let bytes = data.len(); let mut padding = 0; if direct_io { padding = (4096 - (data.len() & 4095)) & 4095; - let layout = Layout::from_size_align(data.len() + padding, 4096).expect("Failed to create layout"); + let layout = Layout::from_size_align(data.len() + padding, 4096) + .expect("Failed to create layout"); assert!((data.len() + padding) % 4096 == 0); unsafe { let new_ptr = alloc(layout); @@ -404,18 +435,19 @@ impl FileWriteTask { impl IoTask for FileWriteTask { #[inline] fn prepare_sqe(&mut self) -> Vec { - let write_op = opcode::Write::new( - io_uring::types::Fd(self.fd), - self.data, - self.size as u32, - ); + let write_op = + opcode::Write::new(io_uring::types::Fd(self.fd), self.data, self.size as u32); - vec![write_op.offset(0u64).build()] + vec![write_op.offset(0u64).build()] } #[inline] fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { - debug_assert_eq!(cqes.len(), 1, "Should receive a single completion for a FileWrite task"); + debug_assert_eq!( + cqes.len(), + 1, + "Should receive a single completion for a FileWrite task" + ); let result = cqes[0].result(); if result != self.size as i32 { self.error = Some(std::io::Error::from_raw_os_error(-result)); diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index 4c32e586..e0546d7b 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -81,13 +81,17 @@ impl BackendKind { BackendKind::MultiBlocking => { async move { multi_blocking_uring::read(path, range, direct_io) }.boxed() } - BackendKind::ThreadPool => thread_pool_uring::read(path, range, direct_io, true).boxed(), + BackendKind::ThreadPool => { + thread_pool_uring::read(path, range, direct_io, true).boxed() + } } } fn write_future(self, path: PathBuf, data: Bytes) -> IoFuture<()> { match self { - BackendKind::Shared => async move { single_uring::write(path, &data, false).await }.boxed(), + BackendKind::Shared => { + async move { single_uring::write(path, &data, false).await }.boxed() + } BackendKind::MultiAsync => { async move { multi_async_uring::write(path, &data, false).await }.boxed() } @@ -161,9 +165,9 @@ fn read_write_roundtrip_non_blocking_uring() { let (tmpdir, path) = seed_file(&original); let path_clone = path.clone(); - let read_bytes = executor.run_to_completion(async move { - runtime::read(path_clone, None).await - }).unwrap_or_else(|err| panic!("read failed: {err}")); + let read_bytes = executor + .run_to_completion(async move { runtime::read(path_clone, None).await }) + .unwrap_or_else(|err| panic!("read failed: {err}")); assert_eq!( read_bytes.as_ref(), original.as_slice(), @@ -173,16 +177,12 @@ fn read_write_roundtrip_non_blocking_uring() { let new_payload: Vec = (0..64).map(|i| (i as u8).wrapping_add(1)).collect(); let bytes = Bytes::from(new_payload.clone()); let path_clone = path.clone(); - executor.run_to_completion(async move { - runtime::write(path_clone, &bytes.clone()).await - }).unwrap_or_else(|err| panic!("write failed: {err}")); + executor + .run_to_completion(async move { runtime::write(path_clone, &bytes.clone()).await }) + .unwrap_or_else(|err| panic!("write failed: {err}")); let on_disk = fs::read(&path).expect("failed to read updated file"); - assert_eq!( - on_disk, - new_payload, - "wrote unexpected data", - ); + assert_eq!(on_disk, new_payload, "wrote unexpected data",); drop(tmpdir); } diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index 2319dace..972ef604 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -1,8 +1,19 @@ use std::{ - collections::VecDeque, fs::OpenOptions, future::Future, io, ops::Range, os::{fd::AsRawFd, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, sync::{ + collections::VecDeque, + fs::OpenOptions, + future::Future, + io, + ops::Range, + os::{fd::AsRawFd, unix::fs::OpenOptionsExt}, + path::PathBuf, + pin::Pin, + sync::{ OnceLock, atomic::{AtomicBool, AtomicUsize, Ordering}, - }, task::{Context, Poll}, thread, time::{Duration, Instant} + }, + task::{Context, Poll}, + thread, + time::{Duration, Instant}, }; use bytes::Bytes; @@ -42,7 +53,7 @@ static ENABLED: AtomicBool = AtomicBool::new(true); struct Submission { task: Box, completion_tx: oneshot::Sender>, - pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker + pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker completions: Vec, } @@ -164,7 +175,7 @@ struct UringWorker { submitted_tasks: Vec>, /** * When using fixed buffers, a single task can produce multiple submission queue entries. - * It is possible that we aren't able to submit all of them at one go. Hold them in an + * It is possible that we aren't able to submit all of them at one go. Hold them in an * intermediate queue in that case */ queued_entries: VecDeque, @@ -176,10 +187,13 @@ struct UringWorker { impl UringWorker { #[allow(clippy::new_ret_no_self)] - fn new(channel: crossbeam_channel::Receiver, register_buffers: bool) -> UringWorker { + fn new( + channel: crossbeam_channel::Receiver, + register_buffers: bool, + ) -> UringWorker { let mut builder = IoUring::::builder(); let ring = builder - .setup_single_issuer() // Only the worker thread will issue IO and poll completions + .setup_single_issuer() // Only the worker thread will issue IO and poll completions .setup_defer_taskrun() // .setup_iopoll() // .setup_sqpoll(50000) @@ -222,7 +236,7 @@ impl UringWorker { fn drain_intermediate_queue(&mut self) { { - let sq = &mut self.ring.submission(); + let sq = &mut self.ring.submission(); while !sq.is_full() && !self.queued_entries.is_empty() { let sqe = self.queued_entries.pop_front().unwrap(); unsafe { @@ -251,11 +265,9 @@ impl UringWorker { self.queued_submissions += sqes.len() as u32; submission.set_completions(sqes.len()); let mut tasks_submitted = 0; - + for sqe in sqes.iter_mut() { - let res = unsafe { - sq.push(&sqe.clone().user_data(token as u64)) - }; + let res = unsafe { sq.push(&sqe.clone().user_data(token as u64)) }; if res.is_err() { break; } @@ -263,7 +275,8 @@ impl UringWorker { sq.sync(); } for i in tasks_submitted..sqes.len() { - self.queued_entries.push_back(sqes[i].clone().user_data(token as u64)); + self.queued_entries + .push_back(sqes[i].clone().user_data(token as u64)); } self.submitted_tasks[token as usize] = Some(submission); } @@ -276,7 +289,12 @@ impl UringWorker { flags.insert(EnterFlags::GETEVENTS); loop { let res = unsafe { - self.ring.submitter().enter::(self.queued_submissions, 0, flags.bits(), None) + self.ring.submitter().enter::( + self.queued_submissions, + 0, + flags.bits(), + None, + ) }; match res { Ok(_num_entries) => { @@ -307,7 +325,7 @@ impl UringWorker { .as_ref() .expect("Task not found in submitted tasks") .pending_completions; - + if pending_completions == 1 { let mut submission = self.submitted_tasks[token] .take() @@ -384,7 +402,7 @@ where UringState::Submitted(mut receiver) => match Pin::new(&mut receiver).poll(cx) { Poll::Ready(Ok(task)) => { if ensure_registered() { - liquid_parquet::io_completed!(|| self.id); + liquid_parquet::io_completed!(|| self.id); } let typed_task = task .into_any() @@ -437,18 +455,18 @@ pub(crate) async fn read( let read_task = FixedFileReadTask::build(effective_range.clone(), &file, direct_io); // Fall back to normal read if fixed buffers are not available if read_task.is_ok() { - return submit_async_task(read_task.unwrap()).await.into_result() + return submit_async_task(read_task.unwrap()).await.into_result(); } } let read_task = FileReadTask::build(effective_range, file, direct_io); - return submit_async_task(read_task).await.into_result() + return submit_async_task(read_task).await.into_result(); } pub(crate) async fn write( path: PathBuf, data: &Bytes, direct_io: bool, - use_fixed_buffers: bool + use_fixed_buffers: bool, ) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) @@ -458,6 +476,7 @@ pub(crate) async fn write( .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, use_fixed_buffers); + let write_task = + FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, use_fixed_buffers); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/lib.rs b/src/parquet/src/lib.rs index bbf65a0e..0fa309f0 100644 --- a/src/parquet/src/lib.rs +++ b/src/parquet/src/lib.rs @@ -19,4 +19,4 @@ pub use utils::{boolean_buffer_and_then, extract_execution_metrics}; #[cfg(target_os = "linux")] pub use crate::io::io_uring::runtime::UringExecutor; -pub use crate::io::SimpleIoContext; \ No newline at end of file +pub use crate::io::SimpleIoContext; diff --git a/src/parquet/src/optimizers/lineage_opt.rs b/src/parquet/src/optimizers/lineage_opt.rs index caa8240a..1276beb3 100644 --- a/src/parquet/src/optimizers/lineage_opt.rs +++ b/src/parquet/src/optimizers/lineage_opt.rs @@ -1102,7 +1102,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, - 0 + 0, ))) } From 2c62fbd6d983812edd694af34ea0c76401b29f6c Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Tue, 31 Mar 2026 23:16:38 -0500 Subject: [PATCH 21/26] First attempt at a work-stealing runtime - Steal tasks only from the global queue - Basic admission control: Only a fixed number of tasks are admitted per worker --- benchmark/src/storage_runner.rs | 6 +- src/parquet/Cargo.toml | 6 + src/parquet/src/io/io_backend.rs | 4 +- .../io_uring/{runtime.rs => local_runtime.rs} | 0 src/parquet/src/io/io_uring/mod.rs | 5 +- src/parquet/src/io/io_uring/tests.rs | 37 +- src/parquet/src/io/io_uring/work_stealing.rs | 527 ++++++++++++++++++ src/parquet/src/lib.rs | 4 +- 8 files changed, 579 insertions(+), 10 deletions(-) rename src/parquet/src/io/io_uring/{runtime.rs => local_runtime.rs} (100%) create mode 100644 src/parquet/src/io/io_uring/work_stealing.rs diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 648ce7f7..e1cc51b2 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -13,7 +13,7 @@ use datafusion::physical_plan::PhysicalExpr; use datafusion::physical_plan::expressions::{BinaryExpr, Column}; use datafusion::scalar::ScalarValue; use liquid_cache_common::IoMode; -use liquid_cache_parquet::{SimpleIoContext, UringExecutor}; +use liquid_cache_parquet::{SimpleIoContext, WorkStealingUringRuntime}; use liquid_cache_storage::cache::{ EntryID, LiquidCache, LiquidCacheBuilder, LiquidPolicy, NoHydration, TranscodeSqueezeEvict, }; @@ -277,7 +277,7 @@ fn run_single_iter( storage: Arc, entry_ids: &Vec, batch_lengths: &Vec, - executor: &mut UringExecutor, + executor: &mut WorkStealingUringRuntime, ) { // 2) Partition batch indices evenly across workers. let batches_per_partition = num_batches / num_partitions; @@ -394,7 +394,7 @@ fn run_bench( .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(); - let mut executor = UringExecutor::new(num_workers); + let mut executor = WorkStealingUringRuntime::new(num_workers); let storage_clone = storage.clone(); let query_owned = query.clone(); let (num_batches, entry_ids, batch_lengths) = executor.run_to_completion(async move { diff --git a/src/parquet/Cargo.toml b/src/parquet/Cargo.toml index a13e3c4e..6fbcde64 100644 --- a/src/parquet/Cargo.toml +++ b/src/parquet/Cargo.toml @@ -36,6 +36,12 @@ io-uring = "0.7.10" libc = "0.2.177" crossbeam-channel = "0.5.15" crossbeam-queue = "0.3.11" +async-task = "4" +concurrent-queue = "2" +futures-lite = "2" +pin-project-lite = "0.2" +slab = "0.4" +fastrand = "2" [dev-dependencies] tempfile = "3.23.0" diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index fa309947..8baeb524 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -58,7 +58,7 @@ pub(super) async fn read( IoMode::UringNonBlocking => { #[cfg(target_os = "linux")] { - super::io_uring::runtime::read(path, range).await + super::io_uring::work_stealing::read(path, range).await } #[cfg(not(target_os = "linux"))] { @@ -132,7 +132,7 @@ pub(super) async fn write( IoMode::UringNonBlocking => { #[cfg(target_os = "linux")] { - super::io_uring::runtime::write(path, &data).await + super::io_uring::work_stealing::write(path, &data).await } #[cfg(not(target_os = "linux"))] { diff --git a/src/parquet/src/io/io_uring/runtime.rs b/src/parquet/src/io/io_uring/local_runtime.rs similarity index 100% rename from src/parquet/src/io/io_uring/runtime.rs rename to src/parquet/src/io/io_uring/local_runtime.rs diff --git a/src/parquet/src/io/io_uring/mod.rs b/src/parquet/src/io/io_uring/mod.rs index f9827703..0e49b9a0 100644 --- a/src/parquet/src/io/io_uring/mod.rs +++ b/src/parquet/src/io/io_uring/mod.rs @@ -7,7 +7,10 @@ pub(crate) use thread_pool_uring::initialize_uring_pool; pub(crate) mod single_uring; -pub(crate) mod runtime; +pub(crate) mod local_runtime; + +mod executor; +pub(crate) mod work_stealing; #[cfg(test)] mod tests; diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index e0546d7b..3c7b8bde 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -1,6 +1,7 @@ #![cfg(target_os = "linux")] -use crate::io::io_uring::runtime::{self, UringExecutor}; +use crate::io::io_uring::local_runtime::{self, UringExecutor}; +use crate::io::io_uring::work_stealing::{self, WorkStealingUringRuntime}; use super::{ initialize_uring_pool, multi_async_uring, multi_blocking_uring, single_uring, thread_pool_uring, @@ -157,6 +158,36 @@ fn read_write_roundtrip_all_backends() { } } +/// Work-stealing uring runtime +#[test] +fn read_write_roundtrip_work_stealing_uring() { + let original: Vec = (0..128).map(|i| (i as u8).wrapping_mul(3)).collect(); + let runtime = WorkStealingUringRuntime::new(2); + + let (tmpdir, path) = seed_file(&original); + let path_clone = path.clone(); + let read_bytes = runtime + .run_to_completion(async move { work_stealing::read(path_clone, None).await }) + .unwrap_or_else(|err| panic!("ws read failed: {err}")); + assert_eq!( + read_bytes.as_ref(), + original.as_slice(), + "ws read returned unexpected payload", + ); + + let new_payload: Vec = (0..64).map(|i| (i as u8).wrapping_add(7)).collect(); + let bytes = Bytes::from(new_payload.clone()); + let path_clone = path.clone(); + runtime + .run_to_completion(async move { work_stealing::write(path_clone, &bytes).await }) + .unwrap_or_else(|err| panic!("ws write failed: {err}")); + + let on_disk = fs::read(&path).expect("failed to read updated file"); + assert_eq!(on_disk, new_payload, "ws wrote unexpected data"); + + drop(tmpdir); +} + /// Non-blocking uring requires a dedicated runtime #[test] fn read_write_roundtrip_non_blocking_uring() { @@ -166,7 +197,7 @@ fn read_write_roundtrip_non_blocking_uring() { let (tmpdir, path) = seed_file(&original); let path_clone = path.clone(); let read_bytes = executor - .run_to_completion(async move { runtime::read(path_clone, None).await }) + .run_to_completion(async move { local_runtime::read(path_clone, None).await }) .unwrap_or_else(|err| panic!("read failed: {err}")); assert_eq!( read_bytes.as_ref(), @@ -178,7 +209,7 @@ fn read_write_roundtrip_non_blocking_uring() { let bytes = Bytes::from(new_payload.clone()); let path_clone = path.clone(); executor - .run_to_completion(async move { runtime::write(path_clone, &bytes.clone()).await }) + .run_to_completion(async move { local_runtime::write(path_clone, &bytes.clone()).await }) .unwrap_or_else(|err| panic!("write failed: {err}")); let on_disk = fs::read(&path).expect("failed to read updated file"); diff --git a/src/parquet/src/io/io_uring/work_stealing.rs b/src/parquet/src/io/io_uring/work_stealing.rs new file mode 100644 index 00000000..7699755e --- /dev/null +++ b/src/parquet/src/io/io_uring/work_stealing.rs @@ -0,0 +1,527 @@ + +use std::{ + cell::{Cell, RefCell}, + collections::VecDeque, + fs::OpenOptions, + ops::Range, + os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, + path::PathBuf, + pin::Pin, + rc::Rc, + sync::{ + Arc, Mutex, OnceLock, atomic::{AtomicBool, Ordering}, + }, + task::{Context, Poll, Waker}, + thread::{self, JoinHandle}, + time::{Duration, Instant}, +}; + +use async_task::Runnable; +use bytes::Bytes; +use futures::Future; +use io_uring::{EnterFlags, IoUring, cqueue, squeue}; +use liquid_cache_common::memory::pool::FixedBufferPool; +use rand::Rng; +use tokio::sync::oneshot; + +use super::tasks::{FileReadTask, FileWriteTask, FixedFileReadTask, IoTask}; + +#[usdt::provider] +mod ws_uring_runtime { + fn io_submission(id: u64) {} + fn io_completion(id: u64) {} +} + +fn ensure_uring_trace_registered() -> bool { + static REGISTERED: OnceLock = OnceLock::new(); + *REGISTERED.get_or_init(|| match usdt::register_probes() { + Ok(()) => true, + Err(err) => { + log::debug!("failed to register work-stealing io_uring USDT probes: {err}"); + false + } + }) +} + +type ExecutorTask = Pin + Send>>; + +const URING_NUM_ENTRIES: u32 = 256; +const MAX_CONCURRENT_IO: u32 = 128; +const URING_BATCH_SIZE: u32 = 8; +const URING_SYSCALL_INTERVAL_US: u64 = 5; +const MAX_ACTIVE_TASKS_PER_THREAD: u32 = 5; + +pub struct WorkStealingUringRuntime { + _workers: Vec>, + sender: crossbeam_channel::Sender, +} + +impl WorkStealingUringRuntime { + /// Spawn `num_threads` worker threads, each with its own io_uring ring. + pub fn new(num_threads: usize) -> Self { + let (sender, receiver) = crossbeam_channel::unbounded(); + + let mut workers = Vec::new(); + for i in 0..num_threads { + let receiver_clone = receiver.clone(); + let worker = thread::Builder::new() + .name(format!("ws-io-worker-{}", i)) + .spawn(move || worker_main_loop(receiver_clone)) + .expect("Failed to spawn worker"); + workers.push(worker); + } + + WorkStealingUringRuntime { + _workers: workers, + sender + } + } + + /// Spawn a future on the runtime; the result is returned through a oneshot channel. + pub fn spawn( + &self, + future: F, + ) -> oneshot::Receiver + where + F::Output: Send + 'static, + { + let (tx, rx) = oneshot::channel(); + + let wrapped_fut = async move { + let output = future.await; + let _ = tx.send(output); + }; + self.sender.send(Box::pin(wrapped_fut)).expect("Failed to send task"); + rx + } + + /// Spawn a batch of futures, returning results via a crossbeam channel. + pub fn spawn_many( + &self, + futures: &mut Vec, + ) -> crossbeam_channel::Receiver + where + F::Output: Send + 'static, + { + let (tx, rx) = crossbeam_channel::bounded::(futures.len()); + for f in futures.drain(..) { + let tx = tx.clone(); + let wrapped_fut = async move { + let output = f.await; + tx.send(output).expect("Failed to send result"); + }; + self.sender.send(Box::pin(wrapped_fut)).expect("Failed to send task"); + } + rx + } + + /// Spawn a future and block the caller until it completes. + pub fn run_to_completion( + &self, + future: F, + ) -> F::Output + where + F::Output: Send + 'static, + { + let receiver = self.spawn(future); + receiver.blocking_recv().expect("Failed to receive result") + } +} + +struct IoDriver { + ring: IoUring, + submitted_tasks: Vec>, + queued_entries: VecDeque, + last_syscall: Instant, + tokens: VecDeque, + io_performed: u64, + queued_submissions: u64, + fixed_buffers_available: bool, +} + +impl IoDriver { + fn new() -> IoDriver { + let ring = IoUring::::builder() + .setup_single_issuer() + .build(URING_NUM_ENTRIES) + .expect("Failed to build IoUring instance"); + + let fixed_buffers_available = + FixedBufferPool::register_buffers_with_ring(&ring).is_ok(); + + let mut tokens = VecDeque::with_capacity(MAX_CONCURRENT_IO as usize); + let mut submitted_tasks = Vec::with_capacity(MAX_CONCURRENT_IO as usize); + for i in 0..MAX_CONCURRENT_IO { + tokens.push_back(i as u16); + submitted_tasks.push(None); + } + + IoDriver { + ring, + submitted_tasks, + tokens, + queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), + last_syscall: Instant::now(), + io_performed: 0, + queued_submissions: 0, + fixed_buffers_available, + } + } + + #[inline] + fn need_syscall(&self) -> bool { + let is_batch_full = self.queued_entries.len() >= URING_BATCH_SIZE as usize; + is_batch_full + || self.last_syscall.elapsed() > Duration::from_micros(URING_SYSCALL_INTERVAL_US) + } + + fn poll_completions(&mut self) { + let cq = &mut self.ring.completion(); + loop { + cq.sync(); + match cq.next() { + Some(cqe) => { + let token = cqe.user_data() as usize; + let pending = self.submitted_tasks[token] + .as_ref() + .expect("Task not found in submitted tasks") + .pending_completions; + if pending == 1 { + let mut task = self.submitted_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + task.push_completion(cqe); + task.complete(); + self.tokens.push_back(token as u16); + self.io_performed += 1; + } else { + let task = self.submitted_tasks[token] + .as_mut() + .expect("Task not found in submitted tasks"); + task.push_completion(cqe); + task.reduce_completions(); + } + } + None => break, + } + } + } + + fn drain_intermediate_queue(&mut self) { + let sq = &mut self.ring.submission(); + while !sq.is_full() && !self.queued_entries.is_empty() { + let sqe = self.queued_entries.pop_front().unwrap(); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + self.queued_submissions += 1; + } + } + + fn submit_task(&mut self, mut task: AsyncIoTask) { + let token = self.tokens.pop_front().expect("No more IO tokens"); + let sq = &mut self.ring.submission(); + let sqes = task.inner.lock().unwrap().prepare_sqe(); + let num_sqes = sqes.len(); + task.set_completions(num_sqes); + self.submitted_tasks[token as usize] = Some(task); + let mut sqes_submitted = 0; + + for sqe in sqes.iter() { + let res = unsafe { sq.push(&sqe.clone().user_data(token as u64)) }; + if res.is_err() { + break; + } + sqes_submitted += 1; + self.queued_submissions += 1; + sq.sync(); + } + for i in sqes_submitted..sqes.len() { + self.queued_entries + .push_back(sqes[i].clone().user_data(token as u64)); + } + } + + fn add_task(task: AsyncIoTask) { + IO_REACTOR.with(|reactor| { + reactor.borrow_mut().submit_task(task); + }); + } +} + +fn worker_main_loop(receiver: crossbeam_channel::Receiver) { + EXECUTOR.with(|worker| { + let mut worker = worker.borrow_mut(); + worker.set_receiver(receiver); + }); + loop { + EXECUTOR.with(|worker| { + let worker = &mut worker.borrow_mut(); + worker.try_tick(); + }); + IO_REACTOR.with(|reactor| { + let reactor = &mut reactor.borrow_mut(); + reactor.drain_intermediate_queue(); + if reactor.need_syscall() { + let mut flags = EnterFlags::empty(); + flags.insert(EnterFlags::GETEVENTS); + loop { + let res = unsafe { + reactor.ring.submitter().enter::( + reactor.queued_submissions as u32, + 0, + flags.bits(), + None, + ) + }; + match res { + Ok(_num_entries) => { + break; + } + Err(e) => { + if e.kind() == std::io::ErrorKind::Interrupted { + continue; + } + panic!("Failed to submit: {}", e.to_string()); + } + } + } + reactor.queued_submissions = 0; + reactor.last_syscall = Instant::now(); + } + reactor.poll_completions(); + }); + } +} + +thread_local! { + static EXECUTOR: RefCell = RefCell::new(RuntimeWorker::new()); + static IO_REACTOR: RefCell = RefCell::new(IoDriver::new()); +} + +struct RuntimeWorker { + task_receiver: Option>, + local: VecDeque, + active_tasks: Rc>, + /// Runnables woken while polling; drained into `local` at the start of each `try_tick`. + pending: Rc>>, +} + +impl RuntimeWorker { + fn new() -> RuntimeWorker { + RuntimeWorker { + task_receiver: None, + local: VecDeque::with_capacity(64), + active_tasks: Rc::new(Cell::new(0)), + pending: Rc::new(RefCell::new(VecDeque::new())), + } + } + + fn set_receiver(&mut self, receiver: crossbeam_channel::Receiver) { + self.task_receiver = Some(receiver); + } + + fn try_tick(&mut self) { + self.local + .extend(self.pending.borrow_mut().drain(..)); + + let mut runnable = self.local.pop_front(); + if runnable.is_none() && self.active_tasks.get() < MAX_ACTIVE_TASKS_PER_THREAD { + if let Ok(future) = self.task_receiver.as_mut().unwrap().try_recv() { + self.active_tasks.set(self.active_tasks.get().saturating_add(1)); + let active_tasks = Rc::clone(&self.active_tasks); + let pending = Rc::clone(&self.pending); + let wrapped = async move { + future.await; + active_tasks.set(active_tasks.get().saturating_sub(1)); + }; + let schedule = move |r: Runnable| { + pending.borrow_mut().push_back(r); + }; + let (r, _) = unsafe { async_task::spawn_unchecked(wrapped, schedule) }; + runnable = Some(r); + } + } + if let Some(r) = runnable { + r.run(); + } + } +} + + +/// Thread-safe wrapper around an `IoTask`. Unlike the local runtime's version +/// which uses `Rc>`, this uses `Arc>` so that the task +/// can be submitted on one thread and completed/read on another. +struct AsyncIoTask { + inner: Arc>, + waker: Waker, + completed: Arc, + pending_completions: usize, + completions: Vec, +} + +unsafe impl Send for AsyncIoTask {} + +impl AsyncIoTask { + #[inline] + fn complete(self) { + self.inner + .lock() + .unwrap() + .complete(self.completions.iter().collect()); + self.completed.store(true, Ordering::Release); + self.waker.wake(); + } + + #[inline] + fn set_completions(&mut self, count: usize) { + self.pending_completions = count; + } + + #[inline] + fn reduce_completions(&mut self) { + self.pending_completions -= 1; + } + + #[inline] + fn push_completion(&mut self, cqe: cqueue::Entry) { + self.completions.push(cqe); + } +} + +enum UringState { + Undecided, + Created, + Submitted, +} + +pub(crate) struct UringFuture +where + T: IoTask + 'static, +{ + state: UringState, + task: Arc>, + completed: Arc, + id: u64, +} + +unsafe impl Send for UringFuture where T: IoTask + 'static {} + +impl UringFuture +where + T: IoTask + 'static, +{ + fn new(task: Arc>) -> Self { + UringFuture { + state: UringState::Created, + task, + completed: Arc::new(AtomicBool::new(false)), + id: rand::rng().random(), + } + } +} + +impl Future for UringFuture +where + T: IoTask + 'static, +{ + type Output = Arc>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + loop { + let state = std::mem::replace(&mut self.state, UringState::Undecided); + match state { + UringState::Created => { + let async_task = AsyncIoTask { + inner: self.task.clone(), + waker: cx.waker().clone(), + completed: self.completed.clone(), + pending_completions: 0, + completions: Vec::new(), + }; + IoDriver::add_task(async_task); + if ensure_uring_trace_registered() { + ws_uring_runtime::io_submission!(|| self.id); + } + self.state = UringState::Submitted; + } + UringState::Submitted => { + if self.completed.load(Ordering::Acquire) { + if ensure_uring_trace_registered() { + ws_uring_runtime::io_completion!(|| self.id); + } + return Poll::Ready(self.task.clone()); + } + self.state = UringState::Submitted; + return Poll::Pending; + } + UringState::Undecided => unreachable!("state cannot be undecided during poll"), + } + } + } +} + +fn submit_async_task(task: T) -> UringFuture +where + T: IoTask + 'static, +{ + UringFuture::new(Arc::new(Mutex::new(task))) +} + +pub(crate) async fn read( + path: PathBuf, + range: Option>, +) -> Result { + let direct_io = IO_REACTOR.with(|w| w.borrow().fixed_buffers_available); + + let mut opts = OpenOptions::new(); + opts.read(true); + if direct_io { + opts.custom_flags(libc::O_DIRECT); + } + let file = opts.open(&path).expect("failed to open file"); + + let effective_range = if let Some(range) = range { + range + } else { + let len = file.metadata()?.len(); + 0..len + }; + + if direct_io { + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, true); + if let Ok(task) = read_task { + let arc = submit_async_task(task).await; + return match Arc::try_unwrap(arc) { + Ok(mutex) => { + FixedFileReadTask::into_result(Box::new(mutex.into_inner().unwrap())) + } + Err(arc) => arc.lock().unwrap().get_result(), + }; + } + } + + let read_task = FileReadTask::build(effective_range, file, direct_io); + let arc = submit_async_task(read_task).await; + match Arc::try_unwrap(arc) { + Ok(mutex) => FileReadTask::into_result(Box::new(mutex.into_inner().unwrap())), + Err(arc) => arc.lock().unwrap().get_result(), + } +} + +pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(&path) + .expect("failed to create file"); + + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false, false); + let arc = submit_async_task(write_task).await; + match Arc::try_unwrap(arc) { + Ok(mutex) => mutex.into_inner().unwrap().get_result(), + Err(arc) => arc.lock().unwrap().get_result(), + } +} diff --git a/src/parquet/src/lib.rs b/src/parquet/src/lib.rs index 0fa309f0..bfa39257 100644 --- a/src/parquet/src/lib.rs +++ b/src/parquet/src/lib.rs @@ -17,6 +17,8 @@ pub use reader::{FilterCandidateBuilder, LiquidParquetSource, LiquidPredicate, L pub use utils::{boolean_buffer_and_then, extract_execution_metrics}; #[cfg(target_os = "linux")] -pub use crate::io::io_uring::runtime::UringExecutor; +pub use crate::io::io_uring::local_runtime::UringExecutor; +#[cfg(target_os = "linux")] +pub use crate::io::io_uring::work_stealing::WorkStealingUringRuntime; pub use crate::io::SimpleIoContext; From 93e5dab38f2128e585b76d4e4f6516299f987f5b Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Tue, 31 Mar 2026 23:35:22 -0500 Subject: [PATCH 22/26] Bug fix --- src/parquet/src/io/io_uring/work_stealing.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/parquet/src/io/io_uring/work_stealing.rs b/src/parquet/src/io/io_uring/work_stealing.rs index 7699755e..d4a5a279 100644 --- a/src/parquet/src/io/io_uring/work_stealing.rs +++ b/src/parquet/src/io/io_uring/work_stealing.rs @@ -302,19 +302,16 @@ thread_local! { struct RuntimeWorker { task_receiver: Option>, - local: VecDeque, active_tasks: Rc>, - /// Runnables woken while polling; drained into `local` at the start of each `try_tick`. - pending: Rc>>, + local: Rc>>, } impl RuntimeWorker { fn new() -> RuntimeWorker { RuntimeWorker { task_receiver: None, - local: VecDeque::with_capacity(64), active_tasks: Rc::new(Cell::new(0)), - pending: Rc::new(RefCell::new(VecDeque::new())), + local: Rc::new(RefCell::new(VecDeque::new())), } } @@ -323,23 +320,22 @@ impl RuntimeWorker { } fn try_tick(&mut self) { - self.local - .extend(self.pending.borrow_mut().drain(..)); - - let mut runnable = self.local.pop_front(); + let mut runnable = self.local.borrow_mut().pop_front(); if runnable.is_none() && self.active_tasks.get() < MAX_ACTIVE_TASKS_PER_THREAD { if let Ok(future) = self.task_receiver.as_mut().unwrap().try_recv() { self.active_tasks.set(self.active_tasks.get().saturating_add(1)); let active_tasks = Rc::clone(&self.active_tasks); - let pending = Rc::clone(&self.pending); + let local_clone = Rc::clone(&self.local); let wrapped = async move { future.await; active_tasks.set(active_tasks.get().saturating_sub(1)); }; let schedule = move |r: Runnable| { - pending.borrow_mut().push_back(r); + local_clone.borrow_mut().push_back(r); }; - let (r, _) = unsafe { async_task::spawn_unchecked(wrapped, schedule) }; + let (r, task) = unsafe { async_task::spawn_unchecked(wrapped, schedule) }; + // Dropping `Task` would cancel the future and drop the oneshot sender (RecvError). + task.detach(); runnable = Some(r); } } From 07923ded9d761c6b105e82b638148228e0384c32 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Wed, 1 Apr 2026 19:15:12 -0500 Subject: [PATCH 23/26] Fixes --- src/parquet/src/io/io_uring/mod.rs | 1 - src/parquet/src/io/io_uring/work_stealing.rs | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parquet/src/io/io_uring/mod.rs b/src/parquet/src/io/io_uring/mod.rs index 0e49b9a0..d7452388 100644 --- a/src/parquet/src/io/io_uring/mod.rs +++ b/src/parquet/src/io/io_uring/mod.rs @@ -9,7 +9,6 @@ pub(crate) mod single_uring; pub(crate) mod local_runtime; -mod executor; pub(crate) mod work_stealing; #[cfg(test)] diff --git a/src/parquet/src/io/io_uring/work_stealing.rs b/src/parquet/src/io/io_uring/work_stealing.rs index d4a5a279..42758a98 100644 --- a/src/parquet/src/io/io_uring/work_stealing.rs +++ b/src/parquet/src/io/io_uring/work_stealing.rs @@ -143,6 +143,7 @@ impl IoDriver { fn new() -> IoDriver { let ring = IoUring::::builder() .setup_single_issuer() + .setup_defer_taskrun() .build(URING_NUM_ENTRIES) .expect("Failed to build IoUring instance"); From 709be25bc88ec3e3fd4d2ff7df6d14eb6530fabf Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Wed, 1 Apr 2026 19:30:39 -0500 Subject: [PATCH 24/26] Measure cpu time in benchmark --- benchmark/src/storage_runner.rs | 46 +++++++++++++-- src/parquet/src/io/io_uring/work_stealing.rs | 59 ++++++++++++++++++-- 2 files changed, 93 insertions(+), 12 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index e1cc51b2..23792074 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -277,8 +277,8 @@ fn run_single_iter( storage: Arc, entry_ids: &Vec, batch_lengths: &Vec, - executor: &mut WorkStealingUringRuntime, -) { + executor: &WorkStealingUringRuntime, +) -> (std::time::Duration, usize) { // 2) Partition batch indices evenly across workers. let batches_per_partition = num_batches / num_partitions; let num_cols = query.columns_to_load().len(); @@ -331,11 +331,12 @@ fn run_single_iter( ); } log::info!( - "Partitions: {}, Time: {:.3}s, Total rows: {}", + "Partitions: {}, wall: {:.3}s, total rows: {}", num_partitions, elapsed.as_secs_f64(), total_rows ); + (elapsed, total_rows) } fn write_flamegraph( @@ -394,7 +395,7 @@ fn run_bench( .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(); - let mut executor = WorkStealingUringRuntime::new(num_workers); + let executor = WorkStealingUringRuntime::new(num_workers); let storage_clone = storage.clone(); let query_owned = query.clone(); let (num_batches, entry_ids, batch_lengths) = executor.run_to_completion(async move { @@ -414,6 +415,10 @@ fn run_bench( (num_batches, entry_ids, batch_lengths) }); + // Baseline after cache load so iteration deltas exclude setup work on the same workers. + let mut prev_runnable_wall_total_ns = executor.total_runnable_wall_nanos(); + let mut prev_per_worker_wall_ns = executor.worker_runnable_wall_nanos(); + for i in 0..num_iter { liquid_cache_benchmarks::tracepoints::iteration_start(query_index as u32, i as u32); let io_guard = DiskIoGuard::new(); @@ -429,14 +434,43 @@ fn run_bench( None }; - run_single_iter( + let (iter_wall, _rows) = run_single_iter( num_batches, num_partitions, &query, storage.clone(), &entry_ids, &batch_lengths, - &mut executor, + &executor, + ); + + let runnable_wall_total_ns = executor.total_runnable_wall_nanos(); + let runnable_this_iter_ns = + runnable_wall_total_ns.saturating_sub(prev_runnable_wall_total_ns); + prev_runnable_wall_total_ns = runnable_wall_total_ns; + + let per_worker_now = executor.worker_runnable_wall_nanos(); + let per_worker_delta_ms: Vec = per_worker_now + .iter() + .zip(prev_per_worker_wall_ns.iter()) + .map(|(now, prev)| now.saturating_sub(*prev) as f64 / 1e6) + .collect(); + prev_per_worker_wall_ns = per_worker_now; + + let wall_ns = iter_wall.as_nanos() as u64; + // Summed across workers; can exceed wall clock when workers run in parallel. + let wall_minus_runnable_sum_ns = wall_ns.saturating_sub(runnable_this_iter_ns); + + log::info!( + "Iteration {}: Runnable::run wall +{:.3} ms this iter (cumulative {:.3} ms); \ + iteration wall {:.3} s; wall minus summed run delta (saturating) {:.3} ms; \ + per-worker +delta ms {:?}", + i, + runnable_this_iter_ns as f64 / 1e6, + runnable_wall_total_ns as f64 / 1e6, + iter_wall.as_secs_f64(), + wall_minus_runnable_sum_ns as f64 / 1e6, + per_worker_delta_ms, ); let (disk_read, disk_written) = io_guard.stop(); diff --git a/src/parquet/src/io/io_uring/work_stealing.rs b/src/parquet/src/io/io_uring/work_stealing.rs index 42758a98..f5f0b08c 100644 --- a/src/parquet/src/io/io_uring/work_stealing.rs +++ b/src/parquet/src/io/io_uring/work_stealing.rs @@ -9,7 +9,8 @@ use std::{ pin::Pin, rc::Rc, sync::{ - Arc, Mutex, OnceLock, atomic::{AtomicBool, Ordering}, + Arc, Mutex, OnceLock, + atomic::{AtomicBool, AtomicU64, Ordering}, }, task::{Context, Poll, Waker}, thread::{self, JoinHandle}, @@ -51,9 +52,19 @@ const URING_BATCH_SIZE: u32 = 8; const URING_SYSCALL_INTERVAL_US: u64 = 5; const MAX_ACTIVE_TASKS_PER_THREAD: u32 = 5; +/// Local io_uring + work-stealing executor. +/// +/// ## Runnable timing +/// +/// [`WorkStealingUringRuntime::worker_runnable_wall_nanos`] accumulates **wall-clock** time each worker +/// spends inside [`Runnable::run`] (one async-task poll). It does **not** include idle time between +/// ticks, io_uring `enter`, or time blocked in syscalls outside `run`. It is **not** OS thread CPU time +/// (`CLOCK_THREAD_CPUTIME_ID`). pub struct WorkStealingUringRuntime { _workers: Vec>, sender: crossbeam_channel::Sender, + /// One counter per worker; same `Arc` as installed on that worker’s [`RuntimeWorker`]. + worker_runnable_wall_nanos: Vec>, } impl WorkStealingUringRuntime { @@ -62,21 +73,43 @@ impl WorkStealingUringRuntime { let (sender, receiver) = crossbeam_channel::unbounded(); let mut workers = Vec::new(); + let mut worker_runnable_wall_nanos = Vec::with_capacity(num_threads); for i in 0..num_threads { + let counter = Arc::new(AtomicU64::new(0)); + worker_runnable_wall_nanos.push(Arc::clone(&counter)); let receiver_clone = receiver.clone(); let worker = thread::Builder::new() .name(format!("ws-io-worker-{}", i)) - .spawn(move || worker_main_loop(receiver_clone)) + .spawn(move || worker_main_loop(receiver_clone, counter)) .expect("Failed to spawn worker"); workers.push(worker); } WorkStealingUringRuntime { _workers: workers, - sender + sender, + worker_runnable_wall_nanos, } } + /// Wall time each worker has spent inside `Runnable::run()`, **nanoseconds**, indexed by worker id. + /// + /// See struct-level docs for semantics. + pub fn worker_runnable_wall_nanos(&self) -> Vec { + self.worker_runnable_wall_nanos + .iter() + .map(|c| c.load(Ordering::Relaxed)) + .collect() + } + + /// Sum of [`Self::worker_runnable_wall_nanos`] across workers. + pub fn total_runnable_wall_nanos(&self) -> u64 { + self.worker_runnable_wall_nanos + .iter() + .map(|c| c.load(Ordering::Relaxed)) + .sum() + } + /// Spawn a future on the runtime; the result is returned through a oneshot channel. pub fn spawn( &self, @@ -251,10 +284,13 @@ impl IoDriver { } } -fn worker_main_loop(receiver: crossbeam_channel::Receiver) { +fn worker_main_loop( + receiver: crossbeam_channel::Receiver, + runnable_wall_nanos: Arc, +) { EXECUTOR.with(|worker| { let mut worker = worker.borrow_mut(); - worker.set_receiver(receiver); + worker.set_context(receiver, runnable_wall_nanos); }); loop { EXECUTOR.with(|worker| { @@ -305,6 +341,7 @@ struct RuntimeWorker { task_receiver: Option>, active_tasks: Rc>, local: Rc>>, + runnable_wall_nanos: Option>, } impl RuntimeWorker { @@ -313,11 +350,17 @@ impl RuntimeWorker { task_receiver: None, active_tasks: Rc::new(Cell::new(0)), local: Rc::new(RefCell::new(VecDeque::new())), + runnable_wall_nanos: None, } } - fn set_receiver(&mut self, receiver: crossbeam_channel::Receiver) { + fn set_context( + &mut self, + receiver: crossbeam_channel::Receiver, + runnable_wall_nanos: Arc, + ) { self.task_receiver = Some(receiver); + self.runnable_wall_nanos = Some(runnable_wall_nanos); } fn try_tick(&mut self) { @@ -341,7 +384,11 @@ impl RuntimeWorker { } } if let Some(r) = runnable { + let start = Instant::now(); r.run(); + if let Some(c) = self.runnable_wall_nanos.as_ref() { + c.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed); + } } } } From 4fab99ceb2b464ad596cb4cc611bb7413b152e40 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 13 Apr 2026 00:52:09 -0500 Subject: [PATCH 25/26] Changes to storage_runner - Table formatting for results - Use a tokio runtime if io mode is std-blocking --- benchmark/src/lib.rs | 75 ++++++++++++++++ benchmark/src/storage_runner.rs | 149 +++++++++++++++++++++----------- 2 files changed, 173 insertions(+), 51 deletions(-) diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs index 420652d1..7f6556d8 100644 --- a/benchmark/src/lib.rs +++ b/benchmark/src/lib.rs @@ -487,6 +487,81 @@ impl Display for IterationResult { } } +/// Table layout matching [`IterationResult`]'s [`Display`] (borders, row style, disk formatting). +/// When `uring_runnable` is `Some`, includes work-stealing executor `Runnable::run` timing (see storage runner). +pub fn format_storage_iteration_metrics( + iteration: usize, + iteration_wall: Duration, + disk_read: u64, + disk_written: u64, + uring_runnable: Option<(f64, f64)>, +) -> String { + struct StorageIterationTable { + iteration: usize, + iteration_wall_ms: u64, + uring_runnable: Option<(f64, f64)>, + disk_read: u64, + disk_written: u64, + } + + impl std::fmt::Display for StorageIterationTable { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + const INNER: usize = 50; + write_border_top(f, INNER)?; + write_kv_row( + f, + INNER, + "Iteration:", + &format!("{}", self.iteration), + )?; + write_kv_row( + f, + INNER, + "Iteration wall:", + &format!("{} ms", format_number(self.iteration_wall_ms)), + )?; + if let Some((runnable_wall_ms, wall_minus_runnable_ms)) = self.uring_runnable { + write_kv_row( + f, + INNER, + "Runnable wall (sum):", + &format!("{:.3} ms", runnable_wall_ms), + )?; + write_kv_row( + f, + INNER, + "Wall minus runnable:", + &format!("{:.3} ms", wall_minus_runnable_ms), + )?; + } + write_border_sep(f, INNER)?; + write_kv_row( + f, + INNER, + "Disk (Read/Write):", + &format!( + "{} / {}", + format_bytes(self.disk_read), + format_bytes(self.disk_written) + ), + )?; + write_border_bottom(f, INNER) + } + } + + let iteration_wall_ms = (iteration_wall.as_secs_f64() * 1000.0).round() as u64; + format!( + "{}", + StorageIterationTable { + iteration, + iteration_wall_ms, + uring_runnable, + disk_read, + disk_written, + } + ) +} + fn format_number(n: u64) -> String { let s = n.to_string(); let chars: Vec = s.chars().collect(); diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 23792074..9832d642 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -9,6 +9,7 @@ use arrow::array::BooleanArray; use arrow::buffer::BooleanBuffer; use clap::Parser; use datafusion::logical_expr::Operator; +use futures::future::join_all; use datafusion::physical_plan::PhysicalExpr; use datafusion::physical_plan::expressions::{BinaryExpr, Column}; use datafusion::scalar::ScalarValue; @@ -21,6 +22,7 @@ use logforth::filter::EnvFilter; use parquet::arrow::{ProjectionMask, arrow_reader::ParquetRecordBatchReaderBuilder}; use std::fs::create_dir_all; use std::path::PathBuf; +use std::future::Future; use std::sync::Arc; use std::time::Instant; use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System}; @@ -32,10 +34,11 @@ struct Args { #[arg(long)] query_index: usize, - /// Number of partitions (tasks to spawn on UringExecutor). + /// Number of partitions (tasks per iteration). #[arg(long)] partitions: usize, + /// Worker threads: io_uring work-stealing runtime size, or Tokio worker threads when `--io-mode std-blocking`. #[arg(long)] worker_threads: usize, @@ -54,7 +57,7 @@ struct Args { #[arg(long = "flamegraph-dir")] flamegraph_dir: Option, - /// IO mode: uring-non-blocking (default) or std-blocking. + /// IO mode: uring-non-blocking (default) or std-blocking. With std-blocking, partition futures run on a multi-thread Tokio runtime (`worker_threads` workers). #[arg(long = "io-mode", default_value = "uring-non-blocking")] io_mode: IoMode, } @@ -185,9 +188,13 @@ fn all_filter_queries() -> Vec> { }); q[20] = Some(FilterQuery { - filter_columns: vec![], - projection_columns: vec!["URL"], - predicates: vec![], + filter_columns: vec!["URL"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::LikeMatch, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::from("%google%"))))) + ))], expected_row_count: 99997497, }); @@ -270,6 +277,48 @@ fn all_filter_queries() -> Vec> { q } +/// Partition futures run either on the work-stealing io_uring executor or, for `std-blocking` IO, on Tokio. +enum StorageBenchRuntime { + Uring(WorkStealingUringRuntime), + Tokio(tokio::runtime::Runtime), +} + +impl StorageBenchRuntime { + fn new(io_mode: IoMode, num_workers: usize) -> Self { + let num_workers = num_workers.max(1); + if io_mode == IoMode::StdBlocking { + Self::Tokio( + tokio::runtime::Builder::new_multi_thread() + .worker_threads(num_workers) + .thread_name("storage-bench-tokio") + .enable_all() + .build() + .expect("build tokio multi-thread runtime"), + ) + } else { + Self::Uring(WorkStealingUringRuntime::new(num_workers)) + } + } + + fn run_to_completion(&self, future: F) -> T + where + F: Future + Send + 'static, + T: Send + 'static, + { + match self { + Self::Uring(e) => e.run_to_completion(future), + Self::Tokio(rt) => rt.block_on(future), + } + } + + fn total_runnable_wall_nanos(&self) -> Option { + match self { + Self::Uring(e) => Some(e.total_runnable_wall_nanos()), + Self::Tokio(_) => None, + } + } +} + fn run_single_iter( num_batches: usize, num_partitions: usize, @@ -277,7 +326,7 @@ fn run_single_iter( storage: Arc, entry_ids: &Vec, batch_lengths: &Vec, - executor: &WorkStealingUringRuntime, + runtime: &StorageBenchRuntime, ) -> (std::time::Duration, usize) { // 2) Partition batch indices evenly across workers. let batches_per_partition = num_batches / num_partitions; @@ -314,14 +363,22 @@ fn run_single_iter( let num_tasks = futures.len(); let start = Instant::now(); - let receiver = executor.spawn_many(&mut futures); - - let mut tasks_completed = 0; - let mut total_rows = 0; - while tasks_completed < num_tasks { - total_rows += receiver.recv().expect("Failed to receive result"); - tasks_completed += 1; - } + let total_rows = match runtime { + StorageBenchRuntime::Uring(executor) => { + let receiver = executor.spawn_many(&mut futures); + let mut tasks_completed = 0; + let mut total_rows = 0; + while tasks_completed < num_tasks { + total_rows += receiver.recv().expect("Failed to receive result"); + tasks_completed += 1; + } + total_rows + } + StorageBenchRuntime::Tokio(rt) => rt + .block_on(join_all(futures)) + .into_iter() + .sum::(), + }; let elapsed = start.elapsed(); if total_rows != query.expected_row_count { log::warn!( @@ -395,10 +452,10 @@ fn run_bench( .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(); - let executor = WorkStealingUringRuntime::new(num_workers); + let runtime = StorageBenchRuntime::new(io_mode, num_workers); let storage_clone = storage.clone(); let query_owned = query.clone(); - let (num_batches, entry_ids, batch_lengths) = executor.run_to_completion(async move { + let (num_batches, entry_ids, batch_lengths) = runtime.run_to_completion(async move { // 1) Load parquet into record batches (filter columns only) and insert into cache. let (entry_ids, batch_lengths) = load_and_insert(storage_clone.clone(), parquet_path, &query_owned).await; @@ -416,8 +473,7 @@ fn run_bench( }); // Baseline after cache load so iteration deltas exclude setup work on the same workers. - let mut prev_runnable_wall_total_ns = executor.total_runnable_wall_nanos(); - let mut prev_per_worker_wall_ns = executor.worker_runnable_wall_nanos(); + let mut prev_runnable_wall_total_ns = runtime.total_runnable_wall_nanos().unwrap_or(0); for i in 0..num_iter { liquid_cache_benchmarks::tracepoints::iteration_start(query_index as u32, i as u32); @@ -441,44 +497,35 @@ fn run_bench( storage.clone(), &entry_ids, &batch_lengths, - &executor, + &runtime, ); - let runnable_wall_total_ns = executor.total_runnable_wall_nanos(); - let runnable_this_iter_ns = - runnable_wall_total_ns.saturating_sub(prev_runnable_wall_total_ns); - prev_runnable_wall_total_ns = runnable_wall_total_ns; - - let per_worker_now = executor.worker_runnable_wall_nanos(); - let per_worker_delta_ms: Vec = per_worker_now - .iter() - .zip(prev_per_worker_wall_ns.iter()) - .map(|(now, prev)| now.saturating_sub(*prev) as f64 / 1e6) - .collect(); - prev_per_worker_wall_ns = per_worker_now; - - let wall_ns = iter_wall.as_nanos() as u64; - // Summed across workers; can exceed wall clock when workers run in parallel. - let wall_minus_runnable_sum_ns = wall_ns.saturating_sub(runnable_this_iter_ns); - - log::info!( - "Iteration {}: Runnable::run wall +{:.3} ms this iter (cumulative {:.3} ms); \ - iteration wall {:.3} s; wall minus summed run delta (saturating) {:.3} ms; \ - per-worker +delta ms {:?}", - i, - runnable_this_iter_ns as f64 / 1e6, - runnable_wall_total_ns as f64 / 1e6, - iter_wall.as_secs_f64(), - wall_minus_runnable_sum_ns as f64 / 1e6, - per_worker_delta_ms, - ); + let uring_runnable = match runtime.total_runnable_wall_nanos() { + Some(total_now) => { + let runnable_this_iter_ns = + total_now.saturating_sub(prev_runnable_wall_total_ns); + prev_runnable_wall_total_ns = total_now; + let wall_ns = iter_wall.as_nanos() as u64; + // Summed across workers; can exceed wall clock when workers run in parallel. + let wall_minus_runnable_sum_ns = wall_ns.saturating_sub(runnable_this_iter_ns); + Some(( + runnable_this_iter_ns as f64 / 1e6, + wall_minus_runnable_sum_ns as f64 / 1e6, + )) + } + None => None, + }; let (disk_read, disk_written) = io_guard.stop(); log::info!( - "Iteration {}: disk read {} bytes, disk written {} bytes", - i, - disk_read, - disk_written + "{}", + liquid_cache_benchmarks::format_storage_iteration_metrics( + i, + iter_wall, + disk_read, + disk_written, + uring_runnable, + ) ); if let (Some(profiler), Some(dir)) = (profiler_guard, flamegraph_dir.as_ref()) { From 89a8d3af8f55420c5b4c4a06c9114098c0e1c723 Mon Sep 17 00:00:00 2001 From: Proteet Paul Date: Mon, 13 Apr 2026 01:02:18 -0500 Subject: [PATCH 26/26] Fix perf slowdown when running bench in std-blocking mode --- benchmark/src/storage_runner.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs index 9832d642..2f4f684d 100644 --- a/benchmark/src/storage_runner.rs +++ b/benchmark/src/storage_runner.rs @@ -374,10 +374,13 @@ fn run_single_iter( } total_rows } - StorageBenchRuntime::Tokio(rt) => rt - .block_on(join_all(futures)) - .into_iter() - .sum::(), + StorageBenchRuntime::Tokio(rt) => { + let handles: Vec<_> = futures.into_iter().map(|f| rt.spawn(f)).collect(); + rt.block_on(join_all(handles)) + .into_iter() + .map(|r| r.expect("partition task failed")) + .sum::() + } }; let elapsed = start.elapsed(); if total_rows != query.expected_row_count {