diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml index 4e15517c..71404b89 100644 --- a/benchmark/Cargo.toml +++ b/benchmark/Cargo.toml @@ -12,6 +12,7 @@ liquid-cache-storage = { workspace = true } liquid-cache-common = { workspace = true } liquid-cache-local = { workspace = true } datafusion = { workspace = true } +futures = { workspace = true } tokio = { workspace = true } log = { workspace = true } arrow-flight = { workspace = true } @@ -67,3 +68,7 @@ path = "bench_server.rs" [[bin]] name = "in_process" path = "in_process.rs" + +[[bin]] +name = "storage_runner" +path = "src/storage_runner.rs" diff --git a/benchmark/bench_server.rs b/benchmark/bench_server.rs index 17563ede..753f1939 100644 --- a/benchmark/bench_server.rs +++ b/benchmark/bench_server.rs @@ -51,6 +51,9 @@ struct CliArgs { /// IO mode, available options: uring, uring-direct, std-blocking, tokio, std-spawn-blocking #[arg(long = "io-mode", default_value = "uring-multi-async")] io_mode: IoMode, + + #[arg(long = "fixed-buffer-pool-size-mb", default_value = "0")] + fixed_buffer_pool_size_mb: usize, } #[tokio::main] @@ -81,6 +84,7 @@ async fn main() -> Result<(), Box> { squeeze_policy, Box::new(NoHydration::new()), Some(args.io_mode), + args.fixed_buffer_pool_size_mb, )?; let liquid_cache_server = Arc::new(liquid_cache_server); diff --git a/benchmark/in_process.rs b/benchmark/in_process.rs index 8ff6259a..444b963f 100644 --- a/benchmark/in_process.rs +++ b/benchmark/in_process.rs @@ -4,7 +4,7 @@ use fastrace::prelude::*; use liquid_cache_benchmarks::{ BenchmarkManifest, InProcessBenchmarkMode, InProcessBenchmarkRunner, setup_observability, }; -use liquid_cache_common::IoMode; +use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use mimalloc::MiMalloc; use serde::Serialize; use std::path::PathBuf; @@ -70,6 +70,9 @@ struct InProcessBenchmark { /// IO mode, available options: uring, uring-direct, std-blocking, tokio, std-spawn-blocking #[arg(long = "io-mode", default_value = "uring-multi-async")] io_mode: IoMode, + + #[arg(long = "fixed-buffer-pool-size-mb", default_value = "0")] + fixed_buffer_pool_size_mb: usize, } impl InProcessBenchmark { @@ -88,7 +91,8 @@ impl InProcessBenchmark { .with_cache_dir(self.cache_dir.clone()) .with_query_filter(self.query_index) .with_io_mode(self.io_mode) - .with_output_dir(self.output_dir.clone()); + .with_output_dir(self.output_dir.clone()) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb); runner.run(manifest, self, output).await?; Ok(()) } @@ -102,6 +106,7 @@ async fn main() -> Result<()> { let _guard = root.set_local_parent(); benchmark.run().await?; + FixedBufferPool::print_stats(); fastrace::flush(); Ok(()) } diff --git a/benchmark/src/inprocess_runner.rs b/benchmark/src/inprocess_runner.rs index 5e535933..c571ba56 100644 --- a/benchmark/src/inprocess_runner.rs +++ b/benchmark/src/inprocess_runner.rs @@ -202,6 +202,7 @@ pub struct InProcessBenchmarkRunner { pub io_mode: IoMode, pub output_dir: Option, pub collect_perf_events: bool, + pub fixed_buffer_pool_size_mb: usize, } impl Default for InProcessBenchmarkRunner { @@ -224,6 +225,7 @@ impl InProcessBenchmarkRunner { io_mode: IoMode::default(), output_dir: None, collect_perf_events: false, + fixed_buffer_pool_size_mb: 0, } } @@ -282,6 +284,11 @@ impl InProcessBenchmarkRunner { self } + pub fn with_fixed_buffer_pool_size_mb(mut self, fixed_buffer_pool_size_mb: usize) -> Self { + self.fixed_buffer_pool_size_mb = fixed_buffer_pool_size_mb; + self + } + #[fastrace::trace] async fn setup_context( &self, @@ -348,6 +355,7 @@ impl InProcessBenchmarkRunner { .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_io_mode(self.io_mode) .with_eager_shredding(true) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb) .build(session_config)?; (v.0, Some(v.1)) } @@ -359,6 +367,7 @@ impl InProcessBenchmarkRunner { .with_hydration_policy(Box::new(NoHydration::new())) .with_squeeze_policy(Box::new(TranscodeEvict)) .with_io_mode(self.io_mode) + .with_fixed_buffer_pool_size_mb(self.fixed_buffer_pool_size_mb) .build(session_config)?; (v.0, Some(v.1)) } diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs index 6e7da983..7f6556d8 100644 --- a/benchmark/src/lib.rs +++ b/benchmark/src/lib.rs @@ -23,7 +23,7 @@ pub mod client_runner; pub mod inprocess_runner; mod manifest; mod observability; -mod tracepoints; +pub mod tracepoints; pub mod utils; pub use client_runner::*; @@ -487,6 +487,81 @@ impl Display for IterationResult { } } +/// Table layout matching [`IterationResult`]'s [`Display`] (borders, row style, disk formatting). +/// When `uring_runnable` is `Some`, includes work-stealing executor `Runnable::run` timing (see storage runner). +pub fn format_storage_iteration_metrics( + iteration: usize, + iteration_wall: Duration, + disk_read: u64, + disk_written: u64, + uring_runnable: Option<(f64, f64)>, +) -> String { + struct StorageIterationTable { + iteration: usize, + iteration_wall_ms: u64, + uring_runnable: Option<(f64, f64)>, + disk_read: u64, + disk_written: u64, + } + + impl std::fmt::Display for StorageIterationTable { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + const INNER: usize = 50; + write_border_top(f, INNER)?; + write_kv_row( + f, + INNER, + "Iteration:", + &format!("{}", self.iteration), + )?; + write_kv_row( + f, + INNER, + "Iteration wall:", + &format!("{} ms", format_number(self.iteration_wall_ms)), + )?; + if let Some((runnable_wall_ms, wall_minus_runnable_ms)) = self.uring_runnable { + write_kv_row( + f, + INNER, + "Runnable wall (sum):", + &format!("{:.3} ms", runnable_wall_ms), + )?; + write_kv_row( + f, + INNER, + "Wall minus runnable:", + &format!("{:.3} ms", wall_minus_runnable_ms), + )?; + } + write_border_sep(f, INNER)?; + write_kv_row( + f, + INNER, + "Disk (Read/Write):", + &format!( + "{} / {}", + format_bytes(self.disk_read), + format_bytes(self.disk_written) + ), + )?; + write_border_bottom(f, INNER) + } + } + + let iteration_wall_ms = (iteration_wall.as_secs_f64() * 1000.0).round() as u64; + format!( + "{}", + StorageIterationTable { + iteration, + iteration_wall_ms, + uring_runnable, + disk_read, + disk_written, + } + ) +} + fn format_number(n: u64) -> String { let s = n.to_string(); let chars: Vec = s.chars().collect(); diff --git a/benchmark/src/storage_runner.rs b/benchmark/src/storage_runner.rs new file mode 100644 index 00000000..2f4f684d --- /dev/null +++ b/benchmark/src/storage_runner.rs @@ -0,0 +1,715 @@ +#![cfg(target_os = "linux")] + +/** + * Benchmark to test the performance of io_uring runtime for clickbench queries. The queries are executed directly + * on a LiquidCache instance to bypass datafusion, which is strongly coupled with tokio. The benchmark is based on + * the arrow benchmark (https://github.com/apache/arrow-rs/blob/main/parquet/benches/arrow_reader_clickbench.rs#L729) + */ +use arrow::array::BooleanArray; +use arrow::buffer::BooleanBuffer; +use clap::Parser; +use datafusion::logical_expr::Operator; +use futures::future::join_all; +use datafusion::physical_plan::PhysicalExpr; +use datafusion::physical_plan::expressions::{BinaryExpr, Column}; +use datafusion::scalar::ScalarValue; +use liquid_cache_common::IoMode; +use liquid_cache_parquet::{SimpleIoContext, WorkStealingUringRuntime}; +use liquid_cache_storage::cache::{ + EntryID, LiquidCache, LiquidCacheBuilder, LiquidPolicy, NoHydration, TranscodeSqueezeEvict, +}; +use logforth::filter::EnvFilter; +use parquet::arrow::{ProjectionMask, arrow_reader::ParquetRecordBatchReaderBuilder}; +use std::fs::create_dir_all; +use std::path::PathBuf; +use std::future::Future; +use std::sync::Arc; +use std::time::Instant; +use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, System}; + +#[derive(Parser)] +#[command(name = "storage_runner")] +struct Args { + /// ClickBench query index (0-based). Only queries with filters are supported (e.g. 1, 10, 19, 20). + #[arg(long)] + query_index: usize, + + /// Number of partitions (tasks per iteration). + #[arg(long)] + partitions: usize, + + /// Worker threads: io_uring work-stealing runtime size, or Tokio worker threads when `--io-mode std-blocking`. + #[arg(long)] + worker_threads: usize, + + #[arg(long)] + iterations: usize, + + /// Path to hits.parquet. Default: benchmark/clickbench/data/hits.parquet + #[arg(long, default_value = "benchmark/clickbench/data/hits.parquet")] + parquet: PathBuf, + + /// Directory for the liquid-cache storage. Default: $TMPDIR/liquid_cache_storage_runner + #[arg(long)] + cache_dir: Option, + + /// Directory to write flamegraph SVG files to (one per query iteration). + #[arg(long = "flamegraph-dir")] + flamegraph_dir: Option, + + /// IO mode: uring-non-blocking (default) or std-blocking. With std-blocking, partition futures run on a multi-thread Tokio runtime (`worker_threads` workers). + #[arg(long = "io-mode", default_value = "uring-non-blocking")] + io_mode: IoMode, +} + +/// Tracks process disk I/O (bytes read/written) between creation and stop(). +struct DiskIoGuard { + system: System, + pid: sysinfo::Pid, + start_read_total: u64, + start_written_total: u64, +} + +impl DiskIoGuard { + fn new() -> Self { + let mut system = System::new(); + let pid = sysinfo::get_current_pid().unwrap(); + system.refresh_processes_specifics( + ProcessesToUpdate::Some(&[pid]), + true, + ProcessRefreshKind::nothing().with_disk_usage(), + ); + let p = system.process(pid).unwrap(); + let du = p.disk_usage(); + Self { + system, + pid, + start_read_total: du.total_read_bytes, + start_written_total: du.total_written_bytes, + } + } + + fn stop(mut self) -> (u64, u64) { + self.system.refresh_processes_specifics( + ProcessesToUpdate::Some(&[self.pid]), + true, + ProcessRefreshKind::nothing().with_disk_usage(), + ); + if let Some(p) = self.system.process(self.pid) { + let du = p.disk_usage(); + ( + du.total_read_bytes.saturating_sub(self.start_read_total), + du.total_written_bytes + .saturating_sub(self.start_written_total), + ) + } else { + (0, 0) + } + } +} + +/// ClickBench query descriptor: filter column(s), optional projection column(s), and predicate expression(s). +/// Each predicate is evaluated on a single column (column index 0 in the cached array). +#[derive(Clone)] +struct FilterQuery { + /// Column names to load and cache for filtering (in schema order). + filter_columns: Vec<&'static str>, + /// Column names to load when there are no predicates (projection-only / full-scan queries). + projection_columns: Vec<&'static str>, + /// One predicate per filter column; each expects Column(0) op Literal. Empty for projection-only. + predicates: Vec>, + /// Number of expected rows in result + expected_row_count: usize, +} + +impl FilterQuery { + /// Columns to load into cache: filter_columns when present, else projection_columns. + fn columns_to_load(&self) -> &[&'static str] { + if self.filter_columns.is_empty() { + &self.projection_columns + } else { + &self.filter_columns + } + } +} + +fn all_filter_queries() -> Vec> { + use datafusion::physical_plan::expressions::Literal as Lit; + let col = || Arc::new(Column::new("col", 0)) as Arc; + + let mut q: Vec> = (0..43).map(|_| None).collect(); + + // Q1: AdvEngineID <> 0 + q[1] = Some(FilterQuery { + filter_columns: vec!["AdvEngineID"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::UInt64(Some(0)))), + ))], + expected_row_count: 3312, + }); + + // Q10: MobilePhoneModel <> '' + q[10] = Some(FilterQuery { + filter_columns: vec!["MobilePhoneModel"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 34276, + }); + + // Q12: SearchPhrase <> '' + q[12] = Some(FilterQuery { + filter_columns: vec!["SearchPhrase"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 131559, + }); + + // Q19: UserID = 3233473875476175636 (value that exists in hits_1) + q[19] = Some(FilterQuery { + filter_columns: vec!["UserID"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt64(Some(3233473875476175636)))), + ))], + expected_row_count: 4, + }); + + q[20] = Some(FilterQuery { + filter_columns: vec!["URL"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::LikeMatch, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::from("%google%"))))) + ))], + expected_row_count: 99997497, + }); + + // Q27: URL <> '' + q[27] = Some(FilterQuery { + filter_columns: vec!["URL"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 999978, + }); + + // Q28: Referer <> '' + q[28] = Some(FilterQuery { + filter_columns: vec!["Referer"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 925813, + }); + + // Q30: SearchPhrase <> '' + q[30] = Some(FilterQuery { + filter_columns: vec!["SearchPhrase"], + projection_columns: vec![], + predicates: vec![Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + ))], + expected_row_count: 131559, + }); + + // Q36: CounterID = 62, DontCountHits = 0, IsRefresh = 0, URL <> '' + q[36] = Some(FilterQuery { + filter_columns: vec![ + "CounterID", + "EventDate", + "DontCountHits", + "IsRefresh", + "URL", + ], + projection_columns: vec![], + predicates: vec![ + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt32(Some(62)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::Int16(Some(0)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt8(Some(0)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::Eq, + Arc::new(Lit::new(ScalarValue::UInt8(Some(0)))), + )), + Arc::new(BinaryExpr::new( + col(), + Operator::NotEq, + Arc::new(Lit::new(ScalarValue::Utf8(Some(String::new())))), + )), + ], + expected_row_count: 181198, + }); + + q +} + +/// Partition futures run either on the work-stealing io_uring executor or, for `std-blocking` IO, on Tokio. +enum StorageBenchRuntime { + Uring(WorkStealingUringRuntime), + Tokio(tokio::runtime::Runtime), +} + +impl StorageBenchRuntime { + fn new(io_mode: IoMode, num_workers: usize) -> Self { + let num_workers = num_workers.max(1); + if io_mode == IoMode::StdBlocking { + Self::Tokio( + tokio::runtime::Builder::new_multi_thread() + .worker_threads(num_workers) + .thread_name("storage-bench-tokio") + .enable_all() + .build() + .expect("build tokio multi-thread runtime"), + ) + } else { + Self::Uring(WorkStealingUringRuntime::new(num_workers)) + } + } + + fn run_to_completion(&self, future: F) -> T + where + F: Future + Send + 'static, + T: Send + 'static, + { + match self { + Self::Uring(e) => e.run_to_completion(future), + Self::Tokio(rt) => rt.block_on(future), + } + } + + fn total_runnable_wall_nanos(&self) -> Option { + match self { + Self::Uring(e) => Some(e.total_runnable_wall_nanos()), + Self::Tokio(_) => None, + } + } +} + +fn run_single_iter( + num_batches: usize, + num_partitions: usize, + query: &FilterQuery, + storage: Arc, + entry_ids: &Vec, + batch_lengths: &Vec, + runtime: &StorageBenchRuntime, +) -> (std::time::Duration, usize) { + // 2) Partition batch indices evenly across workers. + let batches_per_partition = num_batches / num_partitions; + let num_cols = query.columns_to_load().len(); + + // 3) Create futures for every partition (only for partitions that have at least one batch) + let mut futures = Vec::new(); + let mut start_batch_idx = 0; + for p in 0..num_partitions { + let batch_count = if p < num_batches % num_partitions { + batches_per_partition + 1 + } else { + batches_per_partition + }; + let end = (start_batch_idx + batch_count).min(num_batches); + if start_batch_idx >= end { + continue; + } + let storage_clone = Arc::clone(&storage); + let batch_range = start_batch_idx..end; + start_batch_idx = end; + let predicates = query.predicates.iter().map(Arc::clone).collect::>(); + let entry_ids_clone = entry_ids.clone(); + let batch_lengths_clone = batch_lengths.clone(); + futures.push(run_partition( + storage_clone, + batch_range, + num_cols, + predicates, + entry_ids_clone, + batch_lengths_clone, + )); + } + let num_tasks = futures.len(); + + let start = Instant::now(); + let total_rows = match runtime { + StorageBenchRuntime::Uring(executor) => { + let receiver = executor.spawn_many(&mut futures); + let mut tasks_completed = 0; + let mut total_rows = 0; + while tasks_completed < num_tasks { + total_rows += receiver.recv().expect("Failed to receive result"); + tasks_completed += 1; + } + total_rows + } + StorageBenchRuntime::Tokio(rt) => { + let handles: Vec<_> = futures.into_iter().map(|f| rt.spawn(f)).collect(); + rt.block_on(join_all(handles)) + .into_iter() + .map(|r| r.expect("partition task failed")) + .sum::() + } + }; + let elapsed = start.elapsed(); + if total_rows != query.expected_row_count { + log::warn!( + "Expected row count doesn't match. Actual: {}, expected: {}", + total_rows, + query.expected_row_count + ); + } + log::info!( + "Partitions: {}, wall: {:.3}s, total rows: {}", + num_partitions, + elapsed.as_secs_f64(), + total_rows + ); + (elapsed, total_rows) +} + +fn write_flamegraph( + profiler: &pprof::ProfilerGuard<'_>, + flamegraph_dir: &std::path::Path, + query_index: usize, + iteration: u32, +) -> Result<(), Box> { + let report = profiler.report().build()?; + let mut svg_data = Vec::new(); + report.flamegraph(&mut svg_data)?; + create_dir_all(flamegraph_dir)?; + + let now = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH)?; + let secs = now.as_secs(); + let hour = (secs / 3600) % 24; + let minute = (secs / 60) % 60; + let second = secs % 60; + + let filename = + format!("{hour:02}h{minute:02}m{second:02}s_q{query_index:02}_i{iteration:02}.svg"); + let filepath = flamegraph_dir.join(filename); + std::fs::write(&filepath, svg_data)?; + log::info!("Flamegraph written to: {}", filepath.display()); + Ok(()) +} + +fn run_bench( + cache_dir: PathBuf, + parquet_path: PathBuf, + query: &FilterQuery, + query_index: usize, + num_partitions: usize, + num_iter: usize, + num_workers: usize, + flamegraph_dir: Option, + io_mode: IoMode, +) { + let _ = std::fs::create_dir_all(&cache_dir); + let fb_pool_size = if io_mode == IoMode::UringNonBlocking { + 4096 + } else { + 0 + }; + let io_context = Arc::new(SimpleIoContext::new( + cache_dir.clone(), + io_mode, + fb_pool_size, + )); + let storage = LiquidCacheBuilder::new() + .with_io_context(io_context) + .with_cache_dir(cache_dir) + .with_max_cache_bytes(256 * 1024 * 1024) + .with_cache_policy(Box::new(LiquidPolicy::new())) + .with_hydration_policy(Box::new(NoHydration::new())) + .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) + .build(); + + let runtime = StorageBenchRuntime::new(io_mode, num_workers); + let storage_clone = storage.clone(); + let query_owned = query.clone(); + let (num_batches, entry_ids, batch_lengths) = runtime.run_to_completion(async move { + // 1) Load parquet into record batches (filter columns only) and insert into cache. + let (entry_ids, batch_lengths) = + load_and_insert(storage_clone.clone(), parquet_path, &query_owned).await; + let num_cols_loaded = query_owned.columns_to_load().len(); + let num_batches = entry_ids.len() / num_cols_loaded; + log::info!( + "Populated cache: {} batches, {} columns, {} entries", + num_batches, + num_cols_loaded, + entry_ids.len() + ); + + storage_clone.flush_all_to_disk().await; + (num_batches, entry_ids, batch_lengths) + }); + + // Baseline after cache load so iteration deltas exclude setup work on the same workers. + let mut prev_runnable_wall_total_ns = runtime.total_runnable_wall_nanos().unwrap_or(0); + + for i in 0..num_iter { + liquid_cache_benchmarks::tracepoints::iteration_start(query_index as u32, i as u32); + let io_guard = DiskIoGuard::new(); + let profiler_guard = if flamegraph_dir.is_some() { + Some( + pprof::ProfilerGuardBuilder::default() + .frequency(500) + .blocklist(&["libpthread.so.0", "libm.so.6", "libgcc_s.so.1"]) + .build() + .expect("pprof ProfilerGuardBuilder::build"), + ) + } else { + None + }; + + let (iter_wall, _rows) = run_single_iter( + num_batches, + num_partitions, + &query, + storage.clone(), + &entry_ids, + &batch_lengths, + &runtime, + ); + + let uring_runnable = match runtime.total_runnable_wall_nanos() { + Some(total_now) => { + let runnable_this_iter_ns = + total_now.saturating_sub(prev_runnable_wall_total_ns); + prev_runnable_wall_total_ns = total_now; + let wall_ns = iter_wall.as_nanos() as u64; + // Summed across workers; can exceed wall clock when workers run in parallel. + let wall_minus_runnable_sum_ns = wall_ns.saturating_sub(runnable_this_iter_ns); + Some(( + runnable_this_iter_ns as f64 / 1e6, + wall_minus_runnable_sum_ns as f64 / 1e6, + )) + } + None => None, + }; + + let (disk_read, disk_written) = io_guard.stop(); + log::info!( + "{}", + liquid_cache_benchmarks::format_storage_iteration_metrics( + i, + iter_wall, + disk_read, + disk_written, + uring_runnable, + ) + ); + + if let (Some(profiler), Some(dir)) = (profiler_guard, flamegraph_dir.as_ref()) { + if let Err(e) = write_flamegraph(&profiler, dir, query_index, i as u32) { + log::warn!("Failed to write flamegraph for iteration {}: {}", i, e); + } + } + } +} + +async fn run_partition( + storage: Arc, + batch_range: std::ops::Range, + num_cols: usize, + predicates: Vec>, + entry_ids: Vec, + batch_lengths: Vec, +) -> usize { + let mut total_matched = 0usize; + + if predicates.is_empty() { + // No predicates: full scan, count all rows in the partition. + for batch_idx in batch_range.clone() { + let entry_idx = batch_idx * num_cols; + let entry_id = &entry_ids[entry_idx]; + let _result = storage.get(entry_id).await; + total_matched += batch_lengths[entry_idx]; + } + return total_matched; + } + + for batch_idx in batch_range { + let mut combined_mask: Option = None; + for (col_idx, pred) in predicates.iter().enumerate() { + let entry_idx = batch_idx * num_cols + col_idx; + let entry_id = &entry_ids[entry_idx]; + let len = batch_lengths[entry_idx]; + let selection = BooleanBuffer::new_set(len); + let result = storage + .eval_predicate(entry_id, pred) + .with_selection(&selection) // Is this necessary? + .await; + match result { + Some(Ok(mask)) => { + combined_mask = Some(match combined_mask.take() { + Some(prev) => arrow::compute::and(&prev, &mask).unwrap(), + None => mask, + }); + } + Some(Err(_)) | None => { + // Predicate could not be evaluated in cache; treat as no match for this batch. + combined_mask = Some(BooleanArray::from(vec![false; len])); + } + } + } + if let Some(m) = combined_mask { + total_matched += m.true_count(); + } + } + total_matched +} + +/// Load parquet with projection = query.columns_to_load(), insert each (batch, column) into cache. +/// Returns (entry_ids in order batch0_col0, batch0_col1, ..., batch1_col0, ...), (length per entry). +async fn load_and_insert( + storage: Arc, + parquet_path: PathBuf, + query: &FilterQuery, +) -> (Vec, Vec) { + let columns_to_load = query.columns_to_load(); + assert!( + !columns_to_load.is_empty(), + "query must have filter_columns or projection_columns" + ); + + let Ok(parquet_file) = std::fs::File::open(parquet_path.clone()) else { + panic!("Failed to open {:?}", parquet_path.to_str()); + }; + + let builder = ParquetRecordBatchReaderBuilder::try_new(parquet_file).unwrap(); + let schema = builder.parquet_schema(); + let root_fields = schema.root_schema().get_fields(); + let projection_root_indices: Vec = columns_to_load + .iter() + .map(|name| { + root_fields + .iter() + .position(|f| f.name() == *name) + .unwrap_or_else(|| panic!("parquet schema has no column '{name}'")) + }) + .collect(); + let projection_mask = ProjectionMask::roots(schema, projection_root_indices); + + let mut reader = builder + .with_batch_size(8192) + .with_projection(projection_mask) + .build() + .unwrap(); + + let num_cols = columns_to_load.len(); + let mut entry_ids = Vec::new(); + let mut batch_lengths = Vec::new(); + let mut batch_idx = 0usize; + + while let Some(batch_res) = reader.next() { + let batch = batch_res.expect("parquet read batch"); + let nrows = batch.num_rows(); + for col_idx in 0..num_cols { + let entry_id = EntryID::from(batch_idx * num_cols + col_idx); + let array = batch.column(col_idx).clone(); + storage.insert(entry_id, array).await; + entry_ids.push(entry_id); + batch_lengths.push(nrows); + } + batch_idx += 1; + } + + (entry_ids, batch_lengths) +} + +fn setup_logging() { + let mut builder = logforth::builder(); + builder = builder.dispatch(|d| { + d.filter(EnvFilter::from_default_env()) + .append(logforth::append::Stdout::default()) + }); + builder.apply(); +} + +fn main() { + let args = Args::parse(); + setup_logging(); + + let queries = all_filter_queries(); + let query = match args.query_index { + i if i < queries.len() => match &queries[i] { + Some(q) => q, + None => { + eprintln!( + "Query index {} has no filters. Only filter queries are supported. \ + Try e.g. 1, 10, 12, 19, 27, 28, 30, 36.", + args.query_index + ); + std::process::exit(1); + } + }, + _ => { + eprintln!( + "Query index {} out of range (0..{}).", + args.query_index, + queries.len() + ); + std::process::exit(1); + } + }; + + if args.partitions == 0 { + eprintln!("partitions must be >= 1."); + std::process::exit(1); + } + + if !args.parquet.exists() { + eprintln!( + "Parquet file not found: {}. Download e.g. wget https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet -O {}", + args.parquet.display(), + args.parquet.display() + ); + std::process::exit(1); + } + let cache_dir = args + .cache_dir + .unwrap_or_else(|| std::env::temp_dir().join("lc_cache_dir")); + run_bench( + cache_dir, + args.parquet, + query, + args.query_index, + args.partitions, + args.iterations, + args.worker_threads, + args.flamegraph_dir, + args.io_mode, + ); +} diff --git a/dev/README.md b/dev/README.md index 1991c206..bfecdf63 100644 --- a/dev/README.md +++ b/dev/README.md @@ -22,7 +22,6 @@ LiquidCache exports OpenTelemetry traces. Spin up a Jaeger v2 ```bash docker run \ --name jaeger \ - --replace \ -e COLLECTOR_OTLP_ENABLED=true \ -p 16686:16686 \ -p 4317:4317 \ @@ -30,6 +29,8 @@ docker run \ cr.jaegertracing.io/jaegertracing/jaeger:2.11.0 ``` +If a container named `jaeger` already exists, remove it first: `docker rm -f jaeger` (or `podman rm -f jaeger`). + This image contains the Jaeger v2 distribution. Port 16686 exposes the frontend UI at http://localhost:16686. 4317 and 4318 expose OTLP over gRPC and HTTP respectively. @@ -76,6 +77,21 @@ This will trace the execution of `iteration = 2` (`arg1 == 2`) and print the `io [512, 1K) 194 |@@@ | ``` +```bash +sudo bpftrace -e ' + usdt:./target/release/in_process:liquid_benchmark:iteration_start /arg1 == 2/ {@enable = 1;} + usdt:./target/release/in_process:liquid_benchmark:iteration_start /arg1 > 2/ {@enable = 0;} + usdt:./target/release/in_process:io_submitted /@enable/ { + @t[arg0] = nsecs; + } + usdt:./target/release/in_process:io_completed /@enable && @t[arg0]/ { + $us = (nsecs - @t[arg0]) / 1000; + @lat = hist($us); + delete(@t[arg0]); + } + ' +``` + If you're using blocking io mode, try this: ```bash sudo bpftrace -e ' diff --git a/examples/example_server.rs b/examples/example_server.rs index 7fcc4c1e..d16d9563 100644 --- a/examples/example_server.rs +++ b/examples/example_server.rs @@ -17,6 +17,7 @@ async fn main() -> Result<(), Box> { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), Some(IoMode::default()), + 0, )?; let flight = FlightServiceServer::new(liquid_cache); diff --git a/src/common/Cargo.toml b/src/common/Cargo.toml index 5c059ccd..18dfbe26 100644 --- a/src/common/Cargo.toml +++ b/src/common/Cargo.toml @@ -12,10 +12,16 @@ arrow-flight = { workspace = true } async-trait = { workspace = true } bytes = { workspace = true } chrono = "0.4.42" +crossbeam = "0.8.4" futures = { workspace = true } +io-uring = "0.7.11" +libc = "0.2.177" +log.workspace = true object_store = { workspace = true } prost = { workspace = true } +rand = "0.9.2" serde = { workspace = true } +tempfile.workspace = true thiserror = "2.0.17" tokio = { workspace = true } url = { workspace = true } diff --git a/src/common/src/io_mode.rs b/src/common/src/io_mode.rs index 747f0d5c..75b11295 100644 --- a/src/common/src/io_mode.rs +++ b/src/common/src/io_mode.rs @@ -26,6 +26,10 @@ pub enum IoMode { #[serde(rename = "uring-blocking")] UringBlocking, + /// Uses an io_uring runtime + #[serde(rename = "uring-non-blocking")] + UringNonBlocking, + /// Uses rust's std::fs::File, this is blocking IO. /// On Linux, this is essentially `pread/pwrite` /// This is the default on non-Linux platforms. @@ -57,6 +61,7 @@ impl Display for IoMode { IoMode::StdBlocking => "std-blocking", IoMode::TokioIO => "tokio", IoMode::StdSpawnBlocking => "std-spawn-blocking", + IoMode::UringNonBlocking => "uring-non-blocking", } ) } @@ -75,6 +80,7 @@ impl FromStr for IoMode { "std-blocking" => IoMode::StdBlocking, "tokio" => IoMode::TokioIO, "std-spawn-blocking" => IoMode::StdSpawnBlocking, + "uring-non-blocking" => IoMode::UringNonBlocking, _ => return Err(format!("Invalid IO mode: {s}")), }) } diff --git a/src/common/src/lib.rs b/src/common/src/lib.rs index 05c5d68e..9d9682df 100644 --- a/src/common/src/lib.rs +++ b/src/common/src/lib.rs @@ -5,3 +5,4 @@ pub mod mock_store; pub mod rpc; pub mod utils; pub use io_mode::IoMode; +pub mod memory; diff --git a/src/common/src/memory/arena.rs b/src/common/src/memory/arena.rs new file mode 100644 index 00000000..09eab355 --- /dev/null +++ b/src/common/src/memory/arena.rs @@ -0,0 +1,131 @@ +use std::{io, os::raw::c_void, ptr::null_mut}; + +use io_uring::IoUring; + +use crate::memory::{ + page::Slice, + pool::{FIXED_BUFFER_BITS, FIXED_BUFFER_SIZE_BYTES}, + segment::{SEGMENT_SIZE, SEGMENT_SIZE_BITS, Segment}, +}; + +pub struct Arena { + size: usize, + slices: Vec, + used_bitmap: Vec, + /** + * Segments need to be aligned to 32MB boundaries. Hence the first segment's starting address + * could be different from the starting address of the allocated memory + */ + aligned_start_ptr: *mut u8, + actual_start_ptr: *mut u8, + buffers_registered: bool, +} + +unsafe impl Send for Arena {} +unsafe impl Sync for Arena {} + +impl Arena { + pub fn new(capacity: usize) -> Arena { + let mem_start = Self::allocate_memory_from_os(capacity); + assert_ne!(mem_start, null_mut()); + let mem_end = mem_start.wrapping_add(capacity); + let mut ptr_aligned = (mem_start as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; + if ptr_aligned != (mem_start as usize) { + ptr_aligned += SEGMENT_SIZE; + } + let mut slice_start = ptr_aligned; + let mut slices = Vec::new(); + while slice_start + SEGMENT_SIZE <= mem_end as usize { + slices.push(Slice { + ptr: slice_start as *mut u8, + size: SEGMENT_SIZE, + }); + slice_start += SEGMENT_SIZE; + } + let mut used_bitmap = Vec::new(); + used_bitmap.resize(slices.len(), 0); + + Arena { + size: capacity, + slices: slices, + used_bitmap: used_bitmap, + aligned_start_ptr: ptr_aligned as *mut u8, + actual_start_ptr: mem_start, + buffers_registered: false, + } + } + + fn allocate_memory_from_os(capacity: usize) -> *mut u8 { + let prot = libc::PROT_READ | libc::PROT_WRITE; + let flags = libc::MAP_ANONYMOUS | libc::MAP_PRIVATE; + unsafe { libc::mmap64(null_mut(), capacity, prot, flags, -1, 0) as *mut u8 } + } + + pub fn allocate_segment(self: &mut Self, size: usize) -> Option<*mut Segment> { + let num_slices = (size + SEGMENT_SIZE - 1) / SEGMENT_SIZE; + let mut contiguous = 0; + let mut result: i32 = -1; + + for index in 0..self.used_bitmap.len() { + let bit = self.used_bitmap[index]; + if bit == 0 { + contiguous += 1; + if contiguous == num_slices { + result = (index + 1 - contiguous) as i32; + break; + } + } else { + contiguous = 0; + } + } + if result == -1 { + return None; + } + for i in 0..contiguous { + self.used_bitmap[result as usize + i] = 1; + } + let combined_slice = Slice { + ptr: self.slices[result as usize].ptr, + size: num_slices * SEGMENT_SIZE, + }; + Some(Segment::new_from_slice(combined_slice)) + } + + pub(crate) fn start_ptr(self: &Self) -> *mut u8 { + self.aligned_start_ptr + } + + pub(crate) fn retire_segment(self: &mut Self, segment: *mut Segment) { + debug_assert!((self.slices[0].ptr as usize) <= segment as usize); + let segment_idx = (segment as usize - self.slices[0].ptr as usize) / SEGMENT_SIZE; + self.used_bitmap[segment_idx] = 0; + } + + pub(crate) fn register_buffers_with_ring(self: &mut Self, ring: &IoUring) -> io::Result<()> { + let usable_bytes = self + .size + .saturating_sub(self.aligned_start_ptr as usize - self.actual_start_ptr as usize); + let num_buffers = usable_bytes >> FIXED_BUFFER_BITS; + let mut buffers = Vec::::new(); + buffers.reserve(num_buffers); + let mut base_ptr = self.aligned_start_ptr; + for _i in 0..num_buffers { + buffers.push(libc::iovec { + iov_base: base_ptr as *mut std::ffi::c_void, + iov_len: FIXED_BUFFER_SIZE_BYTES, + }); + base_ptr = base_ptr.wrapping_add(FIXED_BUFFER_SIZE_BYTES); + } + let res = unsafe { ring.submitter().register_buffers(&buffers) }; + self.buffers_registered = res.is_ok(); + res + } +} + +impl Drop for Arena { + fn drop(self: &mut Self) { + unsafe { + libc::munmap(self.actual_start_ptr as *mut c_void, self.size); + } + } +} diff --git a/src/common/src/memory/mod.rs b/src/common/src/memory/mod.rs new file mode 100644 index 00000000..72ab6966 --- /dev/null +++ b/src/common/src/memory/mod.rs @@ -0,0 +1,5 @@ +mod arena; +pub mod page; +pub mod pool; +mod segment; +mod tcache; diff --git a/src/common/src/memory/page.rs b/src/common/src/memory/page.rs new file mode 100644 index 00000000..edcb809c --- /dev/null +++ b/src/common/src/memory/page.rs @@ -0,0 +1,230 @@ +use std::{ + ptr::null_mut, + sync::atomic::{AtomicU8, Ordering}, + u8, +}; + +use crossbeam::utils::CachePadded; + +use crate::memory::tcache::MIN_SIZE_FROM_PAGES; + +pub const PAGE_SIZE: usize = 64 << 10; // 64KB +const MAX_BLOCKS_PER_PAGE: usize = PAGE_SIZE / MIN_SIZE_FROM_PAGES; + +struct LocalFreeList { + head: u8, + tail: u8, + num_blocks: u8, + /** + * Stores the block indices within the page for a compact representation, rather than storing pointers. + * That is, if block index=i, it represents ith block from the start of the page. + */ + blocks: [u8; MAX_BLOCKS_PER_PAGE], +} + +impl LocalFreeList { + fn empty() -> LocalFreeList { + LocalFreeList { + head: 0, + tail: 0, + num_blocks: 0, + blocks: [0; MAX_BLOCKS_PER_PAGE], + } + } + + fn new(num_blocks: usize) -> LocalFreeList { + debug_assert!(num_blocks <= MAX_BLOCKS_PER_PAGE); + let mut blocks = [0u8; MAX_BLOCKS_PER_PAGE]; + for i in 0..num_blocks { + blocks[i] = i as u8; + } + LocalFreeList { + head: 0, + tail: num_blocks as u8, + num_blocks: num_blocks as u8, + blocks: blocks, + } + } + + fn push(&mut self, block: u8) { + debug_assert!(self.tail.wrapping_sub(self.head) < self.num_blocks); + self.blocks[self.tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] = block; + self.tail = self.tail.wrapping_add(1); + } + + fn is_empty(&self) -> bool { + self.head == self.tail + } + + fn pop(&mut self) -> Option { + if self.head == self.tail { + return None; + } + let ret = self.blocks[self.head as usize & (MAX_BLOCKS_PER_PAGE - 1)]; + self.head = self.head.wrapping_add(1); + Some(ret) + } +} + +struct MPSCQueue { + head: u8, + tail: CachePadded, + num_blocks: u8, + blocks: [u8; MAX_BLOCKS_PER_PAGE], +} + +impl MPSCQueue { + const HAZARD: u8 = u8::MAX; + + fn new(num_blocks: usize) -> MPSCQueue { + debug_assert!(num_blocks <= MAX_BLOCKS_PER_PAGE); + MPSCQueue { + head: 0, + num_blocks: num_blocks as u8, + tail: CachePadded::new(AtomicU8::new(0)), + blocks: [Self::HAZARD; MAX_BLOCKS_PER_PAGE], + } + } + + fn push(&mut self, block: u8) { + loop { + let cur_tail = self.tail.load(Ordering::Relaxed); + assert!(cur_tail.wrapping_sub(self.head) < self.num_blocks); + let new_tail = cur_tail.wrapping_add(1); + if self + .tail + .compare_exchange(cur_tail, new_tail, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + unsafe { + std::ptr::write_volatile( + &mut self.blocks[cur_tail as usize & (MAX_BLOCKS_PER_PAGE - 1)] as *mut u8, + block, + ); + } + return; + } + } + } + + fn pop(&mut self) -> Option { + if self.head == self.tail.load(Ordering::Relaxed) { + return None; + } + let idx = self.head as usize & (MAX_BLOCKS_PER_PAGE - 1); + loop { + let ret = unsafe { std::ptr::read_volatile(&self.blocks[idx] as *const u8) }; + /* + * The hazard value prevents the following race condition: + * The producer has reserved a slot, but before it can write to the slot, the consumer calls pop. + */ + if ret != Self::HAZARD { + unsafe { + std::ptr::write_volatile(&mut self.blocks[idx] as *mut u8, Self::HAZARD); + } + self.head = self.head.wrapping_add(1); + return Some(ret); + } + } + } +} + +pub struct Page { + pub(crate) block_size: usize, // Size of objects that are being allocated to this page + free_list: LocalFreeList, + pub(crate) used: usize, + thread_free_list: MPSCQueue, + pub(crate) slice_count: usize, // No. of pages in the slice containing this page + pub(crate) slice_offset: usize, // Offset of this page from the start of this slice + pub(crate) page_start: *mut u8, + // Next and previous pages in the span which is a doubly-linked list + pub(crate) next_page: *mut Page, + pub(crate) previous_page: *mut Page, +} + +impl Page { + pub fn from_slice(slice: Slice) -> Page { + Page { + block_size: 0usize, + free_list: LocalFreeList::empty(), + used: 0, + thread_free_list: MPSCQueue::new(PAGE_SIZE / MIN_SIZE_FROM_PAGES), + slice_count: 1, + slice_offset: 0, + page_start: slice.ptr, + next_page: null_mut(), + previous_page: null_mut(), + } + } + + pub fn set_block_size(self: &mut Self, block_size: usize) { + self.block_size = block_size; + let num_blocks = (self.slice_count * PAGE_SIZE) / block_size; + self.free_list = LocalFreeList::new(num_blocks); + } + + #[inline] + pub fn get_free_block(self: &mut Self) -> *mut u8 { + let block_idx = self.free_list.pop(); + let block_idx = match block_idx { + Some(i) => i, + None => return null_mut(), + }; + self.used += 1; + unsafe { self.page_start.add(block_idx as usize * self.block_size) } + } + + #[inline(always)] + pub fn is_full(self: &Self) -> bool { + self.free_list.is_empty() + } + + #[inline(always)] + pub fn is_unused(self: &Self) -> bool { + self.used == 0 + } + + /// Pointer freed on the same core + #[inline(always)] + pub fn free(self: &mut Self, ptr: *mut u8) { + let block_idx = (ptr as usize - self.page_start as usize) / self.block_size; + self.free_list.push(block_idx as u8); + self.used -= 1; + } + + /// Pointer freed on a different core + #[inline(always)] + pub(crate) fn foreign_free(self: &mut Self, ptr: *mut u8) { + let blk_idx = unsafe { ptr.offset_from(self.page_start) as usize / self.block_size }; + self.thread_free_list.push(blk_idx as u8); + } + + /// Collect pointers freed by other threads + #[inline] + pub(crate) fn collect_foreign_frees(self: &mut Self) { + while let Some(blk) = self.thread_free_list.pop() { + self.free_list.push(blk as u8); + self.used -= 1; + } + } +} + +pub struct Slice { + pub ptr: *mut u8, + pub size: usize, +} + +impl Slice { + pub fn split(self: Self) -> (Slice, Slice) { + let new_size = self.size >> 1; + let slice1 = Slice { + ptr: self.ptr, + size: new_size, + }; + let slice2 = Slice { + ptr: self.ptr.wrapping_add(new_size), + size: new_size, + }; + (slice1, slice2) + } +} diff --git a/src/common/src/memory/pool.rs b/src/common/src/memory/pool.rs new file mode 100644 index 00000000..1640f8d4 --- /dev/null +++ b/src/common/src/memory/pool.rs @@ -0,0 +1,514 @@ +extern crate io_uring; + +use core::slice; +use std::{ + cmp::min, + sync::{ + Arc, Mutex, OnceLock, + atomic::{AtomicBool, AtomicU64, Ordering}, + }, +}; + +use futures::io; +use io_uring::IoUring; + +use crate::memory::{ + arena::Arena, + segment::Segment, + tcache::{TCache, TCacheStats}, +}; + +static FIXED_BUFFER_POOL: OnceLock = OnceLock::new(); + +pub const FIXED_BUFFER_SIZE_BYTES: usize = 1 << 20; +pub const FIXED_BUFFER_BITS: u32 = FIXED_BUFFER_SIZE_BYTES.trailing_zeros(); + +#[derive(Debug)] +pub struct FixedBuffer { + pub ptr: *mut u8, + pub buf_id: usize, + pub bytes: usize, +} + +#[derive(Debug)] +pub struct FixedBufferAllocation { + pub ptr: *mut u8, + pub size: usize, +} + +unsafe impl Send for FixedBufferAllocation {} + +impl AsRef<[u8]> for FixedBufferAllocation { + fn as_ref(&self) -> &[u8] { + unsafe { slice::from_raw_parts(self.ptr, self.size) } + } +} + +impl Drop for FixedBufferAllocation { + fn drop(&mut self) { + FixedBufferPool::free(self.ptr); + } +} + +pub struct FixedBufferPool { + local_caches: Vec>, + arena: Arc>, + start_ptr: *mut u8, + capacity: usize, + registered: AtomicBool, // Whether buffers have been registered + foreign_free: AtomicU64, +} + +unsafe impl Send for FixedBufferPool {} + +unsafe impl Sync for FixedBufferPool {} + +impl FixedBufferPool { + fn new(capacity_mb: usize) -> FixedBufferPool { + log::info!( + "Initializing fixed buffer pool with capacity: {} MB", + capacity_mb + ); + let num_cpus = std::thread::available_parallelism().unwrap(); + let capacity = capacity_mb << 20; + let arena = Self::allocate_arena(capacity.clone()); + let start_ptr = { + let guard = arena.try_lock().unwrap(); + guard.start_ptr() + }; + let mut local_caches = Vec::>::new(); + for i in 0..num_cpus.get() { + local_caches.push(Mutex::new(TCache::new(arena.clone(), i))); + } + FixedBufferPool { + local_caches, + arena, + start_ptr, + capacity, + registered: AtomicBool::new(false), + foreign_free: AtomicU64::new(0), + } + } + + pub fn allocate_arena(capacity: usize) -> Arc> { + Arc::new(Mutex::new(Arena::new(capacity))) + } + + pub fn init(capacity_mb: usize) { + FIXED_BUFFER_POOL.get_or_init(|| FixedBufferPool::new(capacity_mb)); + } + + fn get_thread_local_cache() -> &'static Mutex { + let cpu = unsafe { libc::sched_getcpu() }; + &FIXED_BUFFER_POOL.get().unwrap().local_caches[cpu as usize] + } + + pub fn malloc(size: usize) -> *mut u8 { + let cpu = unsafe { libc::sched_getcpu() }; + let local_cache = Self::get_thread_local_cache(); + let ptr = local_cache.lock().unwrap().allocate(size); + log::debug!("Allocated pointer: {:?}, size: {}, cpu: {}", ptr, size, cpu); + if ptr.is_null() { + log::info!("Unsuccessful allocation of {} bytes", size); + } + ptr + } + + pub fn register_buffers_with_ring(ring: &IoUring) -> io::Result<()> { + let Some(pool) = FIXED_BUFFER_POOL.get() else { + return Err(io::Error::new( + io::ErrorKind::Other, + "fixed buffer pool not initialized", + )); + }; + let mut arena_guard = pool.arena.lock().unwrap(); + let res = arena_guard.register_buffers_with_ring(ring); + if res.is_ok() { + log::info!("Registered buffers with io-uring ring"); + pool.registered.store(true, Ordering::Relaxed); + } + res + } + + pub(crate) fn get_stats(cpu: usize) -> TCacheStats { + let Some(pool) = FIXED_BUFFER_POOL.get() else { + return TCacheStats::new(); + }; + let tcache = pool.local_caches[cpu].lock().unwrap(); + tcache.get_stats() + } + + pub fn get_fixed_buffers(alloc: &FixedBufferAllocation) -> Vec { + let ptr = alloc.ptr; + let size = alloc.size; + let pool = FIXED_BUFFER_POOL.get().unwrap(); + debug_assert!( + ptr >= pool.start_ptr && ptr < pool.start_ptr.wrapping_add(pool.capacity), + "Pointer doesn't lie within the arena" + ); + let mut remaining = size; + let mut vec = Vec::::new(); + let mut current = ptr.clone(); + let mut buffer_id = + (current.wrapping_sub(pool.start_ptr as usize) as usize) >> FIXED_BUFFER_BITS; + while remaining > 0 { + let next_buffer_start = pool + .start_ptr + .wrapping_add((buffer_id + 1) << FIXED_BUFFER_BITS); + let bytes = min(remaining, next_buffer_start as usize - current as usize); + let fb = FixedBuffer { + ptr: current, + buf_id: buffer_id, + bytes: bytes, + }; + current = next_buffer_start; + vec.push(fb); + remaining -= bytes; + buffer_id += 1; + } + vec + } + + #[inline] + pub fn buffers_registered() -> bool { + let pool = FIXED_BUFFER_POOL.get().unwrap(); + pool.registered.load(Ordering::Relaxed) + } + + fn free(ptr: *mut u8) { + let segment_ptr = Segment::get_segment_from_ptr(ptr); + let page_ptr = unsafe { (*segment_ptr).get_page_from_ptr(ptr) }; + let thread_id = unsafe { (*segment_ptr).thread_id }; + log::debug!( + "Freed pointer: {:?}, size: {}, owner thread id: {}", + ptr, + unsafe { (*page_ptr).block_size }, + thread_id + ); + + // If page is local and unused after free, return it to segment + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + if cur_cpu == thread_id { + unsafe { + (*page_ptr).free(ptr); + } + let should_free_page = unsafe { (*page_ptr).is_unused() }; + if should_free_page { + let local_cache = Self::get_thread_local_cache(); + let mut guard = local_cache.lock().unwrap(); + guard.retire_page(page_ptr); + } + } else { + unsafe { + (*page_ptr).foreign_free(ptr); + } + let pool = FIXED_BUFFER_POOL.get().unwrap(); + pool.foreign_free.fetch_add(1, Ordering::Relaxed); + } + } + + pub fn print_stats() { + if FIXED_BUFFER_POOL.get().is_none() { + return; + } + let num_cpus = std::thread::available_parallelism().unwrap(); + let mut agg_stats = TCacheStats::new(); + for i in 0..num_cpus.get() { + let stats = Self::get_stats(i); + agg_stats.allocations_from_arena += stats.allocations_from_arena; + agg_stats.allocations_from_pages += stats.allocations_from_pages; + agg_stats.allocations_from_segment += stats.allocations_from_segment; + agg_stats.fast_allocations += stats.fast_allocations; + agg_stats.pages_retired += stats.pages_retired; + agg_stats.segments_retired += stats.segments_retired; + agg_stats.total_segments_allocated += stats.total_segments_allocated; + agg_stats.unsuccessful_allocations += stats.unsuccessful_allocations; + agg_stats.total_allocations += stats.total_allocations; + } + agg_stats.print(); + } +} + +impl Drop for FixedBufferPool { + fn drop(self: &mut Self) { + let arena = self.arena.lock().unwrap(); + drop(arena); + } +} + +mod tests { + #[allow(unused_imports)] + use std::{ + io::Write, + os::fd::AsRawFd, + ptr::{null, null_mut}, + }; + + use bytes::Bytes; + use io_uring::{IoUring, cqueue, opcode, squeue}; + use libc::rlimit; + use rand::RngCore as _; + + use crate::memory::pool::{FIXED_BUFFER_SIZE_BYTES, FixedBufferAllocation, FixedBufferPool}; + + #[test] + fn test_basic_alloc_and_free() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + let mut ptrs = Vec::<*mut u8>::new(); + for len in buffer_lengths { + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len - 1] = 1; + ptrs.push(ptr); + } + + for ptr in ptrs { + FixedBufferPool::free(ptr); + } + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.fast_allocations, 1); + assert_eq!(stats.pages_retired, 2); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_basic_alloc_and_free_bytes() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096, 4096 * 4]; // 2 different size classes + // let mut ptrs = Vec::<*mut u8>::new(); + let mut bytes_vec = Vec::::new(); + for len in buffer_lengths { + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len - 1] = 1; + let alloc = FixedBufferAllocation { + ptr: ptr, + size: len, + }; + let bytes = Bytes::from_owner(alloc); + bytes_vec.push(bytes); + } + + drop(bytes_vec); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.fast_allocations, 1); + assert_eq!(stats.pages_retired, 2); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_free_from_different_thread() { + FixedBufferPool::init(128); + + let buffer_lengths = [4096, 4096 * 4]; + let mut buffers = Vec::<&mut [u8]>::new(); + for len in buffer_lengths { + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len - 1] = 1; + buffers.push(buffer); + } + + std::thread::spawn(move || { + for buffer in buffers { + let ptr = buffer.as_mut_ptr(); + FixedBufferPool::free(ptr); + } + }); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.allocations_from_segment, 1); + assert_eq!(stats.fast_allocations, 0); + assert_eq!(stats.pages_retired, 0); + assert_eq!(stats.segments_retired, 0); + } + + #[test] + fn test_large_alloc_and_free() { + FixedBufferPool::init(128); + let len = 1024 * 1024; // 1 MB + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len - 1] = 1; + FixedBufferPool::free(ptr); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 1); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_large_alloc_and_free2() { + FixedBufferPool::init(128); + let len = 3 * 1024 * 1024; // 1 MB + let ptr = FixedBufferPool::malloc(len); + assert_ne!(ptr, null_mut()); + // 4096 byte alignment is necessary for direct IO + assert_eq!(ptr as usize % 4096, 0); + let buffer = unsafe { std::slice::from_raw_parts_mut(ptr, len) }; + buffer[0] = 1; + buffer[len - 1] = 1; + FixedBufferPool::free(ptr); + + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 1); + assert_eq!(stats.segments_retired, 1); + } + + #[test] + fn test_very_large_alloc_fails() { + FixedBufferPool::init(128); + let len = 32 * 1024 * 1024; // 32 MB + let ptr = FixedBufferPool::malloc(len); + + assert_eq!(ptr, null_mut()); + } + + #[test] + fn test_with_uring_basic() { + let mut rlimit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + unsafe { + libc::getrlimit(libc::RLIMIT_MEMLOCK, &mut rlimit); + } + assert!( + 64 * 1024 <= rlimit.rlim_max, + "rlimit.MEMLOCK should be at least 64 MB to test the fixed-buffer pool. Current rlimit is: {} KB", + rlimit.rlim_max + ); + FixedBufferPool::init(64); + + let mut ring = IoUring::::builder() + .build(32) + .unwrap(); + let res = FixedBufferPool::register_buffers_with_ring(&ring); + assert!(res.is_ok()); + + const LEN: usize = 1 << 20; // 1 MB + let mut file = tempfile::tempfile().unwrap(); + let ptr = FixedBufferPool::malloc(LEN); + assert_ne!(ptr, null_mut()); + let alloc = FixedBufferAllocation { + ptr: ptr, + size: LEN, + }; + let buffers = FixedBufferPool::get_fixed_buffers(&alloc); + assert!(buffers.len() <= (LEN / FIXED_BUFFER_SIZE_BYTES) + 1); + + let mut total = 0; + for fixed_buffer in buffers.iter().as_ref() { + total += fixed_buffer.bytes; + } + assert_eq!(total, LEN); + + let mut random_bytes = [0u8; LEN]; + let mut rng = rand::rng(); + rng.fill_bytes(&mut random_bytes); + let mut res = file.write(&random_bytes); + assert!(res.is_ok(), "Failed to write to temp file"); + assert_eq!(res.unwrap(), LEN, "Failed to write to temp file"); + + let mut file_offset = 0; + for fixed_buffer in buffers.iter().as_ref() { + let sqe = opcode::ReadFixed::new( + io_uring::types::Fd(file.as_raw_fd()), + fixed_buffer.ptr, + fixed_buffer.bytes as u32, + fixed_buffer.buf_id as u16, + ) + .offset(file_offset) + .build(); + file_offset += fixed_buffer.bytes as u64; + let mut sq = ring.submission(); + let res = unsafe { sq.push(&sqe) }; + assert!(res.is_ok(), "Failed to submit to io uring"); + sq.sync(); + } + + res = ring.submit_and_wait(buffers.len()); + assert!(res.is_ok(), "Failed to submit"); + let mut total_bytes_read = 0; + + for _i in 0..buffers.len() { + let mut cq = ring.completion(); + let cqe = cq.next(); + assert!(cqe.is_some()); + let res = cqe.as_ref().unwrap().result(); + assert!( + res > 0, + "Read failed: {}", + std::io::Error::from_raw_os_error(-cqe.unwrap().result()) + ); + total_bytes_read += res as usize; + } + assert_eq!( + total_bytes_read, LEN, + "Expected to read {} bytes, but read {}", + LEN, total_bytes_read + ); + let buffer = Bytes::from_owner(alloc); + assert_eq!(buffer, &random_bytes[..]); + } + + #[test] + fn test_edge_case() { + FixedBufferPool::init(128); + let len = 4 * 1024; + let ptr1 = FixedBufferPool::malloc(len); + let ptr2 = FixedBufferPool::malloc(len << 1); + let ptr3 = FixedBufferPool::malloc(len << 2); + let ptr4 = FixedBufferPool::malloc(len << 4); + + FixedBufferPool::free(ptr1); + FixedBufferPool::free(ptr3); + FixedBufferPool::free(ptr2); + FixedBufferPool::free(ptr4); + let cur_cpu = unsafe { libc::sched_getcpu() as usize }; + let stats = FixedBufferPool::get_stats(cur_cpu); + + assert_eq!(stats.allocations_from_arena, 1); + assert_eq!(stats.pages_retired, 4); + assert_eq!(stats.segments_retired, 1); + // assert_eq + } +} diff --git a/src/common/src/memory/segment.rs b/src/common/src/memory/segment.rs new file mode 100644 index 00000000..e5751d18 --- /dev/null +++ b/src/common/src/memory/segment.rs @@ -0,0 +1,181 @@ +use std::ptr::{null_mut, write}; + +use crate::memory::page::{PAGE_SIZE, Page, Slice}; + +pub const SEGMENT_SIZE: usize = 32 * 1024 * 1024; // 32 MB +pub const SEGMENT_SIZE_BITS: usize = SEGMENT_SIZE.ilog2() as usize; + +// The metadata is stored at the beginning of the slice. So we don't get the entirety of it for pages +pub const PAGES_PER_SEGMENT: usize = + (SEGMENT_SIZE - 3 * size_of::()) / (PAGE_SIZE + size_of::()); + +pub struct Segment { + pub(crate) allocated: usize, + pub(crate) num_slices: usize, + pub(crate) pages: [Page; PAGES_PER_SEGMENT], + pub(crate) thread_id: usize, +} + +impl Segment { + pub fn new_from_slice(slice: Slice) -> *mut Segment { + let segment_ptr = slice.ptr as *mut Segment; + let segment_end_ptr = slice.ptr.wrapping_add(SEGMENT_SIZE); + let mut start_ptr = unsafe { segment_end_ptr.sub(PAGES_PER_SEGMENT * PAGE_SIZE) }; + unsafe { + let pages_ptr = (*segment_ptr).pages.as_mut_ptr(); + (*segment_ptr).allocated = 0; + (*segment_ptr).num_slices = PAGES_PER_SEGMENT; + for i in 0..PAGES_PER_SEGMENT { + // Use ptr::write after dropping to initialize new Pages + write( + pages_ptr.add(i), + Page::from_slice(Slice { + ptr: start_ptr, + size: PAGE_SIZE, + }), + ); + start_ptr = start_ptr.wrapping_add(PAGE_SIZE); + } + } + segment_ptr + } + + #[inline] + pub fn full(self: &mut Self) -> bool { + self.allocated == self.num_slices + } + + pub fn reset(self: &mut Self) -> () { + for page in self.pages.iter_mut() { + page.slice_count = 1; + page.slice_offset = 0; + } + } + + pub fn get_segment_from_ptr(ptr: *mut u8) -> *mut Segment { + let aligned_ptr = (ptr as usize >> SEGMENT_SIZE_BITS) << SEGMENT_SIZE_BITS; + aligned_ptr as *mut Segment + } + + pub fn get_page_from_ptr(self: &mut Self, ptr: *mut u8) -> *mut Page { + let base_page_ptr = self.pages[0].page_start; + debug_assert!(ptr >= base_page_ptr); + let index = unsafe { ptr.sub(base_page_ptr as usize) as usize / PAGE_SIZE }; + debug_assert!(index < PAGES_PER_SEGMENT); + &mut self.pages[index] as *mut Page + } + + /** + * Split `page` into 2, with the first partition having `num_slices` pages. + * Returns a pointer to the first page of the second slice. + */ + pub fn split_page(self: &mut Self, page: *mut Page, num_slices: usize) -> *mut Page { + debug_assert_ne!(page, null_mut()); + let base_page_ptr = unsafe { (*page).page_start }; + let base_segment_page_ptr = self.pages[0].page_start; + debug_assert!(base_page_ptr >= base_segment_page_ptr); + let index = + unsafe { base_page_ptr.sub(base_segment_page_ptr as usize) as usize / PAGE_SIZE }; + + // Read original slice_count before modifying anything + let original_slice_count = unsafe { (*page).slice_count }; + debug_assert!( + num_slices > 0 && num_slices < original_slice_count, + "num_slices: {}, slice_count: {}", + num_slices, + original_slice_count + ); + debug_assert!(index + original_slice_count <= PAGES_PER_SEGMENT); + // log::info!("[thread_id: {}, segment_id: {}] Splitting page with {} slices", self.thread_id, self.segment_id, original_slice_count); + + /* + * ASSUMPTION: Pointer to the beginning of the slice is passed in. + * We don't need to modify all the intermediate pages while splitting. Only update the following: + * - slice_offset for the first page of each slice (should be 0). + * - slice_offset for the last page of each slice. + * - slice_count for the first page of each slice. + */ + // Use raw pointers to avoid borrow checker issues with multiple mutable references + unsafe { + // Update slice1: the original slice becomes the first part + (*page).slice_offset = 0; + (*page).slice_count = num_slices; + + let pages_ptr = self.pages.as_mut_ptr(); + let last_page_in_slice1 = pages_ptr.add(index + num_slices - 1); + (*last_page_in_slice1).slice_offset = num_slices - 1; + + // Update slice2: the remaining pages become the second slice + let slice2_count = original_slice_count - num_slices; + let slice2 = pages_ptr.add(index + num_slices); + (*slice2).slice_offset = 0; + (*slice2).slice_count = slice2_count; + assert!( + (*slice2).block_size == 0, + "block size: {}", + (*slice2).block_size + ); + + let last_page_in_slice2 = pages_ptr.add(index + original_slice_count - 1); + (*last_page_in_slice2).slice_offset = slice2_count - 1; + + slice2 + } + } + + pub fn coalesce_slices(self: &mut Self, left_slice: &mut Page, right_slice: &mut Page) { + debug_assert!( + left_slice.page_start >= self.pages[0].page_start + && left_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start + ); + debug_assert!( + right_slice.page_start >= self.pages[0].page_start + && right_slice.page_start <= self.pages[PAGES_PER_SEGMENT - 1].page_start + ); + + let left_slice_idx = + (left_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + let right_slice_idx = + (right_slice.page_start as usize - self.pages[0].page_start as usize) / PAGE_SIZE; + debug_assert!( + left_slice_idx + left_slice.slice_count == right_slice_idx, + "left slice count: {}, left slice idx: {}, right slice idx: {}, thread_id: {}", + left_slice.slice_count, + left_slice_idx, + right_slice_idx, + self.thread_id + ); + debug_assert!(right_slice_idx + right_slice.slice_count <= PAGES_PER_SEGMENT); + + /* + * ASSUMPTION: Pointer to the beginning of the slice is passed in free(). + * We don't need to modify all the intermediate pages while coalescing. Only update the following: + * - slice_count for the first page of the combined slice (left_slice). + * - slice_offset for the last page in the combined slice. + * Note: right_slice becomes an intermediate page after merging, so we don't update its metadata. + */ + left_slice.slice_offset = 0; + left_slice.slice_count += right_slice.slice_count; + + let last_page = &mut self.pages[left_slice_idx + left_slice.slice_count - 1]; + last_page.slice_offset = left_slice.slice_count - 1; + } + + pub fn check_valid_segment(self: &mut Self) { + let mut idx = 0; + while idx < PAGES_PER_SEGMENT { + let page = &mut self.pages[idx]; + debug_assert!(page.slice_offset == 0 && idx + page.slice_count <= PAGES_PER_SEGMENT); + let slice_count = page.slice_count; + let last_page_in_slice = &mut self.pages[idx + slice_count - 1]; + debug_assert!( + last_page_in_slice.slice_offset == slice_count - 1, + "slice count: {}, last page slice offset: {}, thread_id: {}", + slice_count, + last_page_in_slice.slice_offset, + self.thread_id + ); + idx += slice_count; + } + } +} diff --git a/src/common/src/memory/tcache.rs b/src/common/src/memory/tcache.rs new file mode 100644 index 00000000..ec2e0701 --- /dev/null +++ b/src/common/src/memory/tcache.rs @@ -0,0 +1,502 @@ +use std::{ + ptr::null_mut, + sync::{Arc, Mutex}, +}; + +use crate::memory::{ + arena::Arena, + page::{PAGE_SIZE, Page}, + segment::{PAGES_PER_SEGMENT, SEGMENT_SIZE, Segment}, +}; + +const SIZE_CLASSES: &'static [usize] = &[4 << 10, 8 << 10, 16 << 10, 32 << 10, 64 << 10]; + +const NUM_SIZE_CLASSES: usize = SIZE_CLASSES.len(); + +pub(crate) const MIN_SIZE_FROM_PAGES: usize = SIZE_CLASSES[0]; + +const SEGMENT_BINS: usize = (SEGMENT_SIZE / PAGE_SIZE).ilog2() as usize + 1; + +#[derive(Default, Clone)] +pub(crate) struct TCacheStats { + // Allocation stats + pub(crate) total_allocations: usize, + pub(crate) unsuccessful_allocations: usize, + pub(crate) total_segments_allocated: usize, + pub(crate) fast_allocations: usize, // Allocations from self.free_pages + pub(crate) allocations_from_pages: usize, // Allocations from self.used_pages + pub(crate) allocations_from_segment: usize, + pub(crate) allocations_from_arena: usize, + + // Deallocation stats + pub(crate) pages_retired: usize, + pub(crate) segments_retired: usize, + // TODO(): Add more stats such as number of local frees and frees from another thread +} + +impl TCacheStats { + pub(crate) fn new() -> TCacheStats { + TCacheStats::default() + } + + #[allow(unused)] + pub(crate) fn print(self: &Self) { + println!("Total allocations: {}", self.total_allocations); + println!( + "Unsuccessful allocations: {}", + self.unsuccessful_allocations + ); + println!("Fast allocations: {}", self.fast_allocations); + println!("Allocations from pages: {}", self.allocations_from_pages); + println!( + "Allocations from segment: {}", + self.allocations_from_segment + ); + println!("Allocations from arena: {}", self.allocations_from_arena); + println!("Pages retired: {}", self.pages_retired); + println!("Segments retired: {}", self.segments_retired); + } +} + +#[derive(Copy, Clone)] +struct Span { + pub(crate) first: *mut Page, + pub(crate) last: *mut Page, +} + +pub(crate) struct TCache { + free_pages: [*mut Page; NUM_SIZE_CLASSES], + // TODO(): Make this a linked list + // Last size class holds slices that serve large allocations (>64KB) + used_pages: [Vec<*mut Page>; NUM_SIZE_CLASSES + 1], + // TODO: Use a linked list for O(1) deletion + spans: [Span; SEGMENT_BINS], + arena: Arc>, + thread_id: usize, + stats: TCacheStats, +} + +unsafe impl Send for TCache {} +unsafe impl Sync for TCache {} + +impl TCache { + pub(crate) fn new(arena: Arc>, thread_id: usize) -> TCache { + TCache { + free_pages: [const { null_mut() }; NUM_SIZE_CLASSES], + used_pages: [const { Vec::<*mut Page>::new() }; NUM_SIZE_CLASSES + 1], + spans: [Span { + first: null_mut(), + last: null_mut(), + }; SEGMENT_BINS], + arena: arena.clone(), + thread_id, + stats: TCacheStats::new(), + } + } + + #[inline] + fn get_size_class(size: usize) -> usize { + if size <= MIN_SIZE_FROM_PAGES { + return 0; + } + (size.next_power_of_two() / MIN_SIZE_FROM_PAGES).trailing_zeros() as usize + } + + /** + * Get the smallest bin which can hold contiguous runs of `slice_count` pages + */ + #[inline] + fn get_span_idx_from_slice_count(slice_count: usize) -> usize { + (slice_count + 1).next_power_of_two().trailing_zeros() as usize - 1usize + } + + fn add_slice_to_span(span: &mut Span, slice: &mut Page) { + if span.first == null_mut() { + debug_assert!(span.last == null_mut()); + span.first = slice as *mut Page; + span.last = slice as *mut Page; + return; + } + debug_assert!(span.last != null_mut()); + unsafe { + (*span.last).next_page = slice; + } + slice.previous_page = span.last; + span.last = slice as *mut Page; + } + + fn remove_slice_from_span(self: &mut Self, slice: &mut Page) { + let span_idx = Self::get_span_idx_from_slice_count(slice.slice_count); + let span = &mut self.spans[span_idx]; + if span.first == slice as *mut Page { + span.first = slice.next_page; + if slice.next_page != null_mut() { + unsafe { + (*slice.next_page).previous_page = null_mut(); + } + } else { + span.last = null_mut(); + } + } else if span.last == slice as *mut Page { + span.last = slice.previous_page; + debug_assert!(slice.previous_page != null_mut()); + unsafe { + (*span.last).next_page = null_mut(); + } + } else { + debug_assert!(slice.previous_page != null_mut()); + debug_assert!(slice.next_page != null_mut()); + unsafe { + (*slice.previous_page).next_page = slice.next_page; + } + unsafe { + (*slice.next_page).previous_page = slice.previous_page; + } + } + + slice.next_page = null_mut(); + slice.previous_page = null_mut(); + } + + fn retire_segment(self: &mut Self, segment: *mut Segment) { + // log::info!("Retiring segment from thread with id: {}", self.thread_id); + unsafe { + (*segment).check_valid_segment(); + } + self.stats.segments_retired += 1; + let pages = unsafe { &mut (*segment).pages }; + let mut slice_idx: usize = 0; + while slice_idx < PAGES_PER_SEGMENT { + if pages[slice_idx].block_size == 0 { + self.remove_slice_from_span(&mut pages[slice_idx]); + } + slice_idx += pages[slice_idx].slice_count; + } + let mut guard = self.arena.lock().unwrap(); + guard.retire_segment(segment); + } + + fn remove_page_from_used_queue(self: &mut Self, page_ptr: *mut Page) { + let mut size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); + if size_class >= NUM_SIZE_CLASSES { + size_class = NUM_SIZE_CLASSES; + } + for i in 0..self.used_pages[size_class].len() { + if self.used_pages[size_class][i] == page_ptr { + self.used_pages[size_class].remove(i); + return; + } + } + } + + fn remove_page_from_free_queue(self: &mut Self, page_ptr: *mut Page) { + let size_class = Self::get_size_class(unsafe { (*page_ptr).block_size }); + if size_class < NUM_SIZE_CLASSES && self.free_pages[size_class] == page_ptr { + self.free_pages[size_class] = null_mut(); + } + } + + pub(crate) fn retire_page(self: &mut Self, page: *mut Page) { + assert!(unsafe { (*page).is_unused() }); + self.stats.pages_retired += 1; + self.remove_page_from_used_queue(page); + self.remove_page_from_free_queue(page); + let page_ref = unsafe { &mut (*page) }; + + let segment_ptr = Segment::get_segment_from_ptr(page as *mut u8); + let segment = unsafe { &mut *segment_ptr }; + segment.allocated -= page_ref.slice_count; + if segment.allocated == 0 { + // Return segment to arena + self.retire_segment(segment_ptr); + return; + } + page_ref.block_size = 0; + + let next_slice = page.wrapping_add(page_ref.slice_count); + if next_slice <= (&mut segment.pages[PAGES_PER_SEGMENT - 1]) as *mut Page { + let next_slice_ref = unsafe { &mut (*next_slice) }; + if next_slice_ref.block_size == 0 { + log::debug!( + "[thread_id: {}] Merging released slice with next slice. Slice count of next slice: {}", + self.thread_id, + next_slice_ref.slice_count + ); + // Page is not in use, remove it + self.remove_slice_from_span(next_slice_ref); + segment.coalesce_slices(page_ref, unsafe { &mut (*next_slice) }); + } + } + + let mut merged_with_prev = false; + + if unsafe { page.offset_from(&mut segment.pages[0] as *mut Page) > 0 } { + let mut prev_slice = page.wrapping_sub(1); + prev_slice = prev_slice.wrapping_sub(unsafe { (*prev_slice).slice_offset }); + let prev_slice_ref = unsafe { &mut (*prev_slice) }; + if prev_slice_ref.block_size == 0 { + // Merge with the previous slice + log::debug!( + "[thread_id: {}] Merging slice with previous slice. Slice count of previous slice: {}", + self.thread_id, + prev_slice_ref.slice_count + ); + self.remove_slice_from_span(prev_slice_ref); + segment.coalesce_slices(prev_slice_ref, page_ref); + let span_idx = Self::get_span_idx_from_slice_count(prev_slice_ref.slice_count); + Self::add_slice_to_span(&mut self.spans[span_idx], prev_slice_ref); + log::debug!( + "[thread_id: {}] Added page with slice count {} to span with index: {}", + self.thread_id, + prev_slice_ref.slice_count, + span_idx + ); + merged_with_prev = true; + } + } + if !merged_with_prev { + let span_idx = Self::get_span_idx_from_slice_count(page_ref.slice_count); + Self::add_slice_to_span(&mut self.spans[span_idx], page_ref); + log::debug!( + "[thread_id: {}] Added page with slice count {} to span with index: {}", + self.thread_id, + page_ref.slice_count, + span_idx + ); + } + segment.check_valid_segment(); + } + + fn cleanup_pages(self: &mut Self) { + for i in 0..self.free_pages.len() { + let page = self.free_pages[i]; + if page != null_mut() { + unsafe { + (*page).collect_foreign_frees(); + if (*page).is_unused() { + self.retire_page(page); + self.free_pages[i] = null_mut(); + } + } + } + } + for i in 0..self.used_pages.len() { + let mut page_idx = 0; + while page_idx < self.used_pages[i].len() { + let page = self.used_pages[i][page_idx]; + unsafe { + (*page).collect_foreign_frees(); + if (*page).is_unused() { + self.retire_page(page); + } else { + page_idx += 1; + } + } + } + } + } + + fn find_page_from_used(self: &mut Self, bin: usize) -> *mut u8 { + for i in 0..self.used_pages[bin].len() { + unsafe { + (*self.used_pages[bin][i]).collect_foreign_frees(); + if (*self.used_pages[bin][i]).is_full() { + continue; + } + let page = self.used_pages[bin].remove(i); + let block = (*page).get_free_block(); + self.free_pages[bin] = page; + return block; + } + } + null_mut() + } + + fn find_page_from_spans( + self: &mut Self, + num_slices_required: usize, + block_size: usize, + ) -> *mut Page { + debug_assert!(block_size >= MIN_SIZE_FROM_PAGES); + let min_bin = Self::get_span_idx_from_slice_count(num_slices_required); + for i in min_bin..SEGMENT_BINS { + // let span = &mut self.spans[i]; + let mut slice = self.spans[i].first; + while slice != null_mut() { + let num_slices_original = unsafe { (*slice).slice_count }; + debug_assert!(num_slices_original >= 1 << i); + if num_slices_original < num_slices_required { + unsafe { + slice = (*slice).next_page; + } + continue; + } + self.remove_slice_from_span(unsafe { &mut *slice }); + + let segment = Segment::get_segment_from_ptr(slice as *mut u8); + unsafe { + (*segment).allocated += num_slices_required; + } + if num_slices_original > num_slices_required { + // split slice + let next_slice = unsafe { (*segment).split_page(slice, num_slices_required) }; + debug_assert!(unsafe { (*slice).slice_count == num_slices_required }); + #[cfg(debug_assertions)] + unsafe { + (*segment).check_valid_segment() + }; + let bin = Self::get_span_idx_from_slice_count( + num_slices_original - num_slices_required, + ); + Self::add_slice_to_span(&mut self.spans[bin], unsafe { &mut (*next_slice) }); + log::debug!( + "[thread_id: {}] Added page with slice count {} to span with index: {}", + self.thread_id, + num_slices_original - num_slices_required, + bin + ); + } + unsafe { + (*slice).set_block_size(block_size); + } + return slice; + } + } + null_mut() + } + + fn add_segment_to_spans(self: &mut Self, segment: *mut Segment) { + let segment_ref = unsafe { &mut (*segment) }; + let slice_count = segment_ref.num_slices; + let span_idx = Self::get_span_idx_from_slice_count(slice_count); + let page = &mut segment_ref.pages[0]; + page.slice_count = slice_count; + page.slice_offset = 0; + Self::add_slice_to_span(&mut self.spans[span_idx], page); + + let last_page = &mut segment_ref.pages[PAGES_PER_SEGMENT - 1]; + last_page.slice_offset = PAGES_PER_SEGMENT - 1; + } + + fn allocate_segment_from_arena(self: &mut Self, thread_id: usize) -> bool { + self.stats.total_segments_allocated += 1; + let segment_opt = { + let mut guard = self.arena.lock().unwrap(); + guard.allocate_segment(SEGMENT_SIZE) + }; + if segment_opt.is_none() { + return false; + } + // log::info!("Allocating segment to thread with id: {}", thread_id); + unsafe { + (*segment_opt.unwrap()).thread_id = thread_id; + } + + self.add_segment_to_spans(segment_opt.unwrap()); + true + } + + fn allocate_large(self: &mut Self, size: usize) -> *mut u8 { + // Directly get page from segment + let num_pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + let block_size = num_pages * PAGE_SIZE; + let mut free_page = self.find_page_from_spans(num_pages, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + self.used_pages[NUM_SIZE_CLASSES].push(free_page); + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block; + } + self.cleanup_pages(); + // Retry after cleanup + free_page = self.find_page_from_spans(num_pages, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + self.used_pages[NUM_SIZE_CLASSES].push(free_page); + let free_block = unsafe { (*free_page).get_free_block() }; + return free_block; + } + + let res = self.allocate_segment_from_arena(self.thread_id); + if !res { + self.stats.unsuccessful_allocations += 1; + return null_mut(); + } + self.stats.allocations_from_arena += 1; + free_page = self.find_page_from_spans(num_pages, block_size); + if free_page == null_mut() { + self.stats.unsuccessful_allocations += 1; + return null_mut(); + } + self.used_pages[NUM_SIZE_CLASSES].push(free_page); + debug_assert_ne!(free_page, null_mut()); + let free_block = unsafe { (*free_page).get_free_block() }; + debug_assert_ne!(free_block, null_mut()); + return free_block; + } + + pub(crate) fn allocate(self: &mut Self, size: usize) -> *mut u8 { + self.stats.total_allocations = self.stats.total_allocations.wrapping_add(1); + if self.stats.total_allocations & 0x7f == 0 { + // Periodically cleanup pages + self.cleanup_pages(); + } + if size > PAGE_SIZE { + return self.allocate_large(size); + } + let size_class = Self::get_size_class(size); + debug_assert!(size_class < NUM_SIZE_CLASSES); + + let block_size = SIZE_CLASSES[size_class]; + let mut free_page = self.free_pages[size_class]; + if !free_page.is_null() { + debug_assert_eq!(unsafe { (*free_page).block_size }, block_size); + // allocate from free page + let page = free_page.clone(); + unsafe { + if !(*page).is_full() { + self.stats.fast_allocations += 1; + return (*page).get_free_block(); + } else { + // Try collecting frees from other threads and retrying + (*page).collect_foreign_frees(); + if !(*page).is_full() { + return (*page).get_free_block(); + } + self.used_pages[size_class].push(page); + self.free_pages[size_class] = null_mut(); + } + } + } + let block = self.find_page_from_used(size_class); + if !block.is_null() { + self.stats.allocations_from_pages += 1; + return block; + } + free_page = self.find_page_from_spans(1, block_size); + if free_page != null_mut() { + self.stats.allocations_from_segment += 1; + let free_block = unsafe { (*free_page).get_free_block() }; + debug_assert_ne!(free_block, null_mut()); + self.free_pages[size_class] = free_page; + return free_block; + } + // No space available in segments, allocate a new one + let res = self.allocate_segment_from_arena(self.thread_id); + if !res { + self.stats.unsuccessful_allocations += 1; + return null_mut(); + } + self.stats.allocations_from_arena += 1; + free_page = self.find_page_from_spans(1, block_size); + assert_ne!(free_page, null_mut()); + let free_block = unsafe { (*free_page).get_free_block() }; + self.free_pages[size_class] = free_page; + return free_block; + } + + #[allow(unused)] + pub(crate) fn get_stats(self: &Self) -> TCacheStats { + self.stats.clone() + } +} diff --git a/src/local/src/lib.rs b/src/local/src/lib.rs index de494508..36bd5054 100644 --- a/src/local/src/lib.rs +++ b/src/local/src/lib.rs @@ -73,6 +73,8 @@ pub struct LiquidCacheLocalBuilder { io_mode: IoMode, eager_shredding: bool, + + fixed_buffer_pool_size_mb: usize, } impl Default for LiquidCacheLocalBuilder { @@ -87,6 +89,7 @@ impl Default for LiquidCacheLocalBuilder { span: fastrace::Span::enter_with_local_parent("liquid_cache_local_builder"), io_mode: IoMode::StdBlocking, eager_shredding: true, + fixed_buffer_pool_size_mb: 0, } } } @@ -151,6 +154,12 @@ impl LiquidCacheLocalBuilder { self } + /// Set size of fixed buffer pool + pub fn with_fixed_buffer_pool_size_mb(mut self, fixed_buffer_pool_size_mb: usize) -> Self { + self.fixed_buffer_pool_size_mb = fixed_buffer_pool_size_mb; + self + } + /// Build a SessionContext with liquid cache configured /// Returns the SessionContext and the liquid cache reference pub fn build( @@ -175,6 +184,7 @@ impl LiquidCacheLocalBuilder { self.squeeze_policy, self.hydration_policy, self.io_mode, + self.fixed_buffer_pool_size_mb, ); let cache_ref = Arc::new(cache); diff --git a/src/parquet/Cargo.toml b/src/parquet/Cargo.toml index 25eb39c2..6fbcde64 100644 --- a/src/parquet/Cargo.toml +++ b/src/parquet/Cargo.toml @@ -27,17 +27,25 @@ parquet-variant-compute = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } itertools = "0.14.0" +async-executor = "1.13.3" +usdt = "0.6" +rand = "0.9.2" [target.'cfg(target_os = "linux")'.dependencies] io-uring = "0.7.10" libc = "0.2.177" crossbeam-channel = "0.5.15" crossbeam-queue = "0.3.11" +async-task = "4" +concurrent-queue = "2" +futures-lite = "2" +pin-project-lite = "0.2" +slab = "0.4" +fastrand = "2" [dev-dependencies] tempfile = "3.23.0" divan = "0.1" -rand = "0.9.2" shuttle = "0.8.1" tokio-test = "0.4" serde_json = { workspace = true } diff --git a/src/parquet/bench/filter_pushdown.rs b/src/parquet/bench/filter_pushdown.rs index 5504de22..49d0172b 100644 --- a/src/parquet/bench/filter_pushdown.rs +++ b/src/parquet/bench/filter_pushdown.rs @@ -49,6 +49,7 @@ fn setup_cache(tmp_dir: &TempDir) -> Arc { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let field = Arc::new(Field::new("test_column", DataType::Int32, false)); let schema = Arc::new(Schema::new(vec![field.clone()])); diff --git a/src/parquet/src/cache/mod.rs b/src/parquet/src/cache/mod.rs index 4abc790b..f73ffc5d 100644 --- a/src/parquet/src/cache/mod.rs +++ b/src/parquet/src/cache/mod.rs @@ -248,9 +248,14 @@ impl LiquidCacheParquet { squeeze_policy: Box, hydration_policy: Box, io_mode: IoMode, + fixed_buffer_pool_size_mb: usize, ) -> Self { assert!(batch_size.is_power_of_two()); - let io_context = Arc::new(ParquetIoContext::new(cache_dir.clone(), io_mode)); + let io_context = Arc::new(ParquetIoContext::new( + cache_dir.clone(), + io_mode, + fixed_buffer_pool_size_mb, + )); let cache_storage_builder = LiquidCacheBuilder::new() .with_batch_size(batch_size) .with_max_cache_bytes(max_cache_bytes) @@ -387,6 +392,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let file = cache.register_or_get_file("test".to_string(), schema); file.create_row_group(0, vec![]) diff --git a/src/parquet/src/cache/stats.rs b/src/parquet/src/cache/stats.rs index b82257eb..a1355cd8 100644 --- a/src/parquet/src/cache/stats.rs +++ b/src/parquet/src/cache/stats.rs @@ -193,6 +193,7 @@ mod tests { Box::new(Evict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let fields: Vec = (0..8) .map(|i| Field::new(format!("test_{i}"), DataType::Int32, false)) diff --git a/src/parquet/src/io/io_backend.rs b/src/parquet/src/io/io_backend.rs index f915ded3..8baeb524 100644 --- a/src/parquet/src/io/io_backend.rs +++ b/src/parquet/src/io/io_backend.rs @@ -16,7 +16,7 @@ pub(super) async fn read( IoMode::Uring => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::read(path, range, false).await + super::io_uring::thread_pool_uring::read(path, range, false, false).await } #[cfg(not(target_os = "linux"))] { @@ -38,7 +38,7 @@ pub(super) async fn read( IoMode::UringDirect => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::read(path, range, true).await + super::io_uring::thread_pool_uring::read(path, range, true, true).await } #[cfg(not(target_os = "linux"))] { @@ -55,6 +55,16 @@ pub(super) async fn read( panic!("io_uring modes are only supported on Linux"); } } + IoMode::UringNonBlocking => { + #[cfg(target_os = "linux")] + { + super::io_uring::work_stealing::read(path, range).await + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_uring modes are only supported on Linux"); + } + } IoMode::UringMultiAsync => { #[cfg(target_os = "linux")] { @@ -79,10 +89,20 @@ pub(super) async fn write( data: Bytes, ) -> Result<(), std::io::Error> { match io_mode { - IoMode::Uring | IoMode::UringDirect => { + IoMode::Uring => { + #[cfg(target_os = "linux")] + { + super::io_uring::thread_pool_uring::write(path, &data, false, false).await + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_uring modes are only supported on Linux"); + } + } + IoMode::UringDirect => { #[cfg(target_os = "linux")] { - super::io_uring::thread_pool_uring::write(path, &data).await + super::io_uring::thread_pool_uring::write(path, &data, true, false).await } #[cfg(not(target_os = "linux"))] { @@ -92,7 +112,7 @@ pub(super) async fn write( IoMode::UringShared => { #[cfg(target_os = "linux")] { - super::io_uring::single_uring::write(path, &data).await + super::io_uring::single_uring::write(path, &data, false).await } #[cfg(not(target_os = "linux"))] { @@ -102,7 +122,17 @@ pub(super) async fn write( IoMode::UringBlocking => { #[cfg(target_os = "linux")] { - super::io_uring::multi_blocking_uring::write(path, &data) + super::io_uring::multi_blocking_uring::write(path, &data, false) + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_uring modes are only supported on Linux"); + } + } + IoMode::UringNonBlocking => { + #[cfg(target_os = "linux")] + { + super::io_uring::work_stealing::write(path, &data).await } #[cfg(not(target_os = "linux"))] { @@ -112,7 +142,7 @@ pub(super) async fn write( IoMode::UringMultiAsync => { #[cfg(target_os = "linux")] { - super::io_uring::multi_async_uring::write(path, &data).await + super::io_uring::multi_async_uring::write(path, &data, false).await } #[cfg(not(target_os = "linux"))] { diff --git a/src/parquet/src/io/io_uring/local_runtime.rs b/src/parquet/src/io/io_uring/local_runtime.rs new file mode 100644 index 00000000..a7bc5b70 --- /dev/null +++ b/src/parquet/src/io/io_uring/local_runtime.rs @@ -0,0 +1,502 @@ +use std::{ + cell::RefCell, + collections::VecDeque, + fs::OpenOptions, + ops::Range, + os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, + path::PathBuf, + pin::Pin, + rc::Rc, + sync::{ + OnceLock, + atomic::{AtomicBool, Ordering}, + }, + task::{Context, Poll, Waker}, + thread::{self, JoinHandle}, + time::{Duration, Instant}, +}; + +use async_executor::LocalExecutor; +use bytes::Bytes; +use futures::Future; +use io_uring::{EnterFlags, IoUring, cqueue, squeue}; +use liquid_cache_common::memory::pool::FixedBufferPool; +use rand::Rng; +use tokio::sync::oneshot; + +use crate::io::io_uring::tasks::{FileReadTask, FileWriteTask, FixedFileReadTask, IoTask}; + +#[usdt::provider] +mod liquid_uring_runtime { + fn io_submission(id: u64) {} + fn io_completion(id: u64) {} +} + +fn ensure_uring_trace_registered() -> bool { + static REGISTERED: OnceLock = OnceLock::new(); + *REGISTERED.get_or_init(|| match usdt::register_probes() { + Ok(()) => true, + Err(err) => { + log::debug!("failed to register io_uring runtime USDT probes: {err}"); + false + } + }) +} + +const URING_NUM_ENTRIES: u32 = 256; + +const MAX_CONCURRENT_TASKS: u32 = 128; + +type ExecutorTask = Pin + Send>>; + +/// A dedicated runtime for io_uring, in which the worker threads are responsible for submitting IO and polling for completions. +/// Each worker thread has its own ring, and an executor which is responsible for scheduling. +pub struct UringExecutor { + _workers: Vec>, + /// One sender per worker; tasks are submitted to a worker's dedicated channel. + senders: Vec>, +} + +impl UringExecutor { + /// Spawn worker threads; each worker has its own channel to receive tasks. + pub fn new(num_threads: usize) -> UringExecutor { + let mut workers = Vec::new(); + let mut senders = Vec::with_capacity(num_threads); + for i in 0..num_threads { + let (sender, receiver) = crossbeam_channel::unbounded::(); + senders.push(sender); + let worker = thread::Builder::new() + .name(std::format!("lc-io-worker-{}", i)) + .spawn(move || { + worker_main_loop(receiver); + }) + .expect("Failed to spawn IO runtime worker"); + workers.push(worker); + } + UringExecutor { + _workers: workers, + senders, + } + } + + /// Spawns a task in the uring runtime by sending it to a randomly chosen worker's channel. + /// The result is received through a oneshot channel. + pub fn spawn( + self: &mut Self, + future: F, + ) -> oneshot::Receiver + where + F::Output: Send + 'static, + { + let (sender, receiver) = oneshot::channel(); + let f = async move { + let output = future.await; + let _res = sender.send(output); + if !_res.is_ok() { + panic!("Failed to send task result back"); + } + }; + let task = Box::pin(f); + let idx = rand::rng().random_range(0..self.senders.len()); + self.senders[idx] + .send(task) + .expect("UringExecutor failed to send task"); + receiver + } + + /// Spawn a batch of tasks on the io_uring runtime, balancing across workers (round-robin). + pub fn spawn_many( + self: &mut Self, + futures: &mut Vec, + ) -> crossbeam_channel::Receiver + where + F::Output: Send + 'static, + { + let (sender, receiver) = crossbeam_channel::bounded::(futures.len()); + let num_workers = self.senders.len(); + for (i, f) in futures.drain(..).enumerate() { + let sender_clone = sender.clone(); + let f = Box::pin(f); + let task = async move { + let output = f.await; + sender_clone + .send(output) + .expect("Failed to send back result"); + }; + let idx = i % num_workers; + self.senders[idx] + .send(Box::pin(task)) + .expect("UringExecutor failed to send task"); + } + receiver + } + + /// Spawns a task on the io_uring runtime and blocks on it + pub fn run_to_completion(self: &mut Self, future: F) -> F::Output + where + F::Output: Send + 'static, + { + let receiver = self.spawn(future); + receiver.blocking_recv().expect("Failed to receive result") + } +} + +thread_local! { + static LOCAL_WORKER: RefCell = RefCell::new(RuntimeWorker::new()); +} + +const URING_BATCH_SIZE: u32 = 8; + +const URING_SYSCALL_INTERVAL_US: u64 = 5; + +const RUNTIME_TASK_BATCH_SIZE: u32 = 4; + +struct RuntimeWorker { + ring: io_uring::IoUring, + submitted_tasks: Vec>, + /** + * When using fixed buffers, a single task can produce multiple submission queue entries. + * It is possible that we aren't able to submit all of them at one go. Hold them in an + * intermediate queue in that case + */ + queued_entries: VecDeque, + last_syscall: Instant, + tokens: VecDeque, + io_performed: u64, + queued_submissions: u64, +} + +impl RuntimeWorker { + pub fn new() -> RuntimeWorker { + let mut builder = IoUring::::builder(); + let ring = builder + .setup_single_issuer() // Only the worker thread will issue IO and poll completions + .setup_defer_taskrun() + .build(URING_NUM_ENTRIES) + .expect("Failed to build IoUring instance"); + if FixedBufferPool::register_buffers_with_ring(&ring).is_err() { + log::warn!("Failed to register fixed buffers with runtime worker ring"); + } + let mut tokens = VecDeque::::with_capacity(MAX_CONCURRENT_TASKS as usize); + let mut inflight_tasks = + Vec::>::with_capacity(MAX_CONCURRENT_TASKS as usize); + for i in 0..MAX_CONCURRENT_TASKS { + tokens.push_back(i as u16); + inflight_tasks.push(None); + } + + RuntimeWorker { + ring, + submitted_tasks: inflight_tasks, + tokens, + queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), + last_syscall: Instant::now(), + io_performed: 0, + queued_submissions: 0, + } + } + + #[inline] + fn need_syscall(self: &Self) -> bool { + let time_from_last_submit = self.last_syscall.elapsed(); + let is_batch_full = self.queued_entries.len() >= URING_BATCH_SIZE as usize; + is_batch_full || time_from_last_submit > Duration::from_micros(URING_SYSCALL_INTERVAL_US) + } + + fn poll_completions(self: &mut Self) { + let cq = &mut self.ring.completion(); + loop { + cq.sync(); + match cq.next() { + Some(cqe) => { + let token = cqe.user_data() as usize; + let pending_completions = self.submitted_tasks[token] + .as_ref() + .expect("Task not found in submitted tasks") + .pending_completions; + if pending_completions == 1 { + let mut submission = self.submitted_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + submission.push_completion(cqe); + submission.complete(); + self.tokens.push_back(token as u16); + self.io_performed += 1; + } else { + let submission = self.submitted_tasks[token] + .as_mut() + .expect("Task not found in submitted tasks"); + submission.push_completion(cqe); + submission.reduce_completions(); + } + } + None => break, + } + } + } + + fn drain_intermediate_queue(&mut self) { + { + let sq = &mut self.ring.submission(); + while !sq.is_full() && !self.queued_entries.is_empty() { + let sqe = self.queued_entries.pop_front().unwrap(); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + self.queued_submissions += 1; + } + } + } + + fn submit_task(self: &mut Self, mut task: AsyncTask) { + let token = self.tokens.pop_front().expect("No more tokens"); + let sq = &mut self.ring.submission(); + let sqes = task.inner.borrow_mut().prepare_sqe(); + let num_sqes = sqes.len(); + task.set_completions(num_sqes); + self.submitted_tasks[token as usize] = Some(task); + let mut sqes_submitted = 0; + + for sqe in sqes.iter() { + let res = unsafe { sq.push(&sqe.clone().user_data(token as u64)) }; + if res.is_err() { + // submission queue is full + break; + } + sqes_submitted += 1; + self.queued_submissions += 1; + sq.sync(); + } + for i in sqes_submitted..sqes.len() { + self.queued_entries + .push_back(sqes[i].clone().user_data(token as u64)); + } + } + + pub fn add_task(task: AsyncTask) { + LOCAL_WORKER.with(|worker| { + let mut worker = worker.borrow_mut(); + worker.submit_task(task); + }); + } +} + +fn worker_main_loop(receiver: crossbeam_channel::Receiver) { + let executor = LocalExecutor::new(); + loop { + let mut tasks_submitted = 0; + // Need some form of admission control here + while tasks_submitted < RUNTIME_TASK_BATCH_SIZE && !receiver.is_empty() { + let task = receiver.try_recv(); + if task.is_err() { + continue; + } + executor.spawn(task.unwrap()).detach(); + tasks_submitted += 1; + } + // Can we batch the ticks? + let _task_found = executor.try_tick(); + LOCAL_WORKER.with(|worker| { + let mut worker = worker.borrow_mut(); + worker.drain_intermediate_queue(); + if worker.need_syscall() { + let mut flags = EnterFlags::empty(); + flags.insert(EnterFlags::GETEVENTS); + loop { + let res = unsafe { + worker.ring.submitter().enter::( + worker.queued_submissions as u32, + 0, + flags.bits(), + None, + ) + }; + match res { + Ok(_num_entries) => { + break; + } + Err(e) => { + if e.kind() == std::io::ErrorKind::Interrupted { + continue; + } + panic!("Failed to submit: {}", e.to_string()); + } + } + } + worker.queued_submissions = 0; + worker.last_syscall = Instant::now(); + } + // else if !task_found && worker.tokens.len() < MAX_CONCURRENT_TASKS as usize { + // worker.ring.submit_and_wait(1).expect("Failed to submit"); + // } + worker.poll_completions(); + }); + } +} + +struct AsyncTask { + // Note: Should change this to Arc in case of a work-stealing scheduler + pub inner: Rc>, + waker: Waker, + completed: *mut AtomicBool, + pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker + completions: Vec, +} + +impl AsyncTask { + #[inline] + fn complete(self) { + self.inner + .borrow_mut() + .complete(self.completions.iter().collect()); + unsafe { + (*self.completed).store(true, Ordering::Relaxed); + } + self.waker.wake(); + } + + #[inline] + fn set_completions(&mut self, count: usize) { + self.pending_completions = count; + } + + #[inline] + fn reduce_completions(&mut self) { + self.pending_completions -= 1; + } + + #[inline] + fn push_completion(&mut self, cqe: cqueue::Entry) { + self.completions.push(cqe); + } +} + +enum UringState { + Undecided, + Created, + Submitted, +} + +pub(crate) struct UringFuture +where + T: IoTask + 'static, +{ + state: UringState, + task: Rc>, + completed: AtomicBool, + id: u64, +} + +unsafe impl Send for UringFuture where T: IoTask + 'static {} + +impl UringFuture +where + T: IoTask + 'static, +{ + fn new(task: Rc>) -> UringFuture { + UringFuture { + state: UringState::Created, + task: task, + completed: AtomicBool::new(false), + id: rand::rng().random(), + } + } +} + +impl Future for UringFuture +where + T: IoTask + 'static, +{ + type Output = Rc>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + loop { + let state = std::mem::replace(&mut self.state, UringState::Undecided); + match state { + UringState::Created => { + let async_task = AsyncTask { + inner: self.task.clone(), + waker: cx.waker().clone(), + completed: &mut self.completed, + pending_completions: 0, + completions: Vec::new(), + }; + RuntimeWorker::add_task(async_task); + if ensure_uring_trace_registered() { + liquid_uring_runtime::io_submission!(|| self.id); + } + self.state = UringState::Submitted; + } + UringState::Submitted => match self.completed.load(Ordering::Relaxed) { + true => { + if ensure_uring_trace_registered() { + liquid_uring_runtime::io_completion!(|| self.id); + } + return Poll::Ready(self.task.clone()); + } + false => { + self.state = UringState::Submitted; + return Poll::Pending; + } + }, + UringState::Undecided => unreachable!("state cannot be undecided during poll"), + } + } + } +} + +fn submit_async_task(task: T) -> UringFuture +where + T: IoTask + 'static, +{ + UringFuture::new(Rc::new(RefCell::new(task))) +} + +pub(crate) async fn read( + path: PathBuf, + range: Option>, +) -> Result { + let file = OpenOptions::new() + .read(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .expect("failed to open file"); + + let effective_range = if let Some(range) = range { + range + } else { + let len = file.metadata()?.len(); + 0..len + }; + + { + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, true); + if read_task.is_ok() { + let rc = submit_async_task(read_task.unwrap()).await; + return match Rc::try_unwrap(rc) { + Ok(cell) => FixedFileReadTask::into_result(Box::new(cell.into_inner())), + Err(rc) => rc.borrow_mut().get_result(), + }; + } + } + // Fall back to normal read if fixed buffers are not available + let read_task = FileReadTask::build(effective_range, file, true); + submit_async_task(read_task).await.borrow_mut().get_result() +} + +pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .expect("failed to create file"); + + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), true, false); + submit_async_task(write_task) + .await + .borrow_mut() + .get_result() +} diff --git a/src/parquet/src/io/io_uring/mod.rs b/src/parquet/src/io/io_uring/mod.rs index 692fc8d1..d7452388 100644 --- a/src/parquet/src/io/io_uring/mod.rs +++ b/src/parquet/src/io/io_uring/mod.rs @@ -7,5 +7,9 @@ pub(crate) use thread_pool_uring::initialize_uring_pool; pub(crate) mod single_uring; +pub(crate) mod local_runtime; + +pub(crate) mod work_stealing; + #[cfg(test)] mod tests; diff --git a/src/parquet/src/io/io_uring/multi_async_uring.rs b/src/parquet/src/io/io_uring/multi_async_uring.rs index e2c26ec4..3e40a463 100644 --- a/src/parquet/src/io/io_uring/multi_async_uring.rs +++ b/src/parquet/src/io/io_uring/multi_async_uring.rs @@ -39,7 +39,7 @@ impl AsyncRing { fn submit_task(&mut self, task: &mut dyn IoTask) { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(0); + let entry = task.prepare_sqe()[0].clone().user_data(0); unsafe { sq.push(&entry) .expect("failed to push entry to io-uring submission queue"); @@ -207,7 +207,7 @@ where } State::Pending { mut ring, mut task } => { if let Some(cqe) = ring.as_mut().take_completion() { - task.complete(&cqe); + task.complete(vec![&cqe]); return Poll::Ready(task); } this.state = State::Pending { ring, task }; @@ -256,7 +256,11 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write( + path: PathBuf, + data: &Bytes, + direct_io: bool, +) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) @@ -264,6 +268,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/multi_blocking_uring.rs b/src/parquet/src/io/io_uring/multi_blocking_uring.rs index 86d2f6a5..1a677e3a 100644 --- a/src/parquet/src/io/io_uring/multi_blocking_uring.rs +++ b/src/parquet/src/io/io_uring/multi_blocking_uring.rs @@ -29,7 +29,7 @@ impl BlockingRing { { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(0); + let entry = task.prepare_sqe()[0].clone().user_data(0); unsafe { sq.push(&entry).expect("Failed to push to submission queue"); } @@ -44,7 +44,7 @@ impl BlockingRing { let cqe = cq .next() .ok_or_else(|| io::Error::other("io-uring completion queue empty"))?; - task.complete(&cqe); + task.complete(vec![&cqe]); } Ok(task) @@ -168,7 +168,7 @@ pub(crate) fn read( run_blocking_task(Box::new(read_task))?.into_result() } -pub(crate) fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) fn write(path: PathBuf, data: &Bytes, direct_io: bool) -> Result<(), std::io::Error> { use std::fs::OpenOptions; let file = OpenOptions::new() @@ -176,6 +176,6 @@ pub(crate) fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { .truncate(true) .write(true) .open(path)?; - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); run_blocking_task(Box::new(write_task))?.into_result() } diff --git a/src/parquet/src/io/io_uring/single_uring.rs b/src/parquet/src/io/io_uring/single_uring.rs index 6f5cd55d..0ff1bf1b 100644 --- a/src/parquet/src/io/io_uring/single_uring.rs +++ b/src/parquet/src/io/io_uring/single_uring.rs @@ -161,7 +161,7 @@ impl SharedRingInner { fn submit_task(&mut self, task: &mut dyn IoTask, token: u16) { { let mut sq = self.ring.submission(); - let entry = task.prepare_sqe().user_data(token as u64); + let entry = task.prepare_sqe()[0].clone().user_data(token as u64); unsafe { sq.push(&entry) .expect("Failed to push entry to io-uring submission queue"); @@ -264,7 +264,7 @@ where match state { State::Pending { token, mut task } => { if let Some(cqe) = ring.take_completion(token) { - task.complete(&cqe); + task.complete(vec![&cqe]); return Poll::Ready(task); } // Not ready yet, restore state @@ -331,7 +331,11 @@ pub(crate) async fn read( submit_async_task(read_task).await.into_result() } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write( + path: PathBuf, + data: &Bytes, + direct_io: bool, +) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) @@ -339,6 +343,6 @@ pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Er .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, false); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/tasks.rs b/src/parquet/src/io/io_uring/tasks.rs index 9f79bc5a..828c6a0e 100644 --- a/src/parquet/src/io/io_uring/tasks.rs +++ b/src/parquet/src/io/io_uring/tasks.rs @@ -1,4 +1,5 @@ use std::{ + alloc::{Layout, alloc}, any::Any, ffi::CString, fs, mem, @@ -12,23 +13,25 @@ use std::{ use bytes::Bytes; use io_uring::{cqueue, opcode, squeue}; +use liquid_cache_common::memory::pool::{FixedBufferAllocation, FixedBufferPool}; pub(crate) const BLOCK_ALIGN: usize = 4096; /// Represents an IO request to the uring worker thread. -pub(crate) trait IoTask: Send + Any + std::fmt::Debug { +pub trait IoTask: Send + Any + std::fmt::Debug { /// Convert the request to an io-uring submission queue entry. - fn prepare_sqe(&mut self) -> squeue::Entry; + fn prepare_sqe(&mut self) -> Vec; + // TODO(): Can we pass completion queue entries on the stack? /// Record the outcome of the completion queue entry. - fn complete(&mut self, cqe: &cqueue::Entry); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>); /// Convert the boxed task to a boxed `Any` so callers can recover the original type. fn into_any(self: Box) -> Box; } #[derive(Debug)] -pub(crate) struct FileOpenTask { +pub struct FileOpenTask { path: CString, direct_io: bool, fd: Option, @@ -67,7 +70,7 @@ impl FileOpenTask { impl IoTask for FileOpenTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let mut flags = libc::O_RDONLY | libc::O_CLOEXEC; if self.direct_io { flags |= libc::O_DIRECT; @@ -75,12 +78,17 @@ impl IoTask for FileOpenTask { let open_op = opcode::OpenAt::new(io_uring::types::Fd(libc::AT_FDCWD), self.path.as_ptr()) .flags(flags); - open_op.build() + vec![open_op.build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - let result = cqe.result(); + fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { + debug_assert_eq!( + cqe.len(), + 1, + "Should receive a single completion for a file open task" + ); + let result = cqe[0].result(); if result < 0 { self.error = Some(std::io::Error::from_raw_os_error(-result)); } else { @@ -104,7 +112,7 @@ impl Drop for FileOpenTask { } #[derive(Debug)] -pub(crate) struct FileReadTask { +pub struct FileReadTask { buffer: Vec, aligned_offset: usize, file: fs::File, @@ -180,11 +188,29 @@ impl FileReadTask { Ok(bytes.slice(data_start..data_end)) } + + /// Return a bytes object holding the result of the read operation. + #[inline] + pub(crate) fn get_result(self: &mut Self) -> Result { + if let Some(err) = self.error.take() { + return Err(err); + } + + let (start_padding, _) = self.padding(); + let range_len = (self.range.end - self.range.start) as usize; + let data_start = self.aligned_offset + start_padding; + let data_end = data_start + range_len; + + let buffer = mem::take(&mut self.buffer); + let bytes = Bytes::from(buffer); + + Ok(bytes.slice(data_start..data_end)) + } } impl IoTask for FileReadTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { + fn prepare_sqe(&mut self) -> Vec { let num_bytes = (self.range.end - self.range.start) as usize; let (start_padding, end_padding) = self.padding(); let num_bytes_aligned = num_bytes + start_padding + end_padding; @@ -198,15 +224,150 @@ impl IoTask for FileReadTask { num_bytes_aligned as u32, ); - read_op - .offset(self.range.start - start_padding as u64) - .build() + vec![ + read_op + .offset(self.range.start - start_padding as u64) + .build(), + ] + } + + #[inline] + fn complete(&mut self, cqe: Vec<&cqueue::Entry>) { + debug_assert_eq!( + cqe.len(), + 1, + "Should receive a single completion for a FileRead task" + ); + let result = cqe[0].result(); + if result < 0 { + self.error = Some(std::io::Error::from_raw_os_error(-result)); + } + } + + fn into_any(self: Box) -> Box { + self + } +} + +#[derive(Debug)] +pub(crate) struct FixedFileReadTask { + fixed_buffer: FixedBufferAllocation, + file: RawFd, + range: Range, + direct_io: bool, + error: Option, +} + +impl FixedFileReadTask { + #[inline] + fn compute_padding(range: &Range, direct_io: bool) -> (usize, usize) { + if direct_io { + let start_padding = range.start as usize & (BLOCK_ALIGN - 1); + let end_mod = range.end as usize & (BLOCK_ALIGN - 1); + let end_padding = if end_mod == 0 { + 0 + } else { + BLOCK_ALIGN - end_mod + }; + (start_padding, end_padding) + } else { + (0, 0) + } + } + + #[inline] + fn padding(&self) -> (usize, usize) { + Self::compute_padding(&self.range, self.direct_io) + } + + pub(crate) fn build( + range: Range, + file: &fs::File, + direct_io: bool, + ) -> Result { + let (start_padding, end_padding) = Self::compute_padding(&range, direct_io); + let requested_bytes = (range.end - range.start) as usize; + let num_bytes_aligned = requested_bytes + start_padding + end_padding; + + // Fixed buffers are aligned to the block size. Don't worry about alignment here + let ptr = FixedBufferPool::malloc(num_bytes_aligned); + if ptr.is_null() { + return Err(std::io::Error::from(std::io::ErrorKind::OutOfMemory)); + } + let alloc = FixedBufferAllocation { + ptr, + size: num_bytes_aligned, + }; + + Ok(FixedFileReadTask { + fixed_buffer: alloc, + file: file.as_raw_fd(), + range, + direct_io, + error: None, + }) + } + + /// Return a bytes object holding the result of the read operation (consumes the task). + #[inline] + pub(crate) fn into_result(self: Box) -> Result { + let mut this = self; + if let Some(err) = this.error.take() { + return Err(err); + } + + let (start_padding, _) = this.padding(); + let range_len = (this.range.end - this.range.start) as usize; + let data_end = start_padding + range_len; + let bytes = Bytes::from_owner(this.fixed_buffer); + + Ok(bytes.slice(start_padding..data_end)) + } + + /// Return a bytes object holding the result of the read operation (by copy, for use with RefCell). + #[inline] + pub(crate) fn get_result(&mut self) -> Result { + if let Some(err) = self.error.take() { + return Err(err); + } + + let (start_padding, _) = self.padding(); + let range_len = (self.range.end - self.range.start) as usize; + let data_end = start_padding + range_len; + let slice = &self.fixed_buffer.as_ref()[start_padding..data_end]; + + Ok(Bytes::copy_from_slice(slice)) + } +} + +impl IoTask for FixedFileReadTask { + #[inline] + fn prepare_sqe(&mut self) -> Vec { + let buffers = FixedBufferPool::get_fixed_buffers(&self.fixed_buffer); + let mut sqes = Vec::::new(); + let (start_padding, _) = self.padding(); + let mut file_offset = self.range.start - start_padding as u64; + for buffer in buffers { + let sqe = opcode::ReadFixed::new( + io_uring::types::Fd(self.file), + buffer.ptr, + buffer.bytes as u32, + buffer.buf_id as u16, + ) + .offset(file_offset) + .build(); + file_offset += buffer.bytes as u64; + sqes.push(sqe); + } + sqes } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - if cqe.result() < 0 { - self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { + for cqe in cqes.iter().as_ref() { + if cqe.result() < 0 { + self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + } } } @@ -216,17 +377,40 @@ impl IoTask for FileReadTask { } #[derive(Debug)] -pub(crate) struct FileWriteTask { - data: Bytes, +pub struct FileWriteTask { + data: *const u8, fd: RawFd, + size: usize, error: Option, } +unsafe impl Send for FileWriteTask {} + impl FileWriteTask { - pub(crate) fn build(data: Bytes, fd: RawFd) -> FileWriteTask { + pub(crate) fn build( + data: Bytes, + fd: RawFd, + direct_io: bool, + use_fixed_buffers: bool, + ) -> FileWriteTask { + let mut ptr = data.as_ptr(); + let bytes = data.len(); + let mut padding = 0; + if direct_io { + padding = (4096 - (data.len() & 4095)) & 4095; + let layout = Layout::from_size_align(data.len() + padding, 4096) + .expect("Failed to create layout"); + assert!((data.len() + padding) % 4096 == 0); + unsafe { + let new_ptr = alloc(layout); + std::ptr::copy_nonoverlapping(ptr, new_ptr, data.len()); + ptr = new_ptr; + } + } FileWriteTask { - data, + data: ptr, fd, + size: bytes + padding, error: None, } } @@ -238,24 +422,35 @@ impl FileWriteTask { } Ok(()) } + + #[inline] + pub(crate) fn get_result(self: &mut Self) -> Result<(), std::io::Error> { + if let Some(err) = self.error.take() { + return Err(err); + } + Ok(()) + } } impl IoTask for FileWriteTask { #[inline] - fn prepare_sqe(&mut self) -> squeue::Entry { - let write_op = opcode::Write::new( - io_uring::types::Fd(self.fd), - self.data.as_ptr(), - self.data.len() as u32, - ); + fn prepare_sqe(&mut self) -> Vec { + let write_op = + opcode::Write::new(io_uring::types::Fd(self.fd), self.data, self.size as u32); - write_op.offset(0u64).build() + vec![write_op.offset(0u64).build()] } #[inline] - fn complete(&mut self, cqe: &cqueue::Entry) { - if cqe.result() < 0 { - self.error = Some(std::io::Error::from_raw_os_error(-cqe.result())); + fn complete(&mut self, cqes: Vec<&cqueue::Entry>) { + debug_assert_eq!( + cqes.len(), + 1, + "Should receive a single completion for a FileWrite task" + ); + let result = cqes[0].result(); + if result != self.size as i32 { + self.error = Some(std::io::Error::from_raw_os_error(-result)); } } diff --git a/src/parquet/src/io/io_uring/tests.rs b/src/parquet/src/io/io_uring/tests.rs index 94cb808d..3c7b8bde 100644 --- a/src/parquet/src/io/io_uring/tests.rs +++ b/src/parquet/src/io/io_uring/tests.rs @@ -1,5 +1,8 @@ #![cfg(target_os = "linux")] +use crate::io::io_uring::local_runtime::{self, UringExecutor}; +use crate::io::io_uring::work_stealing::{self, WorkStealingUringRuntime}; + use super::{ initialize_uring_pool, multi_async_uring, multi_blocking_uring, single_uring, thread_pool_uring, }; @@ -55,7 +58,7 @@ impl BackendKind { } else { IoMode::Uring }; - initialize_uring_pool(mode); + initialize_uring_pool(mode, false); }); } BackendKind::MultiBlocking => { @@ -79,21 +82,25 @@ impl BackendKind { BackendKind::MultiBlocking => { async move { multi_blocking_uring::read(path, range, direct_io) }.boxed() } - BackendKind::ThreadPool => thread_pool_uring::read(path, range, direct_io).boxed(), + BackendKind::ThreadPool => { + thread_pool_uring::read(path, range, direct_io, true).boxed() + } } } fn write_future(self, path: PathBuf, data: Bytes) -> IoFuture<()> { match self { - BackendKind::Shared => async move { single_uring::write(path, &data).await }.boxed(), + BackendKind::Shared => { + async move { single_uring::write(path, &data, false).await }.boxed() + } BackendKind::MultiAsync => { - async move { multi_async_uring::write(path, &data).await }.boxed() + async move { multi_async_uring::write(path, &data, false).await }.boxed() } BackendKind::MultiBlocking => { - async move { multi_blocking_uring::write(path, &data) }.boxed() + async move { multi_blocking_uring::write(path, &data, false) }.boxed() } BackendKind::ThreadPool => { - async move { thread_pool_uring::write(path, &data).await }.boxed() + async move { thread_pool_uring::write(path, &data, false, false).await }.boxed() } } } @@ -150,3 +157,63 @@ fn read_write_roundtrip_all_backends() { drop(tmpdir); } } + +/// Work-stealing uring runtime +#[test] +fn read_write_roundtrip_work_stealing_uring() { + let original: Vec = (0..128).map(|i| (i as u8).wrapping_mul(3)).collect(); + let runtime = WorkStealingUringRuntime::new(2); + + let (tmpdir, path) = seed_file(&original); + let path_clone = path.clone(); + let read_bytes = runtime + .run_to_completion(async move { work_stealing::read(path_clone, None).await }) + .unwrap_or_else(|err| panic!("ws read failed: {err}")); + assert_eq!( + read_bytes.as_ref(), + original.as_slice(), + "ws read returned unexpected payload", + ); + + let new_payload: Vec = (0..64).map(|i| (i as u8).wrapping_add(7)).collect(); + let bytes = Bytes::from(new_payload.clone()); + let path_clone = path.clone(); + runtime + .run_to_completion(async move { work_stealing::write(path_clone, &bytes).await }) + .unwrap_or_else(|err| panic!("ws write failed: {err}")); + + let on_disk = fs::read(&path).expect("failed to read updated file"); + assert_eq!(on_disk, new_payload, "ws wrote unexpected data"); + + drop(tmpdir); +} + +/// Non-blocking uring requires a dedicated runtime +#[test] +fn read_write_roundtrip_non_blocking_uring() { + let original: Vec = (0..128).map(|i| (i as u8).wrapping_mul(3)).collect(); + let mut executor = UringExecutor::new(1); + + let (tmpdir, path) = seed_file(&original); + let path_clone = path.clone(); + let read_bytes = executor + .run_to_completion(async move { local_runtime::read(path_clone, None).await }) + .unwrap_or_else(|err| panic!("read failed: {err}")); + assert_eq!( + read_bytes.as_ref(), + original.as_slice(), + "read returned unexpected payload", + ); + + let new_payload: Vec = (0..64).map(|i| (i as u8).wrapping_add(1)).collect(); + let bytes = Bytes::from(new_payload.clone()); + let path_clone = path.clone(); + executor + .run_to_completion(async move { local_runtime::write(path_clone, &bytes.clone()).await }) + .unwrap_or_else(|err| panic!("write failed: {err}")); + + let on_disk = fs::read(&path).expect("failed to read updated file"); + assert_eq!(on_disk, new_payload, "wrote unexpected data",); + + drop(tmpdir); +} diff --git a/src/parquet/src/io/io_uring/thread_pool_uring.rs b/src/parquet/src/io/io_uring/thread_pool_uring.rs index b63cd2e1..972ef604 100644 --- a/src/parquet/src/io/io_uring/thread_pool_uring.rs +++ b/src/parquet/src/io/io_uring/thread_pool_uring.rs @@ -2,8 +2,9 @@ use std::{ collections::VecDeque, fs::OpenOptions, future::Future, + io, ops::Range, - os::fd::AsRawFd, + os::{fd::AsRawFd, unix::fs::OpenOptionsExt}, path::PathBuf, pin::Pin, sync::{ @@ -12,22 +13,48 @@ use std::{ }, task::{Context, Poll}, thread, + time::{Duration, Instant}, }; use bytes::Bytes; -use io_uring::{IoUring, cqueue, squeue}; -use liquid_cache_common::IoMode; +use io_uring::{EnterFlags, IoUring, cqueue, squeue}; +use liquid_cache_common::{IoMode, memory::pool::FixedBufferPool}; use tokio::sync::oneshot; -use super::tasks::{FileOpenTask, FileReadTask, FileWriteTask, IoTask}; +use crate::io::io_uring::tasks::FixedFileReadTask; +use rand::Rng; + +#[usdt::provider] +mod liquid_parquet { + fn io_submitted(id: u64) {} + fn io_completed(id: u64) {} +} + +static REGISTRATION_SUCCEEDED: OnceLock = OnceLock::new(); + +fn ensure_registered() -> bool { + *REGISTRATION_SUCCEEDED.get_or_init(|| match usdt::register_probes() { + Ok(()) => true, + Err(err) => { + log::debug!("failed to register USDT probes: {err}"); + false + } + }) +} + +use super::tasks::{FileReadTask, FileWriteTask, IoTask}; pub(crate) const URING_NUM_ENTRIES: u32 = 256; +const URING_BATCH_SIZE: u32 = 32; + static ENABLED: AtomicBool = AtomicBool::new(true); struct Submission { task: Box, completion_tx: oneshot::Sender>, + pending_completions: usize, // No. of pending completions. Will be populated later by the uring worker + completions: Vec, } impl Submission { @@ -35,15 +62,32 @@ impl Submission { Submission { task, completion_tx, + pending_completions: 0, + completions: Vec::new(), } } - fn send_back(mut self, cqe: &cqueue::Entry) { - self.task.complete(cqe); + fn send_back(mut self) { + self.task.complete(self.completions.iter().collect()); self.completion_tx .send(self.task) .expect("Failed to send task back to caller"); } + + #[inline] + fn set_completions(&mut self, count: usize) { + self.pending_completions = count; + } + + #[inline] + fn reduce_completions(&mut self) { + self.pending_completions -= 1; + } + + #[inline] + fn push_completion(&mut self, cqe: cqueue::Entry) { + self.completions.push(cqe); + } } struct JoinOnDropHandle(Option>); @@ -74,9 +118,9 @@ unsafe impl Sync for IoUringThreadpool {} static IO_URING_THREAD_POOL_INST: OnceLock = OnceLock::new(); -pub(crate) fn initialize_uring_pool(io_mode: IoMode) { +pub(crate) fn initialize_uring_pool(io_mode: IoMode, register_buffers: bool) { if matches!(io_mode, IoMode::Uring | IoMode::UringDirect) { - IO_URING_THREAD_POOL_INST.get_or_init(|| IoUringThreadpool::new(io_mode)); + IO_URING_THREAD_POOL_INST.get_or_init(|| IoUringThreadpool::new(io_mode, register_buffers)); } if matches!(io_mode, IoMode::UringBlocking) { super::multi_blocking_uring::initialize_blocking_rings(); @@ -84,18 +128,13 @@ pub(crate) fn initialize_uring_pool(io_mode: IoMode) { } impl IoUringThreadpool { - fn new(io_type: IoMode) -> IoUringThreadpool { + fn new(io_type: IoMode, register_buffers: bool) -> IoUringThreadpool { let (sender, receiver) = crossbeam_channel::unbounded::(); - let builder = IoUring::::builder(); - let ring = builder - .build(URING_NUM_ENTRIES) - .expect("Failed to build IoUring instance"); - let worker = thread::Builder::new() .name("lc-io-worker".to_string()) .spawn(move || { - let mut uring_worker = UringWorker::new(receiver, ring); + let mut uring_worker = UringWorker::new(receiver, register_buffers); uring_worker.thread_loop(); }) .expect("Failed to spawn io-uring worker thread"); @@ -134,12 +173,40 @@ struct UringWorker { ring: IoUring, tokens: VecDeque, submitted_tasks: Vec>, + /** + * When using fixed buffers, a single task can produce multiple submission queue entries. + * It is possible that we aren't able to submit all of them at one go. Hold them in an + * intermediate queue in that case + */ + queued_entries: VecDeque, io_performed: AtomicUsize, + last_syscall: Instant, + // Number of entries that will be submitted upon calling io_uring_enter + queued_submissions: u32, } impl UringWorker { #[allow(clippy::new_ret_no_self)] - fn new(channel: crossbeam_channel::Receiver, ring: IoUring) -> UringWorker { + fn new( + channel: crossbeam_channel::Receiver, + register_buffers: bool, + ) -> UringWorker { + let mut builder = IoUring::::builder(); + let ring = builder + .setup_single_issuer() // Only the worker thread will issue IO and poll completions + .setup_defer_taskrun() + // .setup_iopoll() + // .setup_sqpoll(50000) + .build(URING_NUM_ENTRIES) + .expect("Failed to build IoUring instance"); + + if register_buffers { + let res = FixedBufferPool::register_buffers_with_ring(&ring); + if res.is_err() { + log::error!("Failed to register buffers with io-uring ring: {:?}", res); + } + } + let tokens = (0..URING_NUM_ENTRIES as u16).collect(); let mut tasks = Vec::with_capacity(URING_NUM_ENTRIES as usize); tasks.resize_with(URING_NUM_ENTRIES as usize, || None); @@ -149,6 +216,9 @@ impl UringWorker { tokens, submitted_tasks: tasks, io_performed: AtomicUsize::new(0), + queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), + last_syscall: Instant::now(), + queued_submissions: 0, } } @@ -158,31 +228,88 @@ impl UringWorker { break; } + self.drain_intermediate_queue(); self.drain_submissions(); self.poll_completions(); } } + fn drain_intermediate_queue(&mut self) { + { + let sq = &mut self.ring.submission(); + while !sq.is_full() && !self.queued_entries.is_empty() { + let sqe = self.queued_entries.pop_front().unwrap(); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + self.queued_submissions += 1; + } + } + } + #[inline(never)] fn drain_submissions(&mut self) { - let mut need_submit = false; while !self.receiver.is_empty() && !self.tokens.is_empty() { - let mut submission = self.receiver.recv().unwrap(); + let sq = &mut self.ring.submission(); + sq.sync(); + if sq.is_full() { + // A single token might have multiple associated sqes. Free token doesn't always imply that we have free submission slots + break; + } + let token = self.tokens.pop_front().unwrap(); - { - let sq = &mut self.ring.submission(); - let task = submission.task.as_mut(); - let sqe = task.prepare_sqe().user_data(token as u64); - unsafe { - sq.push(&sqe).expect("Failed to push to submission queue"); + let mut submission = self.receiver.recv().unwrap(); + let task = submission.task.as_mut(); + let mut sqes = task.prepare_sqe(); + self.queued_submissions += sqes.len() as u32; + submission.set_completions(sqes.len()); + let mut tasks_submitted = 0; + + for sqe in sqes.iter_mut() { + let res = unsafe { sq.push(&sqe.clone().user_data(token as u64)) }; + if res.is_err() { + break; } + tasks_submitted += 1; sq.sync(); } + for i in tasks_submitted..sqes.len() { + self.queued_entries + .push_back(sqes[i].clone().user_data(token as u64)); + } self.submitted_tasks[token as usize] = Some(submission); - need_submit = true; } - if need_submit { - self.ring.submit().expect("Failed to submit"); + // let need_poll = self.tokens.len() < URING_NUM_ENTRIES as usize; + let time_from_last_submit = self.last_syscall.elapsed(); + let is_batch_full = self.queued_submissions >= URING_BATCH_SIZE; + let need_syscall = is_batch_full || time_from_last_submit > Duration::from_micros(20); + if need_syscall { + let mut flags = EnterFlags::empty(); + flags.insert(EnterFlags::GETEVENTS); + loop { + let res = unsafe { + self.ring.submitter().enter::( + self.queued_submissions, + 0, + flags.bits(), + None, + ) + }; + match res { + Ok(_num_entries) => { + break; + } + Err(e) => { + if e.kind() == io::ErrorKind::Interrupted { + continue; + } + panic!("Failed to submit: {}", e.to_string()); + } + } + } + self.last_syscall = Instant::now(); + self.queued_submissions = 0; } } @@ -194,12 +321,26 @@ impl UringWorker { match cq.next() { Some(cqe) => { let token = cqe.user_data() as usize; - let submission = self.submitted_tasks[token] - .take() - .expect("Task not found in submitted tasks"); - submission.send_back(&cqe); - self.tokens.push_back(token as u16); - self.io_performed.fetch_add(1, Ordering::Relaxed); + let pending_completions = self.submitted_tasks[token] + .as_ref() + .expect("Task not found in submitted tasks") + .pending_completions; + + if pending_completions == 1 { + let mut submission = self.submitted_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + submission.push_completion(cqe); + submission.send_back(); + self.tokens.push_back(token as u16); + self.io_performed.fetch_add(1, Ordering::Relaxed); + } else { + let submission = self.submitted_tasks[token] + .as_mut() + .expect("Task not found in submitted tasks"); + submission.reduce_completions(); + submission.push_completion(cqe); + } } None => break, } @@ -221,6 +362,7 @@ where T: IoTask + 'static, { state: UringState, + id: u64, } impl UringFuture @@ -230,6 +372,7 @@ where fn new(task: Box) -> UringFuture { UringFuture { state: UringState::Created(task), + id: rand::rng().random(), } } } @@ -248,6 +391,9 @@ where let pool = IO_URING_THREAD_POOL_INST .get() .expect("Uring threadpool not initialized"); + if ensure_registered() { + liquid_parquet::io_submitted!(|| self.id); + } let (tx, rx) = oneshot::channel::>(); let boxed_task: Box = task; pool.submit_task(boxed_task, tx); @@ -255,6 +401,9 @@ where } UringState::Submitted(mut receiver) => match Pin::new(&mut receiver).poll(cx) { Poll::Ready(Ok(task)) => { + if ensure_registered() { + liquid_parquet::io_completed!(|| self.id); + } let typed_task = task .into_any() .downcast::() @@ -286,9 +435,14 @@ pub(crate) async fn read( path: PathBuf, range: Option>, direct_io: bool, + use_fixed_buffers: bool, ) -> Result { - let open_task = FileOpenTask::build(path, direct_io)?; - let file = submit_async_task(open_task).await.into_result()?; + // Perform open operations in a blocking manner as they are not compatible with a io_uring instance that uses polled mode IO + let file = OpenOptions::new() + .read(true) + .custom_flags(libc::O_DIRECT) + .open(path) + .expect("failed to create file"); let effective_range = if let Some(range) = range { range @@ -297,18 +451,32 @@ pub(crate) async fn read( 0..len }; + if use_fixed_buffers { + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, direct_io); + // Fall back to normal read if fixed buffers are not available + if read_task.is_ok() { + return submit_async_task(read_task.unwrap()).await.into_result(); + } + } let read_task = FileReadTask::build(effective_range, file, direct_io); - submit_async_task(read_task).await.into_result() + return submit_async_task(read_task).await.into_result(); } -pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { +pub(crate) async fn write( + path: PathBuf, + data: &Bytes, + direct_io: bool, + use_fixed_buffers: bool, +) -> Result<(), std::io::Error> { let file = OpenOptions::new() .create(true) .truncate(true) .write(true) + .custom_flags(libc::O_DIRECT) .open(path) .expect("failed to create file"); - let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd()); + let write_task = + FileWriteTask::build(data.clone(), file.as_raw_fd(), direct_io, use_fixed_buffers); submit_async_task(write_task).await.into_result() } diff --git a/src/parquet/src/io/io_uring/work_stealing.rs b/src/parquet/src/io/io_uring/work_stealing.rs new file mode 100644 index 00000000..f5f0b08c --- /dev/null +++ b/src/parquet/src/io/io_uring/work_stealing.rs @@ -0,0 +1,571 @@ + +use std::{ + cell::{Cell, RefCell}, + collections::VecDeque, + fs::OpenOptions, + ops::Range, + os::{fd::AsRawFd as _, unix::fs::OpenOptionsExt}, + path::PathBuf, + pin::Pin, + rc::Rc, + sync::{ + Arc, Mutex, OnceLock, + atomic::{AtomicBool, AtomicU64, Ordering}, + }, + task::{Context, Poll, Waker}, + thread::{self, JoinHandle}, + time::{Duration, Instant}, +}; + +use async_task::Runnable; +use bytes::Bytes; +use futures::Future; +use io_uring::{EnterFlags, IoUring, cqueue, squeue}; +use liquid_cache_common::memory::pool::FixedBufferPool; +use rand::Rng; +use tokio::sync::oneshot; + +use super::tasks::{FileReadTask, FileWriteTask, FixedFileReadTask, IoTask}; + +#[usdt::provider] +mod ws_uring_runtime { + fn io_submission(id: u64) {} + fn io_completion(id: u64) {} +} + +fn ensure_uring_trace_registered() -> bool { + static REGISTERED: OnceLock = OnceLock::new(); + *REGISTERED.get_or_init(|| match usdt::register_probes() { + Ok(()) => true, + Err(err) => { + log::debug!("failed to register work-stealing io_uring USDT probes: {err}"); + false + } + }) +} + +type ExecutorTask = Pin + Send>>; + +const URING_NUM_ENTRIES: u32 = 256; +const MAX_CONCURRENT_IO: u32 = 128; +const URING_BATCH_SIZE: u32 = 8; +const URING_SYSCALL_INTERVAL_US: u64 = 5; +const MAX_ACTIVE_TASKS_PER_THREAD: u32 = 5; + +/// Local io_uring + work-stealing executor. +/// +/// ## Runnable timing +/// +/// [`WorkStealingUringRuntime::worker_runnable_wall_nanos`] accumulates **wall-clock** time each worker +/// spends inside [`Runnable::run`] (one async-task poll). It does **not** include idle time between +/// ticks, io_uring `enter`, or time blocked in syscalls outside `run`. It is **not** OS thread CPU time +/// (`CLOCK_THREAD_CPUTIME_ID`). +pub struct WorkStealingUringRuntime { + _workers: Vec>, + sender: crossbeam_channel::Sender, + /// One counter per worker; same `Arc` as installed on that worker’s [`RuntimeWorker`]. + worker_runnable_wall_nanos: Vec>, +} + +impl WorkStealingUringRuntime { + /// Spawn `num_threads` worker threads, each with its own io_uring ring. + pub fn new(num_threads: usize) -> Self { + let (sender, receiver) = crossbeam_channel::unbounded(); + + let mut workers = Vec::new(); + let mut worker_runnable_wall_nanos = Vec::with_capacity(num_threads); + for i in 0..num_threads { + let counter = Arc::new(AtomicU64::new(0)); + worker_runnable_wall_nanos.push(Arc::clone(&counter)); + let receiver_clone = receiver.clone(); + let worker = thread::Builder::new() + .name(format!("ws-io-worker-{}", i)) + .spawn(move || worker_main_loop(receiver_clone, counter)) + .expect("Failed to spawn worker"); + workers.push(worker); + } + + WorkStealingUringRuntime { + _workers: workers, + sender, + worker_runnable_wall_nanos, + } + } + + /// Wall time each worker has spent inside `Runnable::run()`, **nanoseconds**, indexed by worker id. + /// + /// See struct-level docs for semantics. + pub fn worker_runnable_wall_nanos(&self) -> Vec { + self.worker_runnable_wall_nanos + .iter() + .map(|c| c.load(Ordering::Relaxed)) + .collect() + } + + /// Sum of [`Self::worker_runnable_wall_nanos`] across workers. + pub fn total_runnable_wall_nanos(&self) -> u64 { + self.worker_runnable_wall_nanos + .iter() + .map(|c| c.load(Ordering::Relaxed)) + .sum() + } + + /// Spawn a future on the runtime; the result is returned through a oneshot channel. + pub fn spawn( + &self, + future: F, + ) -> oneshot::Receiver + where + F::Output: Send + 'static, + { + let (tx, rx) = oneshot::channel(); + + let wrapped_fut = async move { + let output = future.await; + let _ = tx.send(output); + }; + self.sender.send(Box::pin(wrapped_fut)).expect("Failed to send task"); + rx + } + + /// Spawn a batch of futures, returning results via a crossbeam channel. + pub fn spawn_many( + &self, + futures: &mut Vec, + ) -> crossbeam_channel::Receiver + where + F::Output: Send + 'static, + { + let (tx, rx) = crossbeam_channel::bounded::(futures.len()); + for f in futures.drain(..) { + let tx = tx.clone(); + let wrapped_fut = async move { + let output = f.await; + tx.send(output).expect("Failed to send result"); + }; + self.sender.send(Box::pin(wrapped_fut)).expect("Failed to send task"); + } + rx + } + + /// Spawn a future and block the caller until it completes. + pub fn run_to_completion( + &self, + future: F, + ) -> F::Output + where + F::Output: Send + 'static, + { + let receiver = self.spawn(future); + receiver.blocking_recv().expect("Failed to receive result") + } +} + +struct IoDriver { + ring: IoUring, + submitted_tasks: Vec>, + queued_entries: VecDeque, + last_syscall: Instant, + tokens: VecDeque, + io_performed: u64, + queued_submissions: u64, + fixed_buffers_available: bool, +} + +impl IoDriver { + fn new() -> IoDriver { + let ring = IoUring::::builder() + .setup_single_issuer() + .setup_defer_taskrun() + .build(URING_NUM_ENTRIES) + .expect("Failed to build IoUring instance"); + + let fixed_buffers_available = + FixedBufferPool::register_buffers_with_ring(&ring).is_ok(); + + let mut tokens = VecDeque::with_capacity(MAX_CONCURRENT_IO as usize); + let mut submitted_tasks = Vec::with_capacity(MAX_CONCURRENT_IO as usize); + for i in 0..MAX_CONCURRENT_IO { + tokens.push_back(i as u16); + submitted_tasks.push(None); + } + + IoDriver { + ring, + submitted_tasks, + tokens, + queued_entries: VecDeque::with_capacity(URING_NUM_ENTRIES as usize), + last_syscall: Instant::now(), + io_performed: 0, + queued_submissions: 0, + fixed_buffers_available, + } + } + + #[inline] + fn need_syscall(&self) -> bool { + let is_batch_full = self.queued_entries.len() >= URING_BATCH_SIZE as usize; + is_batch_full + || self.last_syscall.elapsed() > Duration::from_micros(URING_SYSCALL_INTERVAL_US) + } + + fn poll_completions(&mut self) { + let cq = &mut self.ring.completion(); + loop { + cq.sync(); + match cq.next() { + Some(cqe) => { + let token = cqe.user_data() as usize; + let pending = self.submitted_tasks[token] + .as_ref() + .expect("Task not found in submitted tasks") + .pending_completions; + if pending == 1 { + let mut task = self.submitted_tasks[token] + .take() + .expect("Task not found in submitted tasks"); + task.push_completion(cqe); + task.complete(); + self.tokens.push_back(token as u16); + self.io_performed += 1; + } else { + let task = self.submitted_tasks[token] + .as_mut() + .expect("Task not found in submitted tasks"); + task.push_completion(cqe); + task.reduce_completions(); + } + } + None => break, + } + } + } + + fn drain_intermediate_queue(&mut self) { + let sq = &mut self.ring.submission(); + while !sq.is_full() && !self.queued_entries.is_empty() { + let sqe = self.queued_entries.pop_front().unwrap(); + unsafe { + sq.push(&sqe).expect("Failed to push to submission queue"); + } + sq.sync(); + self.queued_submissions += 1; + } + } + + fn submit_task(&mut self, mut task: AsyncIoTask) { + let token = self.tokens.pop_front().expect("No more IO tokens"); + let sq = &mut self.ring.submission(); + let sqes = task.inner.lock().unwrap().prepare_sqe(); + let num_sqes = sqes.len(); + task.set_completions(num_sqes); + self.submitted_tasks[token as usize] = Some(task); + let mut sqes_submitted = 0; + + for sqe in sqes.iter() { + let res = unsafe { sq.push(&sqe.clone().user_data(token as u64)) }; + if res.is_err() { + break; + } + sqes_submitted += 1; + self.queued_submissions += 1; + sq.sync(); + } + for i in sqes_submitted..sqes.len() { + self.queued_entries + .push_back(sqes[i].clone().user_data(token as u64)); + } + } + + fn add_task(task: AsyncIoTask) { + IO_REACTOR.with(|reactor| { + reactor.borrow_mut().submit_task(task); + }); + } +} + +fn worker_main_loop( + receiver: crossbeam_channel::Receiver, + runnable_wall_nanos: Arc, +) { + EXECUTOR.with(|worker| { + let mut worker = worker.borrow_mut(); + worker.set_context(receiver, runnable_wall_nanos); + }); + loop { + EXECUTOR.with(|worker| { + let worker = &mut worker.borrow_mut(); + worker.try_tick(); + }); + IO_REACTOR.with(|reactor| { + let reactor = &mut reactor.borrow_mut(); + reactor.drain_intermediate_queue(); + if reactor.need_syscall() { + let mut flags = EnterFlags::empty(); + flags.insert(EnterFlags::GETEVENTS); + loop { + let res = unsafe { + reactor.ring.submitter().enter::( + reactor.queued_submissions as u32, + 0, + flags.bits(), + None, + ) + }; + match res { + Ok(_num_entries) => { + break; + } + Err(e) => { + if e.kind() == std::io::ErrorKind::Interrupted { + continue; + } + panic!("Failed to submit: {}", e.to_string()); + } + } + } + reactor.queued_submissions = 0; + reactor.last_syscall = Instant::now(); + } + reactor.poll_completions(); + }); + } +} + +thread_local! { + static EXECUTOR: RefCell = RefCell::new(RuntimeWorker::new()); + static IO_REACTOR: RefCell = RefCell::new(IoDriver::new()); +} + +struct RuntimeWorker { + task_receiver: Option>, + active_tasks: Rc>, + local: Rc>>, + runnable_wall_nanos: Option>, +} + +impl RuntimeWorker { + fn new() -> RuntimeWorker { + RuntimeWorker { + task_receiver: None, + active_tasks: Rc::new(Cell::new(0)), + local: Rc::new(RefCell::new(VecDeque::new())), + runnable_wall_nanos: None, + } + } + + fn set_context( + &mut self, + receiver: crossbeam_channel::Receiver, + runnable_wall_nanos: Arc, + ) { + self.task_receiver = Some(receiver); + self.runnable_wall_nanos = Some(runnable_wall_nanos); + } + + fn try_tick(&mut self) { + let mut runnable = self.local.borrow_mut().pop_front(); + if runnable.is_none() && self.active_tasks.get() < MAX_ACTIVE_TASKS_PER_THREAD { + if let Ok(future) = self.task_receiver.as_mut().unwrap().try_recv() { + self.active_tasks.set(self.active_tasks.get().saturating_add(1)); + let active_tasks = Rc::clone(&self.active_tasks); + let local_clone = Rc::clone(&self.local); + let wrapped = async move { + future.await; + active_tasks.set(active_tasks.get().saturating_sub(1)); + }; + let schedule = move |r: Runnable| { + local_clone.borrow_mut().push_back(r); + }; + let (r, task) = unsafe { async_task::spawn_unchecked(wrapped, schedule) }; + // Dropping `Task` would cancel the future and drop the oneshot sender (RecvError). + task.detach(); + runnable = Some(r); + } + } + if let Some(r) = runnable { + let start = Instant::now(); + r.run(); + if let Some(c) = self.runnable_wall_nanos.as_ref() { + c.fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed); + } + } + } +} + + +/// Thread-safe wrapper around an `IoTask`. Unlike the local runtime's version +/// which uses `Rc>`, this uses `Arc>` so that the task +/// can be submitted on one thread and completed/read on another. +struct AsyncIoTask { + inner: Arc>, + waker: Waker, + completed: Arc, + pending_completions: usize, + completions: Vec, +} + +unsafe impl Send for AsyncIoTask {} + +impl AsyncIoTask { + #[inline] + fn complete(self) { + self.inner + .lock() + .unwrap() + .complete(self.completions.iter().collect()); + self.completed.store(true, Ordering::Release); + self.waker.wake(); + } + + #[inline] + fn set_completions(&mut self, count: usize) { + self.pending_completions = count; + } + + #[inline] + fn reduce_completions(&mut self) { + self.pending_completions -= 1; + } + + #[inline] + fn push_completion(&mut self, cqe: cqueue::Entry) { + self.completions.push(cqe); + } +} + +enum UringState { + Undecided, + Created, + Submitted, +} + +pub(crate) struct UringFuture +where + T: IoTask + 'static, +{ + state: UringState, + task: Arc>, + completed: Arc, + id: u64, +} + +unsafe impl Send for UringFuture where T: IoTask + 'static {} + +impl UringFuture +where + T: IoTask + 'static, +{ + fn new(task: Arc>) -> Self { + UringFuture { + state: UringState::Created, + task, + completed: Arc::new(AtomicBool::new(false)), + id: rand::rng().random(), + } + } +} + +impl Future for UringFuture +where + T: IoTask + 'static, +{ + type Output = Arc>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + loop { + let state = std::mem::replace(&mut self.state, UringState::Undecided); + match state { + UringState::Created => { + let async_task = AsyncIoTask { + inner: self.task.clone(), + waker: cx.waker().clone(), + completed: self.completed.clone(), + pending_completions: 0, + completions: Vec::new(), + }; + IoDriver::add_task(async_task); + if ensure_uring_trace_registered() { + ws_uring_runtime::io_submission!(|| self.id); + } + self.state = UringState::Submitted; + } + UringState::Submitted => { + if self.completed.load(Ordering::Acquire) { + if ensure_uring_trace_registered() { + ws_uring_runtime::io_completion!(|| self.id); + } + return Poll::Ready(self.task.clone()); + } + self.state = UringState::Submitted; + return Poll::Pending; + } + UringState::Undecided => unreachable!("state cannot be undecided during poll"), + } + } + } +} + +fn submit_async_task(task: T) -> UringFuture +where + T: IoTask + 'static, +{ + UringFuture::new(Arc::new(Mutex::new(task))) +} + +pub(crate) async fn read( + path: PathBuf, + range: Option>, +) -> Result { + let direct_io = IO_REACTOR.with(|w| w.borrow().fixed_buffers_available); + + let mut opts = OpenOptions::new(); + opts.read(true); + if direct_io { + opts.custom_flags(libc::O_DIRECT); + } + let file = opts.open(&path).expect("failed to open file"); + + let effective_range = if let Some(range) = range { + range + } else { + let len = file.metadata()?.len(); + 0..len + }; + + if direct_io { + let read_task = FixedFileReadTask::build(effective_range.clone(), &file, true); + if let Ok(task) = read_task { + let arc = submit_async_task(task).await; + return match Arc::try_unwrap(arc) { + Ok(mutex) => { + FixedFileReadTask::into_result(Box::new(mutex.into_inner().unwrap())) + } + Err(arc) => arc.lock().unwrap().get_result(), + }; + } + } + + let read_task = FileReadTask::build(effective_range, file, direct_io); + let arc = submit_async_task(read_task).await; + match Arc::try_unwrap(arc) { + Ok(mutex) => FileReadTask::into_result(Box::new(mutex.into_inner().unwrap())), + Err(arc) => arc.lock().unwrap().get_result(), + } +} + +pub(crate) async fn write(path: PathBuf, data: &Bytes) -> Result<(), std::io::Error> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(&path) + .expect("failed to create file"); + + let write_task = FileWriteTask::build(data.clone(), file.as_raw_fd(), false, false); + let arc = submit_async_task(write_task).await; + match Arc::try_unwrap(arc) { + Ok(mutex) => mutex.into_inner().unwrap().get_result(), + Err(arc) => arc.lock().unwrap().get_result(), + } +} diff --git a/src/parquet/src/io/mod.rs b/src/parquet/src/io/mod.rs index 01ce70b6..53782dbe 100644 --- a/src/parquet/src/io/mod.rs +++ b/src/parquet/src/io/mod.rs @@ -13,9 +13,9 @@ use liquid_cache_storage::cache::{CacheExpression, EntryID, IoContext, LiquidCom use crate::cache::{ColumnAccessPath, ParquetArrayID}; #[cfg(target_os = "linux")] -mod io_uring; +pub mod io_uring; -mod io_backend; +pub mod io_backend; #[derive(Debug)] pub(crate) struct ParquetIoContext { @@ -26,19 +26,25 @@ pub(crate) struct ParquetIoContext { } impl ParquetIoContext { - pub fn new(base_dir: PathBuf, io_mode: IoMode) -> Self { + pub fn new(base_dir: PathBuf, io_mode: IoMode, fixed_buffer_pool_size_mb: usize) -> Self { if matches!( io_mode, - IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking + IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking | IoMode::UringNonBlocking ) { #[cfg(target_os = "linux")] { - crate::io::io_uring::initialize_uring_pool(io_mode); + use liquid_cache_common::memory::pool::FixedBufferPool; + if fixed_buffer_pool_size_mb > 0 { + FixedBufferPool::init(fixed_buffer_pool_size_mb); + } + crate::io::io_uring::initialize_uring_pool(io_mode, fixed_buffer_pool_size_mb > 0); } #[cfg(not(target_os = "linux"))] { panic!("io_mode {:?} is only supported on Linux", io_mode); } + } else if fixed_buffer_pool_size_mb > 0 { + panic!("Fixed buffers are only supported for UringDirect, Uring and UringBlocking"); } Self { @@ -133,6 +139,87 @@ impl IoContext for ParquetIoContext { } } +/// Simple [IoContext] with IO mode selection (tokio, blocking, io_uring, etc.). +/// Uses simple EntryID-based paths and a single compressor, like storage's [liquid_cache_storage::cache::DefaultIoContext], +/// but delegates read/write to [io_backend] so all [IoMode]s are supported. +#[derive(Debug)] +pub struct SimpleIoContext { + compressor_state: Arc, + squeeze_hints: RwLock>>, + base_dir: PathBuf, + io_mode: IoMode, +} + +impl SimpleIoContext { + /// Create a new [SimpleIoContext] with the given base directory and IO mode. + pub fn new(base_dir: PathBuf, io_mode: IoMode, fixed_buffer_pool_size_mb: usize) -> Self { + if matches!( + io_mode, + IoMode::UringDirect | IoMode::Uring | IoMode::UringBlocking | IoMode::UringNonBlocking + ) { + #[cfg(target_os = "linux")] + { + use liquid_cache_common::memory::pool::FixedBufferPool; + if fixed_buffer_pool_size_mb > 0 { + FixedBufferPool::init(fixed_buffer_pool_size_mb); + } + crate::io::io_uring::initialize_uring_pool(io_mode, fixed_buffer_pool_size_mb > 0); + } + #[cfg(not(target_os = "linux"))] + { + panic!("io_mode {:?} is only supported on Linux", io_mode); + } + } else if fixed_buffer_pool_size_mb > 0 { + panic!("Fixed buffers are only supported for UringDirect, Uring and UringBlocking"); + } + + Self { + compressor_state: Arc::new(LiquidCompressorStates::new()), + squeeze_hints: RwLock::new(AHashMap::new()), + base_dir, + io_mode, + } + } +} + +#[async_trait::async_trait] +impl IoContext for SimpleIoContext { + fn add_squeeze_hint(&self, entry_id: &EntryID, expression: Arc) { + let mut guard = self.squeeze_hints.write().unwrap(); + guard.insert(*entry_id, expression); + } + + fn squeeze_hint(&self, entry_id: &EntryID) -> Option> { + let guard = self.squeeze_hints.read().unwrap(); + guard.get(entry_id).cloned() + } + + fn get_compressor(&self, _entry_id: &EntryID) -> Arc { + self.compressor_state.clone() + } + + fn disk_path(&self, entry_id: &EntryID) -> PathBuf { + self.base_dir + .join(format!("{:016x}.liquid", usize::from(*entry_id))) + } + + #[inline(never)] + #[fastrace::trace] + async fn read( + &self, + path: PathBuf, + range: Option>, + ) -> Result { + io_backend::read(self.io_mode, path, range).await + } + + #[inline(never)] + #[fastrace::trace] + async fn write_file(&self, path: PathBuf, data: Bytes) -> Result<(), std::io::Error> { + io_backend::write(self.io_mode, path, data).await + } +} + #[cfg(test)] mod tests { use super::*; @@ -147,7 +234,7 @@ mod tests { #[test] fn squeeze_hint_tracks_majority() { let tmp = tempdir().unwrap(); - let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking); + let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking, 0); let e = entry(1, 2, 3); let month = Arc::new(CacheExpression::extract_date32(Date32Field::Month)); let year = Arc::new(CacheExpression::extract_date32(Date32Field::Year)); @@ -163,7 +250,7 @@ mod tests { #[test] fn squeeze_hint_prefers_recent_on_tie() { let tmp = tempdir().unwrap(); - let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking); + let ctx = ParquetIoContext::new(tmp.path().to_path_buf(), IoMode::StdBlocking, 0); let e = entry(9, 9, 9); let year = Arc::new(CacheExpression::extract_date32(Date32Field::Year)); let day = Arc::new(CacheExpression::extract_date32(Date32Field::Day)); diff --git a/src/parquet/src/lib.rs b/src/parquet/src/lib.rs index 69f8bbcd..bfa39257 100644 --- a/src/parquet/src/lib.rs +++ b/src/parquet/src/lib.rs @@ -15,3 +15,10 @@ pub use liquid_cache_storage as storage; pub use reader::variant_udf::{VariantGetUdf, VariantPretty, VariantToJsonUdf}; pub use reader::{FilterCandidateBuilder, LiquidParquetSource, LiquidPredicate, LiquidRowFilter}; pub use utils::{boolean_buffer_and_then, extract_execution_metrics}; + +#[cfg(target_os = "linux")] +pub use crate::io::io_uring::local_runtime::UringExecutor; +#[cfg(target_os = "linux")] +pub use crate::io::io_uring::work_stealing::WorkStealingUringRuntime; + +pub use crate::io::SimpleIoContext; diff --git a/src/parquet/src/optimizers/lineage_opt.rs b/src/parquet/src/optimizers/lineage_opt.rs index 58f25cb3..1276beb3 100644 --- a/src/parquet/src/optimizers/lineage_opt.rs +++ b/src/parquet/src/optimizers/lineage_opt.rs @@ -1102,6 +1102,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ))) } diff --git a/src/parquet/src/optimizers/mod.rs b/src/parquet/src/optimizers/mod.rs index 2fdfecd9..2cd26e38 100644 --- a/src/parquet/src/optimizers/mod.rs +++ b/src/parquet/src/optimizers/mod.rs @@ -339,6 +339,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, )); let rewritten = rewrite_data_source_plan(plan, &liquid_cache, true); diff --git a/src/parquet/src/reader/runtime/liquid_cache_reader.rs b/src/parquet/src/reader/runtime/liquid_cache_reader.rs index 725cf8ba..8a92f547 100644 --- a/src/parquet/src/reader/runtime/liquid_cache_reader.rs +++ b/src/parquet/src/reader/runtime/liquid_cache_reader.rs @@ -301,6 +301,7 @@ mod tests { Box::new(Evict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let field = Arc::new(Field::new("col0", DataType::Int32, false)); let schema = Arc::new(Schema::new(vec![field.clone()])); diff --git a/src/parquet/src/reader/runtime/liquid_stream.rs b/src/parquet/src/reader/runtime/liquid_stream.rs index 359068de..ee2ff130 100644 --- a/src/parquet/src/reader/runtime/liquid_stream.rs +++ b/src/parquet/src/reader/runtime/liquid_stream.rs @@ -716,6 +716,7 @@ mod tests { Box::new(Evict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let file = cache.register_or_get_file("test.parquet".to_string(), schema); file.create_row_group(0, vec![]) diff --git a/src/server/src/lib.rs b/src/server/src/lib.rs index a4af6733..ac0f2221 100644 --- a/src/server/src/lib.rs +++ b/src/server/src/lib.rs @@ -122,6 +122,7 @@ impl LiquidCacheService { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), None, + 0, ) } @@ -141,6 +142,7 @@ impl LiquidCacheService { squeeze_policy: Box, hydration_policy: Box, io_mode: Option, + fixed_buffer_pool_size_mb: usize, ) -> anyhow::Result { let disk_cache_dir = match disk_cache_dir { Some(dir) => dir, @@ -163,6 +165,7 @@ impl LiquidCacheService { squeeze_policy, hydration_policy, io_mode, + fixed_buffer_pool_size_mb, ), }) } diff --git a/src/server/src/service.rs b/src/server/src/service.rs index ec41586e..cc240fad 100644 --- a/src/server/src/service.rs +++ b/src/server/src/service.rs @@ -52,6 +52,7 @@ impl LiquidCacheServiceInner { squeeze_policy: Box, hydration_policy: Box, io_mode: IoMode, + fixed_buffer_pool_size_mb: usize, ) -> Self { let batch_size = default_ctx.state().config().batch_size(); @@ -66,6 +67,7 @@ impl LiquidCacheServiceInner { squeeze_policy, hydration_policy, io_mode, + fixed_buffer_pool_size_mb, )); Self { @@ -224,6 +226,7 @@ mod tests { Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); let url = Url::parse("file:///").unwrap(); server diff --git a/src/server/src/tests/mod.rs b/src/server/src/tests/mod.rs index 1e9db2b8..4bbae70d 100644 --- a/src/server/src/tests/mod.rs +++ b/src/server/src/tests/mod.rs @@ -46,6 +46,7 @@ async fn run_sql( squeeze_policy, Box::new(AlwaysHydrate::new()), IoMode::Uring, + 0, ); async fn get_result(service: &LiquidCacheServiceInner, sql: &str) -> String { let handle = Uuid::new_v4();