From 2064d3d7ae036f288e0228ab4d5f8113b88b747c Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 27 Feb 2026 17:11:59 +0000 Subject: [PATCH 1/8] chore: clean up `WeightLoader` `Source` management Clean up unnecessary code and simplifying the `WeightLoader` struct --- lib/rust/hmll/examples/basic.rs | 7 +- lib/rust/hmll/examples/multi_files.rs | 2 +- lib/rust/hmll/src/loader.rs | 162 ++++++++++++-------------- lib/rust/hmll/src/source.rs | 13 ++- 4 files changed, 91 insertions(+), 93 deletions(-) diff --git a/lib/rust/hmll/examples/basic.rs b/lib/rust/hmll/examples/basic.rs index 0a2fd8b..579ed2a 100644 --- a/lib/rust/hmll/examples/basic.rs +++ b/lib/rust/hmll/examples/basic.rs @@ -30,18 +30,19 @@ fn main() -> Result<(), Box> { ); // Store in an array to ensure proper lifetime - let sources = [source]; + let source_size = source.size(); + let sources = vec![source]; // Create a weight loader println!("\nCreating weight loader..."); - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto)?; + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto)?; println!("✓ Loader created successfully"); println!(" Device: {}", loader.device()); println!(" Number of sources: {}", loader.num_sources()); // Fetch some data from the beginning of the file let fetch_size = end - start; - let actual_fetch_size = fetch_size.min(sources[0].size()); + let actual_fetch_size = fetch_size.min(source_size); println!( "\nFetching {} bytes ({:.2} MB)...", actual_fetch_size, diff --git a/lib/rust/hmll/examples/multi_files.rs b/lib/rust/hmll/examples/multi_files.rs index 88ebb18..fb10f9a 100644 --- a/lib/rust/hmll/examples/multi_files.rs +++ b/lib/rust/hmll/examples/multi_files.rs @@ -49,7 +49,7 @@ fn main() -> Result<(), Box> { // Create a weight loader for all sources println!("\nCreating weight loader for {} sources...", sources.len()); - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto)?; + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto)?; println!("✓ Loader created successfully"); // Display information about each source diff --git a/lib/rust/hmll/src/loader.rs b/lib/rust/hmll/src/loader.rs index d552fdf..21877a4 100644 --- a/lib/rust/hmll/src/loader.rs +++ b/lib/rust/hmll/src/loader.rs @@ -2,12 +2,9 @@ use hmll_sys::hmll_iobuf; -use crate::source::SourceHandle; use crate::{Buffer, Device, Error, Range, Result, Source}; use std::collections::HashSet; -use std::marker::PhantomData; use std::ptr; -use std::sync::Arc; /// Loader backend kind. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -74,17 +71,13 @@ impl Default for LoaderKind { /// # Ok(()) /// # } /// ``` -pub struct WeightLoader<'a> { +pub struct WeightLoader { context: Box, - /// Raw sources passed to C layer - sources: Vec, - /// Arc handles for each source - keeps mmap alive while views exist - source_handles: Vec>, + sources: Vec, device: Device, - _marker: PhantomData<&'a ()>, } -impl<'a> WeightLoader<'a> { +impl WeightLoader { /// Create a new weight loader. /// /// # Arguments @@ -96,15 +89,11 @@ impl<'a> WeightLoader<'a> { /// # Errors /// /// Returns an error if the loader initialization fails. - pub fn new(sources: &'a [Source], device: Device, kind: LoaderKind) -> Result { + pub fn new(sources: Vec, device: Device, kind: LoaderKind) -> Result { if sources.is_empty() { return Err(Error::InvalidRange); } - // Clone Arc handles to keep sources alive while views exist - let source_handles: Vec> = - sources.iter().map(|s| s.handle().clone()).collect(); - let mut sources_vec: Vec = sources.iter().map(|s| *s.as_raw()).collect(); let mut context = Box::new(hmll_sys::hmll { @@ -130,19 +119,16 @@ impl<'a> WeightLoader<'a> { // For mmap backend, close fds immediately - mmap is independent of fd. // NOTE: Could detect Auto resolving to mmap and close too, but left out for now. if kind == LoaderKind::Mmap { - for handle in &source_handles { - let inner_ptr = &handle.inner as *const _ as *mut hmll_sys::hmll_source; - hmll_sys::hmll_source_close(inner_ptr); + for source in &sources { + source.close_fd(); } } } Ok(Self { context, - sources: sources_vec, - source_handles, + sources, device, - _marker: PhantomData, }) } @@ -189,7 +175,7 @@ impl<'a> WeightLoader<'a> { /// # } /// ``` pub fn fetchv(&mut self, ranges: &[Range], file_index: usize) -> Result> { - if file_index >= self.sources.len() { + if file_index >= self.num_sources() { return Err(Error::InvalidFileIndex(file_index)); } @@ -301,7 +287,7 @@ impl<'a> WeightLoader<'a> { pub fn fetch>(&mut self, range: R, file_index: usize) -> Result { let range = range.into(); - if file_index >= self.sources.len() { + if file_index >= self.num_sources() { return Err(Error::InvalidFileIndex(file_index)); } @@ -378,10 +364,6 @@ impl<'a> WeightLoader<'a> { pub fn fetch_view>(&mut self, range: R, file_index: usize) -> Result { let range = range.into(); - if file_index >= self.sources.len() { - return Err(Error::InvalidFileIndex(file_index)); - } - if range.is_empty() { return Ok(Buffer::empty(self.device)); } @@ -392,10 +374,14 @@ impl<'a> WeightLoader<'a> { } // Clone the Arc to keep the source (and its mmap) alive - let source_handle = self.source_handles[file_index].clone(); + let source = self + .sources + .get(file_index) + .cloned() + .ok_or(Error::InvalidFileIndex(file_index))?; // Get the mmap'd content pointer directly from the source - let content_ptr = source_handle.inner.content; + let content_ptr = source.as_raw().content; if content_ptr.is_null() { return Err(Error::MmapFailed); } @@ -406,7 +392,7 @@ impl<'a> WeightLoader<'a> { let view_size = range.len(); // Create a view buffer - Arc keeps source (and mmap) alive - Ok(unsafe { Buffer::from_source_view(view_ptr, view_size, self.device, source_handle) }) + Ok(unsafe { Buffer::from_source_view(view_ptr, view_size, self.device, source.handle()) }) } /// Get the device this loader is configured for. @@ -424,11 +410,13 @@ impl<'a> WeightLoader<'a> { /// Get information about a specific source file. #[inline] pub fn source_info(&self, index: usize) -> Option { - self.sources.get(index).map(|s| SourceInfo { size: s.size }) + self.sources + .get(index) + .map(|s| SourceInfo { size: s.size() }) } } -impl<'a> Drop for WeightLoader<'a> { +impl Drop for WeightLoader { fn drop(&mut self) { unsafe { hmll_sys::hmll_destroy(self.context.as_mut()); @@ -459,7 +447,7 @@ mod tests { #[test] fn test_empty_sources() { - let result = WeightLoader::new(&[], Device::Cpu, LoaderKind::Auto); + let result = WeightLoader::new(vec![], Device::Cpu, LoaderKind::Auto); assert!(result.is_err()); } @@ -479,9 +467,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); assert_eq!(loader.device(), Device::Cpu); @@ -497,9 +485,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let buffer = loader @@ -519,9 +507,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let buffer = loader @@ -540,9 +528,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let buffer = loader.fetch(5..5, 0).expect("Failed to fetch empty range"); @@ -557,9 +545,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let result = loader.fetch(0..10, 99); @@ -580,9 +568,9 @@ mod tests { let source2 = Source::open(temp2.path()).expect("Failed to open source 2"); let source3 = Source::open(temp3.path()).expect("Failed to open source 3"); - let sources = [source1, source2, source3]; + let sources = vec![source1, source2, source3]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); assert_eq!(loader.num_sources(), 3); @@ -609,9 +597,9 @@ mod tests { let temp_file = create_test_file(&content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let buffer = loader @@ -630,9 +618,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let info = loader.source_info(0); @@ -649,9 +637,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let buffer = loader.fetch(0..content.len(), 0).expect("Failed to fetch"); @@ -666,9 +654,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); let buffer = loader @@ -684,9 +672,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); let view = loader @@ -705,9 +693,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); // Get a view of just the uppercase letters @@ -726,9 +714,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); let view = loader @@ -745,9 +733,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); let result = loader.fetch_view(0..10, 99); @@ -762,9 +750,9 @@ mod tests { let view = { let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); loader @@ -784,9 +772,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); // Create multiple views from the same source @@ -817,9 +805,9 @@ mod tests { let source1 = Source::open(temp1.path()).expect("Failed to open source 1"); let source2 = Source::open(temp2.path()).expect("Failed to open source 2"); - let sources = [source1, source2]; + let sources = vec![source1, source2]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); let view1 = loader @@ -843,9 +831,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let ranges = vec![Range::new(0, 10), Range::new(10, 20), Range::new(20, 36)]; @@ -864,9 +852,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let ranges = vec![Range::new(0, content.len())]; @@ -882,9 +870,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let buffers = loader.fetchv(&[], 0).expect("Failed to fetchv"); @@ -897,9 +885,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let ranges = vec![ @@ -924,9 +912,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let ranges = vec![Range::new(0, 0), Range::new(5, 5), Range::new(10, 3)]; @@ -943,9 +931,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let ranges = vec![Range::new(0, 10)]; @@ -963,9 +951,9 @@ mod tests { let source1 = Source::open(temp1.path()).expect("Failed to open source 1"); let source2 = Source::open(temp2.path()).expect("Failed to open source 2"); - let sources = [source1, source2]; + let sources = vec![source1, source2]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let ranges1 = vec![Range::new(0, content1.len())]; @@ -985,9 +973,9 @@ mod tests { let temp_file = create_test_file(&content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); // Split into 16 equal chunks @@ -1012,9 +1000,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto) .expect("Failed to create loader"); let ranges = vec![Range::new(0, 10), Range::new(10, 20), Range::new(36, 46)]; @@ -1044,9 +1032,9 @@ mod tests { let temp_file = create_test_file(content); let source = Source::open(temp_file.path()).expect("Failed to open source"); - let sources = [source]; + let sources = vec![source]; - let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap) + let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap) .expect("Failed to create mmap loader"); let ranges = vec![Range::new(0, 10), Range::new(10, 20)]; diff --git a/lib/rust/hmll/src/source.rs b/lib/rust/hmll/src/source.rs index b826cf9..6b041e2 100644 --- a/lib/rust/hmll/src/source.rs +++ b/lib/rust/hmll/src/source.rs @@ -99,8 +99,17 @@ impl Source { /// /// This is used internally for creating views that outlive the loader. #[inline(always)] - pub(crate) fn handle(&self) -> &Arc { - &self.handle + pub(crate) fn handle(&self) -> Arc { + self.handle.clone() + } + + /// Close the file descriptor associated to the [`Source`]. + /// Useful for mmap when we don't need a dangling file descriptor. + pub(crate) fn close_fd(&self) { + unsafe { + let inner_ptr = &self.handle.inner as *const _ as *mut hmll_sys::hmll_source; + hmll_sys::hmll_source_close(inner_ptr); + } } } From d3efa5a7ff931a9d55a33e532002726fcf6f2ab9 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 27 Feb 2026 17:20:24 +0000 Subject: [PATCH 2/8] fix: code doc examples were referecing old code --- lib/rust/hmll/src/lib.rs | 12 ++++++------ lib/rust/hmll/src/loader.rs | 18 ++++++++---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/lib/rust/hmll/src/lib.rs b/lib/rust/hmll/src/lib.rs index f5bcbf5..3161eec 100644 --- a/lib/rust/hmll/src/lib.rs +++ b/lib/rust/hmll/src/lib.rs @@ -22,16 +22,16 @@ //! ```no_run //! # use hmll::{Source, WeightLoader, Device, LoaderKind}; //! # fn main() -> Result<(), Box> { -//! let sources = [ -//! Source::open("shard-00001.bin")?, -//! Source::open("shard-00002.bin")?, +//! let sources = vec![ +//! Source::open("model-00001.bin")?, +//! Source::open("model-00002.bin")?, //! ]; //! -//! let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto)?; +//! let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto)?; //! //! // Fetch bytes from specific file by index -//! let data = loader.fetch(0..1024, 0)?; // from shard 1 -//! let data = loader.fetch(0..1024, 1)?; // from shard 2 +//! let data = loader.fetch(0..1024, 0)?; // from source 1 +//! let data = loader.fetch(0..1024, 1)?; // from source 2 //! # Ok(()) //! # } //! ``` diff --git a/lib/rust/hmll/src/loader.rs b/lib/rust/hmll/src/loader.rs index 21877a4..f3e8aa8 100644 --- a/lib/rust/hmll/src/loader.rs +++ b/lib/rust/hmll/src/loader.rs @@ -33,8 +33,6 @@ impl LoaderKind { impl Default for LoaderKind { /// Default loader kind is Auto. - /// - /// Hot path - inline always for zero-cost default. #[inline(always)] fn default() -> Self { LoaderKind::Auto @@ -56,11 +54,11 @@ impl Default for LoaderKind { /// let source1 = Source::open("model-00001-of-00003.safetensors")?; /// let source2 = Source::open("model-00002-of-00003.safetensors")?; /// let source3 = Source::open("model-00003-of-00003.safetensors")?; -/// let sources = [source1, source2, source3]; +/// let sources = vec![source1, source2, source3]; /// /// // Create a loader /// let mut loader = WeightLoader::new( -/// &sources, +/// sources, /// Device::Cpu, /// LoaderKind::Auto /// )?; @@ -158,8 +156,8 @@ impl WeightLoader { /// # use hmll::{Source, WeightLoader, Device, LoaderKind, Range}; /// # fn main() -> Result<(), Box> { /// # let source = Source::open("model.safetensors")?; - /// # let sources = [source]; - /// # let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto)?; + /// # let sources = vec![source]; + /// # let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto)?; /// /// // Fetch multiple weight tensors in a single batch /// let ranges = vec![ @@ -275,8 +273,8 @@ impl WeightLoader { /// # use hmll::{Source, WeightLoader, Device, LoaderKind}; /// # fn main() -> Result<(), Box> { /// # let source = Source::open("model.safetensors")?; - /// # let sources = [source]; - /// # let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Auto)?; + /// # let sources = vec![source]; + /// # let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Auto)?; /// /// // Fetch first 1MB from the first file /// let data = loader.fetch(0..1024 * 1024, 0)?; @@ -350,8 +348,8 @@ impl WeightLoader { /// # use hmll::{Source, WeightLoader, Device, LoaderKind}; /// # fn main() -> Result<(), Box> { /// # let source = Source::open("model.safetensors")?; - /// # let sources = [source]; - /// # let mut loader = WeightLoader::new(&sources, Device::Cpu, LoaderKind::Mmap)?; + /// # let sources = vec![source]; + /// # let mut loader = WeightLoader::new(sources, Device::Cpu, LoaderKind::Mmap)?; /// /// // Get a zero-copy view into the first 1MB /// let view = loader.fetch_view(0..1024 * 1024, 0)?; From acee5d5ef7bf4f0404c0c159c49051c5562dc12a Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 2 Mar 2026 22:19:20 +0000 Subject: [PATCH 3/8] refactor: derive `Debug` for `WeightLoader` --- lib/rust/hmll/src/loader.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/rust/hmll/src/loader.rs b/lib/rust/hmll/src/loader.rs index f3e8aa8..bc5492d 100644 --- a/lib/rust/hmll/src/loader.rs +++ b/lib/rust/hmll/src/loader.rs @@ -69,6 +69,7 @@ impl Default for LoaderKind { /// # Ok(()) /// # } /// ``` +#[derive(Debug)] pub struct WeightLoader { context: Box, sources: Vec, From e2f76226b0a1d63193788f898cc3a8d6cf27c925 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 6 Mar 2026 16:44:27 +0000 Subject: [PATCH 4/8] feat: wrap buffer in arc for easy shared ownership --- lib/rust/hmll/src/buffer.rs | 77 ++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/lib/rust/hmll/src/buffer.rs b/lib/rust/hmll/src/buffer.rs index 8486095..f0aaa33 100644 --- a/lib/rust/hmll/src/buffer.rs +++ b/lib/rust/hmll/src/buffer.rs @@ -90,16 +90,41 @@ impl From for ops::Range { /// - **Empty**: Zero-length buffer with no memory. /// - **Owned**: Allocated memory that is freed when the buffer is dropped. /// - **SourceView**: Zero-copy pointer into mmap'd memory, kept alive via Arc. -pub struct Buffer { +pub struct BufferInner { buf: hmll_iobuf, kind: BufferKind, } +impl Drop for BufferInner { + fn drop(&mut self) { + if let BufferKind::Owned = self.kind { + if !self.buf.ptr.is_null() { + unsafe { hmll_free_buffer(&mut self.buf) }; + } + } + // For SourceView: the Arc is dropped automatically, decrementing refcount. + // When the last Arc is dropped, SourceHandle::drop() unmaps the memory. + } +} + +// SAFETY: BufferInner is safe to send across threads because: +// - The buffer data is immutable after creation (read-only access) +// - Owned buffers: memory is allocated by hmll, only freed on drop +// - SourceView buffers: memory is mmap'd and kept alive by Arc +// - No internal mutation occurs after construction +// +// Callers must NOT mutate data through `as_ptr()` - doing so would be UB. +unsafe impl Send for BufferInner {} +unsafe impl Sync for BufferInner {} + +#[derive(Clone)] +pub struct Buffer(Arc); + impl std::fmt::Debug for Buffer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Buffer") - .field("size", &self.buf.size) - .field("ptr", &self.buf.ptr) + .field("size", &self.0.buf.size) + .field("ptr", &self.0.buf.ptr) .field("device", &self.device()) .field("owned", &self.is_owned()) .finish() @@ -112,14 +137,14 @@ impl Buffer { /// This is useful when you need to represent a zero-length fetch result. #[inline(always)] pub fn empty(device: Device) -> Self { - Self { + Self(Arc::new(BufferInner { buf: hmll_iobuf { size: 0, ptr: std::ptr::null_mut(), device: device.to_raw(), }, kind: BufferKind::Empty, - } + })) } /// Create a new owned buffer from an `hmll_iobuf`. @@ -132,10 +157,10 @@ impl Buffer { /// and that the memory was allocated via hmll allocation functions. #[inline(always)] pub(crate) unsafe fn from_raw_owned(buf: hmll_iobuf) -> Self { - Self { + Self(Arc::new(BufferInner { buf, kind: BufferKind::Owned, - } + })) } /// Create a zero-copy view into mmap'd source memory. @@ -153,28 +178,28 @@ impl Buffer { device: Device, source_handle: Arc, ) -> Self { - Self { + Self(Arc::new(BufferInner { buf: hmll_iobuf { size, ptr, device: device.to_raw(), }, kind: BufferKind::SourceView(source_handle), - } + })) } /// Get the buffer as a byte slice (CPU only). #[inline] pub fn as_slice(&self) -> Option<&[u8]> { if self.device() == Device::Cpu { - if self.buf.ptr.is_null() || self.buf.size == 0 { + if self.0.buf.ptr.is_null() || self.0.buf.size == 0 { // Return empty slice for empty/null buffers Some(&[]) } else { unsafe { Some(std::slice::from_raw_parts( - self.buf.ptr as *const u8, - self.buf.size, + self.0.buf.ptr as *const u8, + self.0.buf.size, )) } } @@ -185,26 +210,26 @@ impl Buffer { /// Get the size of the buffer in bytes. #[inline(always)] - pub const fn len(&self) -> usize { - self.buf.size + pub fn len(&self) -> usize { + self.0.buf.size } /// Check if the buffer is empty. #[inline(always)] - pub const fn is_empty(&self) -> bool { - self.buf.size == 0 + pub fn is_empty(&self) -> bool { + self.0.buf.size == 0 } /// Get the device where the buffer is located. #[inline(always)] pub fn device(&self) -> Device { - Device::from_raw(self.buf.device) + Device::from_raw(self.0.buf.device) } /// Get a raw pointer to the buffer. #[inline(always)] - pub const fn as_ptr(&self) -> *const u8 { - self.buf.ptr as *const u8 + pub fn as_ptr(&self) -> *const u8 { + self.0.buf.ptr as *const u8 } /// Convert to a Vec (copies data if on CPU). @@ -223,18 +248,6 @@ impl Buffer { /// and are kept alive by an Arc reference to the source. #[inline(always)] pub fn is_owned(&self) -> bool { - matches!(self.kind, BufferKind::Owned) - } -} - -impl Drop for Buffer { - fn drop(&mut self) { - if let BufferKind::Owned = self.kind { - if !self.buf.ptr.is_null() { - unsafe { hmll_free_buffer(&mut self.buf) }; - } - } - // For SourceView: the Arc is dropped automatically, decrementing refcount. - // When the last Arc is dropped, SourceHandle::drop() unmaps the memory. + matches!(self.0.kind, BufferKind::Owned) } } From c17d43e87ddc1e6604c3712b42a8bb183d46b0b1 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Wed, 11 Mar 2026 20:32:15 +0000 Subject: [PATCH 5/8] fix: various bugs, including sqe partial init EFAULT --- include/hmll/linux/backend/iouring.h | 9 +++-- lib/linux/backend/iouring.c | 53 ++++++++++++++++++---------- lib/rust/hmll/src/error.rs | 16 +++++++-- lib/unix/memory.c | 9 +++-- 4 files changed, 61 insertions(+), 26 deletions(-) diff --git a/include/hmll/linux/backend/iouring.h b/include/hmll/linux/backend/iouring.h index 85c6a8d..7f575f8 100644 --- a/include/hmll/linux/backend/iouring.h +++ b/include/hmll/linux/backend/iouring.h @@ -107,8 +107,13 @@ static inline int hmll_io_uring_slot_find_available(const struct hmll_iouring_io { for (unsigned i = 0; i < HMLL_URING_IOBUSY_WORDS; ++i) { const int pos = __builtin_ffsll(~iobusy.bits[i]); - if (pos > 0) - return (int)(i * 64) + pos - 1; + if (pos > 0) { + const int slot = (int)(i * 64) + pos - 1; + // Ensure we don't return slots beyond QUEUE_DEPTH + if (slot >= (int)HMLL_URING_QUEUE_DEPTH) + return -1; + return slot; + } } return -1; } diff --git a/lib/linux/backend/iouring.c b/lib/linux/backend/iouring.c index 661d7e0..6a520d3 100644 --- a/lib/linux/backend/iouring.c +++ b/lib/linux/backend/iouring.c @@ -17,15 +17,16 @@ static inline int hmll_io_uring_get_setup_flags(void) { - int flags = IORING_SETUP_SQPOLL; - - // retrieve the current kernel version so we can adjust io_uring flags - struct utsname unamedata; - uname(&unamedata); - - int major, minor, revision = 0; - if (sscanf(unamedata.release, "%d.%d.%d", &major, &minor, &revision)) { - if (major >= 6) flags |= IORING_SETUP_SINGLE_ISSUER; + struct utsname uts; + int flags = 0; + + if (uname(&uts) == 0) { + int major = 0, minor = 0; + sscanf(uts.release, "%d.%d", &major, &minor); + // SQPOLL requires root or CAP_SYS_NICE, skip for now + // SINGLE_ISSUER available since 6.0 + if (major > 6 || (major == 6 && minor >= 0)) + flags |= IORING_SETUP_SINGLE_ISSUER; } return flags; @@ -42,7 +43,9 @@ static struct hmll_error hmll_io_uring_register_staging_buffers( return ctx->error; } - unsigned char *arena = hmll_alloc(HMLL_URING_QUEUE_DEPTH * HMLL_URING_BUFFER_SIZE, device, HMLL_MEM_STAGING); + const size_t staging_size = HMLL_URING_QUEUE_DEPTH * HMLL_URING_BUFFER_SIZE; + + unsigned char *arena = hmll_alloc(staging_size, device, HMLL_MEM_STAGING); if (!arena) { ctx->error = HMLL_ERR(HMLL_ERR_ALLOCATION_FAILED); return ctx->error; @@ -96,13 +99,14 @@ static inline void hmll_io_uring_reclaim_slots( // TODO(mfuntowicz): Should we directly store `slots` which are doing memcpy currently to avoid full scan? for (size_t i = 0; i < HMLL_URING_QUEUE_DEPTH; ++i) { struct hmll_io_uring_cuda_context *cd = dctx + i; - if (hmll_io_uring_slot_is_busy(fetcher->iobusy, i)) { - if (cd->state == HMLL_CUDA_STREAM_MEMCPY && cudaEventQuery(cd->done) == cudaSuccess) { + if (hmll_io_uring_slot_is_busy(fetcher->iobusy, i) && cd->state == HMLL_CUDA_STREAM_MEMCPY) { + if (cudaEventQuery(cd->done) == cudaSuccess) { hmll_io_uring_cuda_stream_set_idle(&cd->state); hmll_io_uring_slot_set_available(&fetcher->iobusy, cd->slot); } } } + #else HMLL_UNUSED(fetcher); HMLL_UNUSED(device); @@ -134,7 +138,9 @@ static inline void hmll_io_uring_prep_sqe( struct hmll_io_uring_cuda_context *dctx = fetcher->device_ctx; void *buf = fetcher->iovecs[slot].iov_base; + dctx[slot].offset = offset; + hmll_io_uring_cuda_stream_set_idle(&dctx[slot].state); io_uring_prep_read_fixed(sqe, iofile, buf, len, offset, slot); io_uring_sqe_set_data(sqe, dctx + slot); } @@ -239,7 +245,7 @@ static ssize_t hmll_io_uring_fetch_range_impl( if (unlikely(io_uring_submit_and_wait(&fetcher->ioring, nwait) < 0)) { // todo: do we need to reset the cca? hmll_io_uring_cca_init(&fetcher->iocca) - ctx->error = HMLL_ERR(HMLL_ERR_IO_ERROR); + ctx->error = HMLL_SYS_ERR(errno); return -1; } clock_gettime(CLOCK_MONOTONIC, &ts_end); @@ -344,14 +350,15 @@ static ssize_t hmll_io_uring_fetchv_range_impl( while (n_active > 0 || n_in_flight > 0) { while (n_active > 0) { - struct io_uring_sqe *sqe = io_uring_get_sqe(&fetcher->ioring); - if (!sqe) break; - if (active_cursor >= n_active) active_cursor = 0; const uint32_t current_idx = active_indices[active_cursor]; struct fetch_state *st = &states[current_idx]; + // Handle fadvise first - doesn't need a buffer slot if (unlikely(!st->fadvise_sent)) { + struct io_uring_sqe *sqe = io_uring_get_sqe(&fetcher->ioring); + if (!sqe) break; + io_uring_prep_fadvise(sqe, iofile, offsets[current_idx], st->size, POSIX_FADV_SEQUENTIAL | POSIX_FADV_WILLNEED); io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); io_uring_sqe_set_data64(sqe, BIT_FADVISE); @@ -359,6 +366,8 @@ static ssize_t hmll_io_uring_fetchv_range_impl( continue; } + // For read operations, check slot availability BEFORE getting SQE + // to avoid leaving uninitialized SQEs in the submission queue int slot = hmll_io_uring_slot_find_available(fetcher->iobusy); if (slot == -1) { hmll_io_uring_reclaim_slots(fetcher, dsts[0].device); @@ -366,6 +375,9 @@ static ssize_t hmll_io_uring_fetchv_range_impl( if (slot == -1) break; } + struct io_uring_sqe *sqe = io_uring_get_sqe(&fetcher->ioring); + if (!sqe) break; + hmll_io_uring_slot_set_busy(&fetcher->iobusy, slot); slot_offsets[slot] = st->submitted; @@ -386,6 +398,8 @@ static ssize_t hmll_io_uring_fetchv_range_impl( io_uring_sqe_set_data64(sqe, ((uint64_t)current_idx << SHIFT_RANGE) | slot); + // Track submission for debug (removed static vars that were causing confusion) + st->submitted += to_read; n_in_flight++; @@ -408,7 +422,7 @@ static ssize_t hmll_io_uring_fetchv_range_impl( clock_gettime(CLOCK_MONOTONIC, &ts_start); if (unlikely(io_uring_submit_and_wait(&fetcher->ioring, nwait) < 0)) { - ctx->error = HMLL_ERR(HMLL_ERR_IO_ERROR); + ctx->error = HMLL_SYS_ERR(errno); goto cleanup; } clock_gettime(CLOCK_MONOTONIC, &ts_end); @@ -426,7 +440,7 @@ static ssize_t hmll_io_uring_fetchv_range_impl( n_in_flight--; if (unlikely(cqe->res < 0)) { - ctx->error = HMLL_ERR(HMLL_ERR_IO_ERROR); + ctx->error = HMLL_SYS_ERR(-cqe->res); io_uring_cq_advance(&fetcher->ioring, count); goto cleanup; } @@ -572,7 +586,8 @@ void hmll_io_uring_destroy(void *ptr) } } - munmap(backend->iovecs[0].iov_base, HMLL_URING_QUEUE_DEPTH * sizeof(struct iovec)); + // Staging arena was allocated with cudaHostAlloc + cudaFreeHost(backend->iovecs[0].iov_base); free(backend->device_ctx); backend->device_ctx = NULL; } diff --git a/lib/rust/hmll/src/error.rs b/lib/rust/hmll/src/error.rs index adc3026..5bab340 100644 --- a/lib/rust/hmll/src/error.rs +++ b/lib/rust/hmll/src/error.rs @@ -39,8 +39,8 @@ pub enum Error { #[error("Buffer too small")] BufferTooSmall, - #[error("I/O error")] - IoError, + #[error("I/O error: {0}")] + IoError(String), #[error("No source provided")] NoSourceProvided, @@ -132,7 +132,17 @@ impl Error { HMLL_ERR_INVALID_RANGE => Error::InvalidRange, HMLL_ERR_BUFFER_ADDR_NOT_ALIGNED => Error::BufferAddrNotAligned, HMLL_ERR_BUFFER_TOO_SMALL => Error::BufferTooSmall, - HMLL_ERR_IO_ERROR => Error::IoError, + HMLL_ERR_IO_ERROR => { + let msg = unsafe { + let ptr = hmll_strerr(err); + if ptr.is_null() { + format!("errno {}", err.sys_err) + } else { + CStr::from_ptr(ptr).to_string_lossy().into_owned() + } + }; + Error::IoError(msg) + } HMLL_ERR_NO_SOURCE_PROVIDED => Error::NoSourceProvided, HMLL_ERR_FILE_NOT_FOUND => Error::FileNotFound(String::new()), HMLL_ERR_FILE_EMPTY => Error::FileEmpty, diff --git a/lib/unix/memory.c b/lib/unix/memory.c index 5d79e5c..1ebc79b 100644 --- a/lib/unix/memory.c +++ b/lib/unix/memory.c @@ -1,6 +1,8 @@ // // Created by mfuntowicz on 1/13/26. // +#include +#include #include #include "hmll/hmll.h" @@ -43,8 +45,11 @@ void *hmll_alloc(const size_t size, const struct hmll_device device, const int f if (hmll_device_is_cuda(device) && flags == HMLL_MEM_DEVICE) cudaMalloc(&ptr, size); - if (hmll_device_is_cuda(device) && flags == HMLL_MEM_STAGING) - cudaHostAlloc(&ptr, size, cudaHostAllocDefault | cudaHostAllocPortable); + if (hmll_device_is_cuda(device) && flags == HMLL_MEM_STAGING) { + cudaError_t err = cudaHostAlloc(&ptr, size, cudaHostAllocDefault | cudaHostAllocPortable); + if (err != cudaSuccess) + ptr = NULL; + } #endif return ptr; From 21f6a7f561bbf6034d1104c6c82b4609cc2f08fa Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Thu, 12 Mar 2026 09:44:49 +0000 Subject: [PATCH 6/8] refactor: revert var extraction --- lib/linux/backend/iouring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/linux/backend/iouring.c b/lib/linux/backend/iouring.c index c453850..fc10e76 100644 --- a/lib/linux/backend/iouring.c +++ b/lib/linux/backend/iouring.c @@ -79,9 +79,7 @@ static struct hmll_error hmll_io_uring_register_staging_buffers( return ctx->error; } - const size_t staging_size = HMLL_URING_QUEUE_DEPTH * HMLL_URING_BUFFER_SIZE; - - unsigned char *arena = hmll_alloc(staging_size, device, HMLL_MEM_STAGING); + unsigned char *arena = hmll_alloc(HMLL_URING_QUEUE_DEPTH * HMLL_URING_BUFFER_SIZE, device, HMLL_MEM_STAGING); if (!arena) { ctx->error = HMLL_ERR(HMLL_ERR_ALLOCATION_FAILED); return ctx->error; From ca2c42829178d17ed37ecf24f079d9fd8af5f878 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Thu, 12 Mar 2026 09:49:58 +0000 Subject: [PATCH 7/8] refactor: remove unneeded comments --- lib/linux/backend/iouring.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/linux/backend/iouring.c b/lib/linux/backend/iouring.c index fc10e76..8f4853d 100644 --- a/lib/linux/backend/iouring.c +++ b/lib/linux/backend/iouring.c @@ -554,8 +554,6 @@ static ssize_t hmll_io_uring_fetchv_loop( io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE); io_uring_sqe_set_data64(sqe, ((uint64_t)bidx << FETCHV_BIDX_SHIFT) | (uint64_t)slot); - // Track submission for debug (removed static vars that were causing confusion) - st->submitted += to_read; n_in_flight++; @@ -891,7 +889,6 @@ void hmll_io_uring_destroy(void *ptr) } } - // Staging arena was allocated with cudaHostAlloc cudaFreeHost(backend->iovecs[0].iov_base); free(backend->device_ctx); backend->device_ctx = NULL; From c67e7c04c2b0fe8afba4aaa0aa725ffa3cce2c23 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 20 Mar 2026 10:51:14 +0000 Subject: [PATCH 8/8] feat: add NUMA node awareness and pin threads accordingly --- lib/linux/backend/iouring.c | 99 ++++++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/lib/linux/backend/iouring.c b/lib/linux/backend/iouring.c index 8f4853d..7e56e8e 100644 --- a/lib/linux/backend/iouring.c +++ b/lib/linux/backend/iouring.c @@ -1,4 +1,7 @@ +#include #include +#include +#include #include #include "hmll/hmll.h" #include "hmll/memory.h" @@ -14,6 +17,61 @@ #include #endif +/* ── NUMA topology helpers ──────────────────────────────────────────── */ + +/** + * Get the NUMA node for a CUDA device by reading sysfs via the PCI bus ID. + * Returns -1 on failure. + */ +static int hmll_get_gpu_numa_node(const int device_idx) +{ +#if defined(__HMLL_CUDA_ENABLED__) + char pci_bus_id[64] = {0}; + if (cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), device_idx) != cudaSuccess) + return -1; + + /* Convert to lowercase for sysfs path (CUDA returns uppercase hex) */ + for (char *p = pci_bus_id; *p; p++) + *p = (*p >= 'A' && *p <= 'Z') ? (*p + 32) : *p; + + char path[256]; + snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/numa_node", pci_bus_id); + + FILE *f = fopen(path, "r"); + if (!f) return -1; + + int node = -1; + if (fscanf(f, "%d", &node) != 1) node = -1; + fclose(f); + + return node; +#else + (void)device_idx; + return -1; +#endif +} + +/** + * Get the first CPU core on a given NUMA node by parsing sysfs. + * Returns -1 on failure. + */ +static int hmll_get_first_cpu_on_node(const int numa_node) +{ + if (numa_node < 0) return -1; + + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/node/node%d/cpulist", numa_node); + + FILE *f = fopen(path, "r"); + if (!f) return -1; + + int first_cpu = -1; + if (fscanf(f, "%d", &first_cpu) != 1) first_cpu = -1; + fclose(f); + + return first_cpu; +} + /* ── runtime kernel version detection ───────────────────────────────── */ static inline unsigned hmll_kernel_version_internal(unsigned maj, unsigned min) { @@ -80,6 +138,7 @@ static struct hmll_error hmll_io_uring_register_staging_buffers( } unsigned char *arena = hmll_alloc(HMLL_URING_QUEUE_DEPTH * HMLL_URING_BUFFER_SIZE, device, HMLL_MEM_STAGING); + if (!arena) { ctx->error = HMLL_ERR(HMLL_ERR_ALLOCATION_FAILED); return ctx->error; @@ -760,8 +819,19 @@ static struct hmll_error hmll_io_uring_queue_init( const struct hmll_device device ) { (void)ctx; + + /* Detect NUMA node for the target device and pin SQPOLL thread accordingly */ + int numa_node = -1; + int sq_cpu = 0; + + if (hmll_device_is_cuda(device)) { + numa_node = hmll_get_gpu_numa_node(device.idx); + int cpu = hmll_get_first_cpu_on_node(numa_node); + if (cpu >= 0) sq_cpu = cpu; + } + struct io_uring_params params = { - .sq_thread_cpu = 0, + .sq_thread_cpu = (unsigned)sq_cpu, .flags = hmll_io_uring_get_setup_flags(), .sq_thread_idle = 500 }; @@ -773,6 +843,33 @@ static struct hmll_error hmll_io_uring_queue_init( return HMLL_ERR(HMLL_ERR_CUDA_SET_DEVICE_FAILED); } + /* Pin this thread to the GPU's NUMA node for optimal memory allocation */ + if (numa_node >= 0) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/node/node%d/cpulist", numa_node); + FILE *f = fopen(path, "r"); + if (f) { + char buf[1024] = {0}; + if (fgets(buf, sizeof(buf), f)) { + /* Parse cpulist format: "0-23,48-71" */ + char *tok = strtok(buf, ",\n"); + while (tok) { + int lo, hi; + if (sscanf(tok, "%d-%d", &lo, &hi) == 2) { + for (int c = lo; c <= hi; c++) CPU_SET(c, &cpuset); + } else if (sscanf(tok, "%d", &lo) == 1) { + CPU_SET(lo, &cpuset); + } + tok = strtok(NULL, ",\n"); + } + } + fclose(f); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + } + } + struct hmll_io_uring_cuda_context *data = calloc(HMLL_URING_QUEUE_DEPTH, sizeof(struct hmll_io_uring_cuda_context)); backend->device_ctx = (void *)data;