From 2e218a0250416f8cdeacb1472aca62b75c8c649f Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 11 May 2026 10:33:37 +0200 Subject: [PATCH 01/27] Batch builder API --- iceberg_rust_ffi/Cargo.toml | 2 +- iceberg_rust_ffi/src/batch_builder.rs | 686 +++++++++++++++++++++++++ iceberg_rust_ffi/src/lib.rs | 3 + iceberg_rust_ffi/src/writer.rs | 172 +------ iceberg_rust_ffi/src/writer_columns.rs | 324 +----------- src/RustyIceberg.jl | 2 +- src/writer.jl | 336 +++++------- test/writer_tests.jl | 303 ++++++++--- 8 files changed, 1115 insertions(+), 713 deletions(-) create mode 100644 iceberg_rust_ffi/src/batch_builder.rs diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml index f8fd7b9..9b33363 100644 --- a/iceberg_rust_ffi/Cargo.toml +++ b/iceberg_rust_ffi/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iceberg_rust_ffi" -version = "0.7.21" +version = "0.7.22" edition = "2021" [lib] diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs new file mode 100644 index 0000000..fcb225f --- /dev/null +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -0,0 +1,686 @@ +/// Incremental column batch builder for zero-copy-from-Julia writes. +/// +/// Julia calls `iceberg_batch_builder_append_slice` once per operator slice, passing +/// one `SliceRef` per column. Each slice's data is appended directly into a pre-allocated +/// `MutableBuffer` that becomes the Arrow column buffer at finalize time — no intermediate +/// Vec or second copy is needed. Julia can reuse source memory as soon as the call returns. +/// +/// Null bits are populated lazily: all-valid slices are skipped entirely. The first null +/// slice triggers a one-time backfill of all prior rows as valid, then subsequent slices +/// are processed normally. If no null slice ever arrives, no NullBuffer is emitted. +/// +/// When a coalesce window is full, Julia calls `iceberg_batch_builder_write` which +/// finalises all per-column buffers into Arrow arrays, assembles a `RecordBatch`, and +/// submits it to the async encode pool — then resets the builder in-place for reuse. +/// Reset swaps in a fresh pre-allocated `MutableBuffer` (same capacity) so the next +/// window never reallocates. +use std::sync::Arc; + +use arrow_array::{ + types::*, + ArrayRef, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, StringArray, +}; +use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_schema::SchemaRef as ArrowSchemaRef; + +use crate::writer::{submit_batch, IcebergDataFileWriter, GLOBAL_ENCODE_POOL}; +use crate::writer_columns::{ + SliceRef, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_DATE, COLUMN_TYPE_DECIMAL_INT128, + COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_FLOAT32, + COLUMN_TYPE_FLOAT64, COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_JULIA_DATE, + COLUMN_TYPE_JULIA_TIMESTAMP, COLUMN_TYPE_JULIA_TIMESTAMPTZ, COLUMN_TYPE_JULIA_TIMESTAMP_NS, + COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS, COLUMN_TYPE_STRING, COLUMN_TYPE_TIMESTAMP, + COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_UUID, +}; + +/// Days from Julia Date epoch (0001-01-01, Rata Die day 1) to Unix epoch (1970-01-01). +/// Julia Date stores days using 1-based Rata Die: Dates.value(Date(1970,1,1)) == 719163. +const JULIA_DATE_OFFSET: i64 = 719_163; +/// Milliseconds from Julia DateTime epoch (0001-01-01) to Unix epoch (1970-01-01). +const JULIA_TIMESTAMP_OFFSET_MS: i64 = 719_163 * 86_400_000; + +// Default coalesce_rows — must match Julia's DEFAULT_COALESCE_ROWS. +const DEFAULT_COALESCE_ROWS: usize = 1_048_576; + +/// Bytes per row for numeric column types (0 for Bool/Str which are not Numeric). +fn column_bytes_per_row(column_type: i32) -> usize { + match column_type { + COLUMN_TYPE_INT32 | COLUMN_TYPE_DATE | COLUMN_TYPE_FLOAT32 + | COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_JULIA_DATE => 4, + COLUMN_TYPE_INT64 | COLUMN_TYPE_TIMESTAMP | COLUMN_TYPE_TIMESTAMPTZ + | COLUMN_TYPE_FLOAT64 | COLUMN_TYPE_DECIMAL_INT64 + | COLUMN_TYPE_JULIA_TIMESTAMP | COLUMN_TYPE_JULIA_TIMESTAMPTZ + | COLUMN_TYPE_JULIA_TIMESTAMP_NS | COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS => 8, + COLUMN_TYPE_DECIMAL_INT128 | COLUMN_TYPE_UUID => 16, + _ => 0, + } +} + +// --------------------------------------------------------------------------- +// Per-column value buffer +// +// All numeric and UUID variants use `MutableBuffer` (byte-oriented, Arrow-native). +// On append we write typed bytes directly into it; on finalize we call `.into()` to +// get an Arrow `Buffer` — ownership transfer, no copy. The buffer is then swapped out +// for a fresh pre-allocated one so the next window never reallocates. + +enum ColumnValues { + Numeric(MutableBuffer), // I32/I64/F32/F64/I128/UUID — bytes per row varies by type + Bool(Vec), // BOOLEAN — 1 byte per row; bit-packed at finalize + Str { + bytes: Vec, + offsets: Vec, // Arrow offset buffer; offsets[0] = 0 always + }, +} + +impl ColumnValues { + fn new(column_type: i32, coalesce_rows: usize) -> Self { + match column_type { + COLUMN_TYPE_BOOLEAN => ColumnValues::Bool(Vec::with_capacity(coalesce_rows)), + COLUMN_TYPE_STRING => ColumnValues::Str { + bytes: Vec::new(), + offsets: { + let mut v = Vec::with_capacity(coalesce_rows + 1); + v.push(0i32); + v + }, + }, + _ => { + let bpr = column_bytes_per_row(column_type).max(8); // fallback 8 for unknown + ColumnValues::Numeric(MutableBuffer::with_capacity(coalesce_rows * bpr)) + } + } + } +} + +// --------------------------------------------------------------------------- +// Per-column builder state + +struct ColumnBuilderState { + column_type: i32, + bytes_per_row: usize, // for Numeric variant: bytes per element + is_nullable: bool, + values: ColumnValues, + /// Lazily-populated validity bitmap. Empty until the first null slice. + null_bits: Vec, + rows: usize, + has_nulls: bool, +} + +impl ColumnBuilderState { + fn new(column_type: i32, is_nullable: bool, coalesce_rows: usize) -> Self { + Self { + column_type, + bytes_per_row: column_bytes_per_row(column_type), + is_nullable, + values: ColumnValues::new(column_type, coalesce_rows), + null_bits: Vec::new(), + rows: 0, + has_nulls: false, + } + } +} + +// --------------------------------------------------------------------------- +// Public builder type + +pub struct ColumnBatchBuilder { + columns: Vec, + arrow_schema: ArrowSchemaRef, + coalesce_rows: usize, +} + +impl ColumnBatchBuilder { + pub(crate) fn new( + arrow_schema: ArrowSchemaRef, + col_types: &[i32], + coalesce_rows: usize, + ) -> Result { + if col_types.len() != arrow_schema.fields().len() { + return Err(anyhow::anyhow!( + "col_types length {} != schema field count {}", + col_types.len(), + arrow_schema.fields().len() + )); + } + let columns = col_types + .iter() + .zip(arrow_schema.fields().iter()) + .map(|(&ct, field)| { + ColumnBuilderState::new(ct, field.is_nullable(), coalesce_rows) + }) + .collect(); + Ok(Self { columns, arrow_schema, coalesce_rows }) + } + + pub(crate) unsafe fn append_slice(&mut self, slices: &[SliceRef]) -> Result<(), anyhow::Error> { + if slices.len() != self.columns.len() { + return Err(anyhow::anyhow!( + "slice count {} != column count {}", + slices.len(), + self.columns.len() + )); + } + for (state, slice) in self.columns.iter_mut().zip(slices.iter()) { + unsafe { append_to_state(state, slice) }?; + } + Ok(()) + } + + pub(crate) fn write_and_reset( + &mut self, + writer_ref: &IcebergDataFileWriter, + pool: &crate::writer::GlobalWorkerPool, + ) -> Result<(), anyhow::Error> { + let mut arrays: Vec = Vec::with_capacity(self.columns.len()); + for (i, state) in self.columns.iter_mut().enumerate() { + let field = self.arrow_schema.field(i); + arrays.push(finalize_and_reset(state, field, self.coalesce_rows)?); + } + let batch = arrow_array::RecordBatch::try_new(self.arrow_schema.clone(), arrays) + .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))?; + submit_batch(writer_ref, pool, batch) + } +} + +// --------------------------------------------------------------------------- +// Append logic + +unsafe fn append_to_state( + state: &mut ColumnBuilderState, + slice: &SliceRef, +) -> Result<(), anyhow::Error> { + let len = slice.len; + + // ---- null bits (lazy) ------------------------------------------------- + // Skip entirely for all-valid slices when no nulls have been seen yet. + // On the first null slice, backfill all prior rows as valid, then copy bits. + // For all-valid slices after nulls have been seen, extend the bitmap with 1s. + if state.is_nullable { + if !slice.validity_ptr.is_null() { + let out_start = state.rows; + let needed = (out_start + len + 7) / 8; + if !state.has_nulls { + // First null slice: backfill all prior rows as valid. + state.null_bits.resize(needed, 0u8); + set_bits_range(&mut state.null_bits, 0, out_start); + state.has_nulls = true; + } else { + if state.null_bits.len() < needed { + state.null_bits.resize(needed, 0u8); + } + } + for i in 0..len { + let b = unsafe { (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1 }; + let pos = out_start + i; + state.null_bits[pos / 8] |= b << (pos % 8); + } + } else if state.has_nulls { + // All-valid slice but nulls seen earlier — extend bitmap with 1s. + let out_start = state.rows; + let needed = (out_start + len + 7) / 8; + if state.null_bits.len() < needed { + state.null_bits.resize(needed, 0u8); + } + set_bits_range(&mut state.null_bits, out_start, out_start + len); + } + // else: all-valid slice, no nulls yet — nothing to do. + } + + // ---- values ----------------------------------------------------------- + match &mut state.values { + ColumnValues::Numeric(buf) => { + append_numeric(buf, slice, state.column_type, len)?; + } + ColumnValues::Bool(v) => { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len) }; + if slice.sel_ptr.is_null() { + v.extend_from_slice(src); + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + v.push(src[(idx - 1) as usize]); + } + } + } + ColumnValues::Str { bytes, offsets } => { + if slice.lengths_ptr.is_null() { + return Err(anyhow::anyhow!("String column: lengths_ptr is null")); + } + let ptrs = unsafe { + std::slice::from_raw_parts(slice.data_ptr as *const *const u8, len) + }; + let lens = unsafe { std::slice::from_raw_parts(slice.lengths_ptr, len) }; + let out_start = state.rows; + for i in 0..len { + let is_null = state.is_nullable + && state.has_nulls + && { + let pos = out_start + i; + (state.null_bits[pos / 8] >> (pos % 8)) & 1 == 0 + }; + if !is_null && !ptrs[i].is_null() { + bytes.extend_from_slice(unsafe { + std::slice::from_raw_parts(ptrs[i], lens[i] as usize) + }); + } + offsets.push(bytes.len() as i32); + } + } + } + + state.rows += len; + Ok(()) +} + +/// Append numeric slice data directly into a `MutableBuffer`. +/// Identity (sequential) slices use a bulk byte copy; scattered slices loop element-wise. +unsafe fn append_numeric( + buf: &mut MutableBuffer, + slice: &SliceRef, + column_type: i32, + len: usize, +) -> Result<(), anyhow::Error> { + match column_type { + COLUMN_TYPE_INT32 | COLUMN_TYPE_DATE => { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i32, len) }; + if slice.sel_ptr.is_null() { + buf.extend_from_slice(as_bytes(src)); + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); + } + } + } + COLUMN_TYPE_INT64 | COLUMN_TYPE_TIMESTAMP | COLUMN_TYPE_TIMESTAMPTZ => { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; + if slice.sel_ptr.is_null() { + buf.extend_from_slice(as_bytes(src)); + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); + } + } + } + COLUMN_TYPE_FLOAT32 => { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const f32, len) }; + if slice.sel_ptr.is_null() { + buf.extend_from_slice(as_bytes(src)); + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); + } + } + } + COLUMN_TYPE_FLOAT64 => { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const f64, len) }; + if slice.sel_ptr.is_null() { + buf.extend_from_slice(as_bytes(src)); + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); + } + } + } + COLUMN_TYPE_DECIMAL_INT32 => { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i32, len) }; + if slice.sel_ptr.is_null() { + for &x in src { + buf.extend_from_slice(&(x as i128).to_ne_bytes()); + } + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + buf.extend_from_slice(&(src[(idx - 1) as usize] as i128).to_ne_bytes()); + } + } + } + COLUMN_TYPE_DECIMAL_INT64 => { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; + if slice.sel_ptr.is_null() { + for &x in src { + buf.extend_from_slice(&(x as i128).to_ne_bytes()); + } + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + buf.extend_from_slice(&(src[(idx - 1) as usize] as i128).to_ne_bytes()); + } + } + } + COLUMN_TYPE_DECIMAL_INT128 | COLUMN_TYPE_UUID => { + // 16-byte elements + let src = + unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len * 16) }; + if slice.sel_ptr.is_null() { + buf.extend_from_slice(src); + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + let off = (idx - 1) as usize * 16; + buf.extend_from_slice(&src[off..off + 16]); + } + } + } + COLUMN_TYPE_JULIA_DATE => { + // Source: i64[] of Julia days (since 0001-01-01). Destination: i32 Date32 (since Unix epoch). + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; + if slice.sel_ptr.is_null() { + for &v in src { + buf.extend_from_slice(&((v - JULIA_DATE_OFFSET) as i32).to_ne_bytes()); + } + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + let v = src[(idx - 1) as usize]; + buf.extend_from_slice(&((v - JULIA_DATE_OFFSET) as i32).to_ne_bytes()); + } + } + } + COLUMN_TYPE_JULIA_TIMESTAMP | COLUMN_TYPE_JULIA_TIMESTAMPTZ => { + // Source: i64[] of Julia ms (since 0001-01-01). Destination: i64 μs since Unix epoch. + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; + if slice.sel_ptr.is_null() { + for &v in src { + buf.extend_from_slice(&((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000).to_ne_bytes()); + } + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + let v = src[(idx - 1) as usize]; + buf.extend_from_slice(&((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000).to_ne_bytes()); + } + } + } + COLUMN_TYPE_JULIA_TIMESTAMP_NS | COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS => { + // Source: i64[] of Julia ms (since 0001-01-01). Destination: i64 ns since Unix epoch. + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; + if slice.sel_ptr.is_null() { + for &v in src { + buf.extend_from_slice( + &((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000_000).to_ne_bytes(), + ); + } + } else { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for &idx in sel { + let v = src[(idx - 1) as usize]; + buf.extend_from_slice( + &((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000_000).to_ne_bytes(), + ); + } + } + } + _ => { + return Err(anyhow::anyhow!("unsupported column type {}", column_type)); + } + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// Finalize + +/// Build an Arrow array from `state`'s accumulated data and reset all buffers +/// in-place for the next coalesce window. +fn finalize_and_reset( + state: &mut ColumnBuilderState, + schema_field: &arrow_schema::Field, + coalesce_rows: usize, +) -> Result { + let rows = state.rows; + state.rows = 0; + state.has_nulls = false; + + // Build NullBuffer from lazily-accumulated bits (None if no nulls seen). + // Preserve the Vec's capacity for the next window to avoid repeated allocation. + let null_buf: Option = if state.is_nullable && !state.null_bits.is_empty() { + let cap = state.null_bits.capacity(); + let bits = std::mem::replace(&mut state.null_bits, Vec::with_capacity(cap)); + Some(NullBuffer::new(BooleanBuffer::new( + Buffer::from_vec(bits), + 0, + rows, + ))) + } else { + state.null_bits.clear(); + None + }; + + let array: ArrayRef = match &mut state.values { + ColumnValues::Numeric(buf) => { + // Swap in a fresh pre-allocated buffer; take the old one as Arrow Buffer. + let old = std::mem::replace( + buf, + MutableBuffer::with_capacity(coalesce_rows * state.bytes_per_row), + ); + let arrow_buf: Buffer = old.into(); + build_numeric_array(state.column_type, arrow_buf, rows, null_buf, schema_field)? + } + ColumnValues::Bool(v) => { + let cap = v.capacity(); + let taken = std::mem::replace(v, Vec::with_capacity(cap)); + let mut bits = vec![0u8; (rows + 7) / 8]; + for (i, &b) in taken.iter().enumerate().take(rows) { + if b != 0 { + bits[i / 8] |= 1u8 << (i % 8); + } + } + Arc::new(BooleanArray::new( + BooleanBuffer::new(Buffer::from_vec(bits), 0, rows), + null_buf, + )) + } + ColumnValues::Str { bytes, offsets } => { + let taken_bytes = std::mem::replace(bytes, Vec::with_capacity(bytes.capacity())); + let taken_offsets = std::mem::replace(offsets, { + let mut v = Vec::with_capacity(coalesce_rows + 1); + v.push(0i32); + v + }); + let arr = unsafe { + StringArray::new_unchecked( + OffsetBuffer::new(ScalarBuffer::from(taken_offsets)), + Buffer::from_vec(taken_bytes), + null_buf, + ) + }; + Arc::new(arr) + } + }; + + Ok(array) +} + +fn build_numeric_array( + column_type: i32, + buf: Buffer, + rows: usize, + null_buf: Option, + schema_field: &arrow_schema::Field, +) -> Result { + Ok(match column_type { + COLUMN_TYPE_INT32 => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), + COLUMN_TYPE_DATE => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), + COLUMN_TYPE_INT64 => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), + COLUMN_TYPE_TIMESTAMP => Arc::new( + PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + ), + ), + COLUMN_TYPE_TIMESTAMPTZ => Arc::new( + PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + ) + .with_timezone("UTC"), + ), + COLUMN_TYPE_FLOAT32 => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), + COLUMN_TYPE_FLOAT64 => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), + COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_DECIMAL_INT64 | COLUMN_TYPE_DECIMAL_INT128 => { + let (precision, scale) = match schema_field.data_type() { + arrow_schema::DataType::Decimal128(p, s) => (*p, *s), + dt => { + return Err(anyhow::anyhow!( + "Expected Decimal128 for field {}, got {:?}", + schema_field.name(), + dt + )) + } + }; + Arc::new( + PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf) + .with_precision_and_scale(precision, scale) + .map_err(|e| anyhow::anyhow!("Decimal precision/scale: {}", e))?, + ) + } + COLUMN_TYPE_UUID => { + // UUID: FixedSizeBinary(16) + Arc::new( + FixedSizeBinaryArray::try_new(16, buf, null_buf) + .map_err(|e| anyhow::anyhow!("UUID FixedSizeBinary: {}", e))?, + ) + } + COLUMN_TYPE_JULIA_DATE => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), + COLUMN_TYPE_JULIA_TIMESTAMP => Arc::new( + PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf), + ), + COLUMN_TYPE_JULIA_TIMESTAMPTZ => Arc::new( + PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf) + .with_timezone("UTC"), + ), + COLUMN_TYPE_JULIA_TIMESTAMP_NS => Arc::new( + PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf), + ), + COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS => Arc::new( + PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf) + .with_timezone("UTC"), + ), + ct => return Err(anyhow::anyhow!("unsupported column type {} in finalize", ct)), + }) +} + +// --------------------------------------------------------------------------- +// Helpers + +/// Set bits [start, end) to 1 in `bits` (Arrow bit layout: bit i at byte i/8, shift i%8). +fn set_bits_range(bits: &mut [u8], start: usize, end: usize) { + if start >= end { + return; + } + let (fb, lb) = (start / 8, (end - 1) / 8); + let (fi, li) = (start % 8, (end - 1) % 8); + if fb == lb { + // All bits in the same byte: set bits [fi, li]. + bits[fb] |= ((1u16 << (li + 1)) - 1) as u8 & (0xFF_u8 << fi); + } else { + bits[fb] |= 0xFF_u8 << fi; // partial first byte: bits [fi, 7] + bits[(fb + 1)..lb].fill(0xFF); // full middle bytes + bits[lb] |= ((1u16 << (li + 1)) - 1) as u8; // partial last byte: bits [0, li] + } +} + +/// Reinterpret a typed slice as bytes. +#[inline(always)] +unsafe fn as_bytes(s: &[T]) -> &[u8] { + unsafe { + std::slice::from_raw_parts(s.as_ptr() as *const u8, s.len() * std::mem::size_of::()) + } +} + +// --------------------------------------------------------------------------- +// FFI entry points + +#[no_mangle] +pub extern "C" fn iceberg_batch_builder_new( + writer: *mut IcebergDataFileWriter, + col_types: *const i32, + num_columns: usize, +) -> *mut ColumnBatchBuilder { + if writer.is_null() || col_types.is_null() || num_columns == 0 { + return std::ptr::null_mut(); + } + let writer_ref = unsafe { &*writer }; + let col_types_slice = unsafe { std::slice::from_raw_parts(col_types, num_columns) }; + match ColumnBatchBuilder::new( + writer_ref.arrow_schema.clone(), + col_types_slice, + DEFAULT_COALESCE_ROWS, + ) { + Ok(b) => Box::into_raw(Box::new(b)), + Err(_) => std::ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn iceberg_batch_builder_append_slice( + builder: *mut ColumnBatchBuilder, + slices: *const SliceRef, + num_columns: usize, +) -> i32 { + if builder.is_null() || slices.is_null() || num_columns == 0 { + return -1; + } + let builder_ref = unsafe { &mut *builder }; + let slices_slice = unsafe { std::slice::from_raw_parts(slices, num_columns) }; + match unsafe { builder_ref.append_slice(slices_slice) } { + Ok(()) => 0, + Err(_) => -1, + } +} + +#[no_mangle] +pub extern "C" fn iceberg_batch_builder_write( + writer: *mut IcebergDataFileWriter, + builder: *mut ColumnBatchBuilder, +) -> i32 { + if writer.is_null() || builder.is_null() { + return -1; + } + let writer_ref = unsafe { &*writer }; + let builder_ref = unsafe { &mut *builder }; + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg] encode pool not initialized"); + return -1; + } + }; + match builder_ref.write_and_reset(writer_ref, pool) { + Ok(()) => 0, + Err(e) => { + crate::writer::store_writer_error_pub(writer_ref, e); + -1 + } + } +} + +#[no_mangle] +pub extern "C" fn iceberg_batch_builder_free(builder: *mut ColumnBatchBuilder) { + if !builder.is_null() { + unsafe { drop(Box::from_raw(builder)) } + } +} diff --git a/iceberg_rust_ffi/src/lib.rs b/iceberg_rust_ffi/src/lib.rs index 25cd7bd..45e1cfa 100644 --- a/iceberg_rust_ffi/src/lib.rs +++ b/iceberg_rust_ffi/src/lib.rs @@ -41,6 +41,9 @@ mod writer; // Column-based writer module (zero-copy from Julia) mod writer_columns; +// Incremental batch builder: per-slice copy into owned buffers, finalize to RecordBatch +mod batch_builder; + // Profiling stats for the file-parallel pipeline mod pipeline_stats; diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index c6d7795..9d0cb14 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -6,8 +6,7 @@ /// a given writer at a time, and the FIFO global queue ensures batches are submitted /// in order. use std::any::Any; -use std::cell::RefCell; -use std::ffi::{c_char, c_void, CString}; +use std::ffi::{c_char, c_void}; use std::io::Cursor; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -92,13 +91,12 @@ impl ParquetWriterPropertiesFFI { } } +use crate::batch_builder::ColumnBatchBuilder; use crate::response::IcebergBoxedResponse; use crate::table::IcebergTable; use crate::transaction::IcebergDataFiles; use crate::util::parse_c_string; -use crate::writer_columns::{ - build_arrow_array_gathered, ColumnDescriptor, GatheredColumnDescriptor, SliceRef, -}; +use crate::writer_columns::{ColumnDescriptor, SliceRef}; use object_store_ffi::{ export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT, }; @@ -108,7 +106,7 @@ type ConcreteDataFileWriter = DataFileWriter; /// Encode task submitted to the global worker pool. -struct EncodeTask { +pub(crate) struct EncodeTask { batch: RecordBatch, state: Arc, } @@ -136,37 +134,11 @@ unsafe impl Send for WriterState {} unsafe impl Sync for WriterState {} /// Global pool of N=available_parallelism encode worker threads shared across all writers. -struct GlobalWorkerPool { - task_tx: tokio::sync::mpsc::Sender, -} - -static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); - -// Thread-local storage for the most recent synchronous gather error. -// Set by iceberg_writer_write_gathered_columns on failure; consumed by iceberg_take_gather_error. -thread_local! { - static LAST_GATHER_ERROR: RefCell> = const { RefCell::new(None) }; +pub(crate) struct GlobalWorkerPool { + pub(crate) task_tx: tokio::sync::mpsc::Sender, } -fn store_gather_error(e: &anyhow::Error) { - let msg = format!("{:#}", e); - LAST_GATHER_ERROR.with(|cell| { - *cell.borrow_mut() = CString::new(msg).ok(); - }); -} - -/// Returns a heap-allocated C string with the most recent gather error on this thread, -/// or NULL if none. Must be called on the same thread as the failed write call, immediately -/// after it returns. The caller must free the returned string with `iceberg_destroy_cstring`. -#[no_mangle] -pub extern "C" fn iceberg_take_gather_error() -> *mut c_char { - LAST_GATHER_ERROR.with(|cell| { - cell.borrow_mut() - .take() - .map(|s| s.into_raw()) - .unwrap_or(std::ptr::null_mut()) - }) -} +pub(crate) static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); /// Formats a Rust panic payload into an anyhow error, preserving the message where possible. fn format_panic_error(panic: Box) -> anyhow::Error { @@ -312,32 +284,15 @@ fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { } } -/// Build a `RecordBatch` from a slice of `GatheredColumnDescriptor`s. -/// -/// # Safety -/// All pointers inside each `GatheredColumnDescriptor` must be valid for the duration of -/// this call (callers hold `GC.@preserve` or equivalent). -unsafe fn build_record_batch( - arrow_schema: ArrowSchemaRef, - col_descs: I, -) -> Result -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - let iter = col_descs.into_iter(); - let mut arrays = Vec::with_capacity(iter.len()); - for (i, desc) in iter.enumerate() { - arrays.push(unsafe { build_arrow_array_gathered(&desc, arrow_schema.field(i))? }); - } - RecordBatch::try_new(arrow_schema, arrays) - .map_err(|_| anyhow::anyhow!("failed to construct RecordBatch")) +/// Store an error in the writer state (public for batch_builder module). +pub(crate) fn store_writer_error_pub(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { + store_writer_error(writer_ref, e); } /// Submit a `RecordBatch` to the global encode pool. /// /// Increments the writer's pending count before sending and rolls it back on channel failure. -fn submit_batch( +pub(crate) fn submit_batch( writer_ref: &IcebergDataFileWriter, pool: &GlobalWorkerPool, batch: RecordBatch, @@ -365,35 +320,10 @@ fn submit_batch( } } -/// Validates column count, builds a `RecordBatch` from pre-built gathered descriptors, -/// and submits it to the encode pool. -unsafe fn write_gathered_inner( - writer_ref: &IcebergDataFileWriter, - pool: &GlobalWorkerPool, - arrow_schema: ArrowSchemaRef, - num_columns: usize, - col_descs: I, -) -> Result<(), anyhow::Error> -where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, -{ - if num_columns != arrow_schema.fields().len() { - return Err(anyhow::anyhow!( - "Column count mismatch: got {} but schema has {}", - num_columns, - arrow_schema.fields().len() - )); - } - let batch = unsafe { build_record_batch(arrow_schema, col_descs) }?; - submit_batch(writer_ref, pool, batch) -} - -/// Validates column count, builds a `RecordBatch` from flat `ColumnDescriptor`s (each -/// treated as a single sequential slice), and submits it to the encode pool. -/// -/// Each `SliceRef` is constructed on the stack and used within the same loop iteration, -/// so no heap allocation is needed for the descriptor conversion. +/// Validates column count, converts each `ColumnDescriptor` into a single-slice `SliceRef`, +/// routes through `ColumnBatchBuilder`, and submits the resulting `RecordBatch` to the +/// encode pool. Using the builder here keeps all type-conversion and null-bit logic in one +/// place (`batch_builder.rs`) instead of duplicating it. unsafe fn write_columns_inner( writer_ref: &IcebergDataFileWriter, pool: &GlobalWorkerPool, @@ -407,73 +337,21 @@ unsafe fn write_columns_inner( arrow_schema.fields().len() )); } - let mut arrays = Vec::with_capacity(col_descs.len()); - for (i, d) in col_descs.iter().enumerate() { - // SliceRef lives on the stack for exactly this iteration; the raw pointer - // is consumed by build_arrow_array_gathered before the next iteration begins. - let slice = SliceRef { + let num_rows = col_descs.iter().map(|d| d.num_rows).max().unwrap_or(0); + let col_types: Vec = col_descs.iter().map(|d| d.column_type).collect(); + let mut builder = ColumnBatchBuilder::new(arrow_schema.clone(), &col_types, num_rows.max(1))?; + let slices: Vec = col_descs + .iter() + .map(|d| SliceRef { data_ptr: d.data_ptr, lengths_ptr: d.lengths_ptr, validity_ptr: d.validity_ptr, sel_ptr: std::ptr::null(), len: d.num_rows, - }; - let desc = GatheredColumnDescriptor { - slices: &slice as *const SliceRef, - num_slices: 1, - total_rows: d.num_rows, - column_type: d.column_type, - is_nullable: d.is_nullable, - }; - arrays.push(unsafe { build_arrow_array_gathered(&desc, arrow_schema.field(i))? }); - } - let batch = RecordBatch::try_new(arrow_schema, arrays) - .map_err(|_| anyhow::anyhow!("failed to construct RecordBatch"))?; - submit_batch(writer_ref, pool, batch) -} - -/// Gather column data from Julia memory into Arrow arrays in the calling thread, then -/// submit the RecordBatch to the global encode pool asynchronously. -/// -/// Julia keeps source arrays alive via `GC.@preserve` for the duration of this call. -/// After this function returns, all Julia pointers have been consumed and Julia may safely -/// release the source data. Encode is still asynchronous in the global pool; call -/// `iceberg_writer_close` to wait for all pending encodes. -/// -/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). -#[no_mangle] -pub extern "C" fn iceberg_writer_write_gathered_columns( - writer: *mut IcebergDataFileWriter, - columns: *const GatheredColumnDescriptor, - num_columns: usize, -) -> i32 { - if writer.is_null() || columns.is_null() || num_columns == 0 { - return -1; - } - let writer_ref = unsafe { &*writer }; - let pool = match GLOBAL_ENCODE_POOL.get() { - Some(p) => p, - None => { - eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first"); - return -1; - } - }; - let arrow_schema = writer_ref.arrow_schema.clone(); - let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; - if let Err(e) = unsafe { - write_gathered_inner( - writer_ref, - pool, - arrow_schema, - num_columns, - col_descs.iter().copied(), - ) - } { - store_gather_error(&e); - store_writer_error(writer_ref, e); - return -1; - } - 0 + }) + .collect(); + unsafe { builder.append_slice(&slices) }?; + builder.write_and_reset(writer_ref, pool) } /// Synchronous write of flat column data: copies each column from Julia memory into diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index 7e1665a..df48a5b 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -1,19 +1,10 @@ /// Column-based writer support for iceberg_rust_ffi /// -/// This module provides FFI bindings for writing raw column data directly to Parquet, -/// avoiding the overhead of Arrow IPC serialization. Julia passes raw column pointers -/// and metadata, and Rust builds Arrow arrays directly from them. +/// This module provides the FFI structs and column type constants shared between the +/// flat-column write path (`iceberg_writer_write_columns`) and the incremental batch +/// builder (`batch_builder.rs`). All Arrow array construction logic lives in +/// `batch_builder.rs`; this file is intentionally thin. use std::ffi::c_void; -use std::sync::Arc; - -use arrow_array::{ - types::{ - Date32Type, Decimal128Type, Float32Type, Float64Type, Int32Type, Int64Type, - TimestampMicrosecondType, - }, - ArrayRef, BooleanArray, PrimitiveArray, StringArray, -}; -use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; /// Column type codes (must match Julia's ColumnType enum) pub const COLUMN_TYPE_INT32: i32 = 0; @@ -32,6 +23,16 @@ pub const COLUMN_TYPE_DECIMAL_INT32: i32 = 10; pub const COLUMN_TYPE_DECIMAL_INT64: i32 = 11; /// Decimal backed by Int128 (precision > 18): data is i128[] scaled integers pub const COLUMN_TYPE_DECIMAL_INT128: i32 = 12; +/// Julia-epoch date: source data is i64[] of days since 0001-01-01; Rust subtracts 719163 and writes i32 Date32. +pub const COLUMN_TYPE_JULIA_DATE: i32 = 13; +/// Julia-epoch timestamp: source data is i64[] of ms since 0001-01-01; Rust converts to μs since Unix epoch. +pub const COLUMN_TYPE_JULIA_TIMESTAMP: i32 = 14; +/// Julia-epoch timestamp with UTC timezone: same conversion as JULIA_TIMESTAMP, UTC-tagged. +pub const COLUMN_TYPE_JULIA_TIMESTAMPTZ: i32 = 15; +/// Julia-epoch nanosecond timestamp: source data is i64[] of ms since 0001-01-01; Rust converts to ns since Unix epoch. +pub const COLUMN_TYPE_JULIA_TIMESTAMP_NS: i32 = 16; +/// Julia-epoch nanosecond timestamp with UTC timezone. +pub const COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS: i32 = 17; /// Descriptor for a single column passed from Julia #[repr(C)] @@ -58,12 +59,6 @@ pub struct ColumnDescriptor { unsafe impl Send for ColumnDescriptor {} unsafe impl Sync for ColumnDescriptor {} -// ============================================================================= -// Scattered-gather writer: pass raw source pointers + selection indices to Rust, -// which gathers the data directly into Arrow arrays — eliminating the Julia-side -// staging copy for non-converting numeric column types. -// ============================================================================= - /// A reference to one slice of source column data. /// `sel_ptr = null` → sequential (identity) access: read data[0..len]. /// `sel_ptr != null` → scattered access: read data[sel[i]-1] for i in 0..len (1-based Julia indices). @@ -82,294 +77,3 @@ pub struct SliceRef { unsafe impl Send for SliceRef {} unsafe impl Sync for SliceRef {} - -/// Gathered column descriptor: gather `num_slices` SliceRefs into one Arrow column. -/// `total_rows` must equal the sum of all `slice.len` values. -/// Fields ordered largest-to-smallest; 3 bytes trailing padding → 32 bytes total. -#[repr(C)] -#[derive(Clone, Copy)] -pub struct GatheredColumnDescriptor { - pub slices: *const SliceRef, - pub num_slices: usize, - pub total_rows: usize, - pub column_type: i32, - pub is_nullable: bool, -} - -unsafe impl Send for GatheredColumnDescriptor {} -unsafe impl Sync for GatheredColumnDescriptor {} - -/// Merges the per-slice validity bitmaps from all slices into a single output bitmap. -/// -/// Each slice contributes `slice.len` output rows. Slices with a null `validity_ptr` are -/// all-valid. Slices with a bitmap may be misaligned relative to the output (each slice -/// starts at a different `out` offset), so bits are copied one at a time with a shift. -/// The selection vector (`sel_ptr`) governs which *source data* elements to read; the -/// validity bitmap is always indexed by output row position, so sequential and scattered -/// slices are treated identically here. -/// -/// Returns `None` if every slice is all-valid (no null buffer needed). -unsafe fn build_null_buffer_gathered(slices: &[SliceRef], total_rows: usize) -> Option { - if !slices.iter().any(|s| !s.validity_ptr.is_null()) { - return None; - } - let mut bits = vec![0u8; (total_rows + 7) / 8]; - let mut out = 0usize; - for slice in slices { - if slice.validity_ptr.is_null() { - // All rows in this slice are valid — set one bit per output row. - for i in 0..slice.len { - bits[(out + i) / 8] |= 1u8 << ((out + i) % 8); - } - } else { - // Copy validity bits from the slice's bitmap into the output bitmap, - // re-aligning from source bit position i to output bit position (out + i). - for i in 0..slice.len { - let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1; - bits[(out + i) / 8] |= b << ((out + i) % 8); - } - } - out += slice.len; - } - Some(NullBuffer::new(BooleanBuffer::new( - Buffer::from(bits), - 0, - total_rows, - ))) -} - -/// Gather all slices for a column into an Arrow array. -pub(crate) unsafe fn build_arrow_array_gathered( - desc: &GatheredColumnDescriptor, - schema_field: &arrow_schema::Field, -) -> Result { - let slices = std::slice::from_raw_parts(desc.slices, desc.num_slices); - let total = desc.total_rows; - let null_buf = if desc.is_nullable { - build_null_buffer_gathered(slices, total) - } else { - None - }; - - // Macro gathers a primitive numeric type from all slices. - // sel_ptr=null → sequential copy; sel_ptr!=null → indirect gather (1-based indices). - macro_rules! gather_primitive { - ($T:ty, $ArrowType:ty) => {{ - let mut values = Vec::<$T>::with_capacity(total); - for slice in slices { - let src = slice.data_ptr as *const $T; - if slice.sel_ptr.is_null() { - values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); - } else { - for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - values.push(*src.add((idx - 1) as usize)); - } - } - } - Arc::new(PrimitiveArray::<$ArrowType>::new( - ScalarBuffer::from(values), - null_buf, - )) as ArrayRef - }}; - } - - let array: ArrayRef = match desc.column_type { - COLUMN_TYPE_INT32 => gather_primitive!(i32, Int32Type), - COLUMN_TYPE_INT64 => gather_primitive!(i64, Int64Type), - COLUMN_TYPE_FLOAT32 => gather_primitive!(f32, Float32Type), - COLUMN_TYPE_FLOAT64 => gather_primitive!(f64, Float64Type), - COLUMN_TYPE_DATE => gather_primitive!(i32, Date32Type), - COLUMN_TYPE_TIMESTAMP => { - let mut values = Vec::::with_capacity(total); - for slice in slices { - let src = slice.data_ptr as *const i64; - if slice.sel_ptr.is_null() { - values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); - } else { - for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - values.push(*src.add((idx - 1) as usize)); - } - } - } - Arc::new(PrimitiveArray::::new( - ScalarBuffer::from(values), - null_buf, - )) - } - COLUMN_TYPE_TIMESTAMPTZ => { - let mut values = Vec::::with_capacity(total); - for slice in slices { - let src = slice.data_ptr as *const i64; - if slice.sel_ptr.is_null() { - values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); - } else { - for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - values.push(*src.add((idx - 1) as usize)); - } - } - } - Arc::new( - PrimitiveArray::::new( - ScalarBuffer::from(values), - null_buf, - ) - .with_timezone("UTC"), - ) - } - COLUMN_TYPE_BOOLEAN => { - let mut bits = vec![0u8; (total + 7) / 8]; - let mut out = 0usize; - for slice in slices { - let src = slice.data_ptr as *const u8; - if slice.sel_ptr.is_null() { - let data = std::slice::from_raw_parts(src, slice.len); - for (i, &v) in data.iter().enumerate() { - if v != 0 { - bits[(out + i) / 8] |= 1 << ((out + i) % 8); - } - } - } else { - for (i, &idx) in std::slice::from_raw_parts(slice.sel_ptr, slice.len) - .iter() - .enumerate() - { - if *src.add((idx - 1) as usize) != 0 { - bits[(out + i) / 8] |= 1 << ((out + i) % 8); - } - } - } - out += slice.len; - } - Arc::new(BooleanArray::new( - BooleanBuffer::new(Buffer::from(bits), 0, total), - null_buf, - )) - } - COLUMN_TYPE_STRING => { - // String columns do not support selection vectors. Julia strings are - // heap-allocated with non-contiguous addresses, so the caller must build - // str_ptrs/str_lens arrays up-front — any row selection is already applied - // before add_string_slice! is called. sel_ptr is therefore always null here. - // data_ptr = *const *const u8, lengths_ptr = *const i64. - // - // Build the Arrow StringArray directly: one pass copies string bytes into a - // contiguous values buffer and tracks cumulative offsets. This avoids the - // intermediate Vec> and skips UTF-8 validation — Julia strings - // are guaranteed valid UTF-8. - let null_buf = if desc.is_nullable { - build_null_buffer_gathered(slices, total) - } else { - None - }; - let mut offsets = Vec::::with_capacity(total + 1); - offsets.push(0i32); - let mut values = Vec::::new(); - for slice in slices { - if slice.lengths_ptr.is_null() { - return Err(anyhow::anyhow!("String column requires lengths_ptr")); - } - let ptrs = - std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len); - let lens = std::slice::from_raw_parts(slice.lengths_ptr, slice.len); - for i in 0..slice.len { - let is_null = !slice.validity_ptr.is_null() - && ((*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1) == 0; - if !is_null { - values.extend_from_slice(std::slice::from_raw_parts( - ptrs[i], - lens[i] as usize, - )); - } - offsets.push(values.len() as i32); - } - } - // SAFETY: offsets are monotonically non-decreasing by construction; values - // bytes come from Julia String objects (valid UTF-8) kept alive in col.preserve. - Arc::new(unsafe { - StringArray::new_unchecked( - OffsetBuffer::new(ScalarBuffer::from(offsets)), - Buffer::from_vec(values), - null_buf, - ) - }) - } - COLUMN_TYPE_UUID => { - let mut data: Vec = Vec::with_capacity(total * 16); - for slice in slices { - let src = slice.data_ptr as *const u8; - if slice.sel_ptr.is_null() { - data.extend_from_slice(std::slice::from_raw_parts(src, slice.len * 16)); - } else { - for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - data.extend_from_slice(std::slice::from_raw_parts( - src.add((idx - 1) as usize * 16), - 16, - )); - } - } - } - let chunks: Vec<&[u8]> = data.chunks(16).collect(); - Arc::new( - arrow_array::FixedSizeBinaryArray::try_from_iter(chunks.into_iter()) - .map_err(|e| anyhow::anyhow!("Failed to build UUID array: {}", e))?, - ) - } - COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_DECIMAL_INT64 | COLUMN_TYPE_DECIMAL_INT128 => { - let (precision, scale) = match schema_field.data_type() { - arrow_schema::DataType::Decimal128(p, s) => (*p, *s), - dt => return Err(anyhow::anyhow!("Expected Decimal128, got {:?}", dt)), - }; - let mut values = Vec::::with_capacity(total); - for slice in slices { - match desc.column_type { - COLUMN_TYPE_DECIMAL_INT32 => { - let src = slice.data_ptr as *const i32; - if slice.sel_ptr.is_null() { - values.extend( - std::slice::from_raw_parts(src, slice.len) - .iter() - .map(|&v| v as i128), - ); - } else { - for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - values.push(*src.add((idx - 1) as usize) as i128); - } - } - } - COLUMN_TYPE_DECIMAL_INT64 => { - let src = slice.data_ptr as *const i64; - if slice.sel_ptr.is_null() { - values.extend( - std::slice::from_raw_parts(src, slice.len) - .iter() - .map(|&v| v as i128), - ); - } else { - for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - values.push(*src.add((idx - 1) as usize) as i128); - } - } - } - _ => { - // DECIMAL_INT128: i128 layout matches Julia Int128 - let src = slice.data_ptr as *const i128; - if slice.sel_ptr.is_null() { - values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); - } else { - for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - values.push(*src.add((idx - 1) as usize)); - } - } - } - } - } - Arc::new( - PrimitiveArray::::new(ScalarBuffer::from(values), null_buf) - .with_precision_and_scale(precision, scale) - .map_err(|e| anyhow::anyhow!("Decimal precision/scale: {}", e))?, - ) - } - _ => return Err(anyhow::anyhow!("Unknown column type: {}", desc.column_type)), - }; - Ok(array) -} diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index 5c93f2f..762d7b3 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -45,7 +45,7 @@ export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FL export COLUMN_TYPE_STRING, COLUMN_TYPE_DATE, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_UUID export COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_DECIMAL_INT128 export julia_type_to_column_type -export GatheredColumn, GatheredBatch, add_slice!, add_string_slice! +export SliceRef, SliceBatch, ColumnBatchBuilder, append_slice!, free_builder! # Always use the JLL library - override via Preferences if needed for local development # To use a local build, set the preference: diff --git a/src/writer.jl b/src/writer.jl index 901144a..6a913df 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -511,26 +511,6 @@ struct SliceRef len::Csize_t end -""" - GatheredColumnDescriptor - -FFI descriptor for a column to be gathered from multiple SliceRefs. -Pass an array of these to `write_columns`. - -- `slices_ptr`: pointer to array of SliceRef structs -- `num_slices`: number of SliceRef entries -- `total_rows`: sum of all slice lengths -- `column_type`: ColumnType enum value -- `is_nullable`: whether the column may contain null values -""" -struct GatheredColumnDescriptor - slices_ptr::Ptr{SliceRef} - num_slices::Csize_t - total_rows::Csize_t - column_type::Int32 - is_nullable::Bool -end - """ ColumnType @@ -550,6 +530,13 @@ Enum for column data types, matching the Rust FFI constants. COLUMN_TYPE_DECIMAL_INT32 = 10 # Decimal backed by Int32 (precision ≤ 9) COLUMN_TYPE_DECIMAL_INT64 = 11 # Decimal backed by Int64 (precision ≤ 18) COLUMN_TYPE_DECIMAL_INT128 = 12 # Decimal backed by Int128 (precision > 18) + # Julia-epoch variants: source data uses Julia's internal epoch (0001-01-01); + # Rust applies the offset to produce Iceberg's Unix-epoch representation. + COLUMN_TYPE_JULIA_DATE = 13 # i64[] days since year 1 → i32 days since 1970-01-01 + COLUMN_TYPE_JULIA_TIMESTAMP = 14 # i64[] ms since year 1 → i64 μs since 1970-01-01 + COLUMN_TYPE_JULIA_TIMESTAMPTZ = 15 # same + UTC timezone + COLUMN_TYPE_JULIA_TIMESTAMP_NS = 16 # i64[] ms since year 1 → i64 ns since 1970-01-01 + COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS = 17 # same + UTC timezone end """ @@ -615,9 +602,11 @@ iceberg_column_type(::IcebergLong) = COLUMN_TYPE_INT64 iceberg_column_type(::IcebergFloat) = COLUMN_TYPE_FLOAT32 iceberg_column_type(::IcebergDouble) = COLUMN_TYPE_FLOAT64 iceberg_column_type(::IcebergString) = COLUMN_TYPE_STRING -iceberg_column_type(::IcebergDate) = COLUMN_TYPE_DATE -iceberg_column_type(::IcebergTimestamp) = COLUMN_TYPE_TIMESTAMP -iceberg_column_type(::IcebergTimestamptz) = COLUMN_TYPE_TIMESTAMPTZ +iceberg_column_type(::IcebergDate) = COLUMN_TYPE_JULIA_DATE +iceberg_column_type(::IcebergTimestamp) = COLUMN_TYPE_JULIA_TIMESTAMP +iceberg_column_type(::IcebergTimestamptz) = COLUMN_TYPE_JULIA_TIMESTAMPTZ +iceberg_column_type(::IcebergTimestampNs) = COLUMN_TYPE_JULIA_TIMESTAMP_NS +iceberg_column_type(::IcebergTimestamptzNs) = COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS iceberg_column_type(::IcebergBoolean) = COLUMN_TYPE_BOOLEAN iceberg_column_type(::IcebergUuid) = COLUMN_TYPE_UUID function iceberg_column_type(d::IcebergDecimal) @@ -884,108 +873,87 @@ function write_columns(writer::DataFileWriter, batch::ColumnBatch) end # ========================================================================================== -# High-level gathered-column API +# Incremental batch builder API # ========================================================================================== """ - GatheredColumn - -Accumulates one or more source slices for a single column. Rust gathers the data -directly from source buffers when the batch is written, avoiding a Julia-side staging -copy for numeric columns. + SliceBatch -Typical usage: +Accumulates one column slice descriptor per column for a single `append_slice!` call. +Handles `SliceRef` construction and GC preservation automatically, so callers work with +plain Julia arrays instead of raw pointers. ```julia -col = GatheredColumn(COLUMN_TYPE_INT64) -add_slice!(col, src_array) # sequential: all rows -add_slice!(col, src_array2; sel=sel_indices) # scattered: rows at sel_indices -add_slice!(col, src_array3; validity=valid_bv) # nullable slice - -str_col = GatheredColumn(COLUMN_TYPE_STRING; nullable=true) -add_string_slice!(str_col, ["a", "", "c"]; validity=BitVector([true, false, true])) +sb = SliceBatch() +push!(sb, ids) # non-nullable numeric, sequential +push!(sb, values; validity=valid_bv) # nullable numeric, sequential +push!(sb, scores; sel=sel_indices) # non-nullable scattered +push!(sb, tags; validity=valid_bv) # nullable strings +append_slice!(builder, sb) ``` -For string columns use `add_string_slice!` instead of `add_slice!`. Selection vectors -are not supported for strings: Julia strings are non-contiguous, so the caller must -build `str_ptrs`/`str_lens` arrays up-front — any row selection is applied on the Julia -side before calling `add_string_slice!`. +A `SliceBatch` is single-use: after `append_slice!` returns (Rust has copied all data), +the source arrays may be released and the batch discarded. """ -mutable struct GatheredColumn +mutable struct SliceBatch slices::Vector{SliceRef} - total_rows::Int - column_type::ColumnType - is_nullable::Bool - preserve::Vector{Any} # source arrays kept alive until write + preserve::Vector{Any} end -GatheredColumn(column_type::ColumnType; nullable::Bool=false) = - GatheredColumn(SliceRef[], 0, column_type, nullable, Any[]) +SliceBatch() = SliceBatch(SliceRef[], Any[]) """ - add_slice!(col::GatheredColumn, data::AbstractVector{T}; - sel=nothing, validity=nothing) + push!(sb::SliceBatch, data::AbstractVector{T}; + validity=nothing, sel=nothing) -Append a slice of `data` to `col`. +Add a non-string column slice to the batch. -- `sel`: optional `Vector{Int64}` of 1-based row indices into `data` to select. +- `validity`: optional `BitVector` where `true` = valid, `false` = null. +- `sel`: optional `Vector{Int64}` of 1-based indices into `data` for scattered access. If omitted, all rows of `data` are used sequentially. -- `validity`: optional `BitVector` (length = number of selected rows, `true` = valid). - Providing this marks the column as nullable. """ -function add_slice!( - col::GatheredColumn, +function Base.push!( + sb::SliceBatch, data::AbstractVector{T}; - sel::Union{Nothing, Vector{Int64}} = nothing, validity::Union{Nothing, BitVector} = nothing, + sel::Union{Nothing, Vector{Int64}} = nothing, ) where T len = sel === nothing ? length(data) : length(sel) sel_ptr = if sel !== nothing - push!(col.preserve, sel) + push!(sb.preserve, sel) pointer(sel) else Ptr{Int64}(C_NULL) end validity_ptr = if validity !== nothing - col.is_nullable = true - push!(col.preserve, validity) + push!(sb.preserve, validity) Ptr{UInt8}(pointer(validity.chunks)) else Ptr{UInt8}(C_NULL) end - push!(col.preserve, data) - push!(col.slices, SliceRef( + push!(sb.preserve, data) + push!(sb.slices, SliceRef( Ptr{Cvoid}(pointer(data)), - Ptr{Int64}(C_NULL), # lengths_ptr unused for non-string types + Ptr{Int64}(C_NULL), validity_ptr, sel_ptr, Csize_t(len), )) - col.total_rows += len - return col + return sb end """ - add_string_slice!(col::GatheredColumn, strings::Vector{String}; validity=nothing) - -Append a string slice to `col` from a plain `Vector{String}`. + push!(sb::SliceBatch, strings::Vector{String}; validity=nothing) -- `validity`: optional `BitVector` (`true` = valid, `false` = null). Marking a row null - does not require a placeholder in `strings`, but the vector must still be the same length. +Add a string column slice to the batch. -```julia -col = GatheredColumn(COLUMN_TYPE_STRING; nullable=true) -add_string_slice!(col, ["hello", "", "world"]; validity=BitVector([true, false, true])) -``` - -For performance-critical paths where pointer/length arrays are pre-allocated, use the -lower-level `add_string_slice!(col, str_ptrs, str_lens; validity)` overload directly. +- `validity`: optional `BitVector` where `true` = valid, `false` = null. """ -function add_string_slice!( - col::GatheredColumn, +function Base.push!( + sb::SliceBatch, strings::Vector{String}; validity::Union{Nothing, BitVector} = nothing, ) @@ -1002,160 +970,136 @@ function add_string_slice!( str_lens[i] = ncodeunits(strings[i]) end end - push!(col.preserve, strings) # keep String objects alive so pointers remain valid - return add_string_slice!(col, str_ptrs, str_lens; validity) -end - -""" - add_string_slice!(col::GatheredColumn, str_ptrs, str_lens; validity=nothing) - -Low-level overload: append a string slice from pre-built pointer/length arrays. -`str_ptrs` is a `Vector{Ptr{UInt8}}` of pointers to UTF-8 string data and `str_lens` -is a `Vector{Int64}` of corresponding byte lengths. The caller is responsible for keeping -the pointed-to string bytes alive until `write_columns` returns. -""" -function add_string_slice!( - col::GatheredColumn, - str_ptrs::Vector{Ptr{UInt8}}, - str_lens::Vector{Int64}; - validity::Union{Nothing, BitVector} = nothing, -) - len = length(str_ptrs) + push!(sb.preserve, strings, str_ptrs, str_lens) validity_ptr = if validity !== nothing - col.is_nullable = true - push!(col.preserve, validity) + push!(sb.preserve, validity) Ptr{UInt8}(pointer(validity.chunks)) else Ptr{UInt8}(C_NULL) end - push!(col.preserve, str_ptrs, str_lens) - push!(col.slices, SliceRef( + push!(sb.slices, SliceRef( Ptr{Cvoid}(pointer(str_ptrs)), pointer(str_lens), validity_ptr, Ptr{Int64}(C_NULL), - Csize_t(len), + Csize_t(n), )) - col.total_rows += len - return col + return sb end """ - GatheredBatch + ColumnBatchBuilder -Collects a `GatheredColumn` per output column, then writes all of them in one call. +Opaque handle to a Rust-side incremental batch builder. Julia appends one slice per +column per operator slice via `append_slice!`; Rust copies each slice's data immediately +into owned typed buffers. When a coalesce window is full, `write_columns` finalises all +columns into Arrow arrays, submits the `RecordBatch` to the async encode pool, and resets +the builder in-place for the next window. -```julia -batch = GatheredBatch() -push!(batch, col_int64) -push!(batch, col_float64) -write_columns(writer, batch) -``` +Create with `ColumnBatchBuilder(writer, col_types)`. The builder is freed automatically +by its finalizer, or explicitly with `free_builder!`. +""" +mutable struct ColumnBatchBuilder + ptr::Ptr{Cvoid} -You can also push a single-slice column inline without building a `GatheredColumn` -explicitly: + function ColumnBatchBuilder( + writer::DataFileWriter, + col_types::Vector{ColumnType}, + ) + writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) + isempty(col_types) && throw(ArgumentError("col_types must not be empty")) + + col_type_codes = Int32[Int32(ct) for ct in col_types] + ptr = GC.@preserve col_type_codes begin + @ccall rust_lib.iceberg_batch_builder_new( + writer.ptr::Ptr{Cvoid}, + pointer(col_type_codes)::Ptr{Int32}, + length(col_type_codes)::Csize_t, + )::Ptr{Cvoid} + end + ptr == C_NULL && throw(IcebergException("iceberg_batch_builder_new failed")) -```julia -batch = GatheredBatch() -push!(batch, src_ints, COLUMN_TYPE_INT64) -push!(batch, src_floats, COLUMN_TYPE_FLOAT64; sel=indices, validity=valid_bv) -write_columns(writer, batch) -``` -""" -mutable struct GatheredBatch - columns::Vector{GatheredColumn} + b = new(ptr) + finalizer(free_builder!, b) + return b + end end -GatheredBatch() = GatheredBatch(GatheredColumn[]) - """ - push!(batch::GatheredBatch, col::GatheredColumn) + free_builder!(builder::ColumnBatchBuilder) -Append an already-built `GatheredColumn` to the batch. +Free the builder without writing. Called automatically by the finalizer; also safe to +call explicitly on error paths. """ -Base.push!(batch::GatheredBatch, col::GatheredColumn) = (push!(batch.columns, col); batch) +function free_builder!(builder::ColumnBatchBuilder) + if builder.ptr != C_NULL + @ccall rust_lib.iceberg_batch_builder_free(builder.ptr::Ptr{Cvoid})::Cvoid + builder.ptr = C_NULL + end + return nothing +end """ - push!(batch::GatheredBatch, data::AbstractVector, column_type::ColumnType; - sel=nothing, validity=nothing, nullable=false) + append_slice!(builder::ColumnBatchBuilder, slices::Vector{SliceRef}, arrays_to_preserve) + +Append one slice per column to the builder. `slices[i]` describes column `i`'s data for +this slice. Rust copies all data synchronously — source arrays referenced by the SliceRefs +may be released (or overwritten) as soon as this call returns. -Convenience: create a single-slice `GatheredColumn` from `data` and append it. +`arrays_to_preserve` holds any Julia objects whose memory is pointed to by the SliceRefs +(e.g. NullableVector backing arrays, string ptr/len buffers). They are GC-pinned only for +the duration of this call. """ -function Base.push!( - batch::GatheredBatch, - data::AbstractVector, - column_type::ColumnType; - sel::Union{Nothing, Vector{Int64}} = nothing, - validity::Union{Nothing, BitVector} = nothing, - nullable::Bool = validity !== nothing, +function append_slice!( + builder::ColumnBatchBuilder, + slices::Vector{SliceRef}, + arrays_to_preserve, ) - col = GatheredColumn(column_type; nullable) - add_slice!(col, data; sel, validity) - push!(batch.columns, col) - return batch + builder.ptr == C_NULL && throw(IcebergException("ColumnBatchBuilder has been freed")) + ret = GC.@preserve slices arrays_to_preserve begin + @ccall rust_lib.iceberg_batch_builder_append_slice( + builder.ptr::Ptr{Cvoid}, + pointer(slices)::Ptr{SliceRef}, + length(slices)::Csize_t, + )::Int32 + end + ret == 0 || throw(IcebergException("append_slice! failed")) + return nothing end """ - write_columns(writer::DataFileWriter, batch::GatheredBatch[, extra_preserve]) + append_slice!(builder::ColumnBatchBuilder, sb::SliceBatch) -Gather column data from Julia memory synchronously, then encode asynchronously. +High-level overload: append one slice per column from a `SliceBatch`. Builds the +`SliceRef` array and preserve list from the batch's accumulated column descriptors. -Gathers all column data from Julia memory in the calling thread using a plain blocking -`ccall`. Encode runs asynchronously in the global worker pool. +```julia +sb = SliceBatch() +push!(sb, ids) +push!(sb, values; validity=valid_bv) +append_slice!(builder, sb) +``` +""" +function append_slice!(builder::ColumnBatchBuilder, sb::SliceBatch) + append_slice!(builder, sb.slices, sb.preserve) +end -`extra_preserve` (optional) is an additional collection of objects whose memory must -stay alive during the gather (e.g. source string arrays for zero-copy string columns). +""" + write_columns(writer::DataFileWriter, builder::ColumnBatchBuilder) -The source data pointed to by the `GatheredBatch` slices and `extra_preserve` must be -valid for the duration of this call. After the call returns, all Julia pointers have -been consumed and the source data may be safely released. +Finalise the builder: assemble Arrow arrays from accumulated per-column buffers, build a +`RecordBatch`, submit it to the async encode pool, and reset all column buffers for the +next coalesce window. The builder is NOT freed and may be reused immediately. """ -function write_columns( - writer::DataFileWriter, - batch::GatheredBatch, - extra_preserve = nothing, -) - isempty(batch.columns) && throw(IcebergException("GatheredBatch has no columns")) +function write_columns(writer::DataFileWriter, builder::ColumnBatchBuilder) writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) - - all_slice_arrays = Vector{Vector{SliceRef}}(undef, length(batch.columns)) - descriptors = Vector{GatheredColumnDescriptor}(undef, length(batch.columns)) - preserve = Any[] - - for (i, col) in enumerate(batch.columns) - slices = col.slices - all_slice_arrays[i] = slices - append!(preserve, col.preserve) - push!(preserve, slices) - descriptors[i] = GatheredColumnDescriptor( - pointer(slices), - Csize_t(length(slices)), - Csize_t(col.total_rows), - Int32(col.column_type), - col.is_nullable, - ) - end - extra_preserve !== nothing && append!(preserve, extra_preserve) - - ret = GC.@preserve preserve all_slice_arrays descriptors begin - @ccall rust_lib.iceberg_writer_write_gathered_columns( - writer.ptr::Ptr{Cvoid}, - pointer(descriptors)::Ptr{GatheredColumnDescriptor}, - length(descriptors)::Csize_t, - )::Int32 - end - if ret != 0 - err_ptr = @ccall rust_lib.iceberg_take_gather_error()::Ptr{Cchar} - msg = if err_ptr != C_NULL - s = unsafe_string(err_ptr) - @ccall rust_lib.iceberg_destroy_cstring(err_ptr::Ptr{Cchar})::Cint - s - else - "gather failed (see writer close for details)" - end - throw(IcebergException("write_columns (gathered): $(msg)")) - end + builder.ptr == C_NULL && throw(IcebergException("ColumnBatchBuilder has been freed")) + ret = @ccall rust_lib.iceberg_batch_builder_write( + writer.ptr::Ptr{Cvoid}, + builder.ptr::Ptr{Cvoid}, + )::Int32 + ret == 0 || throw(IcebergException("write_columns (builder) failed")) return nothing end diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 1c15dd7..5dc2266 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1149,8 +1149,8 @@ end println("\n✅ write_columns decimal nullable tests completed!") end -@testset "Writer write_columns (GatheredBatch) API" begin - println("Testing write_columns with GatheredBatch (gathered-column) API...") +@testset "Writer ColumnBatchBuilder — multi-slice coalescing" begin + println("Testing ColumnBatchBuilder with multiple slices per column...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -1165,7 +1165,7 @@ end catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props) @test catalog !== nothing - test_namespace = ["test_gathered_$(round(Int, time() * 1000))"] + test_namespace = ["test_builder_multi_$(round(Int, time() * 1000))"] RustyIceberg.create_namespace(catalog, test_namespace) # Schema: id (non-nullable long), score (nullable double), tag (nullable string) @@ -1175,60 +1175,56 @@ end Field(Int32(3), "tag", IcebergString(); required=false), ]) - table_name = "gathered_test_$(round(Int, time() * 1000))" + table_name = "builder_multi_$(round(Int, time() * 1000))" table = RustyIceberg.create_table(catalog, test_namespace, table_name, schema) @test table != C_NULL println("✅ Table created") - # --- Data layout (4 rows) --- - # id: [1, 2, 3, 4] — non-nullable, single sequential slice - # score: [1.1, null, 3.3, null] — nullable; two sequential slices, each with validity - # tag: ["alpha", null, "gamma", null] — nullable string via add_string_slice! + # Write 4 rows across 3 separate append_slice! calls. + # id: [1, 2, 3, 4] — non-nullable; 3 slices of lengths 1, 2, 1 + # score: [1.1, null, 3.3, null] — nullable; 3 slices with validity + # tag: ["alpha", null, "gamma", null] — nullable string; 3 slices + # + # Slices are deliberately mis-aligned in terms of source array sizes to exercise + # the multi-slice accumulation path. Slice 2 for score uses a scattered sel_ptr. - data_files = RustyIceberg.with_data_file_writer(table) do writer - batch = RustyIceberg.GatheredBatch() - - # id: single sequential slice, no nulls - id_data = Int64[1, 2, 3, 4] - id_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_INT64) - RustyIceberg.add_slice!(id_col, id_data) - push!(batch, id_col) - println("✅ id column built (sequential, non-nullable)") - - # score: two sequential slices, each with validity masks - # Slice 1 — src = [1.1, 9.9], validity = [true, false] → rows 0 (1.1) and 1 (null) - # Slice 2 — src = [3.3, 8.8] via selection [1] + identity [8.8] (just sequential here) - # Use scattered access for slice 2: src=[99.9, 3.3, 88.8], sel=[2] → picks 3.3 - score_src1 = Float64[1.1, 9.9] - score_valid1 = BitVector([true, false]) - - score_src2 = Float64[99.9, 3.3, 88.8] - score_sel2 = Int64[2] # picks index 2 → 3.3 (1-based) - score_valid2 = BitVector([true]) - - score_src3 = Float64[7.7, 8.8] - score_valid3 = BitVector([false]) # null for row 3 - - score_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_FLOAT64; nullable=true) - RustyIceberg.add_slice!(score_col, score_src1; validity=score_valid1) - RustyIceberg.add_slice!(score_col, score_src2; sel=score_sel2, validity=score_valid2) - RustyIceberg.add_slice!(score_col, score_src3; sel=Int64[1], validity=score_valid3) - push!(batch, score_col) - println("✅ score column built (scattered + nullable)") - - # tag: string column via the high-level add_string_slice! overload - # Row 0: "alpha", row 1: null, row 2: "gamma", row 3: null - tag_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_STRING; nullable=true) - RustyIceberg.add_string_slice!( - tag_col, - ["alpha", "", "gamma", ""]; - validity=BitVector([true, false, true, false]) - ) - push!(batch, tag_col) - println("✅ tag column built (string via add_string_slice!)") + col_types = RustyIceberg.ColumnType[ + RustyIceberg.COLUMN_TYPE_INT64, + RustyIceberg.COLUMN_TYPE_FLOAT64, + RustyIceberg.COLUMN_TYPE_STRING, + ] - RustyIceberg.write_columns(writer, batch) - println("✅ Batch written via write_columns (GatheredBatch)") + data_files = RustyIceberg.with_data_file_writer(table) do writer + builder = RustyIceberg.ColumnBatchBuilder(writer, col_types) + + # --- Slice 1: row 0 --- + sb1 = RustyIceberg.SliceBatch() + push!(sb1, Int64[1]) + push!(sb1, Float64[1.1]; validity=BitVector([true])) + push!(sb1, ["alpha"]) + RustyIceberg.append_slice!(builder, sb1) + println("✅ Slice 1 appended") + + # --- Slice 2: rows 1-2 (score uses a scattered selection) --- + # score_src: [99.9, 3.3, 88.8], sel=[2,1] → values [3.3, 99.9], valid=[true,false] + sb2 = RustyIceberg.SliceBatch() + push!(sb2, Int64[2, 3]) + push!(sb2, Float64[99.9, 3.3, 88.8]; + sel=Int64[2, 1], validity=BitVector([true, false])) + push!(sb2, ["", "gamma"]; validity=BitVector([false, true])) + RustyIceberg.append_slice!(builder, sb2) + println("✅ Slice 2 appended (scattered score, nullable strings)") + + # --- Slice 3: row 3 --- + sb3 = RustyIceberg.SliceBatch() + push!(sb3, Int64[4]) + push!(sb3, Float64[0.0]; validity=BitVector([false])) + push!(sb3, [""]; validity=BitVector([false])) + RustyIceberg.append_slice!(builder, sb3) + println("✅ Slice 3 appended") + + RustyIceberg.write_columns(writer, builder) + println("✅ Builder flushed via write_columns") end @test data_files !== nothing && data_files.ptr != C_NULL println("✅ Writer closed") @@ -1247,22 +1243,19 @@ end println("✅ Read $(length(tbl.id)) rows") perm = sortperm(tbl.id) - sorted_ids = tbl.id[perm] + sorted_ids = tbl.id[perm] sorted_scores = tbl.score[perm] - sorted_tags = tbl.tag[perm] + sorted_tags = tbl.tag[perm] - # Verify id column (non-nullable, sequential) @test sorted_ids == Int64[1, 2, 3, 4] println("✅ id values correct") - # Verify score column (nullable, scattered slices) @test !ismissing(sorted_scores[1]) && sorted_scores[1] ≈ 1.1 @test ismissing(sorted_scores[2]) @test !ismissing(sorted_scores[3]) && sorted_scores[3] ≈ 3.3 @test ismissing(sorted_scores[4]) - println("✅ score values correct (including nulls)") + println("✅ score values correct (including nulls and scattered access)") - # Verify tag column (nullable string) @test !ismissing(sorted_tags[1]) && sorted_tags[1] == "alpha" @test ismissing(sorted_tags[2]) @test !ismissing(sorted_tags[3]) && sorted_tags[3] == "gamma" @@ -1289,7 +1282,201 @@ end end end - println("\n✅ write_columns (GatheredBatch) API tests completed!") + println("\n✅ ColumnBatchBuilder multi-slice tests completed!") +end + +@testset "Writer ColumnBatchBuilder — reuse across windows" begin + println("Testing ColumnBatchBuilder reuse: two write_columns calls on one builder...") + + catalog_uri = get_catalog_uri() + props = get_catalog_properties() + + catalog = nothing + table = C_NULL + data_files = nothing + test_namespace = nothing + table_name = nothing + + try + catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props) + @test catalog !== nothing + + test_namespace = ["test_builder_reuse_$(round(Int, time() * 1000))"] + RustyIceberg.create_namespace(catalog, test_namespace) + + schema = Schema([ + Field(Int32(1), "id", IcebergLong(); required=true), + Field(Int32(2), "value", IcebergDouble(); required=false), + ]) + + table_name = "builder_reuse_$(round(Int, time() * 1000))" + table = RustyIceberg.create_table(catalog, test_namespace, table_name, schema) + @test table != C_NULL + println("✅ Table created") + + col_types = RustyIceberg.ColumnType[ + RustyIceberg.COLUMN_TYPE_INT64, + RustyIceberg.COLUMN_TYPE_FLOAT64, + ] + + data_files = RustyIceberg.with_data_file_writer(table) do writer + builder = RustyIceberg.ColumnBatchBuilder(writer, col_types) + + # Window 1: rows [1, 2] + sb1 = RustyIceberg.SliceBatch() + push!(sb1, Int64[1, 2]) + push!(sb1, Float64[10.0, 20.0]) + RustyIceberg.append_slice!(builder, sb1) + RustyIceberg.write_columns(writer, builder) # flushes window 1, resets builder + println("✅ Window 1 written") + + # Window 2: rows [3, 4, 5] — builder reused in-place + sb2 = RustyIceberg.SliceBatch() + push!(sb2, Int64[3, 4, 5]) + push!(sb2, Float64[30.0, 40.0, 50.0]) + RustyIceberg.append_slice!(builder, sb2) + RustyIceberg.write_columns(writer, builder) # flushes window 2 + println("✅ Window 2 written") + end + @test data_files !== nothing && data_files.ptr != C_NULL + println("✅ Writer closed") + + updated_table = RustyIceberg.with_transaction(table, catalog) do tx + RustyIceberg.with_fast_append(tx) do action + RustyIceberg.add_data_files(action, data_files) + end + end + @test updated_table != C_NULL + println("✅ Transaction committed") + + tbl = read_table_data(updated_table) + @test tbl !== nothing + @test length(tbl.id) == 5 + println("✅ Read $(length(tbl.id)) rows") + + perm = sortperm(tbl.id) + @test tbl.id[perm] == Int64[1, 2, 3, 4, 5] + @test tbl.value[perm] == Float64[10.0, 20.0, 30.0, 40.0, 50.0] + println("✅ Data from both windows correct") + + RustyIceberg.free_table(updated_table) + + finally + if data_files !== nothing && data_files.ptr != C_NULL + RustyIceberg.free_data_files!(data_files) + end + if table != C_NULL + RustyIceberg.free_table(table) + end + if table_name !== nothing && test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_table(catalog, test_namespace, table_name) + end + if test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_namespace(catalog, test_namespace) + end + if catalog !== nothing + RustyIceberg.free_catalog!(catalog) + end + end + + println("\n✅ ColumnBatchBuilder reuse tests completed!") +end + +@testset "Writer ColumnBatchBuilder — date and timestamp epoch conversion" begin + println("Testing ColumnBatchBuilder date/timestamp epoch conversion...") + + catalog_uri = get_catalog_uri() + props = get_catalog_properties() + + catalog = nothing + table = C_NULL + data_files = nothing + test_namespace = nothing + table_name = nothing + + try + catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props) + @test catalog !== nothing + + test_namespace = ["test_builder_dates_$(round(Int, time() * 1000))"] + RustyIceberg.create_namespace(catalog, test_namespace) + + schema = Schema([ + Field(Int32(1), "id", IcebergLong(); required=true), + Field(Int32(2), "event_date", IcebergDate(); required=true), + Field(Int32(3), "event_ts", IcebergTimestamp(); required=true), + ]) + + table_name = "builder_dates_$(round(Int, time() * 1000))" + table = RustyIceberg.create_table(catalog, test_namespace, table_name, schema) + @test table != C_NULL + println("✅ Table created") + + # 2024-01-01 in Julia Date internal representation (Rata Die days, 1-based) + # Dates.value(Date(2024,1,1)) = 738886 + # Expected Iceberg Date32 (days since 1970-01-01) = 738886 - 719163 = 19723 + # Expected Iceberg timestamp (μs since 1970-01-01) = 19723 * 86400 * 1_000_000 = 1_704_067_200_000_000 + julia_date_val = Dates.value(Dates.Date(2024, 1, 1)) # 738886 + julia_ts_val = Dates.value(Dates.DateTime(2024, 1, 1, 0, 0, 0)) # ms since year 1 + + col_types = RustyIceberg.ColumnType[ + RustyIceberg.COLUMN_TYPE_INT64, + RustyIceberg.COLUMN_TYPE_JULIA_DATE, + RustyIceberg.COLUMN_TYPE_JULIA_TIMESTAMP, + ] + + data_files = RustyIceberg.with_data_file_writer(table) do writer + builder = RustyIceberg.ColumnBatchBuilder(writer, col_types) + + sb = RustyIceberg.SliceBatch() + push!(sb, Int64[1]) + push!(sb, Int64[julia_date_val]) + push!(sb, Int64[julia_ts_val]) + RustyIceberg.append_slice!(builder, sb) + RustyIceberg.write_columns(writer, builder) + println("✅ Date/timestamp slice written") + end + @test data_files !== nothing && data_files.ptr != C_NULL + + updated_table = RustyIceberg.with_transaction(table, catalog) do tx + RustyIceberg.with_fast_append(tx) do action + RustyIceberg.add_data_files(action, data_files) + end + end + @test updated_table != C_NULL + + tbl = read_table_data(updated_table) + @test tbl !== nothing + @test length(tbl.id) == 1 + + row = tbl[1] + @test Int64(row.event_date.x) == 19723 + println("✅ event_date = $(row.event_date.x) (expected 19723)") + + @test row.event_ts.x == 1_704_067_200_000_000 + println("✅ event_ts = $(row.event_ts.x) (expected 1_704_067_200_000_000 μs)") + + RustyIceberg.free_table(updated_table) + + finally + if data_files !== nothing && data_files.ptr != C_NULL + RustyIceberg.free_data_files!(data_files) + end + if table != C_NULL + RustyIceberg.free_table(table) + end + if table_name !== nothing && test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_table(catalog, test_namespace, table_name) + end + if test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_namespace(catalog, test_namespace) + end + if catalog !== nothing + RustyIceberg.free_catalog!(catalog) + end + end + + println("\n✅ ColumnBatchBuilder date/timestamp tests completed!") end @testset "Writer WriterConfig parquet properties" begin From 83a8eb51db81c4af9e0d5a55815cc69acc8da2be Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 11 May 2026 10:47:49 +0200 Subject: [PATCH 02/27] Fix tests, format --- iceberg_rust_ffi/src/batch_builder.rs | 110 +++++++++++++++----------- test/writer_tests.jl | 15 ++-- 2 files changed, 72 insertions(+), 53 deletions(-) diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index fcb225f..f46f6ab 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -17,8 +17,7 @@ use std::sync::Arc; use arrow_array::{ - types::*, - ArrayRef, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, StringArray, + types::*, ArrayRef, BooleanArray, FixedSizeBinaryArray, PrimitiveArray, StringArray, }; use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_schema::SchemaRef as ArrowSchemaRef; @@ -26,10 +25,10 @@ use arrow_schema::SchemaRef as ArrowSchemaRef; use crate::writer::{submit_batch, IcebergDataFileWriter, GLOBAL_ENCODE_POOL}; use crate::writer_columns::{ SliceRef, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_DATE, COLUMN_TYPE_DECIMAL_INT128, - COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_FLOAT32, - COLUMN_TYPE_FLOAT64, COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_JULIA_DATE, - COLUMN_TYPE_JULIA_TIMESTAMP, COLUMN_TYPE_JULIA_TIMESTAMPTZ, COLUMN_TYPE_JULIA_TIMESTAMP_NS, - COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS, COLUMN_TYPE_STRING, COLUMN_TYPE_TIMESTAMP, + COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64, + COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_JULIA_DATE, COLUMN_TYPE_JULIA_TIMESTAMP, + COLUMN_TYPE_JULIA_TIMESTAMPTZ, COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS, + COLUMN_TYPE_JULIA_TIMESTAMP_NS, COLUMN_TYPE_STRING, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_UUID, }; @@ -45,12 +44,20 @@ const DEFAULT_COALESCE_ROWS: usize = 1_048_576; /// Bytes per row for numeric column types (0 for Bool/Str which are not Numeric). fn column_bytes_per_row(column_type: i32) -> usize { match column_type { - COLUMN_TYPE_INT32 | COLUMN_TYPE_DATE | COLUMN_TYPE_FLOAT32 - | COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_JULIA_DATE => 4, - COLUMN_TYPE_INT64 | COLUMN_TYPE_TIMESTAMP | COLUMN_TYPE_TIMESTAMPTZ - | COLUMN_TYPE_FLOAT64 | COLUMN_TYPE_DECIMAL_INT64 - | COLUMN_TYPE_JULIA_TIMESTAMP | COLUMN_TYPE_JULIA_TIMESTAMPTZ - | COLUMN_TYPE_JULIA_TIMESTAMP_NS | COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS => 8, + COLUMN_TYPE_INT32 + | COLUMN_TYPE_DATE + | COLUMN_TYPE_FLOAT32 + | COLUMN_TYPE_DECIMAL_INT32 + | COLUMN_TYPE_JULIA_DATE => 4, + COLUMN_TYPE_INT64 + | COLUMN_TYPE_TIMESTAMP + | COLUMN_TYPE_TIMESTAMPTZ + | COLUMN_TYPE_FLOAT64 + | COLUMN_TYPE_DECIMAL_INT64 + | COLUMN_TYPE_JULIA_TIMESTAMP + | COLUMN_TYPE_JULIA_TIMESTAMPTZ + | COLUMN_TYPE_JULIA_TIMESTAMP_NS + | COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS => 8, COLUMN_TYPE_DECIMAL_INT128 | COLUMN_TYPE_UUID => 16, _ => 0, } @@ -69,7 +76,7 @@ enum ColumnValues { Bool(Vec), // BOOLEAN — 1 byte per row; bit-packed at finalize Str { bytes: Vec, - offsets: Vec, // Arrow offset buffer; offsets[0] = 0 always + offsets: Vec, // Arrow offset buffer; offsets[0] = 0 always }, } @@ -146,11 +153,13 @@ impl ColumnBatchBuilder { let columns = col_types .iter() .zip(arrow_schema.fields().iter()) - .map(|(&ct, field)| { - ColumnBuilderState::new(ct, field.is_nullable(), coalesce_rows) - }) + .map(|(&ct, field)| ColumnBuilderState::new(ct, field.is_nullable(), coalesce_rows)) .collect(); - Ok(Self { columns, arrow_schema, coalesce_rows }) + Ok(Self { + columns, + arrow_schema, + coalesce_rows, + }) } pub(crate) unsafe fn append_slice(&mut self, slices: &[SliceRef]) -> Result<(), anyhow::Error> { @@ -247,18 +256,15 @@ unsafe fn append_to_state( if slice.lengths_ptr.is_null() { return Err(anyhow::anyhow!("String column: lengths_ptr is null")); } - let ptrs = unsafe { - std::slice::from_raw_parts(slice.data_ptr as *const *const u8, len) - }; + let ptrs = + unsafe { std::slice::from_raw_parts(slice.data_ptr as *const *const u8, len) }; let lens = unsafe { std::slice::from_raw_parts(slice.lengths_ptr, len) }; let out_start = state.rows; for i in 0..len { - let is_null = state.is_nullable - && state.has_nulls - && { - let pos = out_start + i; - (state.null_bits[pos / 8] >> (pos % 8)) & 1 == 0 - }; + let is_null = state.is_nullable && state.has_nulls && { + let pos = out_start + i; + (state.null_bits[pos / 8] >> (pos % 8)) & 1 == 0 + }; if !is_null && !ptrs[i].is_null() { bytes.extend_from_slice(unsafe { std::slice::from_raw_parts(ptrs[i], lens[i] as usize) @@ -354,8 +360,7 @@ unsafe fn append_numeric( } COLUMN_TYPE_DECIMAL_INT128 | COLUMN_TYPE_UUID => { // 16-byte elements - let src = - unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len * 16) }; + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len * 16) }; if slice.sel_ptr.is_null() { buf.extend_from_slice(src); } else { @@ -516,12 +521,10 @@ fn build_numeric_array( ScalarBuffer::new(buf, 0, rows), null_buf, )), - COLUMN_TYPE_TIMESTAMP => Arc::new( - PrimitiveArray::::new( - ScalarBuffer::new(buf, 0, rows), - null_buf, - ), - ), + COLUMN_TYPE_TIMESTAMP => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), COLUMN_TYPE_TIMESTAMPTZ => Arc::new( PrimitiveArray::::new( ScalarBuffer::new(buf, 0, rows), @@ -565,21 +568,34 @@ fn build_numeric_array( ScalarBuffer::new(buf, 0, rows), null_buf, )), - COLUMN_TYPE_JULIA_TIMESTAMP => Arc::new( - PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf), - ), + COLUMN_TYPE_JULIA_TIMESTAMP => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), COLUMN_TYPE_JULIA_TIMESTAMPTZ => Arc::new( - PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf) - .with_timezone("UTC"), - ), - COLUMN_TYPE_JULIA_TIMESTAMP_NS => Arc::new( - PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf), + PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + ) + .with_timezone("UTC"), ), + COLUMN_TYPE_JULIA_TIMESTAMP_NS => Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )), COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS => Arc::new( - PrimitiveArray::::new(ScalarBuffer::new(buf, 0, rows), null_buf) - .with_timezone("UTC"), + PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + ) + .with_timezone("UTC"), ), - ct => return Err(anyhow::anyhow!("unsupported column type {} in finalize", ct)), + ct => { + return Err(anyhow::anyhow!( + "unsupported column type {} in finalize", + ct + )) + } }) } @@ -597,8 +613,8 @@ fn set_bits_range(bits: &mut [u8], start: usize, end: usize) { // All bits in the same byte: set bits [fi, li]. bits[fb] |= ((1u16 << (li + 1)) - 1) as u8 & (0xFF_u8 << fi); } else { - bits[fb] |= 0xFF_u8 << fi; // partial first byte: bits [fi, 7] - bits[(fb + 1)..lb].fill(0xFF); // full middle bytes + bits[fb] |= 0xFF_u8 << fi; // partial first byte: bits [fi, 7] + bits[(fb + 1)..lb].fill(0xFF); // full middle bytes bits[lb] |= ((1u16 << (li + 1)) - 1) as u8; // partial last byte: bits [0, li] } } diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 5dc2266..549a5eb 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1250,9 +1250,10 @@ end @test sorted_ids == Int64[1, 2, 3, 4] println("✅ id values correct") + # sel=[2,1] on [99.9, 3.3, 88.8] → row0=src[2]=3.3 (valid), row1=src[1]=99.9 (null) @test !ismissing(sorted_scores[1]) && sorted_scores[1] ≈ 1.1 - @test ismissing(sorted_scores[2]) - @test !ismissing(sorted_scores[3]) && sorted_scores[3] ≈ 3.3 + @test !ismissing(sorted_scores[2]) && sorted_scores[2] ≈ 3.3 + @test ismissing(sorted_scores[3]) @test ismissing(sorted_scores[4]) println("✅ score values correct (including nulls and scattered access)") @@ -1450,11 +1451,13 @@ end @test length(tbl.id) == 1 row = tbl[1] - @test Int64(row.event_date.x) == 19723 - println("✅ event_date = $(row.event_date.x) (expected 19723)") + # Arrow.jl collect() returns Dates.Date / Dates.DateTime for date/timestamp columns. + # If the epoch offset is correct, these should round-trip to the original calendar values. + @test row.event_date == Dates.Date(2024, 1, 1) + println("✅ event_date = $(row.event_date) (expected 2024-01-01)") - @test row.event_ts.x == 1_704_067_200_000_000 - println("✅ event_ts = $(row.event_ts.x) (expected 1_704_067_200_000_000 μs)") + @test row.event_ts == Dates.DateTime(2024, 1, 1, 0, 0, 0) + println("✅ event_ts = $(row.event_ts) (expected 2024-01-01T00:00:00)") RustyIceberg.free_table(updated_table) From 48f7e09964315ac3c65cd7079e5ce0b8c7bcef3e Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 11 May 2026 11:24:53 +0200 Subject: [PATCH 03/27] Fix date test: access columns via tbl.col[1] not tbl[1].col Co-Authored-By: Claude Sonnet 4.6 --- test/writer_tests.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 549a5eb..b526768 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1450,14 +1450,13 @@ end @test tbl !== nothing @test length(tbl.id) == 1 - row = tbl[1] # Arrow.jl collect() returns Dates.Date / Dates.DateTime for date/timestamp columns. # If the epoch offset is correct, these should round-trip to the original calendar values. - @test row.event_date == Dates.Date(2024, 1, 1) - println("✅ event_date = $(row.event_date) (expected 2024-01-01)") + @test tbl.event_date[1] == Dates.Date(2024, 1, 1) + println("✅ event_date = $(tbl.event_date[1]) (expected 2024-01-01)") - @test row.event_ts == Dates.DateTime(2024, 1, 1, 0, 0, 0) - println("✅ event_ts = $(row.event_ts) (expected 2024-01-01T00:00:00)") + @test tbl.event_ts[1] == Dates.DateTime(2024, 1, 1, 0, 0, 0) + println("✅ event_ts = $(tbl.event_ts[1]) (expected 2024-01-01T00:00:00)") RustyIceberg.free_table(updated_table) From f554297f5e9aba5b459203f0dd1bd92b6206b290 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 11 May 2026 11:36:54 +0200 Subject: [PATCH 04/27] Fix date/timestamp test: compare Arrow wrapper .x values directly Arrow.jl returns Arrow.Date/Arrow.Timestamp wrappers, not Dates.Date/DateTime. Co-Authored-By: Claude Sonnet 4.6 --- test/writer_tests.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/writer_tests.jl b/test/writer_tests.jl index b526768..0ed8432 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1450,13 +1450,13 @@ end @test tbl !== nothing @test length(tbl.id) == 1 - # Arrow.jl collect() returns Dates.Date / Dates.DateTime for date/timestamp columns. - # If the epoch offset is correct, these should round-trip to the original calendar values. - @test tbl.event_date[1] == Dates.Date(2024, 1, 1) - println("✅ event_date = $(tbl.event_date[1]) (expected 2024-01-01)") + # Arrow.jl returns Arrow.Date / Arrow.Timestamp wrappers with a raw integer in .x. + # Check the raw values directly to verify the Julia→Unix epoch offset is correct. + @test tbl.event_date[1].x == Int32(19723) # 2024-01-01 = day 19723 since 1970-01-01 + println("✅ event_date.x = $(tbl.event_date[1].x) (expected 19723)") - @test tbl.event_ts[1] == Dates.DateTime(2024, 1, 1, 0, 0, 0) - println("✅ event_ts = $(tbl.event_ts[1]) (expected 2024-01-01T00:00:00)") + @test tbl.event_ts[1].x == Int64(1_704_067_200_000_000) # 2024-01-01T00:00:00 in μs + println("✅ event_ts.x = $(tbl.event_ts[1].x) (expected 1_704_067_200_000_000)") RustyIceberg.free_table(updated_table) From 6477aa143c1579ba6af1fd8888d6f8474e97101f Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 11 May 2026 12:02:57 +0200 Subject: [PATCH 05/27] batch_builder: extract append_primitive!/append_transform! macros, merge Julia finalize arms Removes ~130 lines of duplicated identity/scatter dispatch in append_numeric by introducing two declarative macros. Merges JULIA_DATE/TIMESTAMP/TIMESTAMPTZ finalize arms with their non-Julia counterparts since the Arrow output type is identical. Co-Authored-By: Claude Sonnet 4.6 --- iceberg_rust_ffi/src/batch_builder.rs | 197 +++++++++----------------- 1 file changed, 65 insertions(+), 132 deletions(-) diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index f46f6ab..6cdcfdd 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -279,6 +279,40 @@ unsafe fn append_to_state( Ok(()) } +// Bulk-copy or 1-based-scattered-gather a slice of primitive T into buf (no transform). +// Identity selection → single memcpy; scattered selection → element-wise gather. +macro_rules! append_primitive { + ($buf:expr, $slice:expr, $len:expr, $T:ty) => {{ + let src = unsafe { std::slice::from_raw_parts($slice.data_ptr as *const $T, $len) }; + if $slice.sel_ptr.is_null() { + $buf.extend_from_slice(unsafe { as_bytes(src) }); + } else { + let sel = unsafe { std::slice::from_raw_parts($slice.sel_ptr, $len) }; + for &idx in sel { + $buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); + } + } + }}; +} + +// Element-wise transform from source type S with optional 1-based scattered gather. +// `$f` maps S → a value whose `.to_ne_bytes()` is written to buf. +macro_rules! append_transform { + ($buf:expr, $slice:expr, $len:expr, $S:ty, $f:expr) => {{ + let src = unsafe { std::slice::from_raw_parts($slice.data_ptr as *const $S, $len) }; + if $slice.sel_ptr.is_null() { + for &v in src { + $buf.extend_from_slice(&($f)(v).to_ne_bytes()); + } + } else { + let sel = unsafe { std::slice::from_raw_parts($slice.sel_ptr, $len) }; + for &idx in sel { + $buf.extend_from_slice(&($f)(src[(idx - 1) as usize]).to_ne_bytes()); + } + } + }}; +} + /// Append numeric slice data directly into a `MutableBuffer`. /// Identity (sequential) slices use a bulk byte copy; scattered slices loop element-wise. unsafe fn append_numeric( @@ -288,78 +322,20 @@ unsafe fn append_numeric( len: usize, ) -> Result<(), anyhow::Error> { match column_type { - COLUMN_TYPE_INT32 | COLUMN_TYPE_DATE => { - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i32, len) }; - if slice.sel_ptr.is_null() { - buf.extend_from_slice(as_bytes(src)); - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); - } - } - } + COLUMN_TYPE_INT32 | COLUMN_TYPE_DATE => append_primitive!(buf, slice, len, i32), COLUMN_TYPE_INT64 | COLUMN_TYPE_TIMESTAMP | COLUMN_TYPE_TIMESTAMPTZ => { - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; - if slice.sel_ptr.is_null() { - buf.extend_from_slice(as_bytes(src)); - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); - } - } - } - COLUMN_TYPE_FLOAT32 => { - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const f32, len) }; - if slice.sel_ptr.is_null() { - buf.extend_from_slice(as_bytes(src)); - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); - } - } - } - COLUMN_TYPE_FLOAT64 => { - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const f64, len) }; - if slice.sel_ptr.is_null() { - buf.extend_from_slice(as_bytes(src)); - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); - } - } + append_primitive!(buf, slice, len, i64) } + COLUMN_TYPE_FLOAT32 => append_primitive!(buf, slice, len, f32), + COLUMN_TYPE_FLOAT64 => append_primitive!(buf, slice, len, f64), COLUMN_TYPE_DECIMAL_INT32 => { - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i32, len) }; - if slice.sel_ptr.is_null() { - for &x in src { - buf.extend_from_slice(&(x as i128).to_ne_bytes()); - } - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - buf.extend_from_slice(&(src[(idx - 1) as usize] as i128).to_ne_bytes()); - } - } + append_transform!(buf, slice, len, i32, |x: i32| x as i128) } COLUMN_TYPE_DECIMAL_INT64 => { - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; - if slice.sel_ptr.is_null() { - for &x in src { - buf.extend_from_slice(&(x as i128).to_ne_bytes()); - } - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - buf.extend_from_slice(&(src[(idx - 1) as usize] as i128).to_ne_bytes()); - } - } + append_transform!(buf, slice, len, i64, |x: i64| x as i128) } COLUMN_TYPE_DECIMAL_INT128 | COLUMN_TYPE_UUID => { - // 16-byte elements + // 16-byte elements — no primitive type; copy as raw bytes. let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len * 16) }; if slice.sel_ptr.is_null() { buf.extend_from_slice(src); @@ -371,58 +347,25 @@ unsafe fn append_numeric( } } } + // Julia date/timestamp types carry a Julia-epoch offset that Rust removes here. + // Source: i64[] days since 0001-01-01 → i32 days since 1970-01-01 (Date32). COLUMN_TYPE_JULIA_DATE => { - // Source: i64[] of Julia days (since 0001-01-01). Destination: i32 Date32 (since Unix epoch). - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; - if slice.sel_ptr.is_null() { - for &v in src { - buf.extend_from_slice(&((v - JULIA_DATE_OFFSET) as i32).to_ne_bytes()); - } - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - let v = src[(idx - 1) as usize]; - buf.extend_from_slice(&((v - JULIA_DATE_OFFSET) as i32).to_ne_bytes()); - } - } + append_transform!(buf, slice, len, i64, |v: i64| (v - JULIA_DATE_OFFSET) + as i32) } + // Source: i64[] ms since 0001-01-01 → i64 μs since 1970-01-01. COLUMN_TYPE_JULIA_TIMESTAMP | COLUMN_TYPE_JULIA_TIMESTAMPTZ => { - // Source: i64[] of Julia ms (since 0001-01-01). Destination: i64 μs since Unix epoch. - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; - if slice.sel_ptr.is_null() { - for &v in src { - buf.extend_from_slice(&((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000).to_ne_bytes()); - } - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - let v = src[(idx - 1) as usize]; - buf.extend_from_slice(&((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000).to_ne_bytes()); - } - } + append_transform!(buf, slice, len, i64, |v: i64| (v + - JULIA_TIMESTAMP_OFFSET_MS) + * 1_000) } + // Source: i64[] ms since 0001-01-01 → i64 ns since 1970-01-01. COLUMN_TYPE_JULIA_TIMESTAMP_NS | COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS => { - // Source: i64[] of Julia ms (since 0001-01-01). Destination: i64 ns since Unix epoch. - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const i64, len) }; - if slice.sel_ptr.is_null() { - for &v in src { - buf.extend_from_slice( - &((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000_000).to_ne_bytes(), - ); - } - } else { - let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { - let v = src[(idx - 1) as usize]; - buf.extend_from_slice( - &((v - JULIA_TIMESTAMP_OFFSET_MS) * 1_000_000).to_ne_bytes(), - ); - } - } - } - _ => { - return Err(anyhow::anyhow!("unsupported column type {}", column_type)); + append_transform!(buf, slice, len, i64, |v: i64| (v + - JULIA_TIMESTAMP_OFFSET_MS) + * 1_000_000) } + _ => return Err(anyhow::anyhow!("unsupported column type {}", column_type)), } Ok(()) } @@ -513,7 +456,8 @@ fn build_numeric_array( ScalarBuffer::new(buf, 0, rows), null_buf, )), - COLUMN_TYPE_DATE => Arc::new(PrimitiveArray::::new( + // JULIA_DATE stores the epoch-adjusted i32 value; Arrow type is the same as DATE. + COLUMN_TYPE_DATE | COLUMN_TYPE_JULIA_DATE => Arc::new(PrimitiveArray::::new( ScalarBuffer::new(buf, 0, rows), null_buf, )), @@ -521,11 +465,15 @@ fn build_numeric_array( ScalarBuffer::new(buf, 0, rows), null_buf, )), - COLUMN_TYPE_TIMESTAMP => Arc::new(PrimitiveArray::::new( - ScalarBuffer::new(buf, 0, rows), - null_buf, - )), - COLUMN_TYPE_TIMESTAMPTZ => Arc::new( + // JULIA_TIMESTAMP stores epoch-adjusted μs; Arrow type is the same as TIMESTAMP. + COLUMN_TYPE_TIMESTAMP | COLUMN_TYPE_JULIA_TIMESTAMP => { + Arc::new(PrimitiveArray::::new( + ScalarBuffer::new(buf, 0, rows), + null_buf, + )) + } + // JULIA_TIMESTAMPTZ stores epoch-adjusted μs; Arrow type is the same as TIMESTAMPTZ. + COLUMN_TYPE_TIMESTAMPTZ | COLUMN_TYPE_JULIA_TIMESTAMPTZ => Arc::new( PrimitiveArray::::new( ScalarBuffer::new(buf, 0, rows), null_buf, @@ -564,21 +512,6 @@ fn build_numeric_array( .map_err(|e| anyhow::anyhow!("UUID FixedSizeBinary: {}", e))?, ) } - COLUMN_TYPE_JULIA_DATE => Arc::new(PrimitiveArray::::new( - ScalarBuffer::new(buf, 0, rows), - null_buf, - )), - COLUMN_TYPE_JULIA_TIMESTAMP => Arc::new(PrimitiveArray::::new( - ScalarBuffer::new(buf, 0, rows), - null_buf, - )), - COLUMN_TYPE_JULIA_TIMESTAMPTZ => Arc::new( - PrimitiveArray::::new( - ScalarBuffer::new(buf, 0, rows), - null_buf, - ) - .with_timezone("UTC"), - ), COLUMN_TYPE_JULIA_TIMESTAMP_NS => Arc::new(PrimitiveArray::::new( ScalarBuffer::new(buf, 0, rows), null_buf, From d0e3e56f90c58e3b738809f609a325efc7190b57 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 11 May 2026 13:48:48 +0200 Subject: [PATCH 06/27] . --- iceberg_rust_ffi/Cargo.lock | 2 +- iceberg_rust_ffi/src/batch_builder.rs | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock index f82e19e..bbb53f9 100644 --- a/iceberg_rust_ffi/Cargo.lock +++ b/iceberg_rust_ffi/Cargo.lock @@ -1648,7 +1648,7 @@ dependencies = [ [[package]] name = "iceberg_rust_ffi" -version = "0.7.21" +version = "0.7.22" dependencies = [ "anyhow", "arrow-array", diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index 6cdcfdd..18a96de 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -253,22 +253,19 @@ unsafe fn append_to_state( } } ColumnValues::Str { bytes, offsets } => { + // Pointer-of-pointers protocol: data_ptr is *const *const u8 (array of pointers + // into Julia source strings), lengths_ptr is *const i64 (byte lengths per row). + // Null and empty rows have a null data pointer and length 0 — no bytes are copied. if slice.lengths_ptr.is_null() { return Err(anyhow::anyhow!("String column: lengths_ptr is null")); } let ptrs = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const *const u8, len) }; let lens = unsafe { std::slice::from_raw_parts(slice.lengths_ptr, len) }; - let out_start = state.rows; for i in 0..len { - let is_null = state.is_nullable && state.has_nulls && { - let pos = out_start + i; - (state.null_bits[pos / 8] >> (pos % 8)) & 1 == 0 - }; - if !is_null && !ptrs[i].is_null() { - bytes.extend_from_slice(unsafe { - std::slice::from_raw_parts(ptrs[i], lens[i] as usize) - }); + let nb = lens[i] as usize; + if nb > 0 && !ptrs[i].is_null() { + bytes.extend_from_slice(unsafe { std::slice::from_raw_parts(ptrs[i], nb) }); } offsets.push(bytes.len() as i32); } From a953ae909ef21bfdac700f4031d908daf52ba5ae Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Mon, 11 May 2026 15:32:58 +0200 Subject: [PATCH 07/27] . --- iceberg_rust_ffi/src/batch_builder.rs | 55 ++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index 18a96de..24132b4 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -86,8 +86,10 @@ impl ColumnValues { COLUMN_TYPE_BOOLEAN => ColumnValues::Bool(Vec::with_capacity(coalesce_rows)), COLUMN_TYPE_STRING => ColumnValues::Str { bytes: Vec::new(), + // Start empty; finalize_and_reset right-sizes to the actual slice length + // after the first flush, so we never hold a 4MB coalesce_rows-sized Vec. offsets: { - let mut v = Vec::with_capacity(coalesce_rows + 1); + let mut v = Vec::new(); v.push(0i32); v }, @@ -219,10 +221,31 @@ unsafe fn append_to_state( state.null_bits.resize(needed, 0u8); } } - for i in 0..len { - let b = unsafe { (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1 }; - let pos = out_start + i; - state.null_bits[pos / 8] |= b << (pos % 8); + // Copy validity bits. When source and destination are byte-aligned (out_start + // is a multiple of 8 — always true for flush-per-slice), one copy_nonoverlapping + // replaces the 4096-iteration per-bit loop. + if out_start % 8 == 0 { + let dst = out_start / 8; + let n_bytes = (len + 7) / 8; + unsafe { + std::ptr::copy_nonoverlapping( + slice.validity_ptr, + state.null_bits.as_mut_ptr().add(dst), + n_bytes, + ); + } + // Mask off garbage bits beyond `len` in the last byte so they don't + // corrupt a subsequent coalesced slice that shares that byte. + if len % 8 != 0 { + let tail = state.null_bits.last_mut().unwrap(); + *tail &= (1u8 << (len % 8)) - 1; + } + } else { + for i in 0..len { + let b = unsafe { (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1 }; + let pos = out_start + i; + state.null_bits[pos / 8] |= b << (pos % 8); + } } } else if state.has_nulls { // All-valid slice but nulls seen earlier — extend bitmap with 1s. @@ -255,19 +278,26 @@ unsafe fn append_to_state( ColumnValues::Str { bytes, offsets } => { // Pointer-of-pointers protocol: data_ptr is *const *const u8 (array of pointers // into Julia source strings), lengths_ptr is *const i64 (byte lengths per row). - // Null and empty rows have a null data pointer and length 0 — no bytes are copied. + // Null and empty rows have length 0; the nb>0 guard is sufficient since Julia + // always sets len=0 for null/empty rows (no null-pointer check needed). if slice.lengths_ptr.is_null() { return Err(anyhow::anyhow!("String column: lengths_ptr is null")); } let ptrs = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const *const u8, len) }; let lens = unsafe { std::slice::from_raw_parts(slice.lengths_ptr, len) }; + // Pre-reserve so bytes never reallocates mid-loop. + let total: usize = lens.iter().map(|&l| l as usize).sum(); + bytes.reserve(total); + // Track running offset locally instead of reading bytes.len() each iteration. + let mut cur_off = bytes.len(); for i in 0..len { let nb = lens[i] as usize; - if nb > 0 && !ptrs[i].is_null() { + if nb > 0 { bytes.extend_from_slice(unsafe { std::slice::from_raw_parts(ptrs[i], nb) }); + cur_off += nb; } - offsets.push(bytes.len() as i32); + offsets.push(cur_off as i32); } } } @@ -421,9 +451,14 @@ fn finalize_and_reset( )) } ColumnValues::Str { bytes, offsets } => { - let taken_bytes = std::mem::replace(bytes, Vec::with_capacity(bytes.capacity())); + // Capacity hints from the previous window so the reset never over-allocates. + // With has_strings flush-per-slice, offsets.len() == slice_len+1 (~4097), not + // coalesce_rows+1 (1M). Using len() here shrinks the reset from 4MB to ~16KB. + let bytes_cap = bytes.capacity(); + let offsets_hint = offsets.len(); // rows+1 from the window just taken + let taken_bytes = std::mem::replace(bytes, Vec::with_capacity(bytes_cap)); let taken_offsets = std::mem::replace(offsets, { - let mut v = Vec::with_capacity(coalesce_rows + 1); + let mut v = Vec::with_capacity(offsets_hint.max(1)); v.push(0i32); v }); From a4d91922e1051630a98708db62c1b3efce08285b Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 12 May 2026 09:18:18 +0200 Subject: [PATCH 08/27] prefetching --- iceberg_rust_ffi/src/batch_builder.rs | 38 ++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index 24132b4..4e0d9d8 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -286,12 +286,17 @@ unsafe fn append_to_state( let ptrs = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const *const u8, len) }; let lens = unsafe { std::slice::from_raw_parts(slice.lengths_ptr, len) }; - // Pre-reserve so bytes never reallocates mid-loop. + // Pre-reserve so bytes/offsets never reallocate mid-loop. let total: usize = lens.iter().map(|&l| l as usize).sum(); bytes.reserve(total); + offsets.reserve(len); // Track running offset locally instead of reading bytes.len() each iteration. let mut cur_off = bytes.len(); for i in 0..len { + // Prefetch the string data that will be read PREFETCH_DIST iterations ahead. + if i + PREFETCH_DIST < len { + unsafe { prefetch_read(ptrs[i + PREFETCH_DIST]) }; + } let nb = lens[i] as usize; if nb > 0 { bytes.extend_from_slice(unsafe { std::slice::from_raw_parts(ptrs[i], nb) }); @@ -306,8 +311,24 @@ unsafe fn append_to_state( Ok(()) } +/// Issue a read prefetch for the cache line at `ptr`. +/// Compiles to a real prefetch on x86_64 and aarch64; no-op elsewhere. +#[inline(always)] +unsafe fn prefetch_read(ptr: *const u8) { + #[cfg(target_arch = "x86_64")] + std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T1); + #[cfg(target_arch = "aarch64")] + core::arch::asm!("prfm pldl2keep, [{ptr}]", ptr = in(reg) ptr, options(nostack, readonly)); + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + let _ = ptr; +} + // Bulk-copy or 1-based-scattered-gather a slice of primitive T into buf (no transform). // Identity selection → single memcpy; scattered selection → element-wise gather. +// Prefetch distance for scatter-gather loops: enough to cover ~200-cycle cache miss +// latency at typical throughput of a few cycles per element. +const PREFETCH_DIST: usize = 16; + macro_rules! append_primitive { ($buf:expr, $slice:expr, $len:expr, $T:ty) => {{ let src = unsafe { std::slice::from_raw_parts($slice.data_ptr as *const $T, $len) }; @@ -315,7 +336,13 @@ macro_rules! append_primitive { $buf.extend_from_slice(unsafe { as_bytes(src) }); } else { let sel = unsafe { std::slice::from_raw_parts($slice.sel_ptr, $len) }; - for &idx in sel { + for (i, &idx) in sel.iter().enumerate() { + if i + PREFETCH_DIST < $len { + unsafe { + prefetch_read(src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize) + as *const u8) + }; + } $buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); } } @@ -368,7 +395,12 @@ unsafe fn append_numeric( buf.extend_from_slice(src); } else { let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for &idx in sel { + for (i, &idx) in sel.iter().enumerate() { + if i + PREFETCH_DIST < len { + unsafe { + prefetch_read(src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize * 16)) + }; + } let off = (idx - 1) as usize * 16; buf.extend_from_slice(&src[off..off + 16]); } From 407b8133bd27197084069d5033235630458f44f9 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 12 May 2026 09:23:51 +0200 Subject: [PATCH 09/27] Format --- iceberg_rust_ffi/src/batch_builder.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index 4e0d9d8..f16c5a3 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -339,8 +339,9 @@ macro_rules! append_primitive { for (i, &idx) in sel.iter().enumerate() { if i + PREFETCH_DIST < $len { unsafe { - prefetch_read(src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize) - as *const u8) + prefetch_read( + src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize) as *const u8 + ) }; } $buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); @@ -398,7 +399,9 @@ unsafe fn append_numeric( for (i, &idx) in sel.iter().enumerate() { if i + PREFETCH_DIST < len { unsafe { - prefetch_read(src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize * 16)) + prefetch_read( + src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize * 16), + ) }; } let off = (idx - 1) as usize * 16; From 8db1a0b999bc96d9d642fbd1abe8339180b443f9 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 12 May 2026 14:53:56 +0200 Subject: [PATCH 10/27] Use spawn_blocking instead of encode pool --- iceberg_rust_ffi/src/batch_builder.rs | 16 +- iceberg_rust_ffi/src/writer.rs | 264 ++++++++------------------ src/RustyIceberg.jl | 2 +- src/writer.jl | 17 -- 4 files changed, 83 insertions(+), 216 deletions(-) diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index f16c5a3..5ff477c 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -22,7 +22,7 @@ use arrow_array::{ use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_schema::SchemaRef as ArrowSchemaRef; -use crate::writer::{submit_batch, IcebergDataFileWriter, GLOBAL_ENCODE_POOL}; +use crate::writer::{submit_batch, IcebergDataFileWriter}; use crate::writer_columns::{ SliceRef, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_DATE, COLUMN_TYPE_DECIMAL_INT128, COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64, @@ -181,7 +181,6 @@ impl ColumnBatchBuilder { pub(crate) fn write_and_reset( &mut self, writer_ref: &IcebergDataFileWriter, - pool: &crate::writer::GlobalWorkerPool, ) -> Result<(), anyhow::Error> { let mut arrays: Vec = Vec::with_capacity(self.columns.len()); for (i, state) in self.columns.iter_mut().enumerate() { @@ -190,7 +189,7 @@ impl ColumnBatchBuilder { } let batch = arrow_array::RecordBatch::try_new(self.arrow_schema.clone(), arrays) .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))?; - submit_batch(writer_ref, pool, batch) + submit_batch(writer_ref, batch) } } @@ -678,17 +677,10 @@ pub extern "C" fn iceberg_batch_builder_write( } let writer_ref = unsafe { &*writer }; let builder_ref = unsafe { &mut *builder }; - let pool = match GLOBAL_ENCODE_POOL.get() { - Some(p) => p, - None => { - eprintln!("[iceberg] encode pool not initialized"); - return -1; - } - }; - match builder_ref.write_and_reset(writer_ref, pool) { + match builder_ref.write_and_reset(writer_ref) { Ok(()) => 0, Err(e) => { - crate::writer::store_writer_error_pub(writer_ref, e); + crate::writer::store_writer_error(writer_ref, e); -1 } } diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 9d0cb14..097033e 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -1,14 +1,11 @@ /// Writer support for iceberg_rust_ffi /// -/// Encoding is handled by a global pool of N=available_parallelism OS threads shared -/// across all writers. Per-writer ordering is guaranteed by the per-writer -/// `Arc>` inside WriterState: only one pool thread encodes -/// a given writer at a time, and the FIFO global queue ensures batches are submitted -/// in order. -use std::any::Any; +/// Encoding uses Tokio's blocking thread pool via `spawn_blocking`. A global `Semaphore` +/// with N=available_parallelism permits bounds concurrent Parquet+ZSTD encodes. Per-writer +/// ordering is guaranteed by the per-writer `Arc>` inside +/// WriterState: only one blocking task encodes a given writer at a time. use std::ffi::{c_char, c_void}; use std::io::Cursor; -use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::thread; @@ -105,149 +102,41 @@ use object_store_ffi::{ type ConcreteDataFileWriter = DataFileWriter; -/// Encode task submitted to the global worker pool. -pub(crate) struct EncodeTask { - batch: RecordBatch, - state: Arc, -} - -// Safety: RecordBatch is Send; WriterState fields are Send. -unsafe impl Send for EncodeTask {} - /// Shared mutable state for one IcebergDataFileWriter. -/// Owned by the IcebergDataFileWriter and shared with pool workers via Arc. pub(crate) struct WriterState { - /// The underlying Parquet writer. Protected by a Mutex so pool workers can access it - /// concurrently (though at most one worker encodes a given writer at a time due to the - /// bounded per-writer channel). Set to None when the writer is closed or freed. + /// The underlying Parquet writer. Protected by a Mutex so at most one blocking task + /// encodes a given writer at a time. Set to None when the writer is closed or freed. writer: Mutex>, - /// Number of encode tasks submitted to the pool but not yet completed. + /// Number of encode tasks submitted but not yet completed. pending: AtomicUsize, - /// Notified when `pending` drops to zero, so iceberg_writer_close can wait efficiently. + /// Notified when `pending` drops to zero so iceberg_writer_close can wait efficiently. done_notify: tokio::sync::Notify, - /// First encode error encountered by a pool worker, if any. + /// First encode error encountered by any task for this writer, if any. error: Mutex>, } -// Safety: ConcreteDataFileWriter is Send (verified by its use in spawn_blocking previously). +// Safety: ConcreteDataFileWriter is Send. unsafe impl Send for WriterState {} unsafe impl Sync for WriterState {} -/// Global pool of N=available_parallelism encode worker threads shared across all writers. -pub(crate) struct GlobalWorkerPool { - pub(crate) task_tx: tokio::sync::mpsc::Sender, -} - -pub(crate) static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); - -/// Formats a Rust panic payload into an anyhow error, preserving the message where possible. -fn format_panic_error(panic: Box) -> anyhow::Error { - let msg = if let Some(s) = panic.downcast_ref::<&str>() { - format!("encode worker panicked: {}", s) - } else if let Some(s) = panic.downcast_ref::() { - format!("encode worker panicked: {}", s) - } else { - "encode worker panicked (no string payload)".to_string() - }; - anyhow::anyhow!(msg) -} - -/// Body of each encode worker thread: receives tasks from the shared channel and encodes them. -fn encode_worker_loop( - task_rx: Arc>>, +/// Global encode state: semaphore + Tokio handle. +/// Captured at first writer creation (inside the Tokio runtime) so submit_batch can +/// spawn tasks even when called from Julia's sync (non-Tokio) context. +struct GlobalEncodeState { + semaphore: Arc, handle: tokio::runtime::Handle, -) { - loop { - // Acquire the shared receiver lock, then wait for a task. - // The lock is released as soon as recv() returns, so workers are not serialized - // during encoding — only during task pickup. - let task = { - let mut rx = handle.block_on(task_rx.lock()); - match handle.block_on(rx.recv()) { - Some(t) => t, - None => break, // sender dropped → pool shutting down - } - }; - - // Clone state before moving task into the closure so we can always decrement - // pending even if the closure panics. - let state = task.state.clone(); - let handle_enc = handle.clone(); - let encode_result = catch_unwind(AssertUnwindSafe(move || { - let mut guard = task.state.writer.lock().unwrap_or_else(|e| e.into_inner()); - match guard.as_mut() { - Some(w) => handle_enc - .block_on(w.write(task.batch)) - .map_err(|e| anyhow::anyhow!("write batch: {}", e)), - None => Err(anyhow::anyhow!("writer already closed")), - } - })); - - let err = match encode_result { - Ok(Ok(())) => None, - Ok(Err(e)) => Some(e), - Err(panic) => Some(format_panic_error(panic)), - }; - if let Some(e) = err { - let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(e); - } - } - - // Always decrement pending; notify close() if this was the last task. - let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - state.done_notify.notify_one(); - } - } } -/// Desired encode worker count. 0 means "use available_parallelism". -/// Must be set before the first iceberg_writer_new call. -static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0); +static GLOBAL_ENCODE_STATE: OnceLock = OnceLock::new(); -/// Set the number of encode worker threads in the global pool. -/// Must be called before any writer is created. Returns 0 on success, 1 if the pool is -/// already initialized (call ignored). -#[no_mangle] -pub extern "C" fn iceberg_set_encode_workers(n: i32) -> i32 { - if GLOBAL_ENCODE_POOL.get().is_some() { - return 1; - } - if n > 0 { - ENCODE_WORKERS.store(n as usize, Ordering::Relaxed); - } - 0 -} - -/// Initialize the global encode pool on first call. /// Must be called from within a Tokio runtime (iceberg_writer_new satisfies this). -fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { - GLOBAL_ENCODE_POOL.get_or_init(|| { - let configured = ENCODE_WORKERS.load(Ordering::Relaxed); - let n = if configured > 0 { - configured - } else { - // available_parallelism() only fails on unusual platforms (embedded, some sandboxes). - // On Linux/macOS/Windows it always succeeds, so the unwrap never fires in practice. - thread::available_parallelism().unwrap().get() - }; - let handle = tokio::runtime::Handle::current(); - // Buffer 2× workers — drain tasks are rarely blocked on submit. - let (task_tx, task_rx) = tokio::sync::mpsc::channel::(n * 2); - let task_rx = Arc::new(tokio::sync::Mutex::new(task_rx)); - - for i in 0..n { - let task_rx = task_rx.clone(); - let handle = handle.clone(); - thread::Builder::new() - .name(format!("iceberg-encode-{}", i)) - .spawn(move || encode_worker_loop(task_rx, handle)) - .expect("failed to spawn iceberg encode worker"); +fn get_or_init_encode_state() -> &'static GlobalEncodeState { + GLOBAL_ENCODE_STATE.get_or_init(|| { + let n = thread::available_parallelism().unwrap().get(); + GlobalEncodeState { + semaphore: Arc::new(tokio::sync::Semaphore::new(n)), + handle: tokio::runtime::Handle::current(), } - - GlobalWorkerPool { task_tx } }) } @@ -273,7 +162,7 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse; /// Store an error in the writer state (first error wins). -fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { +pub(crate) fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { let mut slot = writer_ref .writer_state .error @@ -284,49 +173,68 @@ fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { } } -/// Store an error in the writer state (public for batch_builder module). -pub(crate) fn store_writer_error_pub(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { - store_writer_error(writer_ref, e); -} - -/// Submit a `RecordBatch` to the global encode pool. +/// Submit a `RecordBatch` for async Parquet encoding via `spawn_blocking`. /// -/// Increments the writer's pending count before sending and rolls it back on channel failure. +/// A semaphore permit is acquired before the blocking task runs, bounding concurrent +/// encodes to N. The caller returns immediately; errors are surfaced at close time. pub(crate) fn submit_batch( writer_ref: &IcebergDataFileWriter, - pool: &GlobalWorkerPool, batch: RecordBatch, ) -> Result<(), anyhow::Error> { - writer_ref - .writer_state - .pending - .fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { - batch, - state: writer_ref.writer_state.clone(), + let enc = match GLOBAL_ENCODE_STATE.get() { + Some(s) => s, + None => return Err(anyhow::anyhow!("encode state not initialized; call iceberg_writer_new first")), }; - match pool.task_tx.blocking_send(task) { - Ok(()) => Ok(()), - Err(_) => { - let prev = writer_ref - .writer_state - .pending - .fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - writer_ref.writer_state.done_notify.notify_one(); + let state = writer_ref.writer_state.clone(); + let semaphore = enc.semaphore.clone(); + let handle = enc.handle.clone(); + state.pending.fetch_add(1, Ordering::AcqRel); + + handle.clone().spawn(async move { + // Acquire a permit before spawning — this is the backpressure mechanism. + let permit = match semaphore.acquire_owned().await { + Ok(p) => p, + Err(_) => { + let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { *slot = Some(anyhow::anyhow!("encode semaphore closed unexpectedly")); } + drop(slot); + let prev = state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { state.done_notify.notify_one(); } + return; } - Err(anyhow::anyhow!("encode pool channel closed unexpectedly")) + }; + + let state2 = state.clone(); + let result = tokio::task::spawn_blocking(move || { + let _permit = permit; // released when blocking work completes + let mut guard = state2.writer.lock().unwrap_or_else(|e| e.into_inner()); + match guard.as_mut() { + Some(w) => handle + .block_on(w.write(batch)) + .map_err(|e| anyhow::anyhow!("write batch: {}", e)), + None => Err(anyhow::anyhow!("writer already closed")), + } + }) + .await; + + let err = match result { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(join_err) => Some(anyhow::anyhow!("encode task failed: {}", join_err)), + }; + if let Some(e) = err { + let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { *slot = Some(e); } } - } + let prev = state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { state.done_notify.notify_one(); } + }); + + Ok(()) } -/// Validates column count, converts each `ColumnDescriptor` into a single-slice `SliceRef`, -/// routes through `ColumnBatchBuilder`, and submits the resulting `RecordBatch` to the -/// encode pool. Using the builder here keeps all type-conversion and null-bit logic in one -/// place (`batch_builder.rs`) instead of duplicating it. unsafe fn write_columns_inner( writer_ref: &IcebergDataFileWriter, - pool: &GlobalWorkerPool, arrow_schema: ArrowSchemaRef, col_descs: &[ColumnDescriptor], ) -> Result<(), anyhow::Error> { @@ -351,7 +259,7 @@ unsafe fn write_columns_inner( }) .collect(); unsafe { builder.append_slice(&slices) }?; - builder.write_and_reset(writer_ref, pool) + builder.write_and_reset(writer_ref) } /// Synchronous write of flat column data: copies each column from Julia memory into @@ -370,16 +278,9 @@ pub extern "C" fn iceberg_writer_write_columns( return -1; } let writer_ref = unsafe { &*writer }; - let pool = match GLOBAL_ENCODE_POOL.get() { - Some(p) => p, - None => { - eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first"); - return -1; - } - }; let arrow_schema = writer_ref.arrow_schema.clone(); let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; - if let Err(e) = unsafe { write_columns_inner(writer_ref, pool, arrow_schema, col_descs) } { + if let Err(e) = unsafe { write_columns_inner(writer_ref, arrow_schema, col_descs) } { store_writer_error(writer_ref, e); return -1; } @@ -470,8 +371,8 @@ export_runtime_op!( .map_err(|e| anyhow::anyhow!("Failed to convert schema to Arrow: {}", e))? ); - // Initialize global pool (no-op if already running). - get_or_init_encode_pool(); + // Initialize global encode state (no-op if already done). + get_or_init_encode_state(); let writer_state = Arc::new(WriterState { writer: Mutex::new(Some(concrete_writer)), @@ -506,15 +407,6 @@ pub extern "C" fn iceberg_writer_write( } let writer_ref = unsafe { &*writer }; - - let pool = match GLOBAL_ENCODE_POOL.get() { - Some(p) => p, - None => { - eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first"); - return -1; - } - }; - let ipc_bytes = unsafe { std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() }; let cursor = Cursor::new(ipc_bytes); @@ -534,7 +426,7 @@ pub extern "C" fn iceberg_writer_write( return -1; } }; - if let Err(e) = submit_batch(writer_ref, pool, batch) { + if let Err(e) = submit_batch(writer_ref, batch) { store_writer_error(writer_ref, e); return -1; } diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index 762d7b3..e1e1c1e 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -38,7 +38,7 @@ export IcebergTimestampNs, IcebergTimestamptzNs export IcebergString, IcebergUuid, IcebergBinary, IcebergDecimal export Transaction, DataFiles, free_transaction!, free_data_files!, commit, transaction export FastAppendAction, free_fast_append_action!, add_data_files, apply, with_fast_append -export DataFileWriter, free_writer!, close_writer, write_columns, set_encode_workers! +export DataFileWriter, free_writer!, close_writer, write_columns export WriterConfig, CompressionCodec, UNCOMPRESSED, SNAPPY, GZIP, LZ4, ZSTD export ColumnDescriptor, ColumnBatch, ColumnType export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64 diff --git a/src/writer.jl b/src/writer.jl index 6a913df..e063fe3 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -146,23 +146,6 @@ function get_column_metadata(table::Table)::Dict{Symbol, Vector{Pair{String, Str return colmeta end -""" - set_encode_workers!(n::Int) - -Set the number of threads in the global Parquet encode worker pool. - -Must be called before the first `DataFileWriter` is created (i.e. before the pool is -initialized). Throws if the pool is already running. Defaults to `Sys.CPU_THREADS` -if not set. -""" -function set_encode_workers!(n::Int) - n > 0 || throw(ArgumentError("n must be positive, got $n")) - ret = @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Int32 - ret == 0 || throw(IcebergException( - "set_encode_workers! must be called before creating any DataFileWriter" - )) - return nothing -end """ DataFileWriter(table::Table, config::WriterConfig) -> DataFileWriter From dfdb437322be9e487ddc9a038e6854f6b862f7f9 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 12 May 2026 14:54:41 +0200 Subject: [PATCH 11/27] Format --- iceberg_rust_ffi/src/writer.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 097033e..aea7745 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -183,7 +183,11 @@ pub(crate) fn submit_batch( ) -> Result<(), anyhow::Error> { let enc = match GLOBAL_ENCODE_STATE.get() { Some(s) => s, - None => return Err(anyhow::anyhow!("encode state not initialized; call iceberg_writer_new first")), + None => { + return Err(anyhow::anyhow!( + "encode state not initialized; call iceberg_writer_new first" + )) + } }; let state = writer_ref.writer_state.clone(); let semaphore = enc.semaphore.clone(); @@ -196,10 +200,14 @@ pub(crate) fn submit_batch( Ok(p) => p, Err(_) => { let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(anyhow::anyhow!("encode semaphore closed unexpectedly")); } + if slot.is_none() { + *slot = Some(anyhow::anyhow!("encode semaphore closed unexpectedly")); + } drop(slot); let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { state.done_notify.notify_one(); } + if prev == 1 { + state.done_notify.notify_one(); + } return; } }; @@ -224,10 +232,14 @@ pub(crate) fn submit_batch( }; if let Some(e) = err { let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(e); } + if slot.is_none() { + *slot = Some(e); + } } let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { state.done_notify.notify_one(); } + if prev == 1 { + state.done_notify.notify_one(); + } }); Ok(()) From 2556bc991388beaf463d43cd431b806b29b9b322 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 12 May 2026 15:00:10 +0200 Subject: [PATCH 12/27] Revert "Format" This reverts commit dfdb437322be9e487ddc9a038e6854f6b862f7f9. --- iceberg_rust_ffi/src/writer.rs | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index aea7745..097033e 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -183,11 +183,7 @@ pub(crate) fn submit_batch( ) -> Result<(), anyhow::Error> { let enc = match GLOBAL_ENCODE_STATE.get() { Some(s) => s, - None => { - return Err(anyhow::anyhow!( - "encode state not initialized; call iceberg_writer_new first" - )) - } + None => return Err(anyhow::anyhow!("encode state not initialized; call iceberg_writer_new first")), }; let state = writer_ref.writer_state.clone(); let semaphore = enc.semaphore.clone(); @@ -200,14 +196,10 @@ pub(crate) fn submit_batch( Ok(p) => p, Err(_) => { let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(anyhow::anyhow!("encode semaphore closed unexpectedly")); - } + if slot.is_none() { *slot = Some(anyhow::anyhow!("encode semaphore closed unexpectedly")); } drop(slot); let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - state.done_notify.notify_one(); - } + if prev == 1 { state.done_notify.notify_one(); } return; } }; @@ -232,14 +224,10 @@ pub(crate) fn submit_batch( }; if let Some(e) = err { let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(e); - } + if slot.is_none() { *slot = Some(e); } } let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - state.done_notify.notify_one(); - } + if prev == 1 { state.done_notify.notify_one(); } }); Ok(()) From c4d0d2f32063258560ec72d996b42daeb973df66 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 12 May 2026 15:00:22 +0200 Subject: [PATCH 13/27] Revert "Use spawn_blocking instead of encode pool" This reverts commit 8db1a0b999bc96d9d642fbd1abe8339180b443f9. --- iceberg_rust_ffi/src/batch_builder.rs | 16 +- iceberg_rust_ffi/src/writer.rs | 264 ++++++++++++++++++-------- src/RustyIceberg.jl | 2 +- src/writer.jl | 17 ++ 4 files changed, 216 insertions(+), 83 deletions(-) diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/batch_builder.rs index 5ff477c..f16c5a3 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/batch_builder.rs @@ -22,7 +22,7 @@ use arrow_array::{ use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_schema::SchemaRef as ArrowSchemaRef; -use crate::writer::{submit_batch, IcebergDataFileWriter}; +use crate::writer::{submit_batch, IcebergDataFileWriter, GLOBAL_ENCODE_POOL}; use crate::writer_columns::{ SliceRef, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_DATE, COLUMN_TYPE_DECIMAL_INT128, COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64, @@ -181,6 +181,7 @@ impl ColumnBatchBuilder { pub(crate) fn write_and_reset( &mut self, writer_ref: &IcebergDataFileWriter, + pool: &crate::writer::GlobalWorkerPool, ) -> Result<(), anyhow::Error> { let mut arrays: Vec = Vec::with_capacity(self.columns.len()); for (i, state) in self.columns.iter_mut().enumerate() { @@ -189,7 +190,7 @@ impl ColumnBatchBuilder { } let batch = arrow_array::RecordBatch::try_new(self.arrow_schema.clone(), arrays) .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))?; - submit_batch(writer_ref, batch) + submit_batch(writer_ref, pool, batch) } } @@ -677,10 +678,17 @@ pub extern "C" fn iceberg_batch_builder_write( } let writer_ref = unsafe { &*writer }; let builder_ref = unsafe { &mut *builder }; - match builder_ref.write_and_reset(writer_ref) { + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg] encode pool not initialized"); + return -1; + } + }; + match builder_ref.write_and_reset(writer_ref, pool) { Ok(()) => 0, Err(e) => { - crate::writer::store_writer_error(writer_ref, e); + crate::writer::store_writer_error_pub(writer_ref, e); -1 } } diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 097033e..9d0cb14 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -1,11 +1,14 @@ /// Writer support for iceberg_rust_ffi /// -/// Encoding uses Tokio's blocking thread pool via `spawn_blocking`. A global `Semaphore` -/// with N=available_parallelism permits bounds concurrent Parquet+ZSTD encodes. Per-writer -/// ordering is guaranteed by the per-writer `Arc>` inside -/// WriterState: only one blocking task encodes a given writer at a time. +/// Encoding is handled by a global pool of N=available_parallelism OS threads shared +/// across all writers. Per-writer ordering is guaranteed by the per-writer +/// `Arc>` inside WriterState: only one pool thread encodes +/// a given writer at a time, and the FIFO global queue ensures batches are submitted +/// in order. +use std::any::Any; use std::ffi::{c_char, c_void}; use std::io::Cursor; +use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::thread; @@ -102,41 +105,149 @@ use object_store_ffi::{ type ConcreteDataFileWriter = DataFileWriter; +/// Encode task submitted to the global worker pool. +pub(crate) struct EncodeTask { + batch: RecordBatch, + state: Arc, +} + +// Safety: RecordBatch is Send; WriterState fields are Send. +unsafe impl Send for EncodeTask {} + /// Shared mutable state for one IcebergDataFileWriter. +/// Owned by the IcebergDataFileWriter and shared with pool workers via Arc. pub(crate) struct WriterState { - /// The underlying Parquet writer. Protected by a Mutex so at most one blocking task - /// encodes a given writer at a time. Set to None when the writer is closed or freed. + /// The underlying Parquet writer. Protected by a Mutex so pool workers can access it + /// concurrently (though at most one worker encodes a given writer at a time due to the + /// bounded per-writer channel). Set to None when the writer is closed or freed. writer: Mutex>, - /// Number of encode tasks submitted but not yet completed. + /// Number of encode tasks submitted to the pool but not yet completed. pending: AtomicUsize, - /// Notified when `pending` drops to zero so iceberg_writer_close can wait efficiently. + /// Notified when `pending` drops to zero, so iceberg_writer_close can wait efficiently. done_notify: tokio::sync::Notify, - /// First encode error encountered by any task for this writer, if any. + /// First encode error encountered by a pool worker, if any. error: Mutex>, } -// Safety: ConcreteDataFileWriter is Send. +// Safety: ConcreteDataFileWriter is Send (verified by its use in spawn_blocking previously). unsafe impl Send for WriterState {} unsafe impl Sync for WriterState {} -/// Global encode state: semaphore + Tokio handle. -/// Captured at first writer creation (inside the Tokio runtime) so submit_batch can -/// spawn tasks even when called from Julia's sync (non-Tokio) context. -struct GlobalEncodeState { - semaphore: Arc, +/// Global pool of N=available_parallelism encode worker threads shared across all writers. +pub(crate) struct GlobalWorkerPool { + pub(crate) task_tx: tokio::sync::mpsc::Sender, +} + +pub(crate) static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); + +/// Formats a Rust panic payload into an anyhow error, preserving the message where possible. +fn format_panic_error(panic: Box) -> anyhow::Error { + let msg = if let Some(s) = panic.downcast_ref::<&str>() { + format!("encode worker panicked: {}", s) + } else if let Some(s) = panic.downcast_ref::() { + format!("encode worker panicked: {}", s) + } else { + "encode worker panicked (no string payload)".to_string() + }; + anyhow::anyhow!(msg) +} + +/// Body of each encode worker thread: receives tasks from the shared channel and encodes them. +fn encode_worker_loop( + task_rx: Arc>>, handle: tokio::runtime::Handle, +) { + loop { + // Acquire the shared receiver lock, then wait for a task. + // The lock is released as soon as recv() returns, so workers are not serialized + // during encoding — only during task pickup. + let task = { + let mut rx = handle.block_on(task_rx.lock()); + match handle.block_on(rx.recv()) { + Some(t) => t, + None => break, // sender dropped → pool shutting down + } + }; + + // Clone state before moving task into the closure so we can always decrement + // pending even if the closure panics. + let state = task.state.clone(); + let handle_enc = handle.clone(); + let encode_result = catch_unwind(AssertUnwindSafe(move || { + let mut guard = task.state.writer.lock().unwrap_or_else(|e| e.into_inner()); + match guard.as_mut() { + Some(w) => handle_enc + .block_on(w.write(task.batch)) + .map_err(|e| anyhow::anyhow!("write batch: {}", e)), + None => Err(anyhow::anyhow!("writer already closed")), + } + })); + + let err = match encode_result { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(panic) => Some(format_panic_error(panic)), + }; + if let Some(e) = err { + let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } + } + + // Always decrement pending; notify close() if this was the last task. + let prev = state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + state.done_notify.notify_one(); + } + } } -static GLOBAL_ENCODE_STATE: OnceLock = OnceLock::new(); +/// Desired encode worker count. 0 means "use available_parallelism". +/// Must be set before the first iceberg_writer_new call. +static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0); +/// Set the number of encode worker threads in the global pool. +/// Must be called before any writer is created. Returns 0 on success, 1 if the pool is +/// already initialized (call ignored). +#[no_mangle] +pub extern "C" fn iceberg_set_encode_workers(n: i32) -> i32 { + if GLOBAL_ENCODE_POOL.get().is_some() { + return 1; + } + if n > 0 { + ENCODE_WORKERS.store(n as usize, Ordering::Relaxed); + } + 0 +} + +/// Initialize the global encode pool on first call. /// Must be called from within a Tokio runtime (iceberg_writer_new satisfies this). -fn get_or_init_encode_state() -> &'static GlobalEncodeState { - GLOBAL_ENCODE_STATE.get_or_init(|| { - let n = thread::available_parallelism().unwrap().get(); - GlobalEncodeState { - semaphore: Arc::new(tokio::sync::Semaphore::new(n)), - handle: tokio::runtime::Handle::current(), +fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { + GLOBAL_ENCODE_POOL.get_or_init(|| { + let configured = ENCODE_WORKERS.load(Ordering::Relaxed); + let n = if configured > 0 { + configured + } else { + // available_parallelism() only fails on unusual platforms (embedded, some sandboxes). + // On Linux/macOS/Windows it always succeeds, so the unwrap never fires in practice. + thread::available_parallelism().unwrap().get() + }; + let handle = tokio::runtime::Handle::current(); + // Buffer 2× workers — drain tasks are rarely blocked on submit. + let (task_tx, task_rx) = tokio::sync::mpsc::channel::(n * 2); + let task_rx = Arc::new(tokio::sync::Mutex::new(task_rx)); + + for i in 0..n { + let task_rx = task_rx.clone(); + let handle = handle.clone(); + thread::Builder::new() + .name(format!("iceberg-encode-{}", i)) + .spawn(move || encode_worker_loop(task_rx, handle)) + .expect("failed to spawn iceberg encode worker"); } + + GlobalWorkerPool { task_tx } }) } @@ -162,7 +273,7 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse; /// Store an error in the writer state (first error wins). -pub(crate) fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { +fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { let mut slot = writer_ref .writer_state .error @@ -173,68 +284,49 @@ pub(crate) fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow:: } } -/// Submit a `RecordBatch` for async Parquet encoding via `spawn_blocking`. +/// Store an error in the writer state (public for batch_builder module). +pub(crate) fn store_writer_error_pub(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { + store_writer_error(writer_ref, e); +} + +/// Submit a `RecordBatch` to the global encode pool. /// -/// A semaphore permit is acquired before the blocking task runs, bounding concurrent -/// encodes to N. The caller returns immediately; errors are surfaced at close time. +/// Increments the writer's pending count before sending and rolls it back on channel failure. pub(crate) fn submit_batch( writer_ref: &IcebergDataFileWriter, + pool: &GlobalWorkerPool, batch: RecordBatch, ) -> Result<(), anyhow::Error> { - let enc = match GLOBAL_ENCODE_STATE.get() { - Some(s) => s, - None => return Err(anyhow::anyhow!("encode state not initialized; call iceberg_writer_new first")), + writer_ref + .writer_state + .pending + .fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { + batch, + state: writer_ref.writer_state.clone(), }; - let state = writer_ref.writer_state.clone(); - let semaphore = enc.semaphore.clone(); - let handle = enc.handle.clone(); - state.pending.fetch_add(1, Ordering::AcqRel); - - handle.clone().spawn(async move { - // Acquire a permit before spawning — this is the backpressure mechanism. - let permit = match semaphore.acquire_owned().await { - Ok(p) => p, - Err(_) => { - let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(anyhow::anyhow!("encode semaphore closed unexpectedly")); } - drop(slot); - let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { state.done_notify.notify_one(); } - return; + match pool.task_tx.blocking_send(task) { + Ok(()) => Ok(()), + Err(_) => { + let prev = writer_ref + .writer_state + .pending + .fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + writer_ref.writer_state.done_notify.notify_one(); } - }; - - let state2 = state.clone(); - let result = tokio::task::spawn_blocking(move || { - let _permit = permit; // released when blocking work completes - let mut guard = state2.writer.lock().unwrap_or_else(|e| e.into_inner()); - match guard.as_mut() { - Some(w) => handle - .block_on(w.write(batch)) - .map_err(|e| anyhow::anyhow!("write batch: {}", e)), - None => Err(anyhow::anyhow!("writer already closed")), - } - }) - .await; - - let err = match result { - Ok(Ok(())) => None, - Ok(Err(e)) => Some(e), - Err(join_err) => Some(anyhow::anyhow!("encode task failed: {}", join_err)), - }; - if let Some(e) = err { - let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(e); } + Err(anyhow::anyhow!("encode pool channel closed unexpectedly")) } - let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { state.done_notify.notify_one(); } - }); - - Ok(()) + } } +/// Validates column count, converts each `ColumnDescriptor` into a single-slice `SliceRef`, +/// routes through `ColumnBatchBuilder`, and submits the resulting `RecordBatch` to the +/// encode pool. Using the builder here keeps all type-conversion and null-bit logic in one +/// place (`batch_builder.rs`) instead of duplicating it. unsafe fn write_columns_inner( writer_ref: &IcebergDataFileWriter, + pool: &GlobalWorkerPool, arrow_schema: ArrowSchemaRef, col_descs: &[ColumnDescriptor], ) -> Result<(), anyhow::Error> { @@ -259,7 +351,7 @@ unsafe fn write_columns_inner( }) .collect(); unsafe { builder.append_slice(&slices) }?; - builder.write_and_reset(writer_ref) + builder.write_and_reset(writer_ref, pool) } /// Synchronous write of flat column data: copies each column from Julia memory into @@ -278,9 +370,16 @@ pub extern "C" fn iceberg_writer_write_columns( return -1; } let writer_ref = unsafe { &*writer }; + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first"); + return -1; + } + }; let arrow_schema = writer_ref.arrow_schema.clone(); let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; - if let Err(e) = unsafe { write_columns_inner(writer_ref, arrow_schema, col_descs) } { + if let Err(e) = unsafe { write_columns_inner(writer_ref, pool, arrow_schema, col_descs) } { store_writer_error(writer_ref, e); return -1; } @@ -371,8 +470,8 @@ export_runtime_op!( .map_err(|e| anyhow::anyhow!("Failed to convert schema to Arrow: {}", e))? ); - // Initialize global encode state (no-op if already done). - get_or_init_encode_state(); + // Initialize global pool (no-op if already running). + get_or_init_encode_pool(); let writer_state = Arc::new(WriterState { writer: Mutex::new(Some(concrete_writer)), @@ -407,6 +506,15 @@ pub extern "C" fn iceberg_writer_write( } let writer_ref = unsafe { &*writer }; + + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first"); + return -1; + } + }; + let ipc_bytes = unsafe { std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() }; let cursor = Cursor::new(ipc_bytes); @@ -426,7 +534,7 @@ pub extern "C" fn iceberg_writer_write( return -1; } }; - if let Err(e) = submit_batch(writer_ref, batch) { + if let Err(e) = submit_batch(writer_ref, pool, batch) { store_writer_error(writer_ref, e); return -1; } diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index e1e1c1e..762d7b3 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -38,7 +38,7 @@ export IcebergTimestampNs, IcebergTimestamptzNs export IcebergString, IcebergUuid, IcebergBinary, IcebergDecimal export Transaction, DataFiles, free_transaction!, free_data_files!, commit, transaction export FastAppendAction, free_fast_append_action!, add_data_files, apply, with_fast_append -export DataFileWriter, free_writer!, close_writer, write_columns +export DataFileWriter, free_writer!, close_writer, write_columns, set_encode_workers! export WriterConfig, CompressionCodec, UNCOMPRESSED, SNAPPY, GZIP, LZ4, ZSTD export ColumnDescriptor, ColumnBatch, ColumnType export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64 diff --git a/src/writer.jl b/src/writer.jl index e063fe3..6a913df 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -146,6 +146,23 @@ function get_column_metadata(table::Table)::Dict{Symbol, Vector{Pair{String, Str return colmeta end +""" + set_encode_workers!(n::Int) + +Set the number of threads in the global Parquet encode worker pool. + +Must be called before the first `DataFileWriter` is created (i.e. before the pool is +initialized). Throws if the pool is already running. Defaults to `Sys.CPU_THREADS` +if not set. +""" +function set_encode_workers!(n::Int) + n > 0 || throw(ArgumentError("n must be positive, got $n")) + ret = @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Int32 + ret == 0 || throw(IcebergException( + "set_encode_workers! must be called before creating any DataFileWriter" + )) + return nothing +end """ DataFileWriter(table::Table, config::WriterConfig) -> DataFileWriter From 5d89e00eca380ac70e43f5ae5a3292ec2e6118fd Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Wed, 20 May 2026 13:05:46 +0200 Subject: [PATCH 14/27] writer: per-writer queues + work-stealing encode pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the global MPMC channel + per-writer mutex with per-writer FIFO queues and a work-stealing worker pool. Workers scan the active-writer set, CAS a `busy` flag to claim a writer, then drain its queue. This removes the head-of-line blocking observed when multiple workers pulled tasks for the same writer and serialized on its mutex. Wakeup discipline: a single shared `Notify` plus a cascade — the worker that wins a claim notifies a peer before draining, so concurrent producer notifications that collapse to one stored permit still wake enough workers. After releasing `busy`, the worker re-checks `queue_len` and notifies again if a producer pushed during the release window. Add a `#[cfg(test)]` encode hook plus two unit tests: - fairness: 4 writers × 8 batches drain in round-robin, preserving per-writer FIFO. - stranded-task stress: 1.6k submits across 8 writers from 4 producer threads all reach pending=0. Co-Authored-By: Claude Opus 4.7 --- iceberg_rust_ffi/src/writer.rs | 689 ++++++++++++++++++++++++++++----- 1 file changed, 593 insertions(+), 96 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 9d0cb14..07a3cc1 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -1,15 +1,16 @@ /// Writer support for iceberg_rust_ffi /// /// Encoding is handled by a global pool of N=available_parallelism OS threads shared -/// across all writers. Per-writer ordering is guaranteed by the per-writer -/// `Arc>` inside WriterState: only one pool thread encodes -/// a given writer at a time, and the FIFO global queue ensures batches are submitted -/// in order. +/// across all writers. Each writer owns its own FIFO queue of pending batches; workers +/// scan the set of active writers and claim one (via the per-writer `busy` flag) before +/// draining its queue. This avoids the head-of-line blocking that the old single-MPMC +/// design suffered when many workers happened to pull tasks for the same writer. use std::any::Any; +use std::collections::VecDeque; use std::ffi::{c_char, c_void}; use std::io::Cursor; use std::panic::{catch_unwind, AssertUnwindSafe}; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::thread; @@ -105,23 +106,44 @@ use object_store_ffi::{ type ConcreteDataFileWriter = DataFileWriter; -/// Encode task submitted to the global worker pool. -pub(crate) struct EncodeTask { - batch: RecordBatch, - state: Arc, -} - -// Safety: RecordBatch is Send; WriterState fields are Send. -unsafe impl Send for EncodeTask {} - /// Shared mutable state for one IcebergDataFileWriter. /// Owned by the IcebergDataFileWriter and shared with pool workers via Arc. +/// +/// # Invariants +/// +/// 1. **Per-writer FIFO ordering.** Batches for a given writer are encoded in submission +/// order. Pushes go to the back of `pending_queue`; pops come from the front; and +/// `busy` ensures at most one worker drains the queue at a time — so the encode order +/// is identical to the push order. +/// 2. **Single-claim.** A worker may only encode for this writer while it has won the +/// `busy.compare_exchange(false → true)`. The claimed worker drains the queue to +/// empty (or until the writer reports an error) before releasing `busy`. +/// 3. **Stranded-task mitigation.** After releasing `busy`, the worker re-checks +/// `queue_len`; if non-zero (a producer pushed a batch between the worker's last +/// pop and the release), it notifies the global pool again. This prevents the +/// classic missed-notification race where a notification is consumed by a worker +/// that arrived between the producer's push and the queue's becoming non-empty. pub(crate) struct WriterState { - /// The underlying Parquet writer. Protected by a Mutex so pool workers can access it - /// concurrently (though at most one worker encodes a given writer at a time due to the - /// bounded per-writer channel). Set to None when the writer is closed or freed. + /// The underlying Parquet writer. Protected by a Mutex so pool workers can access it. + /// Only the worker that holds the `busy` claim ever locks this — so there's no real + /// contention here; the Mutex is preserved purely to coordinate with + /// `iceberg_writer_free`, which may take the writer out from under in-flight work. + /// Set to None when the writer is closed or freed. writer: Mutex>, - /// Number of encode tasks submitted to the pool but not yet completed. + /// FIFO queue of batches awaiting encode for this writer. + pending_queue: Mutex>, + /// Snapshot of `pending_queue.len()` exposed as an atomic so workers can skip writers + /// with no work without taking the queue lock. Kept in sync with the queue under the + /// queue lock by `submit_batch` (increments before notifying) and by workers (decrement + /// after popping). + queue_len: AtomicUsize, + /// Set to true by the worker currently encoding for this writer. Other workers skip + /// this writer while `busy` is true, even if `queue_len > 0`. + busy: AtomicBool, + /// True once this writer has been registered in `GlobalWorkerPool::active_writers`. + /// First submitter wins the CAS and performs the registration. + registered: AtomicBool, + /// Number of encode tasks submitted but not yet completed. Includes queued + in-flight. pending: AtomicUsize, /// Notified when `pending` drops to zero, so iceberg_writer_close can wait efficiently. done_notify: tokio::sync::Notify, @@ -133,13 +155,81 @@ pub(crate) struct WriterState { unsafe impl Send for WriterState {} unsafe impl Sync for WriterState {} +impl WriterState { + fn new(writer: ConcreteDataFileWriter) -> Self { + WriterState { + writer: Mutex::new(Some(writer)), + pending_queue: Mutex::new(VecDeque::new()), + queue_len: AtomicUsize::new(0), + busy: AtomicBool::new(false), + registered: AtomicBool::new(false), + pending: AtomicUsize::new(0), + done_notify: tokio::sync::Notify::new(), + error: Mutex::new(None), + } + } +} + /// Global pool of N=available_parallelism encode worker threads shared across all writers. +/// +/// Replaces the previous single-MPMC channel design. Each writer owns its own queue; +/// workers scan the active-writer list looking for a writer that (a) has queued work and +/// (b) is not currently claimed by another worker. The first such writer is claimed +/// (`busy = true`), drained, then released. +/// +/// # Wakeup discipline +/// +/// `wake` is a single shared `Notify` for the whole pool. Both producers (`submit_batch`) +/// and workers (after releasing a writer that still has work) call `wake.notify_one()`. +/// To avoid stranded tasks when multiple producers fire concurrently and only one permit +/// can be stored, a worker that successfully claims a writer cascades the wakeup by +/// calling `wake.notify_one()` before draining — so if more writers have work, another +/// worker is roused to look. pub(crate) struct GlobalWorkerPool { - pub(crate) task_tx: tokio::sync::mpsc::Sender, + /// Currently-registered writers. Workers iterate this on each pass looking for work. + /// Locked only briefly to snapshot the list (Arc clones); never held during encode. + active_writers: Mutex>>, + /// Wakeup channel for idle workers. Producers and finishing workers notify; idle + /// workers wait. See struct doc for the cascade discipline that prevents lost wakeups. + wake: tokio::sync::Notify, + /// Rotating start offset for the per-pass scan, so workers don't all collide on + /// writer 0 when several writers have work. + scan_offset: AtomicUsize, } pub(crate) static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); +impl GlobalWorkerPool { + /// Add a writer to the active set. Idempotent via the `registered` CAS on WriterState. + fn register(&self, state: &Arc) { + if state + .registered + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + let mut guard = self.active_writers.lock().unwrap_or_else(|e| e.into_inner()); + guard.push(state.clone()); + } + } + + /// Remove a writer from the active set. Called on free; idempotent. + fn unregister(&self, state: &WriterState) { + if !state.registered.swap(false, Ordering::AcqRel) { + return; + } + let mut guard = self.active_writers.lock().unwrap_or_else(|e| e.into_inner()); + let target = state as *const WriterState; + guard.retain(|s| Arc::as_ptr(s) != target); + } + + /// Snapshot the active-writers list. Returns Arc clones so subsequent encoding does + /// not hold the list lock. + fn snapshot(&self) -> Vec> { + let guard = self.active_writers.lock().unwrap_or_else(|e| e.into_inner()); + guard.clone() + } +} + /// Formats a Rust panic payload into an anyhow error, preserving the message where possible. fn format_panic_error(panic: Box) -> anyhow::Error { let msg = if let Some(s) = panic.downcast_ref::<&str>() { @@ -152,54 +242,168 @@ fn format_panic_error(panic: Box) -> anyhow::Error { anyhow::anyhow!(msg) } -/// Body of each encode worker thread: receives tasks from the shared channel and encodes them. -fn encode_worker_loop( - task_rx: Arc>>, - handle: tokio::runtime::Handle, +/// Try to claim a writer with pending work. Returns the claimed writer (busy=true) or +/// None if no writer has work available right now. +/// +/// Scans the active-writers snapshot starting at a rotating offset so workers don't all +/// race for writer 0. +fn try_claim_writer(pool: &GlobalWorkerPool) -> Option> { + let writers = pool.snapshot(); + if writers.is_empty() { + return None; + } + let n = writers.len(); + let start = pool.scan_offset.fetch_add(1, Ordering::Relaxed) % n; + for i in 0..n { + let w = &writers[(start + i) % n]; + if w.queue_len.load(Ordering::Acquire) == 0 { + continue; + } + if w + .busy + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + // Re-check after winning the claim: another worker may have drained the + // queue between our queue_len load and our CAS. If so, release and skip. + if w.queue_len.load(Ordering::Acquire) == 0 { + w.busy.store(false, Ordering::Release); + continue; + } + return Some(w.clone()); + } + } + None +} + +/// Encode a single batch for the given (already-claimed) writer. Stores any encode error +/// in `state.error` (first-writer-wins) and always decrements `pending` exactly once. +fn encode_one_batch( + state: &Arc, + batch: RecordBatch, + handle: &tokio::runtime::Handle, ) { + // Test hook: bypass the real Parquet write so we can exercise the dispatch logic in + // isolation. Enabled only when a test installs a positive delay via `test_hooks`. + #[cfg(test)] + { + let delay_ms = test_hooks::DELAY_MS.load(Ordering::Relaxed); + if delay_ms > 0 { + test_hooks::run_hook(state, &batch); + std::thread::sleep(std::time::Duration::from_millis(delay_ms)); + let prev = state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + state.done_notify.notify_one(); + } + return; + } + } + + let state_for_panic = state.clone(); + let handle_enc = handle.clone(); + let state_for_encode = state.clone(); + let encode_result = catch_unwind(AssertUnwindSafe(move || { + let mut guard = state_for_encode + .writer + .lock() + .unwrap_or_else(|e| e.into_inner()); + match guard.as_mut() { + Some(w) => handle_enc + .block_on(w.write(batch)) + .map_err(|e| anyhow::anyhow!("write batch: {}", e)), + None => Err(anyhow::anyhow!("writer already closed")), + } + })); + + let err = match encode_result { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(panic) => Some(format_panic_error(panic)), + }; + if let Some(e) = err { + let mut slot = state_for_panic + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } + } + + let prev = state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + state.done_notify.notify_one(); + } +} + +/// Drain the claimed writer's queue while we hold `busy`. Pops one batch at a time and +/// encodes it. The `busy` flag ensures FIFO per-writer ordering: while we hold it, no +/// other worker can interleave a pop on this writer's queue. +fn drain_claimed_writer(state: &Arc, handle: &tokio::runtime::Handle) { loop { - // Acquire the shared receiver lock, then wait for a task. - // The lock is released as soon as recv() returns, so workers are not serialized - // during encoding — only during task pickup. - let task = { - let mut rx = handle.block_on(task_rx.lock()); - match handle.block_on(rx.recv()) { - Some(t) => t, - None => break, // sender dropped → pool shutting down + let batch = { + let mut q = state + .pending_queue + .lock() + .unwrap_or_else(|e| e.into_inner()); + match q.pop_front() { + Some(b) => { + state.queue_len.fetch_sub(1, Ordering::AcqRel); + b + } + None => break, } }; + encode_one_batch(state, batch, handle); + } +} - // Clone state before moving task into the closure so we can always decrement - // pending even if the closure panics. - let state = task.state.clone(); - let handle_enc = handle.clone(); - let encode_result = catch_unwind(AssertUnwindSafe(move || { - let mut guard = task.state.writer.lock().unwrap_or_else(|e| e.into_inner()); - match guard.as_mut() { - Some(w) => handle_enc - .block_on(w.write(task.batch)) - .map_err(|e| anyhow::anyhow!("write batch: {}", e)), - None => Err(anyhow::anyhow!("writer already closed")), - } - })); +/// Worker thread body: scan for a writer with work, claim it, drain its queue, release, +/// re-check (stranded-task mitigation), and either continue or wait for a wake-up. +/// +/// The wake-up protocol uses a single shared `Notify` with a cascade discipline. See the +/// docs on `GlobalWorkerPool` for the full picture; the key races are: +/// +/// - **Producer notification lost.** Tokio's `Notify` stores at most one permit, so if +/// two producers fire `notify_one()` while all workers are sleeping, only one worker +/// wakes. To prevent the second producer's work from being stranded, the woken worker +/// calls `wake.notify_one()` *before* it starts draining — cascading the wakeup so +/// another worker checks the remaining writers. +/// - **Push between last-pop and busy-release.** A producer pushes a batch after the +/// drain loop sees an empty queue but before this worker clears `busy`. The producer's +/// `notify_one()` may have been consumed by some other worker that ran an empty scan +/// and went back to sleep. Mitigation: after clearing `busy`, this worker re-reads +/// `queue_len`; if non-zero, it notifies again so someone re-claims the writer. +fn encode_worker_loop(pool: &'static GlobalWorkerPool, handle: tokio::runtime::Handle) { + loop { + // Pre-register interest in the next wake-up. `enable()` guarantees that any + // `notify_one()` issued from this point on will wake the future even if it + // hasn't been polled yet — so the check-then-wait below is race-free. + let notified = pool.wake.notified(); + tokio::pin!(notified); + notified.as_mut().enable(); - let err = match encode_result { - Ok(Ok(())) => None, - Ok(Err(e)) => Some(e), - Err(panic) => Some(format_panic_error(panic)), - }; - if let Some(e) = err { - let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(e); + if let Some(state) = try_claim_writer(pool) { + // Cascade: another writer may also have work. Wake a peer to look in + // parallel before we commit to draining this one. + pool.wake.notify_one(); + + drain_claimed_writer(&state, &handle); + + // Release the claim. After this point another worker is free to claim + // the writer. + state.busy.store(false, Ordering::Release); + + // Stranded-task mitigation: a producer may have pushed between our last + // pop and our release. If so, ensure a worker is woken to handle it. + if state.queue_len.load(Ordering::Acquire) > 0 { + pool.wake.notify_one(); } + continue; } - // Always decrement pending; notify close() if this was the last task. - let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - state.done_notify.notify_one(); - } + // Nothing to claim — go to sleep until notified. + handle.block_on(notified); } } @@ -224,7 +428,8 @@ pub extern "C" fn iceberg_set_encode_workers(n: i32) -> i32 { /// Initialize the global encode pool on first call. /// Must be called from within a Tokio runtime (iceberg_writer_new satisfies this). fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { - GLOBAL_ENCODE_POOL.get_or_init(|| { + static INIT: std::sync::Once = std::sync::Once::new(); + INIT.call_once(|| { let configured = ENCODE_WORKERS.load(Ordering::Relaxed); let n = if configured > 0 { configured @@ -234,21 +439,31 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { thread::available_parallelism().unwrap().get() }; let handle = tokio::runtime::Handle::current(); - // Buffer 2× workers — drain tasks are rarely blocked on submit. - let (task_tx, task_rx) = tokio::sync::mpsc::channel::(n * 2); - let task_rx = Arc::new(tokio::sync::Mutex::new(task_rx)); + + // Install the pool first so workers can reference it as `&'static`. + GLOBAL_ENCODE_POOL + .set(GlobalWorkerPool { + active_writers: Mutex::new(Vec::new()), + wake: tokio::sync::Notify::new(), + scan_offset: AtomicUsize::new(0), + }) + .ok() + .expect("encode pool initialized twice"); + let pool_ref: &'static GlobalWorkerPool = GLOBAL_ENCODE_POOL + .get() + .expect("pool was just installed"); for i in 0..n { - let task_rx = task_rx.clone(); let handle = handle.clone(); thread::Builder::new() .name(format!("iceberg-encode-{}", i)) - .spawn(move || encode_worker_loop(task_rx, handle)) + .spawn(move || encode_worker_loop(pool_ref, handle)) .expect("failed to spawn iceberg encode worker"); } - - GlobalWorkerPool { task_tx } - }) + }); + GLOBAL_ENCODE_POOL + .get() + .expect("encode pool not installed by INIT") } /// Opaque writer handle for FFI. @@ -289,35 +504,40 @@ pub(crate) fn store_writer_error_pub(writer_ref: &IcebergDataFileWriter, e: anyh store_writer_error(writer_ref, e); } -/// Submit a `RecordBatch` to the global encode pool. +/// Submit a `RecordBatch` to the writer's queue. Lazily registers the writer with the +/// global pool on first submit, then pushes onto the per-writer FIFO queue and notifies +/// the pool that there is work available somewhere. /// -/// Increments the writer's pending count before sending and rolls it back on channel failure. +/// `pending` (queued + in-flight) is incremented under the queue lock so that +/// `iceberg_writer_close` sees a consistent count. pub(crate) fn submit_batch( writer_ref: &IcebergDataFileWriter, pool: &GlobalWorkerPool, batch: RecordBatch, ) -> Result<(), anyhow::Error> { - writer_ref - .writer_state - .pending - .fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { - batch, - state: writer_ref.writer_state.clone(), - }; - match pool.task_tx.blocking_send(task) { - Ok(()) => Ok(()), - Err(_) => { - let prev = writer_ref - .writer_state - .pending - .fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - writer_ref.writer_state.done_notify.notify_one(); - } - Err(anyhow::anyhow!("encode pool channel closed unexpectedly")) - } + // Idempotent — only the first submit pays the lock to push into active_writers. + pool.register(&writer_ref.writer_state); + + { + let mut q = writer_ref + .writer_state + .pending_queue + .lock() + .unwrap_or_else(|e| e.into_inner()); + q.push_back(batch); + // Increment counters under the lock so queue and counters stay consistent. + writer_ref + .writer_state + .queue_len + .fetch_add(1, Ordering::AcqRel); + writer_ref + .writer_state + .pending + .fetch_add(1, Ordering::AcqRel); } + + pool.wake.notify_one(); + Ok(()) } /// Validates column count, converts each `ColumnDescriptor` into a single-slice `SliceRef`, @@ -386,12 +606,17 @@ pub extern "C" fn iceberg_writer_write_columns( 0 } -/// Free a writer. Poisons the writer state so any in-flight pool tasks fail gracefully. +/// Free a writer. Poisons the writer state so any in-flight pool tasks fail gracefully, +/// and unregisters the writer from the global pool's active-writers list so workers stop +/// scanning it. #[no_mangle] pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) { if !writer.is_null() { unsafe { let boxed = Box::from_raw(writer); + if let Some(pool) = GLOBAL_ENCODE_POOL.get() { + pool.unregister(&boxed.writer_state); + } // Poison the ConcreteDataFileWriter so any in-flight pool tasks return an error // rather than writing to a partially-freed writer. let _ = boxed.writer_state.writer.lock().unwrap().take(); @@ -473,12 +698,7 @@ export_runtime_op!( // Initialize global pool (no-op if already running). get_or_init_encode_pool(); - let writer_state = Arc::new(WriterState { - writer: Mutex::new(Some(concrete_writer)), - pending: AtomicUsize::new(0), - done_notify: tokio::sync::Notify::new(), - error: Mutex::new(None), - }); + let writer_state = Arc::new(WriterState::new(concrete_writer)); Ok::(IcebergDataFileWriter { arrow_schema, @@ -606,3 +826,280 @@ export_runtime_op!( }, writer: *mut IcebergDataFileWriter ); + +// ───────────────────────────────────────────────────────────────────────────── +// Test hooks + dispatch tests +// ───────────────────────────────────────────────────────────────────────────── + +#[cfg(test)] +pub(crate) mod test_hooks { + use std::sync::atomic::AtomicU64; + use std::sync::Mutex; + + use arrow_array::{Array, Int64Array, RecordBatch}; + + use super::WriterState; + use std::sync::Arc; + + /// When non-zero, `encode_one_batch` skips the real Parquet write, sleeps this many + /// milliseconds, and records the completion. Used by dispatch-logic tests. + pub(crate) static DELAY_MS: AtomicU64 = AtomicU64::new(0); + + /// Recorded `(writer_id, batch_id)` for each completed encode while `DELAY_MS > 0`. + /// `writer_id` is the `Arc` pointer cast to usize. `batch_id` is read + /// from the batch's first column (assumed to be an Int64Array of length 1). + pub(crate) static COMPLETIONS: Mutex> = Mutex::new(Vec::new()); + + pub(crate) fn run_hook(state: &Arc, batch: &RecordBatch) { + let id = batch + .column(0) + .as_any() + .downcast_ref::() + .map(|a| a.value(0)) + .unwrap_or(-1); + let writer_id = Arc::as_ptr(state) as usize; + COMPLETIONS.lock().unwrap().push((writer_id, id)); + } + + pub(crate) fn reset() { + DELAY_MS.store(0, std::sync::atomic::Ordering::Relaxed); + COMPLETIONS.lock().unwrap().clear(); + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::atomic::Ordering; + use std::sync::{Arc, Mutex}; + use std::time::Duration; + + use arrow_array::{Int64Array, RecordBatch}; + use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; + + use super::*; + + /// Serializes all dispatch tests so they don't trample the shared global pool + /// state (`DELAY_MS`, `COMPLETIONS`, `active_writers`). + static TEST_SERIAL: Mutex<()> = Mutex::new(()); + + /// A long-lived multi-threaded runtime that the global encode pool can pin its + /// `Handle` to across tests. `#[tokio::test]` builds a fresh runtime per test and + /// drops it at end, which would invalidate the workers' handles. + fn pinned_runtime() -> &'static tokio::runtime::Runtime { + static RT: std::sync::OnceLock = std::sync::OnceLock::new(); + RT.get_or_init(|| { + tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_all() + .build() + .unwrap() + }) + } + + fn batch_with_id(id: i64) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int64, + false, + )])); + RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(vec![id]))]).unwrap() + } + + /// Constructs a WriterState with no underlying Parquet writer. Safe because the + /// test hook bypasses the real `w.write(batch)` path. + fn mock_writer_state() -> Arc { + Arc::new(WriterState { + writer: Mutex::new(None), + pending_queue: Mutex::new(std::collections::VecDeque::new()), + queue_len: std::sync::atomic::AtomicUsize::new(0), + busy: std::sync::atomic::AtomicBool::new(false), + registered: std::sync::atomic::AtomicBool::new(false), + pending: std::sync::atomic::AtomicUsize::new(0), + done_notify: tokio::sync::Notify::new(), + error: Mutex::new(None), + }) + } + + /// Push a batch onto a WriterState's queue, registering with the pool and waking it. + /// Mirrors `submit_batch` but takes a bare WriterState (so tests can use mock states + /// without going through IcebergDataFileWriter). + fn push(pool: &GlobalWorkerPool, state: &Arc, batch: RecordBatch) { + pool.register(state); + { + let mut q = state.pending_queue.lock().unwrap(); + q.push_back(batch); + state.queue_len.fetch_add(1, Ordering::AcqRel); + state.pending.fetch_add(1, Ordering::AcqRel); + } + pool.wake.notify_one(); + } + + fn wait_for_pending_zero(state: &WriterState, timeout: Duration) -> bool { + let start = std::time::Instant::now(); + while state.pending.load(Ordering::Acquire) > 0 { + if start.elapsed() > timeout { + return false; + } + std::thread::sleep(Duration::from_millis(5)); + } + true + } + + /// Initializes the global encode pool from inside the pinned runtime. Safe to call + /// many times — only the first call does any work. + fn ensure_pool() -> &'static GlobalWorkerPool { + let _g = pinned_runtime().enter(); + get_or_init_encode_pool() + } + + /// Detach any mock writers we registered with the pool so a subsequent test starts + /// from a clean active-writer list. Doesn't shut down the workers (they're shared). + fn cleanup_writers(pool: &GlobalWorkerPool, states: &[Arc]) { + for s in states { + pool.unregister(s); + } + } + + /// Fairness: with 4 writers each holding 8 queued batches and N>=4 workers, the new + /// dispatch should drain all 4 writers in parallel rather than serializing one + /// writer at a time. + /// + /// We assert two properties: + /// 1. Per-writer FIFO: each writer's batches complete in submission order. + /// 2. Parallelism: within any group of 4 consecutive completions, all 4 writers + /// appear — i.e., a round-robin pattern emerges naturally because each writer + /// is being drained by its own worker, all sleeping for the same delay. + #[test] + fn fairness_drains_writers_in_parallel() { + let _serial = TEST_SERIAL.lock().unwrap(); + let pool = ensure_pool(); + + test_hooks::reset(); + test_hooks::DELAY_MS.store(20, Ordering::Relaxed); + + let writers: Vec> = (0..4).map(|_| mock_writer_state()).collect(); + let writer_ids: HashMap = writers + .iter() + .enumerate() + .map(|(i, s)| (Arc::as_ptr(s) as usize, i)) + .collect(); + + // Submit interleaved: round 0 of every writer, then round 1, etc. + for round in 0..8i64 { + for (i, w) in writers.iter().enumerate() { + let batch_id = (i as i64) * 100 + round; + push(pool, w, batch_with_id(batch_id)); + } + } + + for w in &writers { + assert!( + wait_for_pending_zero(w, Duration::from_secs(10)), + "writer did not drain in time" + ); + } + + let completions = test_hooks::COMPLETIONS.lock().unwrap().clone(); + // 4 writers × 8 batches = 32 completions. + assert_eq!(completions.len(), 32); + + // (1) FIFO per writer: filter completions by writer and check batch IDs ascend. + for (i, w) in writers.iter().enumerate() { + let id = Arc::as_ptr(w) as usize; + let ids: Vec = completions + .iter() + .filter(|(wid, _)| *wid == id) + .map(|(_, bid)| *bid) + .collect(); + assert_eq!(ids.len(), 8, "writer {} missing batches", i); + for j in 0..8 { + assert_eq!( + ids[j], + (i as i64) * 100 + j as i64, + "writer {} batch {} out of order: {:?}", + i, + j, + ids + ); + } + } + + // (2) Parallelism: each group of 4 consecutive completions should contain 4 + // distinct writers. With <4 workers in the pool this would fail; on any modern + // dev machine `available_parallelism() >= 4`. + for chunk in completions.chunks(4) { + let distinct: std::collections::HashSet = chunk + .iter() + .map(|(wid, _)| writer_ids[wid]) + .collect(); + assert_eq!( + distinct.len(), + 4, + "expected 4 distinct writers per round, got {:?}", + chunk + ); + } + + cleanup_writers(pool, &writers); + test_hooks::reset(); + } + + /// Stranded-task race: hammer the pool with many submits across many writers and + /// verify that every submitted batch is eventually drained — i.e., `pending` always + /// converges to zero, no batch sits forever in a per-writer queue because of a + /// missed wake-up. + #[test] + fn no_stranded_tasks_under_load() { + let _serial = TEST_SERIAL.lock().unwrap(); + let pool = ensure_pool(); + + test_hooks::reset(); + // Tiny delay (1ms) so a) the test runs fast, b) producers and drains + // genuinely race rather than one always preceding the other. + test_hooks::DELAY_MS.store(1, Ordering::Relaxed); + + const WRITERS: usize = 8; + const BATCHES_PER_WRITER: usize = 200; + let writers: Vec> = (0..WRITERS).map(|_| mock_writer_state()).collect(); + + // Drive submissions from several threads to maximize interleaving. + let mut handles = Vec::new(); + for tid in 0..4 { + let writers = writers.clone(); + let pool: &'static GlobalWorkerPool = pool; + handles.push(std::thread::spawn(move || { + for batch_idx in 0..(BATCHES_PER_WRITER / 4) { + for (wi, w) in writers.iter().enumerate() { + let id = (tid as i64) * 1_000_000 + + (wi as i64) * 10_000 + + batch_idx as i64; + push(pool, w, batch_with_id(id)); + } + } + })); + } + for h in handles { + h.join().unwrap(); + } + + // Wait for every writer's pending to drop to zero. If any single writer's queue + // is stranded, this would time out. + for (i, w) in writers.iter().enumerate() { + assert!( + wait_for_pending_zero(w, Duration::from_secs(30)), + "writer {} did not drain; pending={} queue_len={}", + i, + w.pending.load(Ordering::Acquire), + w.queue_len.load(Ordering::Acquire), + ); + assert_eq!(w.queue_len.load(Ordering::Acquire), 0); + } + + let total = test_hooks::COMPLETIONS.lock().unwrap().len(); + assert_eq!(total, WRITERS * BATCHES_PER_WRITER); + + cleanup_writers(pool, &writers); + test_hooks::reset(); + } +} From e75ad287208629360d7dd60cc4468bf0e0a62b5c Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Wed, 20 May 2026 14:56:18 +0200 Subject: [PATCH 15/27] writer: add LZ4_RAW compression codec The existing LZ4 codec maps to Compression::LZ4, the deprecated Hadoop-framed variant whose per-page framing overhead is the usual reason LZ4 underperforms Snappy in parquet benchmarks. Add LZ4_RAW = 5 as a separate FFI enum value mapping to Compression::LZ4_RAW (modern raw blocks, parquet spec codec ID 7). LZ4 = 3 keeps its current behavior for backward compatibility. Co-Authored-By: Claude Opus 4.7 --- iceberg_rust_ffi/src/writer.rs | 2 ++ src/RustyIceberg.jl | 2 +- src/writer.jl | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 07a3cc1..6a74315 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -35,6 +35,7 @@ const COMPRESSION_SNAPPY: i32 = 1; const COMPRESSION_GZIP: i32 = 2; const COMPRESSION_LZ4: i32 = 3; const COMPRESSION_ZSTD: i32 = 4; +const COMPRESSION_LZ4_RAW: i32 = 5; fn compression_from_code(code: i32) -> Compression { match code { @@ -43,6 +44,7 @@ fn compression_from_code(code: i32) -> Compression { COMPRESSION_GZIP => Compression::GZIP(Default::default()), COMPRESSION_LZ4 => Compression::LZ4, COMPRESSION_ZSTD => Compression::ZSTD(Default::default()), + COMPRESSION_LZ4_RAW => Compression::LZ4_RAW, _ => Compression::SNAPPY, } } diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index 762d7b3..979b35b 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -39,7 +39,7 @@ export IcebergString, IcebergUuid, IcebergBinary, IcebergDecimal export Transaction, DataFiles, free_transaction!, free_data_files!, commit, transaction export FastAppendAction, free_fast_append_action!, add_data_files, apply, with_fast_append export DataFileWriter, free_writer!, close_writer, write_columns, set_encode_workers! -export WriterConfig, CompressionCodec, UNCOMPRESSED, SNAPPY, GZIP, LZ4, ZSTD +export WriterConfig, CompressionCodec, UNCOMPRESSED, SNAPPY, GZIP, LZ4, ZSTD, LZ4_RAW export ColumnDescriptor, ColumnBatch, ColumnType export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64 export COLUMN_TYPE_STRING, COLUMN_TYPE_DATE, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_UUID diff --git a/src/writer.jl b/src/writer.jl index 6a913df..18fd94b 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -18,8 +18,11 @@ Compression codec for Parquet files. - `UNCOMPRESSED`: No compression - `SNAPPY`: Snappy compression (fast, moderate compression) - `GZIP`: Gzip compression (slower, better compression) -- `LZ4`: LZ4 compression (very fast, lower compression) +- `LZ4`: LZ4 compression, legacy Hadoop-framed variant (deprecated in the parquet spec; kept for + backward compatibility — prefer `LZ4_RAW`) - `ZSTD`: Zstandard compression (good balance of speed and compression) +- `LZ4_RAW`: LZ4 compression, raw blocks with no framing overhead (modern parquet variant; faster + than `LZ4`) """ @enum CompressionCodec begin UNCOMPRESSED = 0 @@ -27,6 +30,7 @@ Compression codec for Parquet files. GZIP = 2 LZ4 = 3 ZSTD = 4 + LZ4_RAW = 5 end """ From dd04750e2d94bde1dc49c11c006bbea1f28cf50d Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Wed, 20 May 2026 14:57:19 +0200 Subject: [PATCH 16/27] gitignore: ignore .vscode/settings.json Co-Authored-By: Claude Opus 4.7 --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9080477..9fc7ff4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ iceberg_rust_ffi/integration_test **/.claude **/.DS_Store Manifest.toml -LocalPreferences.toml \ No newline at end of file +LocalPreferences.toml +.vscode/settings.json \ No newline at end of file From 67723b632920f9abd508cd00df2e563f2d9baed2 Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Fri, 22 May 2026 08:57:27 +0200 Subject: [PATCH 17/27] writer: run encode workers as async tokio tasks instead of OS threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the OS encode threads each calling handle.block_on(w.write(batch)) with N async tasks spawned on the existing runtime. The runtime thread is freed during the I/O await inside w.write() and can pick up another writer's encode task in the meantime — hiding S3 PUT latency across writers without changing per-writer FIFO ordering. Adds WriterState::poisoned so iceberg_writer_free can signal in-flight encodes (which now release the writer mutex during .await) to drop rather than restore the writer. Replaces the old catch_unwind bookkeeping with a PendingGuard Drop impl that decrements pending exactly once on normal return or panic. Co-Authored-By: Claude Opus 4.7 --- iceberg_rust_ffi/src/writer.rs | 151 ++++++++++++++++++--------------- 1 file changed, 84 insertions(+), 67 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 6a74315..4032cf4 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -5,11 +5,9 @@ /// scan the set of active writers and claim one (via the per-writer `busy` flag) before /// draining its queue. This avoids the head-of-line blocking that the old single-MPMC /// design suffered when many workers happened to pull tasks for the same writer. -use std::any::Any; use std::collections::VecDeque; use std::ffi::{c_char, c_void}; use std::io::Cursor; -use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::thread; @@ -151,6 +149,11 @@ pub(crate) struct WriterState { done_notify: tokio::sync::Notify, /// First encode error encountered by a pool worker, if any. error: Mutex>, + /// Set by iceberg_writer_free to tell in-flight async encodes to drop the writer + /// instead of putting it back. Needed because async encode takes the writer out of + /// the Option for the duration of `w.write(batch).await`, releasing the Mutex so the + /// runtime thread can drive other tasks while parked on I/O. + poisoned: AtomicBool, } // Safety: ConcreteDataFileWriter is Send (verified by its use in spawn_blocking previously). @@ -168,6 +171,7 @@ impl WriterState { pending: AtomicUsize::new(0), done_notify: tokio::sync::Notify::new(), error: Mutex::new(None), + poisoned: AtomicBool::new(false), } } } @@ -232,18 +236,6 @@ impl GlobalWorkerPool { } } -/// Formats a Rust panic payload into an anyhow error, preserving the message where possible. -fn format_panic_error(panic: Box) -> anyhow::Error { - let msg = if let Some(s) = panic.downcast_ref::<&str>() { - format!("encode worker panicked: {}", s) - } else if let Some(s) = panic.downcast_ref::() { - format!("encode worker panicked: {}", s) - } else { - "encode worker panicked (no string payload)".to_string() - }; - anyhow::anyhow!(msg) -} - /// Try to claim a writer with pending work. Returns the claimed writer (busy=true) or /// None if no writer has work available right now. /// @@ -278,70 +270,86 @@ fn try_claim_writer(pool: &GlobalWorkerPool) -> Option> { None } -/// Encode a single batch for the given (already-claimed) writer. Stores any encode error -/// in `state.error` (first-writer-wins) and always decrements `pending` exactly once. -fn encode_one_batch( - state: &Arc, - batch: RecordBatch, - handle: &tokio::runtime::Handle, -) { +/// Drop guard that decrements `pending` exactly once. Used by the async encode path to +/// guarantee the counter still falls to zero (so `iceberg_writer_close` can return) even +/// if `w.write(...).await` panics and unwinds the task. +struct PendingGuard(Arc); + +impl Drop for PendingGuard { + fn drop(&mut self) { + let prev = self.0.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + self.0.done_notify.notify_one(); + } + } +} + +/// Encode a single batch for the given (already-claimed) writer. +/// +/// **Async**: takes the underlying writer out of `state.writer` under the std Mutex +/// (briefly), awaits `w.write(batch)` without holding any sync lock, then puts the writer +/// back — unless `iceberg_writer_free` has set `poisoned`, in which case the writer is +/// dropped. The take-put pattern is what lets the runtime thread go drive other tasks +/// while this one is parked in an S3 PUT inside `w.write()`. +/// +/// Stores any encode error in `state.error` (first-writer-wins). `pending` is decremented +/// exactly once via the `PendingGuard` Drop impl, so close() never hangs even on panic. +async fn encode_one_batch(state: Arc, batch: RecordBatch) { + let _pending = PendingGuard(state.clone()); + // Test hook: bypass the real Parquet write so we can exercise the dispatch logic in // isolation. Enabled only when a test installs a positive delay via `test_hooks`. #[cfg(test)] { let delay_ms = test_hooks::DELAY_MS.load(Ordering::Relaxed); if delay_ms > 0 { - test_hooks::run_hook(state, &batch); - std::thread::sleep(std::time::Duration::from_millis(delay_ms)); - let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - state.done_notify.notify_one(); - } + test_hooks::run_hook(&state, &batch); + tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; return; } } - let state_for_panic = state.clone(); - let handle_enc = handle.clone(); - let state_for_encode = state.clone(); - let encode_result = catch_unwind(AssertUnwindSafe(move || { - let mut guard = state_for_encode - .writer - .lock() - .unwrap_or_else(|e| e.into_inner()); - match guard.as_mut() { - Some(w) => handle_enc - .block_on(w.write(batch)) - .map_err(|e| anyhow::anyhow!("write batch: {}", e)), - None => Err(anyhow::anyhow!("writer already closed")), - } - })); + // Take the writer out under the std Mutex. If the writer was already poisoned by + // a prior free(), we want to drop any writer that's still in the slot — but the + // Some/None of the slot itself tells us that. + let writer_opt = { + let mut guard = state.writer.lock().unwrap_or_else(|e| e.into_inner()); + guard.take() + }; - let err = match encode_result { - Ok(Ok(())) => None, - Ok(Err(e)) => Some(e), - Err(panic) => Some(format_panic_error(panic)), + let (mut writer_opt, result) = match writer_opt { + Some(mut w) => { + let r = w + .write(batch) + .await + .map_err(|e| anyhow::anyhow!("write batch: {}", e)); + (Some(w), r) + } + None => (None, Err(anyhow::anyhow!("writer already closed"))), }; - if let Some(e) = err { - let mut slot = state_for_panic - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(e); + + // Put the writer back unless free() ran during our .await — in which case + // `poisoned` is set and we drop the writer to honor the poison semantic. + if let Some(w) = writer_opt.take() { + if state.poisoned.load(Ordering::Acquire) { + drop(w); + } else { + *state.writer.lock().unwrap_or_else(|e| e.into_inner()) = Some(w); } } - let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - state.done_notify.notify_one(); + if let Err(e) = result { + let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } } } /// Drain the claimed writer's queue while we hold `busy`. Pops one batch at a time and /// encodes it. The `busy` flag ensures FIFO per-writer ordering: while we hold it, no /// other worker can interleave a pop on this writer's queue. -fn drain_claimed_writer(state: &Arc, handle: &tokio::runtime::Handle) { +async fn drain_claimed_writer(state: Arc) { loop { let batch = { let mut q = state @@ -356,7 +364,7 @@ fn drain_claimed_writer(state: &Arc, handle: &tokio::runtime::Handl None => break, } }; - encode_one_batch(state, batch, handle); + encode_one_batch(state.clone(), batch).await; } } @@ -376,7 +384,7 @@ fn drain_claimed_writer(state: &Arc, handle: &tokio::runtime::Handl /// `notify_one()` may have been consumed by some other worker that ran an empty scan /// and went back to sleep. Mitigation: after clearing `busy`, this worker re-reads /// `queue_len`; if non-zero, it notifies again so someone re-claims the writer. -fn encode_worker_loop(pool: &'static GlobalWorkerPool, handle: tokio::runtime::Handle) { +async fn encode_worker_loop(pool: &'static GlobalWorkerPool) { loop { // Pre-register interest in the next wake-up. `enable()` guarantees that any // `notify_one()` issued from this point on will wake the future even if it @@ -390,7 +398,7 @@ fn encode_worker_loop(pool: &'static GlobalWorkerPool, handle: tokio::runtime::H // parallel before we commit to draining this one. pool.wake.notify_one(); - drain_claimed_writer(&state, &handle); + drain_claimed_writer(state.clone()).await; // Release the claim. After this point another worker is free to claim // the writer. @@ -405,7 +413,7 @@ fn encode_worker_loop(pool: &'static GlobalWorkerPool, handle: tokio::runtime::H } // Nothing to claim — go to sleep until notified. - handle.block_on(notified); + notified.await; } } @@ -455,12 +463,13 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { .get() .expect("pool was just installed"); - for i in 0..n { - let handle = handle.clone(); - thread::Builder::new() - .name(format!("iceberg-encode-{}", i)) - .spawn(move || encode_worker_loop(pool_ref, handle)) - .expect("failed to spawn iceberg encode worker"); + // Spawn N async worker tasks on the tokio runtime. Each task runs + // `encode_worker_loop`, which awaits at I/O boundaries inside `w.write()` — + // freeing the runtime thread to drive other tasks (other writers' encodes) + // during S3 PUTs. Number of in-flight encodes is no longer bounded by OS + // thread count, only by core count for actual CPU work. + for _ in 0..n { + handle.spawn(encode_worker_loop(pool_ref)); } }); GLOBAL_ENCODE_POOL @@ -619,6 +628,13 @@ pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) { if let Some(pool) = GLOBAL_ENCODE_POOL.get() { pool.unregister(&boxed.writer_state); } + // Set the poison flag BEFORE taking the writer out, so any encode task that + // currently holds the writer outside the Mutex (across its `.await`) will see + // `poisoned == true` when it goes to put the writer back, and drop it instead. + boxed + .writer_state + .poisoned + .store(true, Ordering::Release); // Poison the ConcreteDataFileWriter so any in-flight pool tasks return an error // rather than writing to a partially-freed writer. let _ = boxed.writer_state.writer.lock().unwrap().take(); @@ -920,6 +936,7 @@ mod tests { pending: std::sync::atomic::AtomicUsize::new(0), done_notify: tokio::sync::Notify::new(), error: Mutex::new(None), + poisoned: std::sync::atomic::AtomicBool::new(false), }) } From e2508058d0801fdbb625cd1b090b6260be5f21ae Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Fri, 22 May 2026 11:48:27 +0200 Subject: [PATCH 18/27] writer: default encode worker count to 2 * available_parallelism MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the conversion to async worker tasks, workers parked on S3 PUTs no longer consume runtime threads. Defaulting to one task per core left roughly half the cores idle while tasks awaited I/O. Oversubscribing by 2x lets the runtime keep cores busy with encode CPU from one task while others are parked on I/O — measured ~7% throughput improvement on an EBS-capped local benchmark; expected to be larger when I/O latency is higher (real S3). Also refresh the now-stale "OS threads" / "N = available_parallelism" phrasing in the module/struct/FFI doc comments to reflect the async-task implementation. Co-Authored-By: Claude Opus 4.7 --- iceberg_rust_ffi/src/writer.rs | 35 ++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 4032cf4..2ebf129 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -1,10 +1,13 @@ /// Writer support for iceberg_rust_ffi /// -/// Encoding is handled by a global pool of N=available_parallelism OS threads shared -/// across all writers. Each writer owns its own FIFO queue of pending batches; workers -/// scan the set of active writers and claim one (via the per-writer `busy` flag) before -/// draining its queue. This avoids the head-of-line blocking that the old single-MPMC -/// design suffered when many workers happened to pull tasks for the same writer. +/// Encoding is handled by a global pool of N async worker tasks (default +/// 2 * available_parallelism, configurable via `iceberg_set_encode_workers`) running +/// on the tokio runtime, shared across all writers. Each writer owns its own FIFO queue +/// of pending batches; workers scan the set of active writers and claim one (via the +/// per-writer `busy` flag) before draining its queue. This avoids the head-of-line +/// blocking that the old single-MPMC design suffered when many workers happened to pull +/// tasks for the same writer. Workers `.await` the I/O inside `w.write()`, so a runtime +/// thread parked on an S3 PUT is free to drive another writer's encode in the meantime. use std::collections::VecDeque; use std::ffi::{c_char, c_void}; use std::io::Cursor; @@ -176,7 +179,9 @@ impl WriterState { } } -/// Global pool of N=available_parallelism encode worker threads shared across all writers. +/// Global pool of N encode worker tasks shared across all writers (N defaults to +/// 2 * available_parallelism so async workers parked on I/O don't starve cores of +/// encode CPU work; tune via `iceberg_set_encode_workers`). /// /// Replaces the previous single-MPMC channel design. Each writer owns its own queue; /// workers scan the active-writer list looking for a writer that (a) has queued work and @@ -417,11 +422,14 @@ async fn encode_worker_loop(pool: &'static GlobalWorkerPool) { } } -/// Desired encode worker count. 0 means "use available_parallelism". +/// Desired encode worker count. 0 means "use 2 * available_parallelism", which +/// oversubscribes the core count on purpose: encode worker tasks are async, so workers +/// parked on S3 PUTs don't cost CPU and we want enough total tasks for the runtime to +/// keep cores busy with CPU encode while others wait on I/O. /// Must be set before the first iceberg_writer_new call. static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0); -/// Set the number of encode worker threads in the global pool. +/// Set the number of encode worker tasks in the global pool. /// Must be called before any writer is created. Returns 0 on success, 1 if the pool is /// already initialized (call ignored). #[no_mangle] @@ -446,7 +454,9 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { } else { // available_parallelism() only fails on unusual platforms (embedded, some sandboxes). // On Linux/macOS/Windows it always succeeds, so the unwrap never fires in practice. - thread::available_parallelism().unwrap().get() + // 2x: oversubscribe so async workers parked on I/O leave room for other workers + // to do encode CPU on the freed runtime threads. + thread::available_parallelism().unwrap().get() * 2 }; let handle = tokio::runtime::Handle::current(); @@ -480,8 +490,8 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { /// Opaque writer handle for FFI. /// /// Writing is pipelined: Julia gathers a RecordBatch and submits it directly to the -/// global encode pool, then returns immediately. Pool workers (N = available_parallelism) -/// encode Parquet concurrently across all active writers. +/// global encode pool, then returns immediately. Pool workers (async tasks; default +/// N = 2 * available_parallelism) encode Parquet concurrently across all active writers. pub struct IcebergDataFileWriter { /// Arrow schema for this table, used by write_columns to create RecordBatches. pub(crate) arrow_schema: ArrowSchemaRef, @@ -644,7 +654,8 @@ pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) { // Create a new DataFileWriter from a table with configuration options. // -// The global encode pool (N = available_parallelism threads) is initialized on the first call. +// The global encode pool (N async worker tasks, default 2 * available_parallelism) is +// initialized on the first call. export_runtime_op!( iceberg_writer_new, IcebergDataFileWriterResponse, From 171ff872da75a5546ec9988238bb933d951c9db3 Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Fri, 22 May 2026 14:11:31 +0200 Subject: [PATCH 19/27] writer: collapse Batch/Builder API to RowChunk + append! + flush! Rename the Rust module to record_batch_builder and embed the builder inside IcebergDataFileWriter (UnsafeCell>, lazily initialized). Column types are derived from the table's Arrow schema at writer construction; callers no longer pass col_types. Auto-flush when the coalesce window fills; explicit flush via the new iceberg_writer_flush. close also flushes any partial-window remainder. The Julia surface collapses to one user-facing type (RowChunk) and two methods (append!, flush!). ColumnBatch, SliceBatch, ColumnBatchBuilder, ColumnDescriptor, append_slice!, free_builder!, and both write_columns overloads are removed. SliceRef is renamed to ColumnSlice (internal). Co-Authored-By: Claude Opus 4.7 --- iceberg_rust_ffi/src/lib.rs | 9 +- ...tch_builder.rs => record_batch_builder.rs} | 160 ++---- iceberg_rust_ffi/src/writer.rs | 231 ++++++-- iceberg_rust_ffi/src/writer_columns.rs | 56 +- src/RustyIceberg.jl | 6 +- src/writer.jl | 494 +++--------------- test/writer_tests.jl | 230 ++++---- 7 files changed, 422 insertions(+), 764 deletions(-) rename iceberg_rust_ffi/src/{batch_builder.rs => record_batch_builder.rs} (85%) diff --git a/iceberg_rust_ffi/src/lib.rs b/iceberg_rust_ffi/src/lib.rs index 45e1cfa..403263b 100644 --- a/iceberg_rust_ffi/src/lib.rs +++ b/iceberg_rust_ffi/src/lib.rs @@ -38,11 +38,12 @@ mod transaction; // Writer module mod writer; -// Column-based writer module (zero-copy from Julia) +// Shared FFI structs/constants for the column-based write path mod writer_columns; -// Incremental batch builder: per-slice copy into owned buffers, finalize to RecordBatch -mod batch_builder; +// Incremental RecordBatch builder: per-slice copy into owned buffers, finalize to RecordBatch. +// Embedded inside `IcebergDataFileWriter`; not directly exposed across the FFI. +mod record_batch_builder; // Profiling stats for the file-parallel pipeline mod pipeline_stats; @@ -76,7 +77,7 @@ pub use transaction::{IcebergDataFiles, IcebergTransaction, IcebergTransactionRe pub use writer::{ IcebergDataFileWriter, IcebergDataFileWriterResponse, IcebergWriterCloseResponse, }; -pub use writer_columns::ColumnDescriptor; +pub use writer_columns::ColumnSlice; // We use `jl_adopt_thread` to ensure Rust can call into Julia when notifying // the Base.Event that is waiting for the Rust result. diff --git a/iceberg_rust_ffi/src/batch_builder.rs b/iceberg_rust_ffi/src/record_batch_builder.rs similarity index 85% rename from iceberg_rust_ffi/src/batch_builder.rs rename to iceberg_rust_ffi/src/record_batch_builder.rs index f16c5a3..94c72a9 100644 --- a/iceberg_rust_ffi/src/batch_builder.rs +++ b/iceberg_rust_ffi/src/record_batch_builder.rs @@ -1,19 +1,19 @@ -/// Incremental column batch builder for zero-copy-from-Julia writes. +/// Incremental Arrow `RecordBatch` builder for zero-copy-from-Julia writes. /// -/// Julia calls `iceberg_batch_builder_append_slice` once per operator slice, passing -/// one `SliceRef` per column. Each slice's data is appended directly into a pre-allocated -/// `MutableBuffer` that becomes the Arrow column buffer at finalize time — no intermediate -/// Vec or second copy is needed. Julia can reuse source memory as soon as the call returns. +/// Lives as an internal field of `IcebergDataFileWriter`. Julia drives it through +/// `iceberg_writer_append` (per upstream slice) and `iceberg_writer_flush` (explicit +/// boundary). Auto-flush at `coalesce_rows` happens inside the writer's append entry +/// point; the builder itself is mechanical. /// -/// Null bits are populated lazily: all-valid slices are skipped entirely. The first null -/// slice triggers a one-time backfill of all prior rows as valid, then subsequent slices -/// are processed normally. If no null slice ever arrives, no NullBuffer is emitted. +/// Each `append_slice` call copies one slice's data per column directly into per-column +/// `MutableBuffer`s that already match Arrow's physical layout. At finalize time those +/// buffers become Arrow `Buffer`s via a zero-copy `.into()` move, get wrapped in typed +/// arrays, and assemble into a `RecordBatch` — no further copy. Fresh same-capacity +/// buffers swap in for the next window, so steady-state reallocation is zero. /// -/// When a coalesce window is full, Julia calls `iceberg_batch_builder_write` which -/// finalises all per-column buffers into Arrow arrays, assembles a `RecordBatch`, and -/// submits it to the async encode pool — then resets the builder in-place for reuse. -/// Reset swaps in a fresh pre-allocated `MutableBuffer` (same capacity) so the next -/// window never reallocates. +/// Null bits are populated lazily: all-valid slices skip the bitmap entirely. The first +/// null slice triggers a one-time backfill of all prior rows as valid, then subsequent +/// slices proceed normally. If no null slice ever arrives, no `NullBuffer` is emitted. use std::sync::Arc; use arrow_array::{ @@ -22,9 +22,8 @@ use arrow_array::{ use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_schema::SchemaRef as ArrowSchemaRef; -use crate::writer::{submit_batch, IcebergDataFileWriter, GLOBAL_ENCODE_POOL}; use crate::writer_columns::{ - SliceRef, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_DATE, COLUMN_TYPE_DECIMAL_INT128, + ColumnSlice, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_DATE, COLUMN_TYPE_DECIMAL_INT128, COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64, COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_JULIA_DATE, COLUMN_TYPE_JULIA_TIMESTAMP, COLUMN_TYPE_JULIA_TIMESTAMPTZ, COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS, @@ -38,8 +37,8 @@ const JULIA_DATE_OFFSET: i64 = 719_163; /// Milliseconds from Julia DateTime epoch (0001-01-01) to Unix epoch (1970-01-01). const JULIA_TIMESTAMP_OFFSET_MS: i64 = 719_163 * 86_400_000; -// Default coalesce_rows — must match Julia's DEFAULT_COALESCE_ROWS. -const DEFAULT_COALESCE_ROWS: usize = 1_048_576; +/// Default coalesce-window size for the embedded builder. +pub(crate) const DEFAULT_COALESCE_ROWS: usize = 1_048_576; /// Bytes per row for numeric column types (0 for Bool/Str which are not Numeric). fn column_bytes_per_row(column_type: i32) -> usize { @@ -131,15 +130,15 @@ impl ColumnBuilderState { } // --------------------------------------------------------------------------- -// Public builder type +// Builder type -pub struct ColumnBatchBuilder { +pub(crate) struct RecordBatchBuilder { columns: Vec, arrow_schema: ArrowSchemaRef, coalesce_rows: usize, } -impl ColumnBatchBuilder { +impl RecordBatchBuilder { pub(crate) fn new( arrow_schema: ArrowSchemaRef, col_types: &[i32], @@ -164,7 +163,27 @@ impl ColumnBatchBuilder { }) } - pub(crate) unsafe fn append_slice(&mut self, slices: &[SliceRef]) -> Result<(), anyhow::Error> { + /// Rows accumulated in the current window (across all columns; they stay in sync). + pub(crate) fn rows(&self) -> usize { + self.columns.first().map(|c| c.rows).unwrap_or(0) + } + + /// True when the current window has reached or passed `coalesce_rows` — the writer + /// should finalize and reset before continuing. + pub(crate) fn should_flush(&self) -> bool { + self.rows() >= self.coalesce_rows + } + + /// Append one slice per column. Rust copies all data synchronously; source memory + /// may be released the moment this call returns. + /// + /// # Safety + /// All pointers inside the `ColumnSlice`s must be valid for `len` elements for the + /// duration of this call. + pub(crate) unsafe fn append_slice( + &mut self, + slices: &[ColumnSlice], + ) -> Result<(), anyhow::Error> { if slices.len() != self.columns.len() { return Err(anyhow::anyhow!( "slice count {} != column count {}", @@ -178,19 +197,18 @@ impl ColumnBatchBuilder { Ok(()) } - pub(crate) fn write_and_reset( - &mut self, - writer_ref: &IcebergDataFileWriter, - pool: &crate::writer::GlobalWorkerPool, - ) -> Result<(), anyhow::Error> { + /// Finalize the accumulated columns into a `RecordBatch` and reset all column + /// buffers in-place for the next window. The buffers are swapped with fresh + /// pre-allocated `MutableBuffer`s of the same capacity, so the next window never + /// reallocates. + pub(crate) fn take_record_batch(&mut self) -> Result { let mut arrays: Vec = Vec::with_capacity(self.columns.len()); for (i, state) in self.columns.iter_mut().enumerate() { let field = self.arrow_schema.field(i); arrays.push(finalize_and_reset(state, field, self.coalesce_rows)?); } - let batch = arrow_array::RecordBatch::try_new(self.arrow_schema.clone(), arrays) - .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))?; - submit_batch(writer_ref, pool, batch) + arrow_array::RecordBatch::try_new(self.arrow_schema.clone(), arrays) + .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) } } @@ -199,7 +217,7 @@ impl ColumnBatchBuilder { unsafe fn append_to_state( state: &mut ColumnBuilderState, - slice: &SliceRef, + slice: &ColumnSlice, ) -> Result<(), anyhow::Error> { let len = slice.len; @@ -216,10 +234,8 @@ unsafe fn append_to_state( state.null_bits.resize(needed, 0u8); set_bits_range(&mut state.null_bits, 0, out_start); state.has_nulls = true; - } else { - if state.null_bits.len() < needed { - state.null_bits.resize(needed, 0u8); - } + } else if state.null_bits.len() < needed { + state.null_bits.resize(needed, 0u8); } // Copy validity bits. When source and destination are byte-aligned (out_start // is a multiple of 8 — always true for flush-per-slice), one copy_nonoverlapping @@ -372,7 +388,7 @@ macro_rules! append_transform { /// Identity (sequential) slices use a bulk byte copy; scattered slices loop element-wise. unsafe fn append_numeric( buf: &mut MutableBuffer, - slice: &SliceRef, + slice: &ColumnSlice, column_type: i32, len: usize, ) -> Result<(), anyhow::Error> { @@ -626,77 +642,3 @@ unsafe fn as_bytes(s: &[T]) -> &[u8] { std::slice::from_raw_parts(s.as_ptr() as *const u8, s.len() * std::mem::size_of::()) } } - -// --------------------------------------------------------------------------- -// FFI entry points - -#[no_mangle] -pub extern "C" fn iceberg_batch_builder_new( - writer: *mut IcebergDataFileWriter, - col_types: *const i32, - num_columns: usize, -) -> *mut ColumnBatchBuilder { - if writer.is_null() || col_types.is_null() || num_columns == 0 { - return std::ptr::null_mut(); - } - let writer_ref = unsafe { &*writer }; - let col_types_slice = unsafe { std::slice::from_raw_parts(col_types, num_columns) }; - match ColumnBatchBuilder::new( - writer_ref.arrow_schema.clone(), - col_types_slice, - DEFAULT_COALESCE_ROWS, - ) { - Ok(b) => Box::into_raw(Box::new(b)), - Err(_) => std::ptr::null_mut(), - } -} - -#[no_mangle] -pub extern "C" fn iceberg_batch_builder_append_slice( - builder: *mut ColumnBatchBuilder, - slices: *const SliceRef, - num_columns: usize, -) -> i32 { - if builder.is_null() || slices.is_null() || num_columns == 0 { - return -1; - } - let builder_ref = unsafe { &mut *builder }; - let slices_slice = unsafe { std::slice::from_raw_parts(slices, num_columns) }; - match unsafe { builder_ref.append_slice(slices_slice) } { - Ok(()) => 0, - Err(_) => -1, - } -} - -#[no_mangle] -pub extern "C" fn iceberg_batch_builder_write( - writer: *mut IcebergDataFileWriter, - builder: *mut ColumnBatchBuilder, -) -> i32 { - if writer.is_null() || builder.is_null() { - return -1; - } - let writer_ref = unsafe { &*writer }; - let builder_ref = unsafe { &mut *builder }; - let pool = match GLOBAL_ENCODE_POOL.get() { - Some(p) => p, - None => { - eprintln!("[iceberg] encode pool not initialized"); - return -1; - } - }; - match builder_ref.write_and_reset(writer_ref, pool) { - Ok(()) => 0, - Err(e) => { - crate::writer::store_writer_error_pub(writer_ref, e); - -1 - } - } -} - -#[no_mangle] -pub extern "C" fn iceberg_batch_builder_free(builder: *mut ColumnBatchBuilder) { - if !builder.is_null() { - unsafe { drop(Box::from_raw(builder)) } - } -} diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 2ebf129..8a70f8e 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -8,6 +8,7 @@ /// blocking that the old single-MPMC design suffered when many workers happened to pull /// tasks for the same writer. Workers `.await` the I/O inside `w.write()`, so a runtime /// thread parked on an S3 PUT is free to drive another writer's encode in the meantime. +use std::cell::UnsafeCell; use std::collections::VecDeque; use std::ffi::{c_char, c_void}; use std::io::Cursor; @@ -17,7 +18,7 @@ use std::thread; use arrow_array::RecordBatch; use arrow_ipc::reader::StreamReader; -use arrow_schema::SchemaRef as ArrowSchemaRef; +use arrow_schema::{DataType, SchemaRef as ArrowSchemaRef, TimeUnit}; use iceberg::arrow::schema_to_arrow_schema; use iceberg::spec::DataFileFormat; use iceberg::writer::base_writer::data_file_writer::{DataFileWriter, DataFileWriterBuilder}; @@ -95,12 +96,18 @@ impl ParquetWriterPropertiesFFI { } } -use crate::batch_builder::ColumnBatchBuilder; +use crate::record_batch_builder::{RecordBatchBuilder, DEFAULT_COALESCE_ROWS}; use crate::response::IcebergBoxedResponse; use crate::table::IcebergTable; use crate::transaction::IcebergDataFiles; use crate::util::parse_c_string; -use crate::writer_columns::{ColumnDescriptor, SliceRef}; +use crate::writer_columns::{ + ColumnSlice, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_DECIMAL_INT128, COLUMN_TYPE_DECIMAL_INT32, + COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64, COLUMN_TYPE_INT32, + COLUMN_TYPE_INT64, COLUMN_TYPE_JULIA_DATE, COLUMN_TYPE_JULIA_TIMESTAMP, + COLUMN_TYPE_JULIA_TIMESTAMPTZ, COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS, + COLUMN_TYPE_JULIA_TIMESTAMP_NS, COLUMN_TYPE_STRING, COLUMN_TYPE_UUID, +}; use object_store_ffi::{ export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT, }; @@ -489,17 +496,29 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { /// Opaque writer handle for FFI. /// -/// Writing is pipelined: Julia gathers a RecordBatch and submits it directly to the -/// global encode pool, then returns immediately. Pool workers (async tasks; default -/// N = 2 * available_parallelism) encode Parquet concurrently across all active writers. +/// Writing is pipelined: Julia hands one upstream slice at a time to `iceberg_writer_append`, +/// which copies it into the embedded `RecordBatchBuilder`'s per-column buffers. When the +/// builder hits the coalesce window, the writer finalizes a `RecordBatch` and submits it to +/// the global encode pool. Pool workers (async tasks; default N = 2 * available_parallelism) +/// encode Parquet concurrently across all active writers. pub struct IcebergDataFileWriter { - /// Arrow schema for this table, used by write_columns to create RecordBatches. + /// Arrow schema for this table; used to set up the embedded builder and to assemble + /// RecordBatches from IPC writes. pub(crate) arrow_schema: ArrowSchemaRef, + /// Per-column type codes derived from `arrow_schema` at construction time. Drives the + /// builder's copy/conversion logic. + pub(crate) col_types: Vec, + /// Lazily-constructed RecordBatch builder. `UnsafeCell` because the FFI dereferences + /// the writer as `&IcebergDataFileWriter` (one writer is only ever accessed from one + /// Julia thread, so interior mutability without locking is sound). + pub(crate) builder: UnsafeCell>, /// Shared state: owns the ConcreteDataFileWriter, tracks pending count and errors. pub(crate) writer_state: Arc, } unsafe impl Send for IcebergDataFileWriter {} +// Safety: callers must ensure each writer is accessed from one Julia thread at a time — +// the FFI contract. `builder` is the only `!Sync` field; everything else is Sync via Arc. unsafe impl Sync for IcebergDataFileWriter {} /// Type alias for writer response @@ -520,9 +539,79 @@ fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { } } -/// Store an error in the writer state (public for batch_builder module). -pub(crate) fn store_writer_error_pub(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { - store_writer_error(writer_ref, e); +/// Map an Arrow `DataType` to the corresponding `COLUMN_TYPE_*` code the builder uses. +/// +/// Date and Timestamp variants default to the Julia-epoch codes because that's the natural +/// shape of incoming Julia data (`Dates.value(d)` of a Julia `Date` returns days since +/// 0001-01-01). Users sending pre-converted Unix-epoch integers are an edge case the new +/// schema-driven API doesn't address — they'd need to pre-convert and write integers +/// against a plain integer column. +fn arrow_type_to_column_type(dt: &DataType) -> Result { + Ok(match dt { + DataType::Int32 => COLUMN_TYPE_INT32, + DataType::Int64 => COLUMN_TYPE_INT64, + DataType::Float32 => COLUMN_TYPE_FLOAT32, + DataType::Float64 => COLUMN_TYPE_FLOAT64, + DataType::Utf8 => COLUMN_TYPE_STRING, + DataType::Boolean => COLUMN_TYPE_BOOLEAN, + DataType::Date32 => COLUMN_TYPE_JULIA_DATE, + DataType::Timestamp(TimeUnit::Microsecond, None) => COLUMN_TYPE_JULIA_TIMESTAMP, + DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => COLUMN_TYPE_JULIA_TIMESTAMPTZ, + DataType::Timestamp(TimeUnit::Nanosecond, None) => COLUMN_TYPE_JULIA_TIMESTAMP_NS, + DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS, + DataType::FixedSizeBinary(16) => COLUMN_TYPE_UUID, + DataType::Decimal128(p, _) => { + if *p <= 9 { + COLUMN_TYPE_DECIMAL_INT32 + } else if *p <= 18 { + COLUMN_TYPE_DECIMAL_INT64 + } else { + COLUMN_TYPE_DECIMAL_INT128 + } + } + other => { + return Err(anyhow::anyhow!( + "Unsupported Arrow type for column writer: {:?}", + other + )) + } + }) +} + +/// Get the embedded builder, constructing it on first access. +/// +/// # Safety +/// Caller must ensure no other thread is accessing the writer at the same time. The FFI +/// contract is one Julia thread per writer. +unsafe fn get_or_init_builder( + writer_ref: &IcebergDataFileWriter, +) -> Result<&mut RecordBatchBuilder, anyhow::Error> { + let slot = unsafe { &mut *writer_ref.builder.get() }; + if slot.is_none() { + *slot = Some(RecordBatchBuilder::new( + writer_ref.arrow_schema.clone(), + &writer_ref.col_types, + DEFAULT_COALESCE_ROWS, + )?); + } + Ok(slot.as_mut().unwrap()) +} + +/// Finalize the current window of the builder (if non-empty) and submit the resulting +/// `RecordBatch` to the encode pool. Resets the builder in-place for the next window. +fn flush_builder( + writer_ref: &IcebergDataFileWriter, + pool: &GlobalWorkerPool, +) -> Result<(), anyhow::Error> { + let slot = unsafe { &mut *writer_ref.builder.get() }; + let Some(builder) = slot.as_mut() else { + return Ok(()); + }; + if builder.rows() == 0 { + return Ok(()); + } + let batch = builder.take_record_batch()?; + submit_batch(writer_ref, pool, batch) } /// Submit a `RecordBatch` to the writer's queue. Lazily registers the writer with the @@ -561,53 +650,62 @@ pub(crate) fn submit_batch( Ok(()) } -/// Validates column count, converts each `ColumnDescriptor` into a single-slice `SliceRef`, -/// routes through `ColumnBatchBuilder`, and submits the resulting `RecordBatch` to the -/// encode pool. Using the builder here keeps all type-conversion and null-bit logic in one -/// place (`batch_builder.rs`) instead of duplicating it. -unsafe fn write_columns_inner( - writer_ref: &IcebergDataFileWriter, - pool: &GlobalWorkerPool, - arrow_schema: ArrowSchemaRef, - col_descs: &[ColumnDescriptor], -) -> Result<(), anyhow::Error> { - if col_descs.len() != arrow_schema.fields().len() { - return Err(anyhow::anyhow!( - "Column count mismatch: got {} but schema has {}", - col_descs.len(), - arrow_schema.fields().len() - )); - } - let num_rows = col_descs.iter().map(|d| d.num_rows).max().unwrap_or(0); - let col_types: Vec = col_descs.iter().map(|d| d.column_type).collect(); - let mut builder = ColumnBatchBuilder::new(arrow_schema.clone(), &col_types, num_rows.max(1))?; - let slices: Vec = col_descs - .iter() - .map(|d| SliceRef { - data_ptr: d.data_ptr, - lengths_ptr: d.lengths_ptr, - validity_ptr: d.validity_ptr, - sel_ptr: std::ptr::null(), - len: d.num_rows, - }) - .collect(); - unsafe { builder.append_slice(&slices) }?; - builder.write_and_reset(writer_ref, pool) -} - -/// Synchronous write of flat column data: copies each column from Julia memory into -/// Rust-owned Arrow arrays in the calling thread, then submits to the global encode -/// pool asynchronously. +/// Append one `RowChunk` (one `ColumnSlice` per output column) to the embedded builder. /// -/// Each `ColumnDescriptor` is treated as a single sequential slice (no scatter/gather). -/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). +/// Rust copies all slice data synchronously into per-column buffers; source memory may be +/// released the moment this call returns. If the post-append row count reaches +/// `coalesce_rows`, the builder is finalized into a `RecordBatch` and submitted to the +/// encode pool (auto-flush). The window may end up slightly over `coalesce_rows` — we +/// never split a slice mid-append, preserving the byte-aligned fast paths. +/// +/// Returns 0 on success, -1 on error (error stored in writer state, surfaced on close). #[no_mangle] -pub extern "C" fn iceberg_writer_write_columns( +pub extern "C" fn iceberg_writer_append( writer: *mut IcebergDataFileWriter, - columns: *const ColumnDescriptor, + slices: *const ColumnSlice, num_columns: usize, ) -> i32 { - if writer.is_null() || columns.is_null() || num_columns == 0 { + if writer.is_null() || slices.is_null() || num_columns == 0 { + return -1; + } + let writer_ref = unsafe { &*writer }; + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first"); + return -1; + } + }; + let slices_slice = unsafe { std::slice::from_raw_parts(slices, num_columns) }; + + let result = (|| -> Result<(), anyhow::Error> { + let builder = unsafe { get_or_init_builder(writer_ref) }?; + unsafe { builder.append_slice(slices_slice) }?; + if builder.should_flush() { + // Take the batch and submit. Do this after the borrow ends. + let batch = builder.take_record_batch()?; + submit_batch(writer_ref, pool, batch)?; + } + Ok(()) + })(); + + if let Err(e) = result { + store_writer_error(writer_ref, e); + return -1; + } + 0 +} + +/// Force the builder to flush its current (partial) window to the encode pool. +/// +/// Use this on logical boundaries (end of transaction, time tick) when you want a Parquet +/// row group break that doesn't naturally fall at `coalesce_rows`. No-op if the builder +/// is empty or hasn't been initialized. +/// +/// Returns 0 on success, -1 on error (error stored in writer state, surfaced on close). +#[no_mangle] +pub extern "C" fn iceberg_writer_flush(writer: *mut IcebergDataFileWriter) -> i32 { + if writer.is_null() { return -1; } let writer_ref = unsafe { &*writer }; @@ -618,9 +716,7 @@ pub extern "C" fn iceberg_writer_write_columns( return -1; } }; - let arrow_schema = writer_ref.arrow_schema.clone(); - let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; - if let Err(e) = unsafe { write_columns_inner(writer_ref, pool, arrow_schema, col_descs) } { + if let Err(e) = flush_builder(writer_ref, pool) { store_writer_error(writer_ref, e); return -1; } @@ -718,12 +814,20 @@ export_runtime_op!( .await .map_err(|e| anyhow::anyhow!("Failed to build data file writer: {}", e))?; - // Convert Iceberg schema to Arrow schema for use in write_columns + // Convert Iceberg schema to Arrow schema for use by both the IPC and append paths. let arrow_schema = Arc::new( schema_to_arrow_schema(table.metadata().current_schema().as_ref()) .map_err(|e| anyhow::anyhow!("Failed to convert schema to Arrow: {}", e))? ); + // Derive the per-column type codes from the Arrow schema; this is what the + // embedded builder uses to drive copy/conversion decisions. + let col_types: Vec = arrow_schema + .fields() + .iter() + .map(|f| arrow_type_to_column_type(f.data_type())) + .collect::>()?; + // Initialize global pool (no-op if already running). get_or_init_encode_pool(); @@ -731,6 +835,8 @@ export_runtime_op!( Ok::(IcebergDataFileWriter { arrow_schema, + col_types, + builder: UnsafeCell::new(None), writer_state, }) }, @@ -764,6 +870,12 @@ pub extern "C" fn iceberg_writer_write( } }; + // Flush any pending builder window first so IPC batches don't reorder around append!. + if let Err(e) = flush_builder(writer_ref, pool) { + store_writer_error(writer_ref, e); + return -1; + } + let ipc_bytes = unsafe { std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() }; let cursor = Cursor::new(ipc_bytes); @@ -808,6 +920,13 @@ export_runtime_op!( }, writer_ref, async { + // Flush any partial-window remainder in the embedded builder before we wait. + if let Some(pool) = GLOBAL_ENCODE_POOL.get() { + if let Err(e) = flush_builder(writer_ref, pool) { + store_writer_error(writer_ref, e); + } + } + // Wait for all pending pool encodes to complete. // Uses a timeout to guard against a dead worker thread (e.g. panic outside // catch_unwind) that would otherwise leave pending > 0 forever. diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index df48a5b..3c93c2b 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -1,9 +1,8 @@ -/// Column-based writer support for iceberg_rust_ffi +/// Shared FFI types and column-type constants for the column-based write path. /// -/// This module provides the FFI structs and column type constants shared between the -/// flat-column write path (`iceberg_writer_write_columns`) and the incremental batch -/// builder (`batch_builder.rs`). All Arrow array construction logic lives in -/// `batch_builder.rs`; this file is intentionally thin. +/// The only consumer of `ColumnSlice` outside this module is `record_batch_builder.rs`, +/// which owns all Arrow array construction. The struct stays here because it is part of +/// the FFI ABI surface that Julia constructs. use std::ffi::c_void; /// Column type codes (must match Julia's ColumnType enum) @@ -34,40 +33,19 @@ pub const COLUMN_TYPE_JULIA_TIMESTAMP_NS: i32 = 16; /// Julia-epoch nanosecond timestamp with UTC timezone. pub const COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS: i32 = 17; -/// Descriptor for a single column passed from Julia -#[repr(C)] -#[derive(Clone, Copy)] -pub struct ColumnDescriptor { - /// Pointer to the raw data (interpretation depends on column_type) - /// For strings: pointer to array of string pointers (Ptr{UInt8}[]) - pub data_ptr: *const c_void, - /// For string columns: pointer to lengths array (Int64[]) - /// For other types: unused (C_NULL) - pub lengths_ptr: *const i64, - /// Pointer to validity bitmap (only if is_nullable is true) - /// Points to bit-packed data from Julia's BitVector.chunks (UInt64 array) - /// Bit i is 1 if row i is valid, 0 if null - pub validity_ptr: *const u8, - /// Number of rows - pub num_rows: usize, - /// Column type (see COLUMN_TYPE_* constants) - pub column_type: i32, - /// Whether this column is nullable - pub is_nullable: bool, -} - -unsafe impl Send for ColumnDescriptor {} -unsafe impl Sync for ColumnDescriptor {} - -/// A reference to one slice of source column data. -/// `sel_ptr = null` → sequential (identity) access: read data[0..len]. -/// `sel_ptr != null` → scattered access: read data[sel[i]-1] for i in 0..len (1-based Julia indices). -/// `validity_ptr = null` → all rows valid (non-nullable or known all-valid slice). -/// `lengths_ptr != null` → string column: data_ptr is Ptr{UInt8}[], lengths_ptr is Int64[] of byte lengths per string. -/// Fields are all 8 bytes — no padding, total 40 bytes. +/// One column's contribution to a single `RowChunk` — a reference to source data the +/// builder will copy on `append`. All fields are 8 bytes; total struct size is 40 bytes +/// with no padding. +/// +/// - `sel_ptr = null` → sequential (identity) access: read `data[0..len]`. +/// - `sel_ptr != null` → scattered access: read `data[sel[i] - 1]` for `i in 0..len` +/// (1-based Julia indices). +/// - `validity_ptr = null` → all rows in this slice are valid. +/// - `lengths_ptr != null` → string column: `data_ptr` is `*const *const u8`, +/// `lengths_ptr` is `*const i64` of byte lengths per string. #[repr(C)] #[derive(Clone, Copy)] -pub struct SliceRef { +pub struct ColumnSlice { pub data_ptr: *const c_void, pub lengths_ptr: *const i64, pub validity_ptr: *const u8, @@ -75,5 +53,5 @@ pub struct SliceRef { pub len: usize, } -unsafe impl Send for SliceRef {} -unsafe impl Sync for SliceRef {} +unsafe impl Send for ColumnSlice {} +unsafe impl Sync for ColumnSlice {} diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index 979b35b..b2c537c 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -38,14 +38,14 @@ export IcebergTimestampNs, IcebergTimestamptzNs export IcebergString, IcebergUuid, IcebergBinary, IcebergDecimal export Transaction, DataFiles, free_transaction!, free_data_files!, commit, transaction export FastAppendAction, free_fast_append_action!, add_data_files, apply, with_fast_append -export DataFileWriter, free_writer!, close_writer, write_columns, set_encode_workers! +export DataFileWriter, free_writer!, close_writer, set_encode_workers! export WriterConfig, CompressionCodec, UNCOMPRESSED, SNAPPY, GZIP, LZ4, ZSTD, LZ4_RAW -export ColumnDescriptor, ColumnBatch, ColumnType +export RowChunk, flush! +export ColumnType export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64 export COLUMN_TYPE_STRING, COLUMN_TYPE_DATE, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_UUID export COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_DECIMAL_INT128 export julia_type_to_column_type -export SliceRef, SliceBatch, ColumnBatchBuilder, append_slice!, free_builder! # Always use the JLL library - override via Preferences if needed for local development # To use a local build, set the preference: diff --git a/src/writer.jl b/src/writer.jl index 18fd94b..b6b25af 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -492,22 +492,25 @@ end # ========================================================================================== # Column-based writing (zero-copy from Julia) +# +# A user produces one `RowChunk` per upstream slice (a horizontal stripe of rows across all +# output columns), then calls `append!(writer, chunk)`. The writer copies the data eagerly +# into Rust-owned per-column buffers; the source Julia arrays may be released the moment +# `append!` returns. When the accumulated row count reaches the coalesce window, the writer +# finalizes a `RecordBatch` and ships it to the async encode pool automatically. Callers +# that need flush control on logical boundaries can call `flush!(writer)`. # ========================================================================================== """ - SliceRef - -FFI reference to a single slice of source column data for the scattered-gather writer. + ColumnSlice -- `data_ptr`: pointer to source data array (T[]) or string pointers (Ptr{UInt8}[]) -- `lengths_ptr`: for string columns, pointer to lengths array; null for other types -- `validity_ptr`: pointer to validity bitmap (BitVector.chunks); null if all rows valid -- `sel_ptr`: pointer to selection index array (1-based Julia indices); null for sequential access -- `len`: number of rows in this slice +FFI struct describing one column's contribution to a single `RowChunk`. Internal — +users build `RowChunk`s via `push!` instead of constructing `ColumnSlice` directly. -All fields are 8 bytes — total struct size is 40 bytes with no padding. +All fields are 8 bytes; total struct size is 40 bytes with no padding (matches Rust's +`ColumnSlice` layout). """ -struct SliceRef +struct ColumnSlice data_ptr::Ptr{Cvoid} lengths_ptr::Ptr{Int64} validity_ptr::Ptr{UInt8} @@ -543,34 +546,6 @@ Enum for column data types, matching the Rust FFI constants. COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS = 17 # same + UTC timezone end -""" - ColumnDescriptor - -FFI structure describing a single column for direct column writing. -This struct must match the Rust `ColumnDescriptor` layout exactly. - -# Fields -- `data_ptr::Ptr{Cvoid}`: Pointer to the raw column data. For strings, this is a - pointer to an array of string pointers (Ptr{UInt8}[]). -- `lengths_ptr::Ptr{Int64}`: For string columns, pointer to lengths array (Int64[]). - For other types, this is C_NULL. -- `validity_ptr::Ptr{UInt8}`: Pointer to validity bitmap (BitVector.chunks, bit-packed) -- `num_rows::Csize_t`: Number of rows in the column -- `column_type::Int32`: Type of the column (see `ColumnType` enum) -- `is_nullable::Bool`: Whether this column can contain null values - -Note: Fields are ordered to avoid padding (8-byte fields first, then 4-byte, then 1-byte). -""" -struct ColumnDescriptor - data_ptr::Ptr{Cvoid} # 8 bytes, offset 0 - lengths_ptr::Ptr{Int64} # 8 bytes, offset 8 - validity_ptr::Ptr{UInt8} # 8 bytes, offset 16 - num_rows::Csize_t # 8 bytes, offset 24 - column_type::Int32 # 4 bytes, offset 32 - is_nullable::Bool # 1 byte, offset 36 - # (3 bytes trailing padding added by compiler, total 40 bytes) -end - """ julia_type_to_column_type(::Type{T}) -> ColumnType @@ -624,300 +599,43 @@ function iceberg_column_type(d::IcebergDecimal) end """ - ColumnBatch - -A builder for collecting column descriptors and their underlying arrays. -Automatically tracks arrays that need to be preserved during FFI calls. - -# Example -```julia -batch = ColumnBatch() -push!(batch, ids) # non-nullable column -push!(batch, values; validity=validity_vec) # nullable column -write_columns(writer, batch) -``` -""" -mutable struct ColumnBatch - descriptors::Vector{ColumnDescriptor} - arrays_to_preserve::Vector{Any} - - ColumnBatch() = new(ColumnDescriptor[], Any[]) -end - -""" - push!(batch::ColumnBatch, data::Vector{String}; validity=nothing, length=nothing, column_type=nothing) - -Add a string column to the batch. Strings are passed as an array of pointers with lengths. -Note: While this avoids copying on the Julia side, Arrow still copies the string data -into its internal buffer on the Rust side. - -# Arguments -- `data`: The string column data array -- `validity`: Optional validity mask (BitVector where false=null, true=valid) -- `length`: Optional number of rows to use from the array. If not specified, - uses the full array length. -- `column_type`: Optional explicit column type (defaults to COLUMN_TYPE_STRING) -""" -function Base.push!( - batch::ColumnBatch, - data::Vector{String}; - validity::Union{Nothing, BitVector}=nothing, - length::Union{Nothing, Int}=nothing, - column_type::Union{Nothing, ColumnType}=nothing -) - num_rows = length === nothing ? Base.length(data) : length - is_nullable = validity !== nothing - col_type = column_type === nothing ? COLUMN_TYPE_STRING : column_type - - # Build arrays of string pointers and lengths (no copy on Julia side) - # Each String in Julia is a pointer to contiguous UTF-8 bytes - # For null values, we use null pointer and zero length - Rust will check validity mask - str_ptrs = Vector{Ptr{UInt8}}(undef, num_rows) - str_lens = Vector{Int64}(undef, num_rows) - for i in 1:num_rows - if is_nullable && !validity[i] - # Null value - use null pointer and zero length - str_ptrs[i] = Ptr{UInt8}(C_NULL) - str_lens[i] = 0 - else - str_ptrs[i] = pointer(data[i]) - str_lens[i] = sizeof(data[i]) - end - end - - # Preserve all arrays (original strings + metadata arrays) - push!(batch.arrays_to_preserve, data) - push!(batch.arrays_to_preserve, str_ptrs) - push!(batch.arrays_to_preserve, str_lens) - - validity_ptr = if is_nullable - push!(batch.arrays_to_preserve, validity) - Ptr{UInt8}(pointer(validity.chunks)) - else - Ptr{UInt8}(C_NULL) - end - - # For strings: data_ptr = pointer to string pointers, offsets_ptr = pointer to lengths - desc = ColumnDescriptor( - Ptr{Cvoid}(pointer(str_ptrs)), - pointer(str_lens), # Reuse offsets_ptr for lengths array - validity_ptr, - Csize_t(num_rows), - Int32(col_type), - is_nullable - ) - push!(batch.descriptors, desc) - return batch -end - -""" - push!(batch::ColumnBatch, data::Vector{String}, str_ptrs::Vector{Ptr{UInt8}}, str_lens::Vector{Int64}; validity=nothing, length=nothing, column_type=nothing) - -Add a string column to the batch using pre-allocated pointer/length buffers. -The caller is responsible for filling `str_ptrs` and `str_lens` before calling this. -Avoids allocating new pointer/length arrays on every write. -""" -function Base.push!( - batch::ColumnBatch, - data::Vector{String}, - str_ptrs::Vector{Ptr{UInt8}}, - str_lens::Vector{Int64}; - validity::Union{Nothing, BitVector}=nothing, - length::Union{Nothing, Int}=nothing, - column_type::Union{Nothing, ColumnType}=nothing, -) - num_rows = length === nothing ? Base.length(str_ptrs) : length - is_nullable = validity !== nothing - col_type = column_type === nothing ? COLUMN_TYPE_STRING : column_type - - push!(batch.arrays_to_preserve, data, str_ptrs, str_lens) - - validity_ptr = if is_nullable - push!(batch.arrays_to_preserve, validity) - Ptr{UInt8}(pointer(validity.chunks)) - else - Ptr{UInt8}(C_NULL) - end - - desc = ColumnDescriptor( - Ptr{Cvoid}(pointer(str_ptrs)), - pointer(str_lens), - validity_ptr, - Csize_t(num_rows), - Int32(col_type), - is_nullable - ) - push!(batch.descriptors, desc) - return batch -end - -""" - push!(batch::ColumnBatch, data::Vector{T}; validity=nothing, length=nothing, column_type=nothing) where T - -Add a column to the batch. The column type is inferred from the element type unless -explicitly specified. - -# Arguments -- `data`: The column data array -- `validity`: Optional validity mask (BitVector where false=null, true=valid) -- `length`: Optional number of rows to use from the array. If not specified, - uses the full array length. This allows writing only a prefix of the array. -- `column_type`: Optional explicit column type (ColumnType enum). If not specified, - inferred from the element type T. Use this when the physical storage type differs - from the logical type (e.g., Int32 data that represents Date32). -""" -function Base.push!( - batch::ColumnBatch, - data::Vector{T}; - validity::Union{Nothing, BitVector}=nothing, - length::Union{Nothing, Int}=nothing, - column_type::Union{Nothing, ColumnType}=nothing -) where T - push!(batch.arrays_to_preserve, data) - - col_type = column_type === nothing ? julia_type_to_column_type(T) : column_type - num_rows = length === nothing ? Base.length(data) : length - is_nullable = validity !== nothing - - validity_ptr = if is_nullable - # BitVector stores bits in UInt64 chunks - pass pointer to chunks directly - push!(batch.arrays_to_preserve, validity) - Ptr{UInt8}(pointer(validity.chunks)) - else - Ptr{UInt8}(C_NULL) - end - - desc = ColumnDescriptor( - Ptr{Cvoid}(pointer(data)), - Ptr{Int64}(C_NULL), # lengths_ptr not used for non-string types - validity_ptr, - Csize_t(num_rows), - Int32(col_type), - is_nullable - ) - push!(batch.descriptors, desc) - return batch -end - -""" - write_columns(writer::DataFileWriter, columns::Vector{ColumnDescriptor}, arrays_to_preserve) - -Write raw column data directly to the Parquet writer, bypassing Arrow IPC serialization. - -This is a low-level function that passes raw column pointers to Rust, which builds -Arrow arrays directly from them. This avoids one serialization step compared to -the standard `write` function. - -# Arguments -- `writer::DataFileWriter`: The writer to write to -- `columns::Vector{ColumnDescriptor}`: Array of column descriptors -- `arrays_to_preserve`: A tuple/collection of arrays whose memory is referenced by the - ColumnDescriptors. These will be GC-preserved during the FFI call. - -# Safety -The ColumnDescriptors contain raw pointers that must point to valid data. -Pass all source arrays in `arrays_to_preserve` to ensure they are not garbage -collected during the FFI call. - -# Throws -- `IcebergException` if the write fails - -# Example -```julia -data = Int64[1, 2, 3] -validity = UInt8[1, 1, 1] -desc = ColumnDescriptor(pointer(data), ...) -write_columns(writer, [desc], (data, validity)) # Arrays preserved during call -``` -""" -function write_columns(writer::DataFileWriter, columns::Vector{ColumnDescriptor}, arrays_to_preserve) - writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) - isempty(columns) && throw(IcebergException("No columns provided")) - - ret = GC.@preserve columns arrays_to_preserve begin - @ccall rust_lib.iceberg_writer_write_columns( - writer.ptr::Ptr{Cvoid}, - pointer(columns)::Ptr{ColumnDescriptor}, - length(columns)::Csize_t, - )::Int32 - end - - ret == 0 || throw(IcebergException("write_columns failed (see writer close for details)")) - return nothing -end - -""" - write_columns(writer::DataFileWriter, batch::ColumnBatch) - -Write columns from a ColumnBatch to the Parquet writer. - -This is the recommended way to use write_columns - the ColumnBatch automatically -tracks all arrays that need to be preserved during the FFI call. - -# Arguments -- `writer::DataFileWriter`: The writer to write to -- `batch::ColumnBatch`: The column batch to write - -# Example -```julia -batch = ColumnBatch() -push!(batch, ids) -push!(batch, values; validity=validity_vec) -write_columns(writer, batch) - -# To write only first 100 rows, use the length parameter on push!: -batch = ColumnBatch() -push!(batch, ids; length=100) -push!(batch, values; validity=validity_vec, length=100) -write_columns(writer, batch) -``` -""" -function write_columns(writer::DataFileWriter, batch::ColumnBatch) - write_columns(writer, batch.descriptors, batch.arrays_to_preserve) -end - -# ========================================================================================== -# Incremental batch builder API -# ========================================================================================== - -""" - SliceBatch + RowChunk -Accumulates one column slice descriptor per column for a single `append_slice!` call. -Handles `SliceRef` construction and GC preservation automatically, so callers work with -plain Julia arrays instead of raw pointers. +A horizontal stripe of rows across all output columns — what a streaming producer hands +to the writer at each step. Build one by pushing the column data (in schema order); the +chunk handles `ColumnSlice` construction and GC preservation automatically. ```julia -sb = SliceBatch() -push!(sb, ids) # non-nullable numeric, sequential -push!(sb, values; validity=valid_bv) # nullable numeric, sequential -push!(sb, scores; sel=sel_indices) # non-nullable scattered -push!(sb, tags; validity=valid_bv) # nullable strings -append_slice!(builder, sb) +chunk = RowChunk() +push!(chunk, ids) # non-nullable numeric, sequential +push!(chunk, values; validity=valid_bv) # nullable numeric, sequential +push!(chunk, scores; sel=sel_indices) # non-nullable scattered +push!(chunk, tags; validity=valid_bv) # nullable strings +append!(writer, chunk) ``` -A `SliceBatch` is single-use: after `append_slice!` returns (Rust has copied all data), -the source arrays may be released and the batch discarded. +A `RowChunk` is single-use: after `append!` returns Rust has copied all data, so the +source arrays may be released and the chunk discarded. """ -mutable struct SliceBatch - slices::Vector{SliceRef} +mutable struct RowChunk + slices::Vector{ColumnSlice} preserve::Vector{Any} end -SliceBatch() = SliceBatch(SliceRef[], Any[]) +RowChunk() = RowChunk(ColumnSlice[], Any[]) """ - push!(sb::SliceBatch, data::AbstractVector{T}; + push!(chunk::RowChunk, data::AbstractVector{T}; validity=nothing, sel=nothing) -Add a non-string column slice to the batch. +Add a non-string column slice to the chunk. - `validity`: optional `BitVector` where `true` = valid, `false` = null. - `sel`: optional `Vector{Int64}` of 1-based indices into `data` for scattered access. If omitted, all rows of `data` are used sequentially. """ function Base.push!( - sb::SliceBatch, + chunk::RowChunk, data::AbstractVector{T}; validity::Union{Nothing, BitVector} = nothing, sel::Union{Nothing, Vector{Int64}} = nothing, @@ -925,39 +643,39 @@ function Base.push!( len = sel === nothing ? length(data) : length(sel) sel_ptr = if sel !== nothing - push!(sb.preserve, sel) + push!(chunk.preserve, sel) pointer(sel) else Ptr{Int64}(C_NULL) end validity_ptr = if validity !== nothing - push!(sb.preserve, validity) + push!(chunk.preserve, validity) Ptr{UInt8}(pointer(validity.chunks)) else Ptr{UInt8}(C_NULL) end - push!(sb.preserve, data) - push!(sb.slices, SliceRef( + push!(chunk.preserve, data) + push!(chunk.slices, ColumnSlice( Ptr{Cvoid}(pointer(data)), Ptr{Int64}(C_NULL), validity_ptr, sel_ptr, Csize_t(len), )) - return sb + return chunk end """ - push!(sb::SliceBatch, strings::Vector{String}; validity=nothing) + push!(chunk::RowChunk, strings::Vector{String}; validity=nothing) -Add a string column slice to the batch. +Add a string column slice to the chunk. - `validity`: optional `BitVector` where `true` = valid, `false` = null. """ function Base.push!( - sb::SliceBatch, + chunk::RowChunk, strings::Vector{String}; validity::Union{Nothing, BitVector} = nothing, ) @@ -974,136 +692,60 @@ function Base.push!( str_lens[i] = ncodeunits(strings[i]) end end - push!(sb.preserve, strings, str_ptrs, str_lens) + push!(chunk.preserve, strings, str_ptrs, str_lens) validity_ptr = if validity !== nothing - push!(sb.preserve, validity) + push!(chunk.preserve, validity) Ptr{UInt8}(pointer(validity.chunks)) else Ptr{UInt8}(C_NULL) end - push!(sb.slices, SliceRef( + push!(chunk.slices, ColumnSlice( Ptr{Cvoid}(pointer(str_ptrs)), pointer(str_lens), validity_ptr, Ptr{Int64}(C_NULL), Csize_t(n), )) - return sb -end - -""" - ColumnBatchBuilder - -Opaque handle to a Rust-side incremental batch builder. Julia appends one slice per -column per operator slice via `append_slice!`; Rust copies each slice's data immediately -into owned typed buffers. When a coalesce window is full, `write_columns` finalises all -columns into Arrow arrays, submits the `RecordBatch` to the async encode pool, and resets -the builder in-place for the next window. - -Create with `ColumnBatchBuilder(writer, col_types)`. The builder is freed automatically -by its finalizer, or explicitly with `free_builder!`. -""" -mutable struct ColumnBatchBuilder - ptr::Ptr{Cvoid} - - function ColumnBatchBuilder( - writer::DataFileWriter, - col_types::Vector{ColumnType}, - ) - writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) - isempty(col_types) && throw(ArgumentError("col_types must not be empty")) - - col_type_codes = Int32[Int32(ct) for ct in col_types] - ptr = GC.@preserve col_type_codes begin - @ccall rust_lib.iceberg_batch_builder_new( - writer.ptr::Ptr{Cvoid}, - pointer(col_type_codes)::Ptr{Int32}, - length(col_type_codes)::Csize_t, - )::Ptr{Cvoid} - end - ptr == C_NULL && throw(IcebergException("iceberg_batch_builder_new failed")) - - b = new(ptr) - finalizer(free_builder!, b) - return b - end -end - -""" - free_builder!(builder::ColumnBatchBuilder) - -Free the builder without writing. Called automatically by the finalizer; also safe to -call explicitly on error paths. -""" -function free_builder!(builder::ColumnBatchBuilder) - if builder.ptr != C_NULL - @ccall rust_lib.iceberg_batch_builder_free(builder.ptr::Ptr{Cvoid})::Cvoid - builder.ptr = C_NULL - end - return nothing + return chunk end """ - append_slice!(builder::ColumnBatchBuilder, slices::Vector{SliceRef}, arrays_to_preserve) - -Append one slice per column to the builder. `slices[i]` describes column `i`'s data for -this slice. Rust copies all data synchronously — source arrays referenced by the SliceRefs -may be released (or overwritten) as soon as this call returns. + append!(writer::DataFileWriter, chunk::RowChunk) -`arrays_to_preserve` holds any Julia objects whose memory is pointed to by the SliceRefs -(e.g. NullableVector backing arrays, string ptr/len buffers). They are GC-pinned only for -the duration of this call. +Hand one `RowChunk` to the writer. Rust copies all slice data synchronously into per-column +buffers; the source arrays may be released the moment this call returns. When the +accumulated window reaches the coalesce size the writer auto-flushes a `RecordBatch` to +the encode pool. No reordering happens — `append!` calls are appended in order. """ -function append_slice!( - builder::ColumnBatchBuilder, - slices::Vector{SliceRef}, - arrays_to_preserve, -) - builder.ptr == C_NULL && throw(IcebergException("ColumnBatchBuilder has been freed")) - ret = GC.@preserve slices arrays_to_preserve begin - @ccall rust_lib.iceberg_batch_builder_append_slice( - builder.ptr::Ptr{Cvoid}, - pointer(slices)::Ptr{SliceRef}, - length(slices)::Csize_t, +function Base.append!(writer::DataFileWriter, chunk::RowChunk) + writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) + isempty(chunk.slices) && throw(IcebergException("RowChunk has no columns")) + ret = GC.@preserve chunk begin + @ccall rust_lib.iceberg_writer_append( + writer.ptr::Ptr{Cvoid}, + pointer(chunk.slices)::Ptr{ColumnSlice}, + length(chunk.slices)::Csize_t, )::Int32 end - ret == 0 || throw(IcebergException("append_slice! failed")) - return nothing + ret == 0 || throw(IcebergException("append! failed (see close_writer for details)")) + return writer end """ - append_slice!(builder::ColumnBatchBuilder, sb::SliceBatch) + flush!(writer::DataFileWriter) -High-level overload: append one slice per column from a `SliceBatch`. Builds the -`SliceRef` array and preserve list from the batch's accumulated column descriptors. +Force the writer to flush its current partial window to the encode pool. Useful on logical +boundaries (end of transaction, time tick) where a Parquet row-group break is desired +without waiting for the natural coalesce-window boundary. No-op if the buffer is empty. -```julia -sb = SliceBatch() -push!(sb, ids) -push!(sb, values; validity=valid_bv) -append_slice!(builder, sb) -``` +`close_writer` flushes any remainder automatically, so explicit `flush!` is only needed +when the caller wants control over flush timing. """ -function append_slice!(builder::ColumnBatchBuilder, sb::SliceBatch) - append_slice!(builder, sb.slices, sb.preserve) -end - -""" - write_columns(writer::DataFileWriter, builder::ColumnBatchBuilder) - -Finalise the builder: assemble Arrow arrays from accumulated per-column buffers, build a -`RecordBatch`, submit it to the async encode pool, and reset all column buffers for the -next coalesce window. The builder is NOT freed and may be reused immediately. -""" -function write_columns(writer::DataFileWriter, builder::ColumnBatchBuilder) +function flush!(writer::DataFileWriter) writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) - builder.ptr == C_NULL && throw(IcebergException("ColumnBatchBuilder has been freed")) - ret = @ccall rust_lib.iceberg_batch_builder_write( - writer.ptr::Ptr{Cvoid}, - builder.ptr::Ptr{Cvoid}, - )::Int32 - ret == 0 || throw(IcebergException("write_columns (builder) failed")) - return nothing + ret = @ccall rust_lib.iceberg_writer_flush(writer.ptr::Ptr{Cvoid})::Int32 + ret == 0 || throw(IcebergException("flush! failed (see close_writer for details)")) + return writer end diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 0ed8432..e7b5ffd 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -678,8 +678,8 @@ end println("\n✅ Writer with vended credentials tests completed!") end -@testset "Writer write_columns API" begin - println("Testing write_columns (raw column) API...") +@testset "Writer append! / RowChunk API" begin + println("Testing append!(writer, RowChunk) API...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -720,10 +720,9 @@ end @test table != C_NULL println("✅ Test table created: $table_name") - # Test: Write raw column data using write_columns - println("\nTest: Writing data via write_columns...") + # Test: Write raw column data via the streaming RowChunk path + println("\nTest: Writing data via append!(writer, RowChunk)...") - # Prepare raw column data col_ids = Int64[1, 2, 3, 4, 5] col_counts = Int32[10, 20, 30, 40, 50] col_values = Float64[1.1, 2.2, 3.3, 4.4, 5.5] @@ -739,15 +738,14 @@ end @test writer.ptr != C_NULL println("✅ Writer created successfully") - # Build column batch using the helper - batch = RustyIceberg.ColumnBatch() - push!(batch, col_ids) - push!(batch, col_counts; validity=validity_counts) - push!(batch, col_values; validity=validity_values) - push!(batch, col_flags; validity=validity_flags) + chunk = RustyIceberg.RowChunk() + push!(chunk, col_ids) + push!(chunk, col_counts; validity=validity_counts) + push!(chunk, col_values; validity=validity_values) + push!(chunk, col_flags; validity=validity_flags) - RustyIceberg.write_columns(writer, batch) - println("✅ Data written via write_columns") + append!(writer, chunk) + println("✅ Data appended; close_writer will flush the remainder") end @test data_files !== nothing @test data_files.ptr != C_NULL @@ -784,7 +782,7 @@ end @test sorted_counts == Int32[10, 20, 30, 40, 50] @test sorted_values == Float64[1.1, 2.2, 3.3, 4.4, 5.5] @test sorted_flags == Bool[true, false, true, false, true] - println("✅ Verified write_columns data content matches exactly") + println("✅ Verified append!/RowChunk data content matches exactly") # Clean up updated table RustyIceberg.free_table(updated_table) @@ -812,11 +810,11 @@ end end end - println("\n✅ write_columns API tests completed!") + println("\n✅ append!/RowChunk API tests completed!") end -@testset "Writer write_columns with nulls" begin - println("Testing write_columns with null values...") +@testset "Writer RowChunk with nulls" begin + println("Testing RowChunk with null values...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -861,12 +859,11 @@ end validity_values = BitVector([true, false, true, false, true]) # positions 2 and 4 are null data_files = RustyIceberg.with_data_file_writer(table) do writer - batch = RustyIceberg.ColumnBatch() - push!(batch, col_ids) - push!(batch, col_values; validity=validity_values) - - RustyIceberg.write_columns(writer, batch) - println("✅ Data with nulls written via write_columns") + chunk = RustyIceberg.RowChunk() + push!(chunk, col_ids) + push!(chunk, col_values; validity=validity_values) + append!(writer, chunk) + println("✅ Data with nulls appended") end @test data_files !== nothing println("✅ Writer closed successfully") @@ -923,11 +920,11 @@ end end end - println("\n✅ write_columns with nulls tests completed!") + println("\n✅ RowChunk with nulls tests completed!") end -@testset "Writer write_columns decimal types" begin - println("Testing write_columns with decimal types (Int32/Int64/bytes backing)...") +@testset "Writer RowChunk decimal types" begin + println("Testing RowChunk with decimal types (Int32/Int64/Int128 backing)...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -977,13 +974,16 @@ end col_balances = Int128[12345678901234567890, -999999999999, 1] data_files = RustyIceberg.with_data_file_writer(table) do writer - batch = RustyIceberg.ColumnBatch() - push!(batch, col_ids) - push!(batch, col_prices; column_type=RustyIceberg.COLUMN_TYPE_DECIMAL_INT32) - push!(batch, col_volumes; column_type=RustyIceberg.COLUMN_TYPE_DECIMAL_INT64) - push!(batch, col_balances; column_type=RustyIceberg.COLUMN_TYPE_DECIMAL_INT128) - RustyIceberg.write_columns(writer, batch) - println("✅ Decimal data written via write_columns") + # The writer infers DECIMAL_INT32 / INT64 / INT128 column types from the + # schema's Decimal128(precision, scale). Callers just push raw Int32 / Int64 + # / Int128 scaled-integer columns matching that precision. + chunk = RustyIceberg.RowChunk() + push!(chunk, col_ids) + push!(chunk, col_prices) + push!(chunk, col_volumes) + push!(chunk, col_balances) + append!(writer, chunk) + println("✅ Decimal data appended") end @test data_files !== nothing && data_files.ptr != C_NULL println("✅ Writer closed, got DataFiles handle") @@ -1049,11 +1049,11 @@ end end end - println("\n✅ write_columns decimal types tests completed!") + println("\n✅ RowChunk decimal types tests completed!") end -@testset "Writer write_columns decimal nullable" begin - println("Testing write_columns with nullable decimal column...") +@testset "Writer RowChunk decimal nullable" begin + println("Testing RowChunk with nullable decimal column...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -1089,11 +1089,11 @@ end validity = BitVector([true, false, true, false, true]) data_files = RustyIceberg.with_data_file_writer(table) do writer - batch = RustyIceberg.ColumnBatch() - push!(batch, col_ids) - push!(batch, col_prices; validity=validity, column_type=RustyIceberg.COLUMN_TYPE_DECIMAL_INT64) - RustyIceberg.write_columns(writer, batch) - println("✅ Nullable decimal data written") + chunk = RustyIceberg.RowChunk() + push!(chunk, col_ids) + push!(chunk, col_prices; validity=validity) + append!(writer, chunk) + println("✅ Nullable decimal data appended") end @test data_files !== nothing @@ -1146,11 +1146,11 @@ end end end - println("\n✅ write_columns decimal nullable tests completed!") + println("\n✅ RowChunk decimal nullable tests completed!") end -@testset "Writer ColumnBatchBuilder — multi-slice coalescing" begin - println("Testing ColumnBatchBuilder with multiple slices per column...") +@testset "Writer streaming — multi-chunk coalescing" begin + println("Testing streaming append! with multiple RowChunks per writer...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -1180,51 +1180,40 @@ end @test table != C_NULL println("✅ Table created") - # Write 4 rows across 3 separate append_slice! calls. - # id: [1, 2, 3, 4] — non-nullable; 3 slices of lengths 1, 2, 1 - # score: [1.1, null, 3.3, null] — nullable; 3 slices with validity - # tag: ["alpha", null, "gamma", null] — nullable string; 3 slices + # Write 4 rows across 3 separate append! calls. + # id: [1, 2, 3, 4] — non-nullable; 3 chunks of lengths 1, 2, 1 + # score: [1.1, null, 3.3, null] — nullable; 3 chunks with validity + # tag: ["alpha", null, "gamma", null] — nullable string; 3 chunks # - # Slices are deliberately mis-aligned in terms of source array sizes to exercise - # the multi-slice accumulation path. Slice 2 for score uses a scattered sel_ptr. - - col_types = RustyIceberg.ColumnType[ - RustyIceberg.COLUMN_TYPE_INT64, - RustyIceberg.COLUMN_TYPE_FLOAT64, - RustyIceberg.COLUMN_TYPE_STRING, - ] + # Chunks are deliberately mis-aligned in terms of source array sizes to exercise + # the multi-chunk accumulation path. Chunk 2 for score uses a scattered sel. data_files = RustyIceberg.with_data_file_writer(table) do writer - builder = RustyIceberg.ColumnBatchBuilder(writer, col_types) - - # --- Slice 1: row 0 --- - sb1 = RustyIceberg.SliceBatch() - push!(sb1, Int64[1]) - push!(sb1, Float64[1.1]; validity=BitVector([true])) - push!(sb1, ["alpha"]) - RustyIceberg.append_slice!(builder, sb1) - println("✅ Slice 1 appended") - - # --- Slice 2: rows 1-2 (score uses a scattered selection) --- + # --- Chunk 1: row 0 --- + c1 = RustyIceberg.RowChunk() + push!(c1, Int64[1]) + push!(c1, Float64[1.1]; validity=BitVector([true])) + push!(c1, ["alpha"]) + append!(writer, c1) + println("✅ Chunk 1 appended") + + # --- Chunk 2: rows 1-2 (score uses a scattered selection) --- # score_src: [99.9, 3.3, 88.8], sel=[2,1] → values [3.3, 99.9], valid=[true,false] - sb2 = RustyIceberg.SliceBatch() - push!(sb2, Int64[2, 3]) - push!(sb2, Float64[99.9, 3.3, 88.8]; + c2 = RustyIceberg.RowChunk() + push!(c2, Int64[2, 3]) + push!(c2, Float64[99.9, 3.3, 88.8]; sel=Int64[2, 1], validity=BitVector([true, false])) - push!(sb2, ["", "gamma"]; validity=BitVector([false, true])) - RustyIceberg.append_slice!(builder, sb2) - println("✅ Slice 2 appended (scattered score, nullable strings)") - - # --- Slice 3: row 3 --- - sb3 = RustyIceberg.SliceBatch() - push!(sb3, Int64[4]) - push!(sb3, Float64[0.0]; validity=BitVector([false])) - push!(sb3, [""]; validity=BitVector([false])) - RustyIceberg.append_slice!(builder, sb3) - println("✅ Slice 3 appended") - - RustyIceberg.write_columns(writer, builder) - println("✅ Builder flushed via write_columns") + push!(c2, ["", "gamma"]; validity=BitVector([false, true])) + append!(writer, c2) + println("✅ Chunk 2 appended (scattered score, nullable strings)") + + # --- Chunk 3: row 3 --- + c3 = RustyIceberg.RowChunk() + push!(c3, Int64[4]) + push!(c3, Float64[0.0]; validity=BitVector([false])) + push!(c3, [""]; validity=BitVector([false])) + append!(writer, c3) + println("✅ Chunk 3 appended; close_writer will flush remainder") end @test data_files !== nothing && data_files.ptr != C_NULL println("✅ Writer closed") @@ -1283,11 +1272,11 @@ end end end - println("\n✅ ColumnBatchBuilder multi-slice tests completed!") + println("\n✅ Streaming multi-chunk tests completed!") end -@testset "Writer ColumnBatchBuilder — reuse across windows" begin - println("Testing ColumnBatchBuilder reuse: two write_columns calls on one builder...") +@testset "Writer streaming — explicit flush! between windows" begin + println("Testing explicit flush!(writer) between windows...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -1315,29 +1304,22 @@ end @test table != C_NULL println("✅ Table created") - col_types = RustyIceberg.ColumnType[ - RustyIceberg.COLUMN_TYPE_INT64, - RustyIceberg.COLUMN_TYPE_FLOAT64, - ] - data_files = RustyIceberg.with_data_file_writer(table) do writer - builder = RustyIceberg.ColumnBatchBuilder(writer, col_types) - # Window 1: rows [1, 2] - sb1 = RustyIceberg.SliceBatch() - push!(sb1, Int64[1, 2]) - push!(sb1, Float64[10.0, 20.0]) - RustyIceberg.append_slice!(builder, sb1) - RustyIceberg.write_columns(writer, builder) # flushes window 1, resets builder - println("✅ Window 1 written") - - # Window 2: rows [3, 4, 5] — builder reused in-place - sb2 = RustyIceberg.SliceBatch() - push!(sb2, Int64[3, 4, 5]) - push!(sb2, Float64[30.0, 40.0, 50.0]) - RustyIceberg.append_slice!(builder, sb2) - RustyIceberg.write_columns(writer, builder) # flushes window 2 - println("✅ Window 2 written") + c1 = RustyIceberg.RowChunk() + push!(c1, Int64[1, 2]) + push!(c1, Float64[10.0, 20.0]) + append!(writer, c1) + RustyIceberg.flush!(writer) # force a flush boundary + println("✅ Window 1 flushed") + + # Window 2: rows [3, 4, 5] — fresh window after flush! + c2 = RustyIceberg.RowChunk() + push!(c2, Int64[3, 4, 5]) + push!(c2, Float64[30.0, 40.0, 50.0]) + append!(writer, c2) + # close_writer will flush window 2's remainder + println("✅ Window 2 appended (close flushes remainder)") end @test data_files !== nothing && data_files.ptr != C_NULL println("✅ Writer closed") @@ -1380,11 +1362,11 @@ end end end - println("\n✅ ColumnBatchBuilder reuse tests completed!") + println("\n✅ Streaming explicit-flush tests completed!") end -@testset "Writer ColumnBatchBuilder — date and timestamp epoch conversion" begin - println("Testing ColumnBatchBuilder date/timestamp epoch conversion...") +@testset "Writer streaming — date and timestamp epoch conversion" begin + println("Testing date/timestamp epoch conversion through the streaming path...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -1420,22 +1402,16 @@ end julia_date_val = Dates.value(Dates.Date(2024, 1, 1)) # 738886 julia_ts_val = Dates.value(Dates.DateTime(2024, 1, 1, 0, 0, 0)) # ms since year 1 - col_types = RustyIceberg.ColumnType[ - RustyIceberg.COLUMN_TYPE_INT64, - RustyIceberg.COLUMN_TYPE_JULIA_DATE, - RustyIceberg.COLUMN_TYPE_JULIA_TIMESTAMP, - ] - + # Writer infers JULIA_DATE / JULIA_TIMESTAMP from the schema (IcebergDate / + # IcebergTimestamp). User just pushes raw Int64 values (Dates.value of the Julia + # Date/DateTime) and Rust handles the epoch conversion at copy time. data_files = RustyIceberg.with_data_file_writer(table) do writer - builder = RustyIceberg.ColumnBatchBuilder(writer, col_types) - - sb = RustyIceberg.SliceBatch() - push!(sb, Int64[1]) - push!(sb, Int64[julia_date_val]) - push!(sb, Int64[julia_ts_val]) - RustyIceberg.append_slice!(builder, sb) - RustyIceberg.write_columns(writer, builder) - println("✅ Date/timestamp slice written") + chunk = RustyIceberg.RowChunk() + push!(chunk, Int64[1]) + push!(chunk, Int64[julia_date_val]) + push!(chunk, Int64[julia_ts_val]) + append!(writer, chunk) + println("✅ Date/timestamp chunk appended") end @test data_files !== nothing && data_files.ptr != C_NULL @@ -1478,7 +1454,7 @@ end end end - println("\n✅ ColumnBatchBuilder date/timestamp tests completed!") + println("\n✅ Streaming date/timestamp tests completed!") end @testset "Writer WriterConfig parquet properties" begin From 8ded709136078c15161a6b99049460d90d38ff6a Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Tue, 26 May 2026 11:25:22 +0200 Subject: [PATCH 20/27] writer: defer FFI prep into append!; reuse scratch in writer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit push!(chunk, ...) now just records a reference to the column data plus optional validity/sel — no pointer-taking, no allocation of ptr/len arrays. All FFI prep (pointer-taking, string ptr/len gather) happens in append! using scratch buffers owned by DataFileWriter. Scratch is grown lazily on first use and reused across every subsequent append!, so a streaming pipeline pays zero per-call allocation for the FFI argument arrays. For string columns this kills the per-chunk Vector{Ptr{UInt8}} and Vector{Int64} allocation that previously scaled with chunk size — the benchmark on 100M rows × 4 string columns allocated 9 GiB more than the pre-refactor builder; this commit restores parity. push!(chunk, ::AbstractVector{<:AbstractString}) replaces the narrow ::Vector{String} signature, so callers with Vector{VariableSizeString}, Vector{SubString{String}}, etc. don't have to materialize through String. ColumnSlice moves up next to DataFileWriter so the writer can carry a Vector{ColumnSlice} scratch field. FFI surface is unchanged. Co-Authored-By: Claude Opus 4.7 --- src/writer.jl | 229 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 140 insertions(+), 89 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index b6b25af..984c7ac 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -86,6 +86,24 @@ struct ParquetWriterPropertiesFFI statistics_enabled::Bool end +""" + ColumnSlice + +FFI struct describing one column's contribution to a single `RowChunk`. Internal — +users build `RowChunk`s via `push!` instead of constructing `ColumnSlice` directly. + +All fields are 8 bytes; total struct size is 40 bytes with no padding (matches Rust's +`ColumnSlice` layout). Defined here (rather than next to `RowChunk`) so the +`DataFileWriter` struct can carry a `Vector{ColumnSlice}` scratch buffer. +""" +struct ColumnSlice + data_ptr::Ptr{Cvoid} + lengths_ptr::Ptr{Int64} + validity_ptr::Ptr{UInt8} + sel_ptr::Ptr{Int64} + len::Csize_t +end + """ DataFileWriter @@ -98,12 +116,21 @@ data files. The writer tracks any `DataFiles` produced by `close_writer` and automatically frees them when `free_writer!` is called, unless they have already been freed or consumed. + +The writer also owns per-column scratch buffers (`_slices`, `_str_ptrs`, +`_str_lens`) that are reused across `append!` calls. They grow lazily on first +use and stay sized for the steady state, so a typical streaming pipeline pays +no per-`append!` allocation for these. """ mutable struct DataFileWriter ptr::Ptr{Cvoid} table::Table # Keep reference to table to prevent GC colmeta::Dict{Symbol, Vector{Pair{String, String}}} # Column metadata with Iceberg field IDs data_files::Union{DataFiles, Nothing} # Track DataFiles for automatic cleanup + # Scratch buffers reused across append! calls. Sized lazily by `append!`. + _slices::Vector{ColumnSlice} # per-call FFI argument + _str_ptrs::Vector{Vector{Ptr{UInt8}}} # one buffer per column position + _str_lens::Vector{Vector{Int64}} # one buffer per column position end # Response type for writer creation @@ -232,7 +259,10 @@ function DataFileWriter(table::Table, config::WriterConfig) # This metadata is added to Arrow IPC data so iceberg-rust can match fields colmeta = get_column_metadata(table) - return DataFileWriter(response.value, table, colmeta, nothing) + return DataFileWriter( + response.value, table, colmeta, nothing, + ColumnSlice[], Vector{Ptr{UInt8}}[], Vector{Int64}[], + ) end # Convenience constructor with keyword arguments @@ -501,23 +531,6 @@ end # that need flush control on logical boundaries can call `flush!(writer)`. # ========================================================================================== -""" - ColumnSlice - -FFI struct describing one column's contribution to a single `RowChunk`. Internal — -users build `RowChunk`s via `push!` instead of constructing `ColumnSlice` directly. - -All fields are 8 bytes; total struct size is 40 bytes with no padding (matches Rust's -`ColumnSlice` layout). -""" -struct ColumnSlice - data_ptr::Ptr{Cvoid} - lengths_ptr::Ptr{Int64} - validity_ptr::Ptr{UInt8} - sel_ptr::Ptr{Int64} - len::Csize_t -end - """ ColumnType @@ -598,12 +611,32 @@ function iceberg_column_type(d::IcebergDecimal) end end +# ------------------------------------------------------------------------------------------ +# Internal column-entry types stored inside a `RowChunk`. +# +# `push!` only records a reference to the user's data plus optional validity/sel; no +# `pointer(...)`-taking and no allocation of ptr/len arrays happens at `push!` time. +# The full `ColumnSlice` (the FFI argument) is materialized in `append!` using writer-owned +# scratch — so per-`append!` allocations stay flat regardless of chunk size. +# ------------------------------------------------------------------------------------------ + +struct _NumericCol + data::Any # AbstractVector{T} for some T + validity::Union{Nothing, BitVector} + sel::Union{Nothing, Vector{Int64}} +end + +struct _StringCol + strings::Any # AbstractVector{<:AbstractString} + validity::Union{Nothing, BitVector} +end + """ RowChunk A horizontal stripe of rows across all output columns — what a streaming producer hands -to the writer at each step. Build one by pushing the column data (in schema order); the -chunk handles `ColumnSlice` construction and GC preservation automatically. +to the writer at each step. Build one by pushing column data in schema order; `push!` +just records the reference, so it allocates O(1) per column regardless of chunk size. ```julia chunk = RowChunk() @@ -615,20 +648,21 @@ append!(writer, chunk) ``` A `RowChunk` is single-use: after `append!` returns Rust has copied all data, so the -source arrays may be released and the chunk discarded. +source arrays may be released and the chunk discarded. The writer owns the scratch +buffers used to materialize the FFI arguments, so they are reused across `append!` +calls without per-call allocation. """ mutable struct RowChunk - slices::Vector{ColumnSlice} - preserve::Vector{Any} + columns::Vector{Any} # holds _NumericCol or _StringCol instances end -RowChunk() = RowChunk(ColumnSlice[], Any[]) +RowChunk() = RowChunk(Any[]) """ push!(chunk::RowChunk, data::AbstractVector{T}; validity=nothing, sel=nothing) -Add a non-string column slice to the chunk. +Record a non-string column slice on the chunk. - `validity`: optional `BitVector` where `true` = valid, `false` = null. - `sel`: optional `Vector{Int64}` of 1-based indices into `data` for scattered access. @@ -640,99 +674,116 @@ function Base.push!( validity::Union{Nothing, BitVector} = nothing, sel::Union{Nothing, Vector{Int64}} = nothing, ) where T - len = sel === nothing ? length(data) : length(sel) - - sel_ptr = if sel !== nothing - push!(chunk.preserve, sel) - pointer(sel) - else - Ptr{Int64}(C_NULL) - end - - validity_ptr = if validity !== nothing - push!(chunk.preserve, validity) - Ptr{UInt8}(pointer(validity.chunks)) - else - Ptr{UInt8}(C_NULL) - end - - push!(chunk.preserve, data) - push!(chunk.slices, ColumnSlice( - Ptr{Cvoid}(pointer(data)), - Ptr{Int64}(C_NULL), - validity_ptr, - sel_ptr, - Csize_t(len), - )) + push!(chunk.columns, _NumericCol(data, validity, sel)) return chunk end """ - push!(chunk::RowChunk, strings::Vector{String}; validity=nothing) + push!(chunk::RowChunk, strings::AbstractVector{<:AbstractString}; validity=nothing) -Add a string column slice to the chunk. +Record a string column slice on the chunk. Accepts any `AbstractString` element type +(`String`, `SubString{String}`, Arrow's `VariableSizeString`, …); the pointer/length +gather happens inside `append!` so there's no per-`push!` materialization or copy. - `validity`: optional `BitVector` where `true` = valid, `false` = null. """ function Base.push!( chunk::RowChunk, - strings::Vector{String}; + strings::AbstractVector{<:AbstractString}; validity::Union{Nothing, BitVector} = nothing, ) - n = length(strings) - is_nullable = validity !== nothing - str_ptrs = Vector{Ptr{UInt8}}(undef, n) - str_lens = Vector{Int64}(undef, n) - for i in 1:n - if is_nullable && !validity[i] - str_ptrs[i] = Ptr{UInt8}(C_NULL) - str_lens[i] = 0 - else - str_ptrs[i] = pointer(strings[i]) - str_lens[i] = ncodeunits(strings[i]) - end - end - push!(chunk.preserve, strings, str_ptrs, str_lens) - - validity_ptr = if validity !== nothing - push!(chunk.preserve, validity) - Ptr{UInt8}(pointer(validity.chunks)) - else - Ptr{UInt8}(C_NULL) - end - - push!(chunk.slices, ColumnSlice( - Ptr{Cvoid}(pointer(str_ptrs)), - pointer(str_lens), - validity_ptr, - Ptr{Int64}(C_NULL), - Csize_t(n), - )) + push!(chunk.columns, _StringCol(strings, validity)) return chunk end """ append!(writer::DataFileWriter, chunk::RowChunk) -Hand one `RowChunk` to the writer. Rust copies all slice data synchronously into per-column -buffers; the source arrays may be released the moment this call returns. When the -accumulated window reaches the coalesce size the writer auto-flushes a `RecordBatch` to -the encode pool. No reordering happens — `append!` calls are appended in order. +Hand one `RowChunk` to the writer. All per-column FFI prep (pointer-taking, validity bitmap +lookup, string ptr/len gather) happens here using writer-owned scratch buffers, so per-call +allocations stay flat. Rust then copies the slice data synchronously into per-column +buffers; the source arrays may be released the moment this call returns. + +When the accumulated window reaches the coalesce size the writer auto-flushes a +`RecordBatch` to the encode pool. `append!` calls are appended in order — no reordering. """ function Base.append!(writer::DataFileWriter, chunk::RowChunk) writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) - isempty(chunk.slices) && throw(IcebergException("RowChunk has no columns")) - ret = GC.@preserve chunk begin + n = length(chunk.columns) + n == 0 && throw(IcebergException("RowChunk has no columns")) + + # Grow writer scratch to fit `n` columns. Stays at this capacity for the writer's + # lifetime, so the steady-state cost is zero. + resize!(writer._slices, n) + while length(writer._str_ptrs) < n + push!(writer._str_ptrs, Ptr{UInt8}[]) + push!(writer._str_lens, Int64[]) + end + + ret = GC.@preserve writer chunk begin + @inbounds for i in 1:n + writer._slices[i] = _build_column_slice(writer, i, chunk.columns[i]) + end @ccall rust_lib.iceberg_writer_append( writer.ptr::Ptr{Cvoid}, - pointer(chunk.slices)::Ptr{ColumnSlice}, - length(chunk.slices)::Csize_t, + pointer(writer._slices)::Ptr{ColumnSlice}, + n::Csize_t, )::Int32 end ret == 0 || throw(IcebergException("append! failed (see close_writer for details)")) return writer end +# Materialize a `ColumnSlice` for a numeric column. No allocation — just pointer-taking. +@inline function _build_column_slice(::DataFileWriter, ::Int, col::_NumericCol) + data = col.data + sel_ptr = col.sel === nothing ? Ptr{Int64}(C_NULL) : pointer(col.sel) + validity_ptr = col.validity === nothing ? + Ptr{UInt8}(C_NULL) : + Ptr{UInt8}(pointer(col.validity.chunks)) + len = col.sel === nothing ? length(data) : length(col.sel) + return ColumnSlice( + Ptr{Cvoid}(pointer(data)), + Ptr{Int64}(C_NULL), + validity_ptr, + sel_ptr, + Csize_t(len), + ) +end + +# Materialize a `ColumnSlice` for a string column. Fills the writer's per-position +# `str_ptrs` / `str_lens` scratch in place — `resize!` is amortized O(1) once steady state +# is reached, so this is allocation-free across the streaming pipeline. +@inline function _build_column_slice(writer::DataFileWriter, idx::Int, col::_StringCol) + strings = col.strings + n = length(strings) + str_ptrs = writer._str_ptrs[idx] + str_lens = writer._str_lens[idx] + resize!(str_ptrs, n) + resize!(str_lens, n) + is_nullable = col.validity !== nothing + @inbounds for i in 1:n + if is_nullable && !col.validity[i] + str_ptrs[i] = Ptr{UInt8}(C_NULL) + str_lens[i] = 0 + else + s = strings[i] + str_ptrs[i] = pointer(s) + str_lens[i] = ncodeunits(s) + end + end + validity_ptr = col.validity === nothing ? + Ptr{UInt8}(C_NULL) : + Ptr{UInt8}(pointer(col.validity.chunks)) + return ColumnSlice( + Ptr{Cvoid}(pointer(str_ptrs)), + pointer(str_lens), + validity_ptr, + Ptr{Int64}(C_NULL), + Csize_t(n), + ) +end + """ flush!(writer::DataFileWriter) From d3a478453032e4a1bc04eb80814bd25218dc0ab7 Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Tue, 26 May 2026 13:14:18 +0200 Subject: [PATCH 21/27] writer: reusable RowChunk + empty!; build ColumnSlice inline at push! time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Numeric push! builds ColumnSlice inline like the pre-Option-D code — fully type-stable, no allocation beyond chunk.slices/preserve growth. String push! also builds ColumnSlice inline, but materializes ptr/len into a per-column-position pool owned by the RowChunk. Base.empty!(chunk) resets the working vectors without freeing the pool, so streaming pipelines that follow the `empty!; push!; append!` idiom pay zero per-iteration allocation for string-gather buffers (down from 9 GiB on the 100M×4-string benchmark). push!(chunk, ::AbstractVector{<:AbstractString}) accepts any AbstractString subtype directly (VariableSizeString, SubString{String}, …), so callers don't have to materialize through Vector{String}. String columns now also support `sel=` for scattered access — same shape as the numeric overload. The original API documented that strings couldn't support sel; that rationale (caller had to provide pre-built ptr/len in the old low-level overload) no longer applies once the gather lives in push!. Replaces the Option D commit (8ded709): deferring numeric prep into append! introduced an Any-typed wrapper struct that broke type stability and cost ~10-20% on numeric workloads. This commit puts numerics back on the typed path while keeping the string buffer-reuse win. Co-Authored-By: Claude Opus 4.7 --- src/writer.jl | 295 +++++++++++++++++++++++++++----------------------- 1 file changed, 160 insertions(+), 135 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 984c7ac..b66e579 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -86,24 +86,6 @@ struct ParquetWriterPropertiesFFI statistics_enabled::Bool end -""" - ColumnSlice - -FFI struct describing one column's contribution to a single `RowChunk`. Internal — -users build `RowChunk`s via `push!` instead of constructing `ColumnSlice` directly. - -All fields are 8 bytes; total struct size is 40 bytes with no padding (matches Rust's -`ColumnSlice` layout). Defined here (rather than next to `RowChunk`) so the -`DataFileWriter` struct can carry a `Vector{ColumnSlice}` scratch buffer. -""" -struct ColumnSlice - data_ptr::Ptr{Cvoid} - lengths_ptr::Ptr{Int64} - validity_ptr::Ptr{UInt8} - sel_ptr::Ptr{Int64} - len::Csize_t -end - """ DataFileWriter @@ -116,21 +98,12 @@ data files. The writer tracks any `DataFiles` produced by `close_writer` and automatically frees them when `free_writer!` is called, unless they have already been freed or consumed. - -The writer also owns per-column scratch buffers (`_slices`, `_str_ptrs`, -`_str_lens`) that are reused across `append!` calls. They grow lazily on first -use and stay sized for the steady state, so a typical streaming pipeline pays -no per-`append!` allocation for these. """ mutable struct DataFileWriter ptr::Ptr{Cvoid} table::Table # Keep reference to table to prevent GC colmeta::Dict{Symbol, Vector{Pair{String, String}}} # Column metadata with Iceberg field IDs data_files::Union{DataFiles, Nothing} # Track DataFiles for automatic cleanup - # Scratch buffers reused across append! calls. Sized lazily by `append!`. - _slices::Vector{ColumnSlice} # per-call FFI argument - _str_ptrs::Vector{Vector{Ptr{UInt8}}} # one buffer per column position - _str_lens::Vector{Vector{Int64}} # one buffer per column position end # Response type for writer creation @@ -259,10 +232,7 @@ function DataFileWriter(table::Table, config::WriterConfig) # This metadata is added to Arrow IPC data so iceberg-rust can match fields colmeta = get_column_metadata(table) - return DataFileWriter( - response.value, table, colmeta, nothing, - ColumnSlice[], Vector{Ptr{UInt8}}[], Vector{Int64}[], - ) + return DataFileWriter(response.value, table, colmeta, nothing) end # Convenience constructor with keyword arguments @@ -611,32 +581,29 @@ function iceberg_column_type(d::IcebergDecimal) end end -# ------------------------------------------------------------------------------------------ -# Internal column-entry types stored inside a `RowChunk`. -# -# `push!` only records a reference to the user's data plus optional validity/sel; no -# `pointer(...)`-taking and no allocation of ptr/len arrays happens at `push!` time. -# The full `ColumnSlice` (the FFI argument) is materialized in `append!` using writer-owned -# scratch — so per-`append!` allocations stay flat regardless of chunk size. -# ------------------------------------------------------------------------------------------ - -struct _NumericCol - data::Any # AbstractVector{T} for some T - validity::Union{Nothing, BitVector} - sel::Union{Nothing, Vector{Int64}} -end +""" + ColumnSlice + +FFI struct describing one column's contribution to a single `RowChunk`. Internal — +users build `RowChunk`s via `push!` instead of constructing `ColumnSlice` directly. -struct _StringCol - strings::Any # AbstractVector{<:AbstractString} - validity::Union{Nothing, BitVector} +All fields are 8 bytes; total struct size is 40 bytes with no padding (matches Rust's +`ColumnSlice` layout). +""" +struct ColumnSlice + data_ptr::Ptr{Cvoid} + lengths_ptr::Ptr{Int64} + validity_ptr::Ptr{UInt8} + sel_ptr::Ptr{Int64} + len::Csize_t end """ RowChunk A horizontal stripe of rows across all output columns — what a streaming producer hands -to the writer at each step. Build one by pushing column data in schema order; `push!` -just records the reference, so it allocates O(1) per column regardless of chunk size. +to the writer at each step. Build one by pushing column data in schema order, then call +`append!(writer, chunk)`. ```julia chunk = RowChunk() @@ -647,22 +614,59 @@ push!(chunk, tags; validity=valid_bv) # nullable strings append!(writer, chunk) ``` -A `RowChunk` is single-use: after `append!` returns Rust has copied all data, so the -source arrays may be released and the chunk discarded. The writer owns the scratch -buffers used to materialize the FFI arguments, so they are reused across `append!` -calls without per-call allocation. +For streaming pipelines that push many chunks, reuse the same `RowChunk` and call +`empty!(chunk)` at the top of each iteration. `empty!` clears the working vectors but +retains the chunk's internal pool of `str_ptrs` / `str_lens` buffers, so string columns +amortize their per-chunk gather work to zero. + +```julia +chunk = RowChunk() +for slice in upstream + empty!(chunk) + push!(chunk, slice.ids) + push!(chunk, slice.tags; validity=slice.v) + append!(writer, chunk) +end +``` + +Column order must be stable across iterations (it has to match the writer's schema +anyway). The internal string pool is keyed by column position. """ mutable struct RowChunk - columns::Vector{Any} # holds _NumericCol or _StringCol instances + slices::Vector{ColumnSlice} # FFI-ready, one entry per push! + preserve::Vector{Any} # GC roots for source data / validity / sel + # Per-column-position string scratch. Entry `i` holds the `str_ptrs` / `str_lens` + # buffers for the i-th pushed column when it was a string column. Non-string positions + # have unused (length-0) entries. Retained across `empty!` so streaming reuse is free. + str_ptrs::Vector{Vector{Ptr{UInt8}}} + str_lens::Vector{Vector{Int64}} end -RowChunk() = RowChunk(Any[]) +RowChunk() = RowChunk(ColumnSlice[], Any[], Vector{Ptr{UInt8}}[], Vector{Int64}[]) + +""" + empty!(chunk::RowChunk) + +Reset the chunk's working vectors so the next pass can refill them. The internal string +ptr/len pool is *not* freed — it stays sized at the previous high-water mark, so a +streaming loop that calls `empty!` then `push!` repeatedly pays zero per-iteration +allocation for the string-gather buffers. + +Drop the chunk and create a new one if you want to actually reclaim the pool memory. +""" +function Base.empty!(chunk::RowChunk) + empty!(chunk.slices) + empty!(chunk.preserve) + return chunk +end """ push!(chunk::RowChunk, data::AbstractVector{T}; validity=nothing, sel=nothing) -Record a non-string column slice on the chunk. +Add a non-string column slice to the chunk. Builds the FFI-ready `ColumnSlice` inline — +just `pointer(data)` and pointer arithmetic, no allocation beyond the chunk's own +`slices` / `preserve` growth. - `validity`: optional `BitVector` where `true` = valid, `false` = null. - `sel`: optional `Vector{Int64}` of 1-based indices into `data` for scattered access. @@ -674,116 +678,137 @@ function Base.push!( validity::Union{Nothing, BitVector} = nothing, sel::Union{Nothing, Vector{Int64}} = nothing, ) where T - push!(chunk.columns, _NumericCol(data, validity, sel)) + len = sel === nothing ? length(data) : length(sel) + + sel_ptr = if sel !== nothing + push!(chunk.preserve, sel) + pointer(sel) + else + Ptr{Int64}(C_NULL) + end + + validity_ptr = if validity !== nothing + push!(chunk.preserve, validity) + Ptr{UInt8}(pointer(validity.chunks)) + else + Ptr{UInt8}(C_NULL) + end + + push!(chunk.preserve, data) + push!(chunk.slices, ColumnSlice( + Ptr{Cvoid}(pointer(data)), + Ptr{Int64}(C_NULL), + validity_ptr, + sel_ptr, + Csize_t(len), + )) return chunk end """ - push!(chunk::RowChunk, strings::AbstractVector{<:AbstractString}; validity=nothing) + push!(chunk::RowChunk, strings::AbstractVector{<:AbstractString}; + validity=nothing, sel=nothing) -Record a string column slice on the chunk. Accepts any `AbstractString` element type -(`String`, `SubString{String}`, Arrow's `VariableSizeString`, …); the pointer/length -gather happens inside `append!` so there's no per-`push!` materialization or copy. +Add a string column slice to the chunk. Accepts any `AbstractString` element type +(`String`, `SubString{String}`, Arrow's `VariableSizeString`, …) — `pointer(s)` and +`ncodeunits(s)` are used directly, no materialization through `Vector{String}`. -- `validity`: optional `BitVector` where `true` = valid, `false` = null. +The ptr/len gather buffers live in the chunk's per-position pool and are reused across +`empty!`/refill cycles, so a streaming loop pays no allocation for them after the first +chunk. + +- `validity`: optional `BitVector` (length matches the number of output rows) where + `true` = valid, `false` = null. +- `sel`: optional `Vector{Int64}` of 1-based indices into `strings` for scattered access. + If omitted, all rows of `strings` are used sequentially. """ function Base.push!( chunk::RowChunk, strings::AbstractVector{<:AbstractString}; validity::Union{Nothing, BitVector} = nothing, + sel::Union{Nothing, Vector{Int64}} = nothing, ) - push!(chunk.columns, _StringCol(strings, validity)) + pos = length(chunk.slices) + 1 + # Grow the per-position pool to cover this column's slot. Lazy and retained across + # `empty!` calls, so subsequent iterations are no-ops here. + while length(chunk.str_ptrs) < pos + push!(chunk.str_ptrs, Ptr{UInt8}[]) + push!(chunk.str_lens, Int64[]) + end + str_ptrs = chunk.str_ptrs[pos] + str_lens = chunk.str_lens[pos] + + n = sel === nothing ? length(strings) : length(sel) + resize!(str_ptrs, n) # amortized O(1) once the pool is sized + resize!(str_lens, n) + is_nullable = validity !== nothing + if sel === nothing + @inbounds for i in 1:n + if is_nullable && !validity[i] + str_ptrs[i] = Ptr{UInt8}(C_NULL) + str_lens[i] = 0 + else + s = strings[i] + str_ptrs[i] = pointer(s) + str_lens[i] = ncodeunits(s) + end + end + else + @inbounds for i in 1:n + if is_nullable && !validity[i] + str_ptrs[i] = Ptr{UInt8}(C_NULL) + str_lens[i] = 0 + else + s = strings[sel[i]] + str_ptrs[i] = pointer(s) + str_lens[i] = ncodeunits(s) + end + end + push!(chunk.preserve, sel) + end + + push!(chunk.preserve, strings) + validity_ptr = if validity !== nothing + push!(chunk.preserve, validity) + Ptr{UInt8}(pointer(validity.chunks)) + else + Ptr{UInt8}(C_NULL) + end + + push!(chunk.slices, ColumnSlice( + Ptr{Cvoid}(pointer(str_ptrs)), + pointer(str_lens), + validity_ptr, + Ptr{Int64}(C_NULL), + Csize_t(n), + )) return chunk end """ append!(writer::DataFileWriter, chunk::RowChunk) -Hand one `RowChunk` to the writer. All per-column FFI prep (pointer-taking, validity bitmap -lookup, string ptr/len gather) happens here using writer-owned scratch buffers, so per-call -allocations stay flat. Rust then copies the slice data synchronously into per-column -buffers; the source arrays may be released the moment this call returns. +Hand one `RowChunk` to the writer. The chunk's `slices` are already FFI-ready, so this +just pins memory, fires the FFI, and returns. Rust copies all slice data synchronously +into per-column buffers; the source arrays may be released the moment this call returns. When the accumulated window reaches the coalesce size the writer auto-flushes a `RecordBatch` to the encode pool. `append!` calls are appended in order — no reordering. """ function Base.append!(writer::DataFileWriter, chunk::RowChunk) writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) - n = length(chunk.columns) - n == 0 && throw(IcebergException("RowChunk has no columns")) - - # Grow writer scratch to fit `n` columns. Stays at this capacity for the writer's - # lifetime, so the steady-state cost is zero. - resize!(writer._slices, n) - while length(writer._str_ptrs) < n - push!(writer._str_ptrs, Ptr{UInt8}[]) - push!(writer._str_lens, Int64[]) - end - - ret = GC.@preserve writer chunk begin - @inbounds for i in 1:n - writer._slices[i] = _build_column_slice(writer, i, chunk.columns[i]) - end + isempty(chunk.slices) && throw(IcebergException("RowChunk has no columns")) + ret = GC.@preserve chunk begin @ccall rust_lib.iceberg_writer_append( writer.ptr::Ptr{Cvoid}, - pointer(writer._slices)::Ptr{ColumnSlice}, - n::Csize_t, + pointer(chunk.slices)::Ptr{ColumnSlice}, + length(chunk.slices)::Csize_t, )::Int32 end ret == 0 || throw(IcebergException("append! failed (see close_writer for details)")) return writer end -# Materialize a `ColumnSlice` for a numeric column. No allocation — just pointer-taking. -@inline function _build_column_slice(::DataFileWriter, ::Int, col::_NumericCol) - data = col.data - sel_ptr = col.sel === nothing ? Ptr{Int64}(C_NULL) : pointer(col.sel) - validity_ptr = col.validity === nothing ? - Ptr{UInt8}(C_NULL) : - Ptr{UInt8}(pointer(col.validity.chunks)) - len = col.sel === nothing ? length(data) : length(col.sel) - return ColumnSlice( - Ptr{Cvoid}(pointer(data)), - Ptr{Int64}(C_NULL), - validity_ptr, - sel_ptr, - Csize_t(len), - ) -end - -# Materialize a `ColumnSlice` for a string column. Fills the writer's per-position -# `str_ptrs` / `str_lens` scratch in place — `resize!` is amortized O(1) once steady state -# is reached, so this is allocation-free across the streaming pipeline. -@inline function _build_column_slice(writer::DataFileWriter, idx::Int, col::_StringCol) - strings = col.strings - n = length(strings) - str_ptrs = writer._str_ptrs[idx] - str_lens = writer._str_lens[idx] - resize!(str_ptrs, n) - resize!(str_lens, n) - is_nullable = col.validity !== nothing - @inbounds for i in 1:n - if is_nullable && !col.validity[i] - str_ptrs[i] = Ptr{UInt8}(C_NULL) - str_lens[i] = 0 - else - s = strings[i] - str_ptrs[i] = pointer(s) - str_lens[i] = ncodeunits(s) - end - end - validity_ptr = col.validity === nothing ? - Ptr{UInt8}(C_NULL) : - Ptr{UInt8}(pointer(col.validity.chunks)) - return ColumnSlice( - Ptr{Cvoid}(pointer(str_ptrs)), - pointer(str_lens), - validity_ptr, - Ptr{Int64}(C_NULL), - Csize_t(n), - ) -end - """ flush!(writer::DataFileWriter) From dfeca6480214740f592b377527c6ce51f374496e Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Wed, 27 May 2026 13:25:46 +0200 Subject: [PATCH 22/27] Fix segfault --- iceberg_rust_ffi/src/record_batch_builder.rs | 35 ++++++++++++++------ iceberg_rust_ffi/src/writer_columns.rs | 7 ++-- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/iceberg_rust_ffi/src/record_batch_builder.rs b/iceberg_rust_ffi/src/record_batch_builder.rs index 94c72a9..fef8470 100644 --- a/iceberg_rust_ffi/src/record_batch_builder.rs +++ b/iceberg_rust_ffi/src/record_batch_builder.rs @@ -281,13 +281,15 @@ unsafe fn append_to_state( append_numeric(buf, slice, state.column_type, len)?; } ColumnValues::Bool(v) => { - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len) }; if slice.sel_ptr.is_null() { + let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len) }; v.extend_from_slice(src); } else { + // See `append_primitive!` for why scatter uses a raw pointer. + let src = slice.data_ptr as *const u8; let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; for &idx in sel { - v.push(src[(idx - 1) as usize]); + v.push(unsafe { *src.add((idx - 1) as usize) }); } } } @@ -347,20 +349,26 @@ const PREFETCH_DIST: usize = 16; macro_rules! append_primitive { ($buf:expr, $slice:expr, $len:expr, $T:ty) => {{ - let src = unsafe { std::slice::from_raw_parts($slice.data_ptr as *const $T, $len) }; if $slice.sel_ptr.is_null() { + let src = unsafe { std::slice::from_raw_parts($slice.data_ptr as *const $T, $len) }; $buf.extend_from_slice(unsafe { as_bytes(src) }); } else { + // Scatter: index into the source array via raw pointer arithmetic. The + // source array may be longer than `$len` (which is the output stripe + // length), so a `&[T]` of length `$len` would false-positive on any + // sel index ≥ $len. + let src = $slice.data_ptr as *const $T; let sel = unsafe { std::slice::from_raw_parts($slice.sel_ptr, $len) }; for (i, &idx) in sel.iter().enumerate() { if i + PREFETCH_DIST < $len { unsafe { prefetch_read( - src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize) as *const u8 + src.add((sel[i + PREFETCH_DIST] - 1) as usize) as *const u8 ) }; } - $buf.extend_from_slice(&src[(idx - 1) as usize].to_ne_bytes()); + let v = unsafe { *src.add((idx - 1) as usize) }; + $buf.extend_from_slice(&v.to_ne_bytes()); } } }}; @@ -370,15 +378,18 @@ macro_rules! append_primitive { // `$f` maps S → a value whose `.to_ne_bytes()` is written to buf. macro_rules! append_transform { ($buf:expr, $slice:expr, $len:expr, $S:ty, $f:expr) => {{ - let src = unsafe { std::slice::from_raw_parts($slice.data_ptr as *const $S, $len) }; if $slice.sel_ptr.is_null() { + let src = unsafe { std::slice::from_raw_parts($slice.data_ptr as *const $S, $len) }; for &v in src { $buf.extend_from_slice(&($f)(v).to_ne_bytes()); } } else { + // See `append_primitive!` for why scatter uses a raw pointer. + let src = $slice.data_ptr as *const $S; let sel = unsafe { std::slice::from_raw_parts($slice.sel_ptr, $len) }; for &idx in sel { - $buf.extend_from_slice(&($f)(src[(idx - 1) as usize]).to_ne_bytes()); + let v = unsafe { *src.add((idx - 1) as usize) }; + $buf.extend_from_slice(&($f)(v).to_ne_bytes()); } } }}; @@ -407,21 +418,25 @@ unsafe fn append_numeric( } COLUMN_TYPE_DECIMAL_INT128 | COLUMN_TYPE_UUID => { // 16-byte elements — no primitive type; copy as raw bytes. - let src = unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len * 16) }; if slice.sel_ptr.is_null() { + let src = + unsafe { std::slice::from_raw_parts(slice.data_ptr as *const u8, len * 16) }; buf.extend_from_slice(src); } else { + // See `append_primitive!` for why scatter uses a raw pointer. + let src = slice.data_ptr as *const u8; let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; for (i, &idx) in sel.iter().enumerate() { if i + PREFETCH_DIST < len { unsafe { prefetch_read( - src.as_ptr().add((sel[i + PREFETCH_DIST] - 1) as usize * 16), + src.add((sel[i + PREFETCH_DIST] - 1) as usize * 16), ) }; } let off = (idx - 1) as usize * 16; - buf.extend_from_slice(&src[off..off + 16]); + let chunk = unsafe { std::slice::from_raw_parts(src.add(off), 16) }; + buf.extend_from_slice(chunk); } } } diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index 3c93c2b..ac9e1a6 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -37,9 +37,12 @@ pub const COLUMN_TYPE_JULIA_TIMESTAMPTZ_NS: i32 = 17; /// builder will copy on `append`. All fields are 8 bytes; total struct size is 40 bytes /// with no padding. /// -/// - `sel_ptr = null` → sequential (identity) access: read `data[0..len]`. +/// - `sel_ptr = null` → sequential (identity) access: read `data[0..len]`. `data_ptr` +/// must be valid for `len` elements. /// - `sel_ptr != null` → scattered access: read `data[sel[i] - 1]` for `i in 0..len` -/// (1-based Julia indices). +/// (1-based Julia indices). `data_ptr` must be valid for `max(sel)` elements; the +/// source array is typically longer than `len`, so the gather path uses raw pointer +/// arithmetic rather than a `&[T]` of length `len`. /// - `validity_ptr = null` → all rows in this slice are valid. /// - `lengths_ptr != null` → string column: `data_ptr` is `*const *const u8`, /// `lengths_ptr` is `*const i64` of byte lengths per string. From 867c7bbe61db1669bcc9a09dc2aa6ab9422988c3 Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Wed, 27 May 2026 14:21:38 +0200 Subject: [PATCH 23/27] writer: source-aligned validity semantics; gather through sel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Validity bitmaps passed to `push!` are now source-aligned by contract (`length(validity) >= length(data)`, bit `i` describes `data[i]`), matching the natural shape of `NullableVector` / `Vector{Union{T,Missing}}` / Arrow nullable arrays. With `sel == nothing`, source and output positions coincide and the steady-state hot path is unchanged. Numeric path: Rust gathers validity through `sel` alongside the value gather in `append_to_state` — added a new branch before the existing memcpy and bit-by-bit paths. String path: Rust never sees `sel_ptr` for string columns (the value gather is pre-applied on the Julia side because Rust can't walk an `AbstractString` vector across the FFI). The validity gather now folds into that same Julia-side loop, materializing an output-aligned bitmap into a new per-column `RowChunk.str_validity` pool retained across `empty!` so streaming reuse stays zero-allocation. The consumer-visible contract is identical for numeric and string columns: pass a source-aligned `validity` either way; the library does the gather. Tests: - "multi-chunk coalescing" translated to source-aligned validity. - New "source-aligned validity with non-identity sel" testset covers both numeric and string columns with a 5-element source and sel=[3,1,4]. --- iceberg_rust_ffi/src/record_batch_builder.rs | 26 ++++- src/writer.jl | 75 +++++++++--- test/writer_tests.jl | 115 ++++++++++++++++++- 3 files changed, 195 insertions(+), 21 deletions(-) diff --git a/iceberg_rust_ffi/src/record_batch_builder.rs b/iceberg_rust_ffi/src/record_batch_builder.rs index fef8470..10a2ab5 100644 --- a/iceberg_rust_ffi/src/record_batch_builder.rs +++ b/iceberg_rust_ffi/src/record_batch_builder.rs @@ -237,10 +237,28 @@ unsafe fn append_to_state( } else if state.null_bits.len() < needed { state.null_bits.resize(needed, 0u8); } - // Copy validity bits. When source and destination are byte-aligned (out_start - // is a multiple of 8 — always true for flush-per-slice), one copy_nonoverlapping - // replaces the 4096-iteration per-bit loop. - if out_start % 8 == 0 { + // Copy validity bits. Three paths: + // (1) `sel_ptr != null` — source-aligned bitmap + non-identity sel: gather + // bit `sel[i] - 1` into output position `out_start + i`. + // (2) `out_start % 8 == 0` — byte-aligned destination (the steady state + // under flush-per-slice): one `copy_nonoverlapping` for the whole slice. + // (3) otherwise — bit-by-bit copy. + // The gather path uses raw pointer arithmetic on `validity_ptr` because the + // source bitmap covers `length(data)` bits (≥ `max(sel)`), which is typically + // larger than `len`. ORing into 0-initialized `null_bits` is safe even when + // `out_start + len` shares a byte with prior rows — we never write a 0 over a + // previously-set 1 because `set_bits_range(0, out_start)` already populated + // earlier rows in a separate phase. + if !slice.sel_ptr.is_null() { + let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; + for i in 0..len { + let src_idx = (sel[i] - 1) as usize; + let b = + unsafe { (*slice.validity_ptr.add(src_idx / 8) >> (src_idx % 8)) & 1 }; + let pos = out_start + i; + state.null_bits[pos / 8] |= b << (pos % 8); + } + } else if out_start % 8 == 0 { let dst = out_start / 8; let n_bytes = (len + 7) / 8; unsafe { diff --git a/src/writer.jl b/src/writer.jl index b66e579..b6d7985 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -607,10 +607,13 @@ to the writer at each step. Build one by pushing column data in schema order, th ```julia chunk = RowChunk() -push!(chunk, ids) # non-nullable numeric, sequential -push!(chunk, values; validity=valid_bv) # nullable numeric, sequential -push!(chunk, scores; sel=sel_indices) # non-nullable scattered -push!(chunk, tags; validity=valid_bv) # nullable strings +push!(chunk, ids) # non-nullable numeric, sequential +push!(chunk, values; validity=src_valid_bv) # nullable numeric, sequential +push!(chunk, source_scores; + validity=src_valid_bv, sel=sel_indices) # nullable scattered: validity is + # aligned to `source_scores`; the + # library gathers through `sel`. +push!(chunk, tags; validity=src_valid_bv) # nullable strings append!(writer, chunk) ``` @@ -635,14 +638,20 @@ anyway). The internal string pool is keyed by column position. mutable struct RowChunk slices::Vector{ColumnSlice} # FFI-ready, one entry per push! preserve::Vector{Any} # GC roots for source data / validity / sel - # Per-column-position string scratch. Entry `i` holds the `str_ptrs` / `str_lens` - # buffers for the i-th pushed column when it was a string column. Non-string positions - # have unused (length-0) entries. Retained across `empty!` so streaming reuse is free. + # Per-column-position string scratch. Entry `i` holds the `str_ptrs` / `str_lens` / + # `str_validity` buffers for the i-th pushed column when it was a string column. + # `str_validity` is the output-aligned validity bitmap Rust receives when both `sel` + # and `validity` are present on a string push — see that `push!` overload for why. + # Non-string positions have unused (length-0) entries. Retained across `empty!` so + # streaming reuse is free. str_ptrs::Vector{Vector{Ptr{UInt8}}} str_lens::Vector{Vector{Int64}} + str_validity::Vector{BitVector} end -RowChunk() = RowChunk(ColumnSlice[], Any[], Vector{Ptr{UInt8}}[], Vector{Int64}[]) +RowChunk() = RowChunk( + ColumnSlice[], Any[], Vector{Ptr{UInt8}}[], Vector{Int64}[], BitVector[], +) """ empty!(chunk::RowChunk) @@ -668,7 +677,11 @@ Add a non-string column slice to the chunk. Builds the FFI-ready `ColumnSlice` i just `pointer(data)` and pointer arithmetic, no allocation beyond the chunk's own `slices` / `preserve` growth. -- `validity`: optional `BitVector` where `true` = valid, `false` = null. +- `validity`: optional `BitVector` *aligned to `data`* — `length(validity) >= length(data)`, + bit `i` describes whether `data[i]` is valid (`true` = valid, `false` = null). When + `sel` is also provided, Rust gathers validity through `sel` alongside the value gather: + bit `sel[i] - 1` of the source bitmap becomes bit `i` of the output bitmap. When `sel` + is omitted, source and output positions coincide. - `sel`: optional `Vector{Int64}` of 1-based indices into `data` for scattered access. If omitted, all rows of `data` are used sequentially. """ @@ -717,10 +730,17 @@ The ptr/len gather buffers live in the chunk's per-position pool and are reused `empty!`/refill cycles, so a streaming loop pays no allocation for them after the first chunk. -- `validity`: optional `BitVector` (length matches the number of output rows) where - `true` = valid, `false` = null. +- `validity`: optional `BitVector` *aligned to `strings`* — `length(validity) >= length(strings)`, + bit `i` describes whether `strings[i]` is valid (`true` = valid, `false` = null). - `sel`: optional `Vector{Int64}` of 1-based indices into `strings` for scattered access. If omitted, all rows of `strings` are used sequentially. + +Unlike the numeric `push!`, the value gather is performed here on the Julia side +(`pointer(strings[sel[i]])` per row) because Rust can't walk a Julia `AbstractString` +vector across the FFI. When both `sel` and `validity` are supplied, the validity gather +is folded into that same loop and an output-aligned bitmap is materialized in the +chunk's per-position pool. The consumer-visible contract is therefore identical to the +numeric case — pass a source-aligned `validity` either way. """ function Base.push!( chunk::RowChunk, @@ -734,6 +754,7 @@ function Base.push!( while length(chunk.str_ptrs) < pos push!(chunk.str_ptrs, Ptr{UInt8}[]) push!(chunk.str_lens, Int64[]) + push!(chunk.str_validity, BitVector()) end str_ptrs = chunk.str_ptrs[pos] str_lens = chunk.str_lens[pos] @@ -741,7 +762,18 @@ function Base.push!( n = sel === nothing ? length(strings) : length(sel) resize!(str_ptrs, n) # amortized O(1) once the pool is sized resize!(str_lens, n) + is_nullable = validity !== nothing + # Rust never sees `sel_ptr` for string columns (the value gather is pre-applied + # below), so when `sel` *and* `validity` are both supplied we have to rewrite the + # source-aligned validity bitmap to an output-aligned one here. With `sel === nothing`, + # source and output positions coincide and we pass `validity` through unchanged. + needs_validity_gather = is_nullable && sel !== nothing + str_validity = needs_validity_gather ? chunk.str_validity[pos] : nothing + if needs_validity_gather + resize!(str_validity, n) + end + if sel === nothing @inbounds for i in 1:n if is_nullable && !validity[i] @@ -753,22 +785,35 @@ function Base.push!( str_lens[i] = ncodeunits(s) end end - else + elseif needs_validity_gather @inbounds for i in 1:n - if is_nullable && !validity[i] + src = sel[i] + if !validity[src] str_ptrs[i] = Ptr{UInt8}(C_NULL) str_lens[i] = 0 + str_validity[i] = false else - s = strings[sel[i]] + s = strings[src] str_ptrs[i] = pointer(s) str_lens[i] = ncodeunits(s) + str_validity[i] = true end end push!(chunk.preserve, sel) + else + @inbounds for i in 1:n + s = strings[sel[i]] + str_ptrs[i] = pointer(s) + str_lens[i] = ncodeunits(s) + end + push!(chunk.preserve, sel) end push!(chunk.preserve, strings) - validity_ptr = if validity !== nothing + validity_ptr = if needs_validity_gather + push!(chunk.preserve, str_validity) + Ptr{UInt8}(pointer(str_validity.chunks)) + elseif validity !== nothing push!(chunk.preserve, validity) Ptr{UInt8}(pointer(validity.chunks)) else diff --git a/test/writer_tests.jl b/test/writer_tests.jl index e7b5ffd..060be31 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1198,11 +1198,15 @@ end println("✅ Chunk 1 appended") # --- Chunk 2: rows 1-2 (score uses a scattered selection) --- - # score_src: [99.9, 3.3, 88.8], sel=[2,1] → values [3.3, 99.9], valid=[true,false] + # score_src: [99.9, 3.3, 88.8], sel=[2,1] → values [3.3, 99.9]. + # Source-aligned validity describes the source row, so the bits map as: + # source row 0 = 99.9 → false (output row 1 will be null) + # source row 1 = 3.3 → true (output row 0 is valid) + # source row 2 = 88.8 → unused (not selected); set to true for cleanliness. c2 = RustyIceberg.RowChunk() push!(c2, Int64[2, 3]) push!(c2, Float64[99.9, 3.3, 88.8]; - sel=Int64[2, 1], validity=BitVector([true, false])) + sel=Int64[2, 1], validity=BitVector([false, true, true])) push!(c2, ["", "gamma"]; validity=BitVector([false, true])) append!(writer, c2) println("✅ Chunk 2 appended (scattered score, nullable strings)") @@ -1275,6 +1279,113 @@ end println("\n✅ Streaming multi-chunk tests completed!") end +@testset "Writer streaming — source-aligned validity with non-identity sel" begin + # Regression test for the source-aligned-validity gather. The source arrays are + # length 5, sel=[3, 1, 4] picks 3 output rows, and the validity bitmaps are + # source-aligned (length 5). Under the old output-aligned semantics the library + # would have memcpy'd validity bits [0..3) into the output without gathering, and + # the null pattern would have been wrong. Under source-aligned semantics the + # numeric path gathers through `sel` in Rust and the string path gathers through + # `sel` on the Julia side of `push!` — the visible contract is the same. + println("Testing source-aligned validity + non-identity sel...") + + catalog_uri = get_catalog_uri() + props = get_catalog_properties() + + catalog = nothing + table = C_NULL + data_files = nothing + test_namespace = nothing + table_name = nothing + + try + catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props) + @test catalog !== nothing + + test_namespace = ["test_builder_srcval_$(round(Int, time() * 1000))"] + RustyIceberg.create_namespace(catalog, test_namespace) + + schema = Schema([ + Field(Int32(1), "id", IcebergLong(); required=true), + Field(Int32(2), "v", IcebergDouble(); required=false), + Field(Int32(3), "tag", IcebergString(); required=false), + ]) + + table_name = "builder_srcval_$(round(Int, time() * 1000))" + table = RustyIceberg.create_table(catalog, test_namespace, table_name, schema) + @test table != C_NULL + + # Source columns (length 5) and sel = [3, 1, 4]: + # output row 0 ← source row 3 → v=30.0 (valid), tag="gamma" (valid) + # output row 1 ← source row 1 → v=10.0 (valid), tag="alpha" (valid) + # output row 2 ← source row 4 → v=40.0 (null!), tag="" (null) + v_src = Float64[10.0, 20.0, 30.0, 40.0, 50.0] + v_validity = BitVector([true, false, true, false, true]) # source-aligned + tag_src = String["alpha", "beta", "gamma", "", "epsilon"] + tag_valid = BitVector([true, true, true, false, true]) # source-aligned + sel = Int64[3, 1, 4] + + data_files = RustyIceberg.with_data_file_writer(table) do writer + c = RustyIceberg.RowChunk() + push!(c, Int64[1, 2, 3]) + push!(c, v_src; sel=sel, validity=v_validity) + push!(c, tag_src; sel=sel, validity=tag_valid) + append!(writer, c) + end + @test data_files !== nothing && data_files.ptr != C_NULL + + updated_table = RustyIceberg.with_transaction(table, catalog) do tx + RustyIceberg.with_fast_append(tx) do action + RustyIceberg.add_data_files(action, data_files) + end + end + @test updated_table != C_NULL + + tbl = read_table_data(updated_table) + @test tbl !== nothing + @test length(tbl.id) == 3 + + perm = sortperm(tbl.id) + ids = tbl.id[perm] + vs = tbl.v[perm] + tags = tbl.tag[perm] + + # id was written [1, 2, 3] in chunk order, paired with output rows 0..2. + # After sortperm(tbl.id) the position is id ascending, so: + # ids[1]=1 ↔ output row 0 ↔ source row 3 → v=30.0, tag="gamma" + # ids[2]=2 ↔ output row 1 ↔ source row 1 → v=10.0, tag="alpha" + # ids[3]=3 ↔ output row 2 ↔ source row 4 → null, tag=null + @test ids == Int64[1, 2, 3] + @test !ismissing(vs[1]) && vs[1] ≈ 30.0 + @test !ismissing(vs[2]) && vs[2] ≈ 10.0 + @test ismissing(vs[3]) + + @test !ismissing(tags[1]) && tags[1] == "gamma" + @test !ismissing(tags[2]) && tags[2] == "alpha" + @test ismissing(tags[3]) + + RustyIceberg.free_table(updated_table) + finally + if data_files !== nothing && data_files.ptr != C_NULL + RustyIceberg.free_data_files!(data_files) + end + if table != C_NULL + RustyIceberg.free_table(table) + end + if table_name !== nothing && test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_table(catalog, test_namespace, table_name) + end + if test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_namespace(catalog, test_namespace) + end + if catalog !== nothing + RustyIceberg.free_catalog!(catalog) + end + end + + println("✅ Source-aligned validity + sel test completed") +end + @testset "Writer streaming — explicit flush! between windows" begin println("Testing explicit flush!(writer) between windows...") From 03121ad7dfe1caa57c78fb7777852ee2660d2fc4 Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Wed, 27 May 2026 14:53:01 +0200 Subject: [PATCH 24/27] writer: accept any contiguous AbstractVector{Int64} as sel Relaxes the `sel` parameter of both `push!` overloads from `Union{Nothing, Vector{Int64}}` to `Union{Nothing, AbstractVector{Int64}}` so callers can pass a `view(sel_buf, 1:n)` when their sel buffer carries stale capacity beyond the live region. `pointer(sel)` and `length(sel)` already work correctly for contiguous `SubArray{Int64,1,Vector{Int64},Tuple{UnitRange{Int}},true}` (verified: `pointer(view(v, 3:7)) == pointer(v) + 2*sizeof(Int64)`), so no body changes are needed. --- src/writer.jl | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index b6d7985..67c00ed 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -682,14 +682,16 @@ just `pointer(data)` and pointer arithmetic, no allocation beyond the chunk's ow `sel` is also provided, Rust gathers validity through `sel` alongside the value gather: bit `sel[i] - 1` of the source bitmap becomes bit `i` of the output bitmap. When `sel` is omitted, source and output positions coincide. -- `sel`: optional `Vector{Int64}` of 1-based indices into `data` for scattered access. - If omitted, all rows of `data` are used sequentially. +- `sel`: optional contiguous `AbstractVector{Int64}` of 1-based indices into `data` for + scattered access. Output row count is `length(sel)`. If omitted, all rows of `data` + are used sequentially. A `view(sel_buf, 1:n)` is accepted — useful when the caller + has a sel buffer with stale capacity beyond the live region. """ function Base.push!( chunk::RowChunk, data::AbstractVector{T}; validity::Union{Nothing, BitVector} = nothing, - sel::Union{Nothing, Vector{Int64}} = nothing, + sel::Union{Nothing, AbstractVector{Int64}} = nothing, ) where T len = sel === nothing ? length(data) : length(sel) @@ -732,8 +734,9 @@ chunk. - `validity`: optional `BitVector` *aligned to `strings`* — `length(validity) >= length(strings)`, bit `i` describes whether `strings[i]` is valid (`true` = valid, `false` = null). -- `sel`: optional `Vector{Int64}` of 1-based indices into `strings` for scattered access. - If omitted, all rows of `strings` are used sequentially. +- `sel`: optional contiguous `AbstractVector{Int64}` of 1-based indices into `strings` + for scattered access. Output row count is `length(sel)`. If omitted, all rows of + `strings` are used sequentially. Unlike the numeric `push!`, the value gather is performed here on the Julia side (`pointer(strings[sel[i]])` per row) because Rust can't walk a Julia `AbstractString` @@ -746,7 +749,7 @@ function Base.push!( chunk::RowChunk, strings::AbstractVector{<:AbstractString}; validity::Union{Nothing, BitVector} = nothing, - sel::Union{Nothing, Vector{Int64}} = nothing, + sel::Union{Nothing, AbstractVector{Int64}} = nothing, ) pos = length(chunk.slices) + 1 # Grow the per-position pool to cover this column's slot. Lazy and retained across From e1512b00fa7ef62873a961a3ea773cc64e2023dd Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Wed, 27 May 2026 16:32:46 +0200 Subject: [PATCH 25/27] doc cleanup --- src/writer.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 67c00ed..e2eaa87 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -721,8 +721,10 @@ function Base.push!( end """ - push!(chunk::RowChunk, strings::AbstractVector{<:AbstractString}; - validity=nothing, sel=nothing) + push!( + chunk::RowChunk, strings::AbstractVector{<:AbstractString}; + validity=nothing, sel=nothing + ) Add a string column slice to the chunk. Accepts any `AbstractString` element type (`String`, `SubString{String}`, Arrow's `VariableSizeString`, …) — `pointer(s)` and From 15219d40e455cced63db7d2ee65062c7a75aa213 Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Thu, 28 May 2026 09:29:03 +0200 Subject: [PATCH 26/27] ci: satisfy rustfmt + clippy cargo fmt and address all clippy lints (div_ceil, is_multiple_of, vec![] over Vec::new+push, enumerate over index loop, size_of_val). Annotate get_or_init_builder with allow(mut_from_ref) since it relies on UnsafeCell interior mutability per the documented single-thread FFI contract. Co-Authored-By: Claude Opus 4.7 --- iceberg_rust_ffi/src/record_batch_builder.rs | 37 +++++++------------ iceberg_rust_ffi/src/writer.rs | 39 ++++++++++---------- 2 files changed, 33 insertions(+), 43 deletions(-) diff --git a/iceberg_rust_ffi/src/record_batch_builder.rs b/iceberg_rust_ffi/src/record_batch_builder.rs index 10a2ab5..91854ed 100644 --- a/iceberg_rust_ffi/src/record_batch_builder.rs +++ b/iceberg_rust_ffi/src/record_batch_builder.rs @@ -87,11 +87,7 @@ impl ColumnValues { bytes: Vec::new(), // Start empty; finalize_and_reset right-sizes to the actual slice length // after the first flush, so we never hold a 4MB coalesce_rows-sized Vec. - offsets: { - let mut v = Vec::new(); - v.push(0i32); - v - }, + offsets: vec![0i32], }, _ => { let bpr = column_bytes_per_row(column_type).max(8); // fallback 8 for unknown @@ -228,7 +224,7 @@ unsafe fn append_to_state( if state.is_nullable { if !slice.validity_ptr.is_null() { let out_start = state.rows; - let needed = (out_start + len + 7) / 8; + let needed = (out_start + len).div_ceil(8); if !state.has_nulls { // First null slice: backfill all prior rows as valid. state.null_bits.resize(needed, 0u8); @@ -251,16 +247,15 @@ unsafe fn append_to_state( // earlier rows in a separate phase. if !slice.sel_ptr.is_null() { let sel = unsafe { std::slice::from_raw_parts(slice.sel_ptr, len) }; - for i in 0..len { - let src_idx = (sel[i] - 1) as usize; - let b = - unsafe { (*slice.validity_ptr.add(src_idx / 8) >> (src_idx % 8)) & 1 }; + for (i, &s) in sel.iter().enumerate() { + let src_idx = (s - 1) as usize; + let b = unsafe { (*slice.validity_ptr.add(src_idx / 8) >> (src_idx % 8)) & 1 }; let pos = out_start + i; state.null_bits[pos / 8] |= b << (pos % 8); } - } else if out_start % 8 == 0 { + } else if out_start.is_multiple_of(8) { let dst = out_start / 8; - let n_bytes = (len + 7) / 8; + let n_bytes = len.div_ceil(8); unsafe { std::ptr::copy_nonoverlapping( slice.validity_ptr, @@ -270,7 +265,7 @@ unsafe fn append_to_state( } // Mask off garbage bits beyond `len` in the last byte so they don't // corrupt a subsequent coalesced slice that shares that byte. - if len % 8 != 0 { + if !len.is_multiple_of(8) { let tail = state.null_bits.last_mut().unwrap(); *tail &= (1u8 << (len % 8)) - 1; } @@ -284,7 +279,7 @@ unsafe fn append_to_state( } else if state.has_nulls { // All-valid slice but nulls seen earlier — extend bitmap with 1s. let out_start = state.rows; - let needed = (out_start + len + 7) / 8; + let needed = (out_start + len).div_ceil(8); if state.null_bits.len() < needed { state.null_bits.resize(needed, 0u8); } @@ -380,9 +375,7 @@ macro_rules! append_primitive { for (i, &idx) in sel.iter().enumerate() { if i + PREFETCH_DIST < $len { unsafe { - prefetch_read( - src.add((sel[i + PREFETCH_DIST] - 1) as usize) as *const u8 - ) + prefetch_read(src.add((sel[i + PREFETCH_DIST] - 1) as usize) as *const u8) }; } let v = unsafe { *src.add((idx - 1) as usize) }; @@ -447,9 +440,7 @@ unsafe fn append_numeric( for (i, &idx) in sel.iter().enumerate() { if i + PREFETCH_DIST < len { unsafe { - prefetch_read( - src.add((sel[i + PREFETCH_DIST] - 1) as usize * 16), - ) + prefetch_read(src.add((sel[i + PREFETCH_DIST] - 1) as usize * 16)) }; } let off = (idx - 1) as usize * 16; @@ -523,7 +514,7 @@ fn finalize_and_reset( ColumnValues::Bool(v) => { let cap = v.capacity(); let taken = std::mem::replace(v, Vec::with_capacity(cap)); - let mut bits = vec![0u8; (rows + 7) / 8]; + let mut bits = vec![0u8; rows.div_ceil(8)]; for (i, &b) in taken.iter().enumerate().take(rows) { if b != 0 { bits[i / 8] |= 1u8 << (i % 8); @@ -671,7 +662,5 @@ fn set_bits_range(bits: &mut [u8], start: usize, end: usize) { /// Reinterpret a typed slice as bytes. #[inline(always)] unsafe fn as_bytes(s: &[T]) -> &[u8] { - unsafe { - std::slice::from_raw_parts(s.as_ptr() as *const u8, s.len() * std::mem::size_of::()) - } + unsafe { std::slice::from_raw_parts(s.as_ptr() as *const u8, std::mem::size_of_val(s)) } } diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 8a70f8e..ae352a9 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -225,7 +225,10 @@ impl GlobalWorkerPool { .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) .is_ok() { - let mut guard = self.active_writers.lock().unwrap_or_else(|e| e.into_inner()); + let mut guard = self + .active_writers + .lock() + .unwrap_or_else(|e| e.into_inner()); guard.push(state.clone()); } } @@ -235,7 +238,10 @@ impl GlobalWorkerPool { if !state.registered.swap(false, Ordering::AcqRel) { return; } - let mut guard = self.active_writers.lock().unwrap_or_else(|e| e.into_inner()); + let mut guard = self + .active_writers + .lock() + .unwrap_or_else(|e| e.into_inner()); let target = state as *const WriterState; guard.retain(|s| Arc::as_ptr(s) != target); } @@ -243,7 +249,10 @@ impl GlobalWorkerPool { /// Snapshot the active-writers list. Returns Arc clones so subsequent encoding does /// not hold the list lock. fn snapshot(&self) -> Vec> { - let guard = self.active_writers.lock().unwrap_or_else(|e| e.into_inner()); + let guard = self + .active_writers + .lock() + .unwrap_or_else(|e| e.into_inner()); guard.clone() } } @@ -265,8 +274,7 @@ fn try_claim_writer(pool: &GlobalWorkerPool) -> Option> { if w.queue_len.load(Ordering::Acquire) == 0 { continue; } - if w - .busy + if w.busy .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) .is_ok() { @@ -476,9 +484,8 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { }) .ok() .expect("encode pool initialized twice"); - let pool_ref: &'static GlobalWorkerPool = GLOBAL_ENCODE_POOL - .get() - .expect("pool was just installed"); + let pool_ref: &'static GlobalWorkerPool = + GLOBAL_ENCODE_POOL.get().expect("pool was just installed"); // Spawn N async worker tasks on the tokio runtime. Each task runs // `encode_worker_loop`, which awaits at I/O boundaries inside `w.write()` — @@ -583,6 +590,7 @@ fn arrow_type_to_column_type(dt: &DataType) -> Result { /// # Safety /// Caller must ensure no other thread is accessing the writer at the same time. The FFI /// contract is one Julia thread per writer. +#[allow(clippy::mut_from_ref)] unsafe fn get_or_init_builder( writer_ref: &IcebergDataFileWriter, ) -> Result<&mut RecordBatchBuilder, anyhow::Error> { @@ -737,10 +745,7 @@ pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) { // Set the poison flag BEFORE taking the writer out, so any encode task that // currently holds the writer outside the Mutex (across its `.await`) will see // `poisoned == true` when it goes to put the writer back, and drop it instead. - boxed - .writer_state - .poisoned - .store(true, Ordering::Release); + boxed.writer_state.poisoned.store(true, Ordering::Release); // Poison the ConcreteDataFileWriter so any in-flight pool tasks return an error // rather than writing to a partially-freed writer. let _ = boxed.writer_state.writer.lock().unwrap().take(); @@ -1178,10 +1183,8 @@ mod tests { // distinct writers. With <4 workers in the pool this would fail; on any modern // dev machine `available_parallelism() >= 4`. for chunk in completions.chunks(4) { - let distinct: std::collections::HashSet = chunk - .iter() - .map(|(wid, _)| writer_ids[wid]) - .collect(); + let distinct: std::collections::HashSet = + chunk.iter().map(|(wid, _)| writer_ids[wid]).collect(); assert_eq!( distinct.len(), 4, @@ -1220,9 +1223,7 @@ mod tests { handles.push(std::thread::spawn(move || { for batch_idx in 0..(BATCHES_PER_WRITER / 4) { for (wi, w) in writers.iter().enumerate() { - let id = (tid as i64) * 1_000_000 - + (wi as i64) * 10_000 - + batch_idx as i64; + let id = (tid as i64) * 1_000_000 + (wi as i64) * 10_000 + batch_idx as i64; push(pool, w, batch_with_id(id)); } } From 85a23b8639ee40b83f8b26156f3c0a638eb0c93e Mon Sep 17 00:00:00 2001 From: Richard Gankema Date: Thu, 28 May 2026 12:44:40 +0200 Subject: [PATCH 27/27] Bump version to 0.8.2; align writer exception codes with deferred-error pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use DATA_SCHEMA_MISMATCH (matching `_write_ipc_bytes`) for append!/flush! non-zero FFI returns — these are deferred-error placeholders; the real classified error surfaces via close_writer. Co-Authored-By: Claude Opus 4.7 --- Project.toml | 2 +- iceberg_rust_ffi/Cargo.lock | 2 +- iceberg_rust_ffi/Cargo.toml | 2 +- src/writer.jl | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index d1ab521..6790cdb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "RustyIceberg" uuid = "390bdf5b-b624-43dc-a846-0ef7a3405804" -version = "0.8.1" +version = "0.8.2" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock index 952e2f1..cd3c359 100644 --- a/iceberg_rust_ffi/Cargo.lock +++ b/iceberg_rust_ffi/Cargo.lock @@ -1641,7 +1641,7 @@ dependencies = [ [[package]] name = "iceberg_rust_ffi" -version = "0.8.1" +version = "0.8.2" dependencies = [ "anyhow", "arrow-array", diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml index 9f203fe..85a4c99 100644 --- a/iceberg_rust_ffi/Cargo.toml +++ b/iceberg_rust_ffi/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iceberg_rust_ffi" -version = "0.8.1" +version = "0.8.2" edition = "2021" [lib] diff --git a/src/writer.jl b/src/writer.jl index 8567bfe..9b63326 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -882,8 +882,8 @@ function Base.append!(writer::DataFileWriter, chunk::RowChunk) )::Int32 end ret == 0 || throw(IcebergException( - INTERNAL, - "Internal error (please report this as a bug)", + DATA_SCHEMA_MISMATCH, + "Column not found in table schema", "append! failed (see close_writer for details)", )) return writer @@ -907,8 +907,8 @@ function flush!(writer::DataFileWriter) )) ret = @ccall rust_lib.iceberg_writer_flush(writer.ptr::Ptr{Cvoid})::Int32 ret == 0 || throw(IcebergException( - INTERNAL, - "Internal error (please report this as a bug)", + DATA_SCHEMA_MISMATCH, + "Column not found in table schema", "flush! failed (see close_writer for details)", )) return writer