From 3b33e83288dc4171a8bc79eef6e481959152c7e3 Mon Sep 17 00:00:00 2001 From: Paul Reinlein Date: Wed, 11 Feb 2026 13:11:45 -0500 Subject: [PATCH] Example optimization hunt --- .../lading-optimize-hunt/assets/db.yaml | 8 ++ .../assets/db/block-read-at-zero-copy.yaml | 25 ++++++ .../db/trace-agent-v04-buffer-reuse.yaml | 20 +++++ .../lading-optimize-review/assets/db.yaml | 6 ++ .../db/opt-block-cache-read-at-zero-copy.yaml | 49 +++++++++++ .../db/opt-trace-agent-v04-buffer-reuse.yaml | 28 ++++++ lading_payload/src/block.rs | 88 ++++++++++++------- 7 files changed, 190 insertions(+), 34 deletions(-) create mode 100644 .claude/skills/lading-optimize-hunt/assets/db/block-read-at-zero-copy.yaml create mode 100644 .claude/skills/lading-optimize-hunt/assets/db/trace-agent-v04-buffer-reuse.yaml create mode 100644 .claude/skills/lading-optimize-review/assets/db/opt-block-cache-read-at-zero-copy.yaml create mode 100644 .claude/skills/lading-optimize-review/assets/db/opt-trace-agent-v04-buffer-reuse.yaml diff --git a/.claude/skills/lading-optimize-hunt/assets/db.yaml b/.claude/skills/lading-optimize-hunt/assets/db.yaml index 5148e3c78..46150080b 100644 --- a/.claude/skills/lading-optimize-hunt/assets/db.yaml +++ b/.claude/skills/lading-optimize-hunt/assets/db.yaml @@ -19,3 +19,11 @@ entries: technique: buffer-reuse status: success file: assets/db/datadog-logs-buffer-reuse.yaml + - target: lading_payload/src/trace_agent/v04.rs::V04::to_bytes + technique: buffer-reuse + status: failure + file: assets/db/trace-agent-v04-buffer-reuse.yaml + - target: lading_payload/src/block.rs::Cache::read_at + technique: zero-copy-slice + status: success + file: assets/db/block-read-at-zero-copy.yaml diff --git a/.claude/skills/lading-optimize-hunt/assets/db/block-read-at-zero-copy.yaml b/.claude/skills/lading-optimize-hunt/assets/db/block-read-at-zero-copy.yaml new file mode 100644 index 000000000..cef82c4f3 --- /dev/null +++ b/.claude/skills/lading-optimize-hunt/assets/db/block-read-at-zero-copy.yaml @@ -0,0 +1,25 @@ +target: lading_payload/src/block.rs::Cache::read_at +technique: zero-copy-slice +status: success +date: 2026-02-11 +measurements: + time: + cache_read_at_1KiB: -86.9% (36.7ns -> 4.8ns) + cache_read_at_64KiB: -99.5% (1.02us -> 4.9ns) + cache_read_at_1MiB: ~0% (expected, spans multiple blocks) + memory: N/A (macro benchmark does not exercise read_at) + allocations: N/A (macro benchmark does not exercise read_at) +lessons: | + Bytes::slice() provides zero-copy reads when the entire read fits within + a single block. The optimization checks this condition before allocating + any output buffer. For single-block reads (the common case in logrotate_fs), + this eliminates both the BytesMut allocation and the memcpy, reducing + the operation to an O(1) reference count increment. + + Key factors for success: + 1. Block data is stored as immutable Bytes (already reference-counted) + 2. Most reads are smaller than block size (256KiB blocks, typical reads 1-64KiB) + 3. The fast path check (size <= bytes_in_block) is a single comparison + 4. The slow path (multi-block reads) is unchanged from the original + + Review: Approved 5/5 by all personas. diff --git a/.claude/skills/lading-optimize-hunt/assets/db/trace-agent-v04-buffer-reuse.yaml b/.claude/skills/lading-optimize-hunt/assets/db/trace-agent-v04-buffer-reuse.yaml new file mode 100644 index 000000000..bcc292bb0 --- /dev/null +++ b/.claude/skills/lading-optimize-hunt/assets/db/trace-agent-v04-buffer-reuse.yaml @@ -0,0 +1,20 @@ +target: lading_payload/src/trace_agent/v04.rs::V04::to_bytes +technique: buffer-reuse +status: failure +date: 2026-02-11 +reason: | + Serialization cost dominates wall-clock time. Buffer allocation/deallocation + for Vec::with_capacity(max_bytes) in the growth loop and binary search is + handled efficiently by the system allocator. Criterion showed no timing + improvement at any payload size (1MiB: +0.34%, 10MiB: +0.11%, 100MiB: -0.49%, + all within noise). Memory stats showed 65.6% reduction in total allocated + bytes but this didn't translate to timing improvement. +lessons: | + Buffer reuse in to_bytes() is a cold path for trace_agent. The system + allocator (macOS) recycles large allocations efficiently via mmap/munmap, + so repeated Vec::with_capacity(N) + drop cycles for large N don't cost + much in wall-clock time. The hot path is msgpack serialization via + rmp_serde, not buffer allocation. Future optimization should target: + 1. Incremental size tracking to avoid re-serialization in binary search + 2. BTreeMap creation cost in create_span() (per-span allocation) + 3. Serialization format efficiency (struct_map overhead) diff --git a/.claude/skills/lading-optimize-review/assets/db.yaml b/.claude/skills/lading-optimize-review/assets/db.yaml index c4b6256e3..3107851f3 100644 --- a/.claude/skills/lading-optimize-review/assets/db.yaml +++ b/.claude/skills/lading-optimize-review/assets/db.yaml @@ -5,3 +5,9 @@ entries: - id: payload-cache-inline verdict: approved file: assets/db/opt-payload-cache-inline.yaml + - id: trace-agent-v04-buffer-reuse + verdict: rejected + file: assets/db/opt-trace-agent-v04-buffer-reuse.yaml + - id: block-cache-read-at-zero-copy + verdict: approved + file: assets/db/opt-block-cache-read-at-zero-copy.yaml diff --git a/.claude/skills/lading-optimize-review/assets/db/opt-block-cache-read-at-zero-copy.yaml b/.claude/skills/lading-optimize-review/assets/db/opt-block-cache-read-at-zero-copy.yaml new file mode 100644 index 000000000..cd09919a0 --- /dev/null +++ b/.claude/skills/lading-optimize-review/assets/db/opt-block-cache-read-at-zero-copy.yaml @@ -0,0 +1,49 @@ +id: block-cache-read-at-zero-copy +target: lading_payload/src/block.rs::Cache::read_at +technique: zero-copy-slice +verdict: approved +date: 2026-02-11 +votes: + duplicate_hunter: approve + skeptic: approve + conservative: approve + rust_expert: approve + greybeard: approve +measurements: + cache_read_at_1KiB: + time_baseline: 36.691 ns + time_optimized: 4.812 ns + time_change: -86.9% + throughput_baseline: 26 GiB/s + throughput_optimized: 198 GiB/s + throughput_change: +663% + cache_read_at_64KiB: + time_baseline: 1.019 us + time_optimized: 4.882 ns + time_change: -99.5% + throughput_baseline: 60 GiB/s + throughput_optimized: 12502 GiB/s + throughput_change: +20737% + cache_read_at_1MiB: + time_baseline: 13.962 us + time_optimized: 14.049 us + time_change: ~0% (expected, spans multiple blocks) + macro: N/A (payloadtool does not exercise read_at; uses advance() instead) +reason: | + When a read fits entirely within a single block (the common case for + logrotate_fs file reads), returns Bytes::slice() instead of allocating + BytesMut and copying. This is O(1) reference counting vs O(n) memcpy. + For reads spanning multiple blocks, falls back to original copy behavior. + Unanimous 5/5 approval based on extraordinary micro-benchmark results. +lessons: | + Bytes::slice() is a powerful zero-copy optimization for read paths where + the source data is already in Bytes format and the read is contained + within a single buffer. The key insight is checking whether the read + fits within one block before allocating any output buffer. This pattern + applies whenever: + 1. Source data is stored as Bytes (immutable, reference-counted) + 2. Reads are typically smaller than the source buffer + 3. The fast path (single buffer) is the common case + Macro benchmark gap: read_at is exercised by logrotate_fs file generator, + not by the TCP/HTTP generators that payloadtool fingerprint configs use. + A file_gen fingerprint config would close this gap. diff --git a/.claude/skills/lading-optimize-review/assets/db/opt-trace-agent-v04-buffer-reuse.yaml b/.claude/skills/lading-optimize-review/assets/db/opt-trace-agent-v04-buffer-reuse.yaml new file mode 100644 index 000000000..f620ba5a5 --- /dev/null +++ b/.claude/skills/lading-optimize-review/assets/db/opt-trace-agent-v04-buffer-reuse.yaml @@ -0,0 +1,28 @@ +id: trace-agent-v04-buffer-reuse +target: lading_payload/src/trace_agent/v04.rs::V04::to_bytes +technique: buffer-reuse +verdict: rejected +date: 2026-02-11 +votes: + duplicate_hunter: approve + skeptic: reject + conservative: approve + rust_expert: approve + greybeard: approve +reason: | + Skeptic rejected: criterion micro-benchmarks showed no timing improvement + at any payload size (1MiB, 10MiB, 100MiB - all within noise threshold). + The >=5% timing gate was not met. While payloadtool --memory-stats showed + a 65.6% reduction in total allocated memory (33.5 MiB -> 11.5 MiB), this + was measured from a single run without statistical rigor. The system + allocator handles large allocation recycling efficiently enough that the + redundant Vec::with_capacity(max_bytes) calls in the growth loop and + binary search don't manifest as a performance bottleneck. +lessons: | + Buffer reuse in trace_agent to_bytes() reduces allocator pressure but + doesn't improve wall-clock time because msgpack serialization dominates + the cost. The system allocator (macOS) recycles large allocations + efficiently via mmap/munmap. This is a cold path for allocation overhead. + Future optimization efforts on trace_agent should target serialization + cost (e.g., incremental size tracking to avoid re-serialization in the + binary search) rather than allocation cost. diff --git a/lading_payload/src/block.rs b/lading_payload/src/block.rs index daf6b82dc..3a6d1d327 100644 --- a/lading_payload/src/block.rs +++ b/lading_payload/src/block.rs @@ -469,12 +469,14 @@ impl Cache { /// Read data starting from a given offset and up to the specified size. /// + /// When the entire read fits within a single block, returns a zero-copy + /// slice of the block's `Bytes`. Otherwise falls back to copying across + /// block boundaries. + /// /// # Panics /// /// Function will panic if reads are larger than machine word bytes wide. pub fn read_at(&self, offset: u64, size: usize) -> Bytes { - let mut data = BytesMut::with_capacity(size); - let (blocks, total_cycle_size) = match self { Cache::Fixed { blocks, @@ -487,45 +489,63 @@ impl Cache { ), }; - let mut remaining = size; - let mut current_offset = + let current_offset = usize::try_from(offset).expect("offset larger than machine word bytes"); + let offset_within_cycle = current_offset % total_cycle_size; + + // Find the starting block. + let mut block_start = 0; + for block in blocks { + let block_size = block.total_bytes.get() as usize; + if offset_within_cycle < block_start + block_size { + let block_offset = offset_within_cycle - block_start; + let bytes_in_block = block_size - block_offset; + + // Fast path: entire read fits in this one block. Return a + // zero-copy slice, avoiding allocation and memcpy. + if size <= bytes_in_block { + return block.bytes.slice(block_offset..block_offset + size); + } - while remaining > 0 { - // The plan is this. We treat the blocks as one infinite cycle. We - // map our offset into the domain of the blocks, then seek forward - // until we find the block we need to start reading from. Then we - // read into `data`. - - let offset_within_cycle = current_offset % total_cycle_size; - let mut block_start = 0; - for block in blocks { - let block_size = block.total_bytes.get() as usize; - if offset_within_cycle < block_start + block_size { - // Offset is within this block. Begin reading into `data`. - let block_offset = offset_within_cycle - block_start; - let bytes_in_block = (block_size - block_offset).min(remaining); - - data.extend_from_slice( - &block.bytes[block_offset..block_offset + bytes_in_block], - ); - - remaining -= bytes_in_block; - current_offset += bytes_in_block; - break; + // Slow path: read spans multiple blocks. Allocate and copy. + let mut data = BytesMut::with_capacity(size); + data.extend_from_slice(&block.bytes[block_offset..block_offset + bytes_in_block]); + let mut remaining = size - bytes_in_block; + let mut current_offset = current_offset + bytes_in_block; + + while remaining > 0 { + let offset_within_cycle = current_offset % total_cycle_size; + let mut inner_block_start = 0; + for blk in blocks { + let blk_size = blk.total_bytes.get() as usize; + if offset_within_cycle < inner_block_start + blk_size { + let blk_offset = offset_within_cycle - inner_block_start; + let bytes_avail = (blk_size - blk_offset).min(remaining); + data.extend_from_slice( + &blk.bytes[blk_offset..blk_offset + bytes_avail], + ); + remaining -= bytes_avail; + current_offset += bytes_avail; + break; + } + inner_block_start += blk_size; + } + + if remaining > 0 && inner_block_start >= total_cycle_size { + error!("Offset exceeds total cycle size"); + break; + } } - block_start += block_size; - } - // If we couldn't find a block this suggests something seriously - // wacky has happened. - if remaining > 0 && block_start >= total_cycle_size { - error!("Offset exceeds total cycle size"); - break; + return data.freeze(); } + block_start += block_size; } - data.freeze() + // If we get here, no block contained the offset. This shouldn't + // happen with a valid cache, but return empty bytes defensively. + error!("Offset exceeds total cycle size"); + Bytes::new() } }