DataDog · preinlein · Feb 11, 2026
diff --git a/.claude/skills/lading-optimize-hunt/assets/db.yaml b/.claude/skills/lading-optimize-hunt/assets/db.yaml
@@ -19,3 +19,11 @@ entries:
     technique: buffer-reuse
     status: success
     file: assets/db/datadog-logs-buffer-reuse.yaml
+  - target: lading_payload/src/trace_agent/v04.rs::V04::to_bytes
+    technique: buffer-reuse
+    status: failure
+    file: assets/db/trace-agent-v04-buffer-reuse.yaml
+  - target: lading_payload/src/block.rs::Cache::read_at
+    technique: zero-copy-slice
+    status: success
+    file: assets/db/block-read-at-zero-copy.yaml
diff --git a/.claude/skills/lading-optimize-hunt/assets/db/block-read-at-zero-copy.yaml b/.claude/skills/lading-optimize-hunt/assets/db/block-read-at-zero-copy.yaml
@@ -0,0 +1,25 @@
+target: lading_payload/src/block.rs::Cache::read_at
+technique: zero-copy-slice
+status: success
+date: 2026-02-11
+measurements:
+  time:
+    cache_read_at_1KiB: -86.9% (36.7ns -> 4.8ns)
+    cache_read_at_64KiB: -99.5% (1.02us -> 4.9ns)
+    cache_read_at_1MiB: ~0% (expected, spans multiple blocks)
+  memory: N/A (macro benchmark does not exercise read_at)
+  allocations: N/A (macro benchmark does not exercise read_at)
+lessons: |
+  Bytes::slice() provides zero-copy reads when the entire read fits within
+  a single block. The optimization checks this condition before allocating
+  any output buffer. For single-block reads (the common case in logrotate_fs),
+  this eliminates both the BytesMut allocation and the memcpy, reducing
+  the operation to an O(1) reference count increment.
+
+  Key factors for success:
+  1. Block data is stored as immutable Bytes (already reference-counted)
+  2. Most reads are smaller than block size (256KiB blocks, typical reads 1-64KiB)
+  3. The fast path check (size <= bytes_in_block) is a single comparison
+  4. The slow path (multi-block reads) is unchanged from the original
+
+  Review: Approved 5/5 by all personas.
diff --git a/.claude/skills/lading-optimize-hunt/assets/db/trace-agent-v04-buffer-reuse.yaml b/.claude/skills/lading-optimize-hunt/assets/db/trace-agent-v04-buffer-reuse.yaml
@@ -0,0 +1,20 @@
+target: lading_payload/src/trace_agent/v04.rs::V04::to_bytes
+technique: buffer-reuse
+status: failure
+date: 2026-02-11
+reason: |
+  Serialization cost dominates wall-clock time. Buffer allocation/deallocation
+  for Vec::with_capacity(max_bytes) in the growth loop and binary search is
+  handled efficiently by the system allocator. Criterion showed no timing
+  improvement at any payload size (1MiB: +0.34%, 10MiB: +0.11%, 100MiB: -0.49%,
+  all within noise). Memory stats showed 65.6% reduction in total allocated
+  bytes but this didn't translate to timing improvement.
+lessons: |
+  Buffer reuse in to_bytes() is a cold path for trace_agent. The system
+  allocator (macOS) recycles large allocations efficiently via mmap/munmap,
+  so repeated Vec::with_capacity(N) + drop cycles for large N don't cost
+  much in wall-clock time. The hot path is msgpack serialization via
+  rmp_serde, not buffer allocation. Future optimization should target:
+  1. Incremental size tracking to avoid re-serialization in binary search
+  2. BTreeMap creation cost in create_span() (per-span allocation)
+  3. Serialization format efficiency (struct_map overhead)
diff --git a/.claude/skills/lading-optimize-review/assets/db.yaml b/.claude/skills/lading-optimize-review/assets/db.yaml
@@ -5,3 +5,9 @@ entries:
   - id: payload-cache-inline
     verdict: approved
     file: assets/db/opt-payload-cache-inline.yaml
+  - id: trace-agent-v04-buffer-reuse
+    verdict: rejected
+    file: assets/db/opt-trace-agent-v04-buffer-reuse.yaml
+  - id: block-cache-read-at-zero-copy
+    verdict: approved
+    file: assets/db/opt-block-cache-read-at-zero-copy.yaml
diff --git a/.claude/skills/lading-optimize-review/assets/db/opt-block-cache-read-at-zero-copy.yaml b/.claude/skills/lading-optimize-review/assets/db/opt-block-cache-read-at-zero-copy.yaml
@@ -0,0 +1,49 @@
+id: block-cache-read-at-zero-copy
+target: lading_payload/src/block.rs::Cache::read_at
+technique: zero-copy-slice
+verdict: approved
+date: 2026-02-11
+votes:
+  duplicate_hunter: approve
+  skeptic: approve
+  conservative: approve
+  rust_expert: approve
+  greybeard: approve
+measurements:
+  cache_read_at_1KiB:
+    time_baseline: 36.691 ns
+    time_optimized: 4.812 ns
+    time_change: -86.9%
+    throughput_baseline: 26 GiB/s
+    throughput_optimized: 198 GiB/s
+    throughput_change: +663%
+  cache_read_at_64KiB:
+    time_baseline: 1.019 us
+    time_optimized: 4.882 ns
+    time_change: -99.5%
+    throughput_baseline: 60 GiB/s
+    throughput_optimized: 12502 GiB/s
+    throughput_change: +20737%
+  cache_read_at_1MiB:
+    time_baseline: 13.962 us
+    time_optimized: 14.049 us
+    time_change: ~0% (expected, spans multiple blocks)
+  macro: N/A (payloadtool does not exercise read_at; uses advance() instead)
+reason: |
+  When a read fits entirely within a single block (the common case for
+  logrotate_fs file reads), returns Bytes::slice() instead of allocating
+  BytesMut and copying. This is O(1) reference counting vs O(n) memcpy.
+  For reads spanning multiple blocks, falls back to original copy behavior.
+  Unanimous 5/5 approval based on extraordinary micro-benchmark results.
+lessons: |
+  Bytes::slice() is a powerful zero-copy optimization for read paths where
+  the source data is already in Bytes format and the read is contained
+  within a single buffer. The key insight is checking whether the read
+  fits within one block before allocating any output buffer. This pattern
+  applies whenever:
+  1. Source data is stored as Bytes (immutable, reference-counted)
+  2. Reads are typically smaller than the source buffer
+  3. The fast path (single buffer) is the common case
+  Macro benchmark gap: read_at is exercised by logrotate_fs file generator,
+  not by the TCP/HTTP generators that payloadtool fingerprint configs use.
+  A file_gen fingerprint config would close this gap.
diff --git a/.claude/skills/lading-optimize-review/assets/db/opt-trace-agent-v04-buffer-reuse.yaml b/.claude/skills/lading-optimize-review/assets/db/opt-trace-agent-v04-buffer-reuse.yaml
@@ -0,0 +1,28 @@
+id: trace-agent-v04-buffer-reuse
+target: lading_payload/src/trace_agent/v04.rs::V04::to_bytes
+technique: buffer-reuse
+verdict: rejected
+date: 2026-02-11
+votes:
+  duplicate_hunter: approve
+  skeptic: reject
+  conservative: approve
+  rust_expert: approve
+  greybeard: approve
+reason: |
+  Skeptic rejected: criterion micro-benchmarks showed no timing improvement
+  at any payload size (1MiB, 10MiB, 100MiB - all within noise threshold).
+  The >=5% timing gate was not met. While payloadtool --memory-stats showed
+  a 65.6% reduction in total allocated memory (33.5 MiB -> 11.5 MiB), this
+  was measured from a single run without statistical rigor. The system
+  allocator handles large allocation recycling efficiently enough that the
+  redundant Vec::with_capacity(max_bytes) calls in the growth loop and
+  binary search don't manifest as a performance bottleneck.
+lessons: |
+  Buffer reuse in trace_agent to_bytes() reduces allocator pressure but
+  doesn't improve wall-clock time because msgpack serialization dominates
+  the cost. The system allocator (macOS) recycles large allocations
+  efficiently via mmap/munmap. This is a cold path for allocation overhead.
+  Future optimization efforts on trace_agent should target serialization
+  cost (e.g., incremental size tracking to avoid re-serialization in the
+  binary search) rather than allocation cost.
diff --git a/lading_payload/src/block.rs b/lading_payload/src/block.rs
@@ -469,12 +469,14 @@ impl Cache {
 
     /// Read data starting from a given offset and up to the specified size.
     ///
+    /// When the entire read fits within a single block, returns a zero-copy
+    /// slice of the block's `Bytes`. Otherwise falls back to copying across
+    /// block boundaries.
+    ///
     /// # Panics
     ///
     /// Function will panic if reads are larger than machine word bytes wide.
     pub fn read_at(&self, offset: u64, size: usize) -> Bytes {
-        let mut data = BytesMut::with_capacity(size);
-
         let (blocks, total_cycle_size) = match self {
             Cache::Fixed {
                 blocks,
@@ -487,45 +489,63 @@ impl Cache {
             ),
         };
 
-        let mut remaining = size;
-        let mut current_offset =
+        let current_offset =
             usize::try_from(offset).expect("offset larger than machine word bytes");
+        let offset_within_cycle = current_offset % total_cycle_size;
+
+        // Find the starting block.
+        let mut block_start = 0;
+        for block in blocks {
+            let block_size = block.total_bytes.get() as usize;
+            if offset_within_cycle < block_start + block_size {
+                let block_offset = offset_within_cycle - block_start;
+                let bytes_in_block = block_size - block_offset;
+
+                // Fast path: entire read fits in this one block. Return a
+                // zero-copy slice, avoiding allocation and memcpy.
+                if size <= bytes_in_block {
+                    return block.bytes.slice(block_offset..block_offset + size);
+                }
 
-        while remaining > 0 {
-            // The plan is this. We treat the blocks as one infinite cycle. We
-            // map our offset into the domain of the blocks, then seek forward
-            // until we find the block we need to start reading from. Then we
-            // read into `data`.
-
-            let offset_within_cycle = current_offset % total_cycle_size;
-            let mut block_start = 0;
-            for block in blocks {
-                let block_size = block.total_bytes.get() as usize;
-                if offset_within_cycle < block_start + block_size {
-                    // Offset is within this block. Begin reading into `data`.
-                    let block_offset = offset_within_cycle - block_start;
-                    let bytes_in_block = (block_size - block_offset).min(remaining);
-
-                    data.extend_from_slice(
-                        &block.bytes[block_offset..block_offset + bytes_in_block],
-                    );
-
-                    remaining -= bytes_in_block;
-                    current_offset += bytes_in_block;
-                    break;
+                // Slow path: read spans multiple blocks. Allocate and copy.
+                let mut data = BytesMut::with_capacity(size);
+                data.extend_from_slice(&block.bytes[block_offset..block_offset + bytes_in_block]);
+                let mut remaining = size - bytes_in_block;
+                let mut current_offset = current_offset + bytes_in_block;
+
+                while remaining > 0 {
+                    let offset_within_cycle = current_offset % total_cycle_size;
+                    let mut inner_block_start = 0;
+                    for blk in blocks {
+                        let blk_size = blk.total_bytes.get() as usize;
+                        if offset_within_cycle < inner_block_start + blk_size {
+                            let blk_offset = offset_within_cycle - inner_block_start;
+                            let bytes_avail = (blk_size - blk_offset).min(remaining);
+                            data.extend_from_slice(
+                                &blk.bytes[blk_offset..blk_offset + bytes_avail],
+                            );
+                            remaining -= bytes_avail;
+                            current_offset += bytes_avail;
+                            break;
+                        }
+                        inner_block_start += blk_size;
+                    }
+
+                    if remaining > 0 && inner_block_start >= total_cycle_size {
+                        error!("Offset exceeds total cycle size");
+                        break;
+                    }
                 }
-                block_start += block_size;
-            }
 
-            // If we couldn't find a block this suggests something seriously
-            // wacky has happened.
-            if remaining > 0 && block_start >= total_cycle_size {
-                error!("Offset exceeds total cycle size");
-                break;
+                return data.freeze();
             }
+            block_start += block_size;
         }
 
-        data.freeze()
+        // If we get here, no block contained the offset. This shouldn't
+        // happen with a valid cache, but return empty bytes defensively.
+        error!("Offset exceeds total cycle size");
+        Bytes::new()
     }
 }