diff --git a/libdd-trace-utils/benches/deserialization_v05.rs b/libdd-trace-utils/benches/deserialization_v05.rs new file mode 100644 index 0000000000..03fe37bc21 --- /dev/null +++ b/libdd-trace-utils/benches/deserialization_v05.rs @@ -0,0 +1,235 @@ +// Copyright 2024-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! Microbenchmarks for the V05 msgpack decode path +//! (`libdd_trace_utils::msgpack_decoder::v05`). +//! +//! Unlike V04, the V05 wire format encodes a shared string dictionary up front and every span +//! references its string fields (service, name, resource, type, and every meta/metrics key/value) +//! by index into that dictionary. Decoding therefore performs a dictionary lookup-and-clone for +//! each string slot, which is real per-span work on the agent ingestion hot path and is otherwise +//! uncovered by benchmarks. +//! +//! We vary two axes that drive that cost: +//! * the number of spans (work scales with span count), and +//! * the string-sharing ratio: a "high sharing" payload reuses a small dictionary across all +//! spans (the common case for tracer traffic, where service/name/tag keys repeat), while a "low +//! sharing" payload gives each span its own unique strings (a larger dictionary and worse cache +//! behavior). + +use std::alloc::System; +use std::collections::HashMap; + +use criterion::{black_box, criterion_group, BenchmarkId, Criterion, Throughput}; +use libdd_common::bench_utils::{memory_allocated_measurement, AllocatedBytesMeasurement}; +use libdd_trace_utils::msgpack_decoder; + +/// A V05 span is a fixed 12-element tuple. String fields (service, name, resource, type, and the +/// meta/metrics keys/values) are `u32` indices into the shared dictionary. +type V05Span = ( + u32, // service (dict index) + u32, // name (dict index) + u32, // resource (dict index) + u64, // trace_id + u64, // span_id + u64, // parent_id + i64, // start + i64, // duration + i32, // error + HashMap, // meta (dict index -> dict index) + HashMap, // metrics (dict index -> value) + u32, // type (dict index) +); + +type V05Payload = (Vec, Vec>); + +/// Number of meta tags per span. Picked to resemble a typical instrumented span (service/runtime +/// metadata, a couple of resource attributes, thread info, etc.). +const META_TAGS_PER_SPAN: usize = 8; +/// Number of metrics per span (sampling priority, a couple of measured values). +const METRICS_PER_SPAN: usize = 3; + +/// Builds a representative V05 payload. +/// +/// `unique_per_span` controls the string-sharing ratio: +/// * `false` (high sharing): all spans draw their strings from a single small shared dictionary, +/// mirroring real tracer traffic where service names and tag keys repeat heavily. +/// * `true` (low sharing): each span contributes its own unique strings, producing a large +/// dictionary with little reuse. +/// +/// Data is fully deterministic. +fn build_v05_payload(num_traces: usize, spans_per_trace: usize, unique_per_span: bool) -> Vec { + let mut dict = Vec::new(); + let intern = |s: String, dict: &mut Vec| -> u32 { + let idx = dict.len() as u32; + dict.push(s); + idx + }; + + // Shared dictionary entries reused by every span in the "high sharing" scenario. + let shared_service = intern("test-service".to_string(), &mut dict); + let shared_name = intern("test-service.handler".to_string(), &mut dict); + let shared_resource = intern("GET /api/v1/resource".to_string(), &mut dict); + let shared_type = intern("web".to_string(), &mut dict); + + // Shared meta keys/values (typical tag keys that repeat across all spans). + let shared_meta: Vec<(u32, u32)> = [ + ("env", "production"), + ("version", "1.2.3"), + ("runtime-id", "f0e1d2c3-b4a5-6789-0abc-def012345678"), + ("language", "rust"), + ("component", "http"), + ("span.kind", "server"), + ("http.method", "GET"), + ("http.status_code", "200"), + ] + .iter() + .take(META_TAGS_PER_SPAN) + .map(|(k, v)| { + ( + intern((*k).to_string(), &mut dict), + intern((*v).to_string(), &mut dict), + ) + }) + .collect(); + + let shared_metric_keys: Vec<_> = ["_sampling_priority_v1", "_dd.measured", "_dd.top_level"] + .iter() + .take(METRICS_PER_SPAN) + .map(|k| intern((*k).to_string(), &mut dict)) + .collect(); + + let mut traces = Vec::with_capacity(num_traces); + + for trace_idx in 0..num_traces { + let mut spans = Vec::with_capacity(spans_per_trace); + let root_span_id = 100_000_000_000 + trace_idx as u64; + + for span_idx in 0..spans_per_trace { + let span_id = root_span_id + span_idx as u64 + 1; + let parent_id = if span_idx == 0 { 0 } else { root_span_id }; + + let (service, name, resource, ty, meta, metric_keys) = if unique_per_span { + // Low sharing: every span interns its own unique strings. + let service = intern(format!("service-{trace_idx}-{span_idx}"), &mut dict); + let name = intern(format!("op-{trace_idx}-{span_idx}"), &mut dict); + let resource = intern(format!("GET /api/{trace_idx}/{span_idx}"), &mut dict); + let ty = intern(format!("type-{}", span_idx % 4), &mut dict); + + let meta: Vec<(u32, u32)> = (0..META_TAGS_PER_SPAN) + .map(|m| { + ( + intern(format!("tag.key.{trace_idx}.{span_idx}.{m}"), &mut dict), + intern(format!("tag-value-{trace_idx}-{span_idx}-{m}"), &mut dict), + ) + }) + .collect(); + let metric_keys: Vec = (0..METRICS_PER_SPAN) + .map(|m| intern(format!("metric.{trace_idx}.{span_idx}.{m}"), &mut dict)) + .collect(); + + (service, name, resource, ty, meta, metric_keys) + } else { + // High sharing: reuse the shared dictionary entries. + ( + shared_service, + shared_name, + shared_resource, + shared_type, + shared_meta.clone(), + shared_metric_keys.clone(), + ) + }; + + let meta_map: HashMap = meta.into_iter().collect(); + let metrics_map: HashMap = metric_keys + .into_iter() + .enumerate() + .map(|(i, k)| (k, i as f64 + 1.0)) + .collect(); + + spans.push(( + service, + name, + resource, + 100_000_000_000u64 + trace_idx as u64, + span_id, + parent_id, + 1_700_000_000_000_000_000i64, + 123_456i64, + 0i32, + meta_map, + metrics_map, + ty, + )); + } + traces.push(spans); + } + + let payload: V05Payload = (dict, traces); + rmp_serde::to_vec(&payload).expect("Failed to serialize V05 test payload.") +} + +/// Runs the V05 decode benchmark matrix against the given criterion harness, which lets us reuse +/// the same scenarios for both the wall-time and the bytes-allocated measurements. +/// +/// Note: `Throughput::Bytes` is reported per scenario so each series can be read as a decode rate, +/// but high-sharing and low-sharing payloads have very different encoded sizes for the same span +/// count, so their throughput numbers are *not* directly comparable across the sharing dimension. +fn bench_v05_matrix( + c: &mut Criterion, + group_name: &str, +) { + // A representative chunked payload: 20 traces (the upper-bound trace count for a tracer flush) + // with a varying number of spans each. Span counts stay realistic for a single flush while + // exercising the dictionary-dedup path across many spans. + const NUM_TRACES: usize = 20; + let span_counts = [10, 100, 500]; + + let mut group = c.benchmark_group(group_name); + + for &spans_per_trace in &span_counts { + for (sharing_label, unique_per_span) in [("high_sharing", false), ("low_sharing", true)] { + let data = build_v05_payload(NUM_TRACES, spans_per_trace, unique_per_span); + let data_as_bytes = libdd_tinybytes::Bytes::copy_from_slice(&data); + + group.throughput(Throughput::Bytes(data.len() as u64)); + group.bench_with_input( + BenchmarkId::new(sharing_label, spans_per_trace * NUM_TRACES), + &data_as_bytes, + |b, data_as_bytes| { + b.iter_batched( + || data_as_bytes.clone(), + |data_as_bytes| { + let result = black_box(msgpack_decoder::v05::from_bytes(data_as_bytes)); + assert!(result.is_ok()); + // Return the result to avoid measuring the deallocation time. + result + }, + criterion::BatchSize::LargeInput, + ); + }, + ); + } + } + + group.finish(); +} + +fn deserialize_msgpack_v05(c: &mut Criterion) { + bench_v05_matrix(c, "msgpack_decoder::v05"); +} + +/// Allocation-measured counterpart. The dictionary-dedup path clones a `Bytes` slice for every +/// string slot, so the amount allocated per decode is the metric most directly affected by the +/// sharing ratio. +fn deserialize_msgpack_v05_allocs(c: &mut Criterion>) { + bench_v05_matrix(c, "msgpack_decoder::v05 (allocs)"); +} + +criterion_group!(deserialize_v05_benches, deserialize_msgpack_v05); +criterion_group!( + name = deserialize_v05_alloc_benches; + config = memory_allocated_measurement(&super::GLOBAL); + targets = deserialize_msgpack_v05_allocs +); diff --git a/libdd-trace-utils/benches/main.rs b/libdd-trace-utils/benches/main.rs index 0d86f25ee5..20a32599e6 100644 --- a/libdd-trace-utils/benches/main.rs +++ b/libdd-trace-utils/benches/main.rs @@ -10,10 +10,13 @@ use libdd_common::bench_utils::ReportingAllocator; pub static GLOBAL: ReportingAllocator = ReportingAllocator::new(System); mod deserialization; +mod deserialization_v05; mod serialization; criterion_main!( serialization::serialize_benches, deserialization::deserialize_benches, - deserialization::deserialize_alloc_benches + deserialization::deserialize_alloc_benches, + deserialization_v05::deserialize_v05_benches, + deserialization_v05::deserialize_v05_alloc_benches );