From 24a9fdccdb49a77327f337b84ee2a8263226c73e Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Tue, 23 Jun 2026 17:46:22 -0400 Subject: [PATCH 1/2] Overhaul set benchmarks: split Immutable / SingleThreaded, add Set.copyOf Mirror the map-benchmark overhaul for sets. Replace the single SetBenchmark (shared mutable counter under @Threads(8); contains_treeSet bug that queried HASH_SET) with two classes that each pick the right threading model: - ImmutableSetBenchmark: fixed read-only membership shared across threads (@State(Scope.Benchmark)); array / sortedArray / HashSet / TreeSet / Set.copyOf (the JDK compact SetN the agent actually uses for config sets, via CollectionUtils.tryMakeImmutableSet). hit/miss split, per-thread cursor. - SingleThreadedSetBenchmark: per-thread mutable lifecycle (@State(Scope.Thread)); create/clone + contains/iterate, plus a Collections.synchronizedSet case for the uncontended synchronization tax (per-thread => bias never revoked; biased-locking story across JVMs). StringIndex rows fold in later. Result blocks empty pending a fresh multi-JVM run. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../trace/util/ImmutableSetBenchmark.java | 165 +++++++++++++++++ .../java/datadog/trace/util/SetBenchmark.java | 128 ------------- .../util/SingleThreadedSetBenchmark.java | 172 ++++++++++++++++++ 3 files changed, 337 insertions(+), 128 deletions(-) create mode 100644 internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java delete mode 100644 internal-api/src/jmh/java/datadog/trace/util/SetBenchmark.java create mode 100644 internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java diff --git a/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java new file mode 100644 index 00000000000..8dc45eed908 --- /dev/null +++ b/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java @@ -0,0 +1,165 @@ +package datadog.trace.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.TreeSet; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Membership over a small, fixed, read-only string set shared across threads — split into hit and + * miss lookups (different cost shapes per structure). + * + *

The set is built once and only read, so a single shared instance ({@link Scope#Benchmark}) + * read by all {@code @Threads} is realistic and contention-free. This is the read-mostly + * counterpart to the per-thread mutable {@link SingleThreadedSetBenchmark}, and mirrors {@link + * ImmutableMapBenchmark} on the set side. Sets in the tracer skew strongly toward this fixed, + * read-only shape. + * + *

Strategies compared: + * + *

+ * + *

Lookups are interned (the {@code ==} fast path where a structure has one); misses are short + * and never present. (Results pending a fresh multi-JVM run — {@code Set.copyOf} only materializes + * the compact form on Java 10+.) + */ +@Fork(2) +@Warmup(iterations = 2) +@Measurement(iterations = 3) +@Threads(8) +@State(Scope.Benchmark) +public class ImmutableSetBenchmark { + static final String[] STRINGS = { + "foo", "bar", "baz", "quux", "hello", "world", + "service", "queryString", "lorem", "ipsum", "dolem", "sit" + }; + + /** Distinct String instances that are never present, for the miss path. */ + static final String[] MISSES = newMisses(); + + static String[] newMisses() { + String[] misses = new String[STRINGS.length * 4]; + for (int i = 0; i < misses.length; ++i) { + misses[i] = "dne-" + i; + } + return misses; + } + + // Built once, never mutated -- safe to share across the reader threads. + String[] array; + String[] sortedArray; + HashSet hashSet; + TreeSet treeSet; + Set copyOfSet; + + @Setup(Level.Trial) + public void setUp() { + array = STRINGS; + sortedArray = Arrays.copyOf(STRINGS, STRINGS.length); + Arrays.sort(sortedArray); + hashSet = new HashSet<>(Arrays.asList(STRINGS)); + treeSet = new TreeSet<>(Arrays.asList(STRINGS)); + copyOfSet = CollectionUtils.tryMakeImmutableSet(Arrays.asList(STRINGS)); + } + + /** Per-thread lookup cursor so each reader thread cycles keys independently. */ + @State(Scope.Thread) + public static class Cursor { + int hitIndex = 0; + int missIndex = 0; + + String nextHit() { + int i = hitIndex + 1; + if (i >= STRINGS.length) { + i = 0; + } + hitIndex = i; + return STRINGS[i]; + } + + String nextMiss() { + int i = missIndex + 1; + if (i >= MISSES.length) { + i = 0; + } + missIndex = i; + return MISSES[i]; + } + } + + static boolean arrayContains(String[] array, String needle) { + for (String s : array) { + if (needle.equals(s)) { + return true; + } + } + return false; + } + + @Benchmark + public boolean array_hit(Cursor cursor) { + return arrayContains(array, cursor.nextHit()); + } + + @Benchmark + public boolean array_miss(Cursor cursor) { + return arrayContains(array, cursor.nextMiss()); + } + + @Benchmark + public boolean sortedArray_hit(Cursor cursor) { + return Arrays.binarySearch(sortedArray, cursor.nextHit()) >= 0; + } + + @Benchmark + public boolean sortedArray_miss(Cursor cursor) { + return Arrays.binarySearch(sortedArray, cursor.nextMiss()) >= 0; + } + + @Benchmark + public boolean hashSet_hit(Cursor cursor) { + return hashSet.contains(cursor.nextHit()); + } + + @Benchmark + public boolean hashSet_miss(Cursor cursor) { + return hashSet.contains(cursor.nextMiss()); + } + + @Benchmark + public boolean treeSet_hit(Cursor cursor) { + return treeSet.contains(cursor.nextHit()); + } + + @Benchmark + public boolean treeSet_miss(Cursor cursor) { + return treeSet.contains(cursor.nextMiss()); + } + + @Benchmark + public boolean copyOf_hit(Cursor cursor) { + return copyOfSet.contains(cursor.nextHit()); + } + + @Benchmark + public boolean copyOf_miss(Cursor cursor) { + return copyOfSet.contains(cursor.nextMiss()); + } +} diff --git a/internal-api/src/jmh/java/datadog/trace/util/SetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/SetBenchmark.java deleted file mode 100644 index 144e4748400..00000000000 --- a/internal-api/src/jmh/java/datadog/trace/util/SetBenchmark.java +++ /dev/null @@ -1,128 +0,0 @@ -package datadog.trace.util; - -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.TreeSet; -import java.util.concurrent.ThreadLocalRandom; -import java.util.function.Supplier; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; - -/** - * - * - *

- * - * - * MacBook M1 - 8 threads - Java 21 - * 1/3 not found rate - * - * Benchmark Mode Cnt Score Error Units - * SetBenchmark.contains_array thrpt 6 645561886.327 ± 100781717.494 ops/s - * SetBenchmark.contains_hashSet thrpt 6 1536236680.235 ± 114966961.506 ops/s - * SetBenchmark.contains_sortedArray thrpt 6 571476939.441 ± 21334620.460 ops/s - * SetBenchmark.contains_treeSet thrpt 6 1557663759.411 ± 95343683.124 ops/s - * - */ -@Fork(2) -@Warmup(iterations = 2) -@Measurement(iterations = 3) -@Threads(8) -public class SetBenchmark { - static final String[] STRINGS = - new String[] { - "foo", - "bar", - "baz", - "quux", - "hello", - "world", - "service", - "queryString", - "lorem", - "ipsum", - "dolem", - "sit" - }; - - static T init(Supplier supplier) { - return supplier.get(); - } - - static final String[] LOOKUPS = - init( - () -> { - String[] lookups = Arrays.copyOf(STRINGS, STRINGS.length * 10); - - for (int i = 0; i < STRINGS.length; ++i) { - lookups[STRINGS.length + i] = new String(STRINGS[i]); - } - - // 2 / 3 of the key look-ups miss the set - for (int i = STRINGS.length * 2; i < lookups.length; ++i) { - lookups[i] = "dne-" + ThreadLocalRandom.current().nextInt(); - } - - Collections.shuffle(Arrays.asList(lookups)); - return lookups; - }); - - static int sharedLookupIndex = 0; - - static String nextString() { - int localIndex = ++sharedLookupIndex; - if (localIndex >= LOOKUPS.length) { - sharedLookupIndex = localIndex = 0; - } - return LOOKUPS[localIndex]; - } - - static final String[] ARRAY = STRINGS; - - @Benchmark - public boolean contains_array() { - String needle = nextString(); - for (String str : ARRAY) { - if (needle.equals(str)) return true; - } - return false; - } - - static final String[] SORTED_ARRAY = - init( - () -> { - String[] sorted = Arrays.copyOf(STRINGS, STRINGS.length); - Arrays.sort(sorted); - return sorted; - }); - - @Benchmark - public boolean contains_sortedArray() { - return (Arrays.binarySearch(SORTED_ARRAY, nextString()) != -1); - } - - static final HashSet HASH_SET = new HashSet<>(Arrays.asList(STRINGS)); - - @Benchmark - public boolean contains_hashSet() { - return HASH_SET.contains(nextString()); - } - - static final TreeSet TREE_SET = new TreeSet<>(Arrays.asList(STRINGS)); - - @Benchmark - public boolean contains_treeSet() { - return HASH_SET.contains(nextString()); - } -} diff --git a/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java new file mode 100644 index 00000000000..b90fbbeb288 --- /dev/null +++ b/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java @@ -0,0 +1,172 @@ +package datadog.trace.util; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.TreeSet; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Single-threaded (uncontended) set usage: each thread builds, reads, and discards its own + * sets. Per-thread state ({@link Scope#Thread}); mirrors {@link SingleThreadedMapBenchmark} on the + * set side. Running at {@code @Threads(8)} keeps allocation / GC interactions visible without lock + * contention. + * + *

Sets in the tracer skew read-only/fixed (see {@link ImmutableSetBenchmark}); this covers the + * mutable-lifecycle case for completeness and — via {@link Collections#synchronizedSet} — the + * uncontended synchronization tax. Because each thread owns its synchronized set, the + * monitor is only ever locked by one thread: biased locking ≈ free on Java ≤ 11, full uncontended + * CAS on Java 15+ (biased locking disabled by default, JEP 374). The unsynchronized {@code hashSet} + * {@code contains}/{@code iterate} methods are the in-harness baseline; the tax is the delta. + * (Results pending a fresh multi-JVM run.) + */ +@Fork(2) +@Warmup(iterations = 2) +@Measurement(iterations = 3) +@Threads(8) +@State(Scope.Thread) +public class SingleThreadedSetBenchmark { + static final String[] ELEMENTS = { + "foo", "bar", "baz", "quux", "hello", "world", + "service", "queryString", "lorem", "ipsum", "dolem", "sit" + }; + + // Distinct String instances so lookups exercise equals(), not identity. + static final String[] EQUAL_ELEMENTS = newEqualElements(); + + static String[] newEqualElements() { + String[] copies = new String[ELEMENTS.length]; + for (int i = 0; i < ELEMENTS.length; ++i) { + copies[i] = new String(ELEMENTS[i]); + } + return copies; + } + + static void fill(Set set) { + for (String s : ELEMENTS) { + set.add(s); + } + } + + // Per-thread prebuilt sets for the read + clone benchmarks (built once per trial, per thread). + HashSet hashSet; + Set synchronizedSet; + TreeSet treeSet; + LinkedHashSet linkedHashSet; + int index = 0; + + @Setup(Level.Trial) + public void setUp() { + hashSet = new HashSet<>(Arrays.asList(ELEMENTS)); + synchronizedSet = Collections.synchronizedSet(new HashSet<>(hashSet)); + treeSet = new TreeSet<>(Arrays.asList(ELEMENTS)); + linkedHashSet = new LinkedHashSet<>(Arrays.asList(ELEMENTS)); + } + + String nextLookup() { + if (++index >= EQUAL_ELEMENTS.length) { + index = 0; + } + return EQUAL_ELEMENTS[index]; + } + + // ---- construction: build cost + allocation ---- + + @Benchmark + public Set create_hashSet() { + HashSet set = new HashSet<>(); + fill(set); + return set; + } + + @Benchmark + public Set create_hashSet_sized() { + HashSet set = new HashSet<>(ELEMENTS.length); + fill(set); + return set; + } + + @Benchmark + public Set create_synchronizedSet() { + Set set = Collections.synchronizedSet(new HashSet<>()); + fill(set); + return set; + } + + @Benchmark + public Set create_treeSet() { + TreeSet set = new TreeSet<>(); + fill(set); + return set; + } + + @Benchmark + public Set create_linkedHashSet() { + LinkedHashSet set = new LinkedHashSet<>(); + fill(set); + return set; + } + + // ---- copy ---- + + @Benchmark + public Set clone_hashSet() { + return new HashSet<>(hashSet); + } + + @Benchmark + public Set clone_synchronizedSet() { + return Collections.synchronizedSet(new HashSet<>(hashSet)); + } + + @Benchmark + public Set clone_treeSet() { + return new TreeSet<>(treeSet); + } + + @Benchmark + public Set clone_linkedHashSet() { + return new LinkedHashSet<>(linkedHashSet); + } + + // ---- read: unsynchronized baseline vs uncontended synchronized (biased-locking story) ---- + + @Benchmark + public boolean contains_hashSet() { + return hashSet.contains(nextLookup()); + } + + @Benchmark + public boolean contains_synchronizedSet() { + return synchronizedSet.contains(nextLookup()); + } + + @Benchmark + public void iterate_hashSet(Blackhole blackhole) { + for (String s : hashSet) { + blackhole.consume(s); + } + } + + @Benchmark + public void iterate_synchronizedSet(Blackhole blackhole) { + // Collections.synchronizedSet requires the caller to synchronize during iteration; this is the + // correct usage and measures one (uncontended) monitor acquire around the traversal. + synchronized (synchronizedSet) { + for (String s : synchronizedSet) { + blackhole.consume(s); + } + } + } +} From 5a14e62e10ad9138bb90c6643ff5111b39a43145 Mon Sep 17 00:00:00 2001 From: Douglas Q Hawkins Date: Wed, 24 Jun 2026 07:15:12 -0400 Subject: [PATCH 2/2] Add Java 17 results to set benchmark Javadocs ImmutableSetBenchmark: HashSet fastest; Set.copyOf (SetN) ~10% behind on hit, the compact form the agent uses for fixed config sets. SingleThreadedSetBenchmark: uncontended synchronizedSet tax ~37% on contains (biased locking off, Java 17), near-zero on iterate. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../trace/util/ImmutableSetBenchmark.java | 25 +++++++++++++++-- .../util/SingleThreadedSetBenchmark.java | 28 ++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java index 8dc45eed908..13312c6c576 100644 --- a/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java +++ b/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java @@ -37,8 +37,29 @@ * * *

Lookups are interned (the {@code ==} fast path where a structure has one); misses are short - * and never present. (Results pending a fresh multi-JVM run — {@code Set.copyOf} only materializes - * the compact form on Java 10+.) + * and never present. + * + *

Java 17 results (Apple M1, {@code @Fork(2)}, {@code @Threads(8)}; M ops/s = millions): + * + *

{@code
+ * Structure              hit     miss
+ * hashSet               2159     1751    (fastest)
+ * copyOf (SetN)         1946     1633
+ * array                  926      584
+ * sortedArray            664      588
+ * treeSet                642      593
+ * }
+ * + *

Key findings: + * + *

    + *
  • {@code HashSet} is fastest; {@link java.util.Set#copyOf} ({@code SetN}) trails by only ~10% + * on hit and ~7% on miss — and it's the compact, array-backed form the agent already uses for + * fixed config sets, so it's a strong default when the set is immutable. + *
  • {@code array} / {@code sortedArray} / {@code treeSet} cluster at ~0.6–0.9B — they scan, + * binary-search, or tree-walk per lookup, so they trail the hashed structures, most visibly + * on the miss path. + *
*/ @Fork(2) @Warmup(iterations = 2) diff --git a/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java index b90fbbeb288..f9e9b69179a 100644 --- a/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java +++ b/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java @@ -29,7 +29,33 @@ * monitor is only ever locked by one thread: biased locking ≈ free on Java ≤ 11, full uncontended * CAS on Java 15+ (biased locking disabled by default, JEP 374). The unsynchronized {@code hashSet} * {@code contains}/{@code iterate} methods are the in-harness baseline; the tax is the delta. - * (Results pending a fresh multi-JVM run.) + * + *

Java 17 results (Apple M1, {@code @Fork(2)}, {@code @Threads(8)}; M ops/s = millions): + * + *

{@code
+ * contains_hashSet            1291
+ * contains_synchronizedSet     808    (~37% slower — the uncontended sync tax)
+ * iterate_hashSet              91
+ * iterate_synchronizedSet      90    (one monitor acquire amortized over the walk)
+ *
+ * create_hashSet         81    clone_hashSet          48
+ * create_hashSet_sized   78    clone_synchronizedSet  47
+ * create_linkedHashSet   61    clone_linkedHashSet    59
+ * create_synchronizedSet 41    clone_treeSet          83
+ * create_treeSet         36
+ * }
+ * + *

Key findings: + * + *

    + *
  • Uncontended synchronization tax on {@code contains} is ~37% (1291 → 808M ops/s) even + * with no contention and biased locking disabled (Java 17, JEP 374) — the full per-lock CAS + * cost. On {@code iterate} it nearly vanishes: a single monitor acquire amortized over the + * traversal. + *
  • Construction: {@code TreeSet} is the slowest to build (~36M); the {@code synchronizedSet} + * wrapper adds a modest cost over plain {@code HashSet}. (Allocation-path numbers carry more + * run-to-run variance than the read paths.) + *
*/ @Fork(2) @Warmup(iterations = 2)