diff --git a/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java new file mode 100644 index 00000000000..13312c6c576 --- /dev/null +++ b/internal-api/src/jmh/java/datadog/trace/util/ImmutableSetBenchmark.java @@ -0,0 +1,186 @@ +package datadog.trace.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.TreeSet; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; + +/** + * Membership over a small, fixed, read-only string set shared across threads — split into hit and + * miss lookups (different cost shapes per structure). + * + *

The set is built once and only read, so a single shared instance ({@link Scope#Benchmark}) + * read by all {@code @Threads} is realistic and contention-free. This is the read-mostly + * counterpart to the per-thread mutable {@link SingleThreadedSetBenchmark}, and mirrors {@link + * ImmutableMapBenchmark} on the set side. Sets in the tracer skew strongly toward this fixed, + * read-only shape. + * + *

Strategies compared: + * + *

+ * + *

Lookups are interned (the {@code ==} fast path where a structure has one); misses are short + * and never present. + * + *

Java 17 results (Apple M1, {@code @Fork(2)}, {@code @Threads(8)}; M ops/s = millions): + * + *

{@code
+ * Structure              hit     miss
+ * hashSet               2159     1751    (fastest)
+ * copyOf (SetN)         1946     1633
+ * array                  926      584
+ * sortedArray            664      588
+ * treeSet                642      593
+ * }
+ * + *

Key findings: + * + *

+ */ +@Fork(2) +@Warmup(iterations = 2) +@Measurement(iterations = 3) +@Threads(8) +@State(Scope.Benchmark) +public class ImmutableSetBenchmark { + static final String[] STRINGS = { + "foo", "bar", "baz", "quux", "hello", "world", + "service", "queryString", "lorem", "ipsum", "dolem", "sit" + }; + + /** Distinct String instances that are never present, for the miss path. */ + static final String[] MISSES = newMisses(); + + static String[] newMisses() { + String[] misses = new String[STRINGS.length * 4]; + for (int i = 0; i < misses.length; ++i) { + misses[i] = "dne-" + i; + } + return misses; + } + + // Built once, never mutated -- safe to share across the reader threads. + String[] array; + String[] sortedArray; + HashSet hashSet; + TreeSet treeSet; + Set copyOfSet; + + @Setup(Level.Trial) + public void setUp() { + array = STRINGS; + sortedArray = Arrays.copyOf(STRINGS, STRINGS.length); + Arrays.sort(sortedArray); + hashSet = new HashSet<>(Arrays.asList(STRINGS)); + treeSet = new TreeSet<>(Arrays.asList(STRINGS)); + copyOfSet = CollectionUtils.tryMakeImmutableSet(Arrays.asList(STRINGS)); + } + + /** Per-thread lookup cursor so each reader thread cycles keys independently. */ + @State(Scope.Thread) + public static class Cursor { + int hitIndex = 0; + int missIndex = 0; + + String nextHit() { + int i = hitIndex + 1; + if (i >= STRINGS.length) { + i = 0; + } + hitIndex = i; + return STRINGS[i]; + } + + String nextMiss() { + int i = missIndex + 1; + if (i >= MISSES.length) { + i = 0; + } + missIndex = i; + return MISSES[i]; + } + } + + static boolean arrayContains(String[] array, String needle) { + for (String s : array) { + if (needle.equals(s)) { + return true; + } + } + return false; + } + + @Benchmark + public boolean array_hit(Cursor cursor) { + return arrayContains(array, cursor.nextHit()); + } + + @Benchmark + public boolean array_miss(Cursor cursor) { + return arrayContains(array, cursor.nextMiss()); + } + + @Benchmark + public boolean sortedArray_hit(Cursor cursor) { + return Arrays.binarySearch(sortedArray, cursor.nextHit()) >= 0; + } + + @Benchmark + public boolean sortedArray_miss(Cursor cursor) { + return Arrays.binarySearch(sortedArray, cursor.nextMiss()) >= 0; + } + + @Benchmark + public boolean hashSet_hit(Cursor cursor) { + return hashSet.contains(cursor.nextHit()); + } + + @Benchmark + public boolean hashSet_miss(Cursor cursor) { + return hashSet.contains(cursor.nextMiss()); + } + + @Benchmark + public boolean treeSet_hit(Cursor cursor) { + return treeSet.contains(cursor.nextHit()); + } + + @Benchmark + public boolean treeSet_miss(Cursor cursor) { + return treeSet.contains(cursor.nextMiss()); + } + + @Benchmark + public boolean copyOf_hit(Cursor cursor) { + return copyOfSet.contains(cursor.nextHit()); + } + + @Benchmark + public boolean copyOf_miss(Cursor cursor) { + return copyOfSet.contains(cursor.nextMiss()); + } +} diff --git a/internal-api/src/jmh/java/datadog/trace/util/SetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/SetBenchmark.java deleted file mode 100644 index 144e4748400..00000000000 --- a/internal-api/src/jmh/java/datadog/trace/util/SetBenchmark.java +++ /dev/null @@ -1,128 +0,0 @@ -package datadog.trace.util; - -import java.util.Arrays; -import java.util.Collections; -import java.util.HashSet; -import java.util.TreeSet; -import java.util.concurrent.ThreadLocalRandom; -import java.util.function.Supplier; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Fork; -import org.openjdk.jmh.annotations.Measurement; -import org.openjdk.jmh.annotations.Threads; -import org.openjdk.jmh.annotations.Warmup; - -/** - * - * - *
    - * Benchmark showing possible ways to represent and check if a set includes an elememt... - *
  • (RECOMMENDED) HashSet - on par with TreeSet - idiomatic - *
  • (RECOMMENDED) TreeMap - on par with HashSet - better solution if custom comparator is - * needed (see CaseInsensitiveMapBenchmark) - *
  • array - slower than HashSet - *
  • sortedArray - slowest - slower than array for common case of small arrays - *
- * - * - * MacBook M1 - 8 threads - Java 21 - * 1/3 not found rate - * - * Benchmark Mode Cnt Score Error Units - * SetBenchmark.contains_array thrpt 6 645561886.327 ± 100781717.494 ops/s - * SetBenchmark.contains_hashSet thrpt 6 1536236680.235 ± 114966961.506 ops/s - * SetBenchmark.contains_sortedArray thrpt 6 571476939.441 ± 21334620.460 ops/s - * SetBenchmark.contains_treeSet thrpt 6 1557663759.411 ± 95343683.124 ops/s - * - */ -@Fork(2) -@Warmup(iterations = 2) -@Measurement(iterations = 3) -@Threads(8) -public class SetBenchmark { - static final String[] STRINGS = - new String[] { - "foo", - "bar", - "baz", - "quux", - "hello", - "world", - "service", - "queryString", - "lorem", - "ipsum", - "dolem", - "sit" - }; - - static T init(Supplier supplier) { - return supplier.get(); - } - - static final String[] LOOKUPS = - init( - () -> { - String[] lookups = Arrays.copyOf(STRINGS, STRINGS.length * 10); - - for (int i = 0; i < STRINGS.length; ++i) { - lookups[STRINGS.length + i] = new String(STRINGS[i]); - } - - // 2 / 3 of the key look-ups miss the set - for (int i = STRINGS.length * 2; i < lookups.length; ++i) { - lookups[i] = "dne-" + ThreadLocalRandom.current().nextInt(); - } - - Collections.shuffle(Arrays.asList(lookups)); - return lookups; - }); - - static int sharedLookupIndex = 0; - - static String nextString() { - int localIndex = ++sharedLookupIndex; - if (localIndex >= LOOKUPS.length) { - sharedLookupIndex = localIndex = 0; - } - return LOOKUPS[localIndex]; - } - - static final String[] ARRAY = STRINGS; - - @Benchmark - public boolean contains_array() { - String needle = nextString(); - for (String str : ARRAY) { - if (needle.equals(str)) return true; - } - return false; - } - - static final String[] SORTED_ARRAY = - init( - () -> { - String[] sorted = Arrays.copyOf(STRINGS, STRINGS.length); - Arrays.sort(sorted); - return sorted; - }); - - @Benchmark - public boolean contains_sortedArray() { - return (Arrays.binarySearch(SORTED_ARRAY, nextString()) != -1); - } - - static final HashSet HASH_SET = new HashSet<>(Arrays.asList(STRINGS)); - - @Benchmark - public boolean contains_hashSet() { - return HASH_SET.contains(nextString()); - } - - static final TreeSet TREE_SET = new TreeSet<>(Arrays.asList(STRINGS)); - - @Benchmark - public boolean contains_treeSet() { - return HASH_SET.contains(nextString()); - } -} diff --git a/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java b/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java new file mode 100644 index 00000000000..f9e9b69179a --- /dev/null +++ b/internal-api/src/jmh/java/datadog/trace/util/SingleThreadedSetBenchmark.java @@ -0,0 +1,198 @@ +package datadog.trace.util; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.TreeSet; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +/** + * Single-threaded (uncontended) set usage: each thread builds, reads, and discards its own + * sets. Per-thread state ({@link Scope#Thread}); mirrors {@link SingleThreadedMapBenchmark} on the + * set side. Running at {@code @Threads(8)} keeps allocation / GC interactions visible without lock + * contention. + * + *

Sets in the tracer skew read-only/fixed (see {@link ImmutableSetBenchmark}); this covers the + * mutable-lifecycle case for completeness and — via {@link Collections#synchronizedSet} — the + * uncontended synchronization tax. Because each thread owns its synchronized set, the + * monitor is only ever locked by one thread: biased locking ≈ free on Java ≤ 11, full uncontended + * CAS on Java 15+ (biased locking disabled by default, JEP 374). The unsynchronized {@code hashSet} + * {@code contains}/{@code iterate} methods are the in-harness baseline; the tax is the delta. + * + *

Java 17 results (Apple M1, {@code @Fork(2)}, {@code @Threads(8)}; M ops/s = millions): + * + *

{@code
+ * contains_hashSet            1291
+ * contains_synchronizedSet     808    (~37% slower — the uncontended sync tax)
+ * iterate_hashSet              91
+ * iterate_synchronizedSet      90    (one monitor acquire amortized over the walk)
+ *
+ * create_hashSet         81    clone_hashSet          48
+ * create_hashSet_sized   78    clone_synchronizedSet  47
+ * create_linkedHashSet   61    clone_linkedHashSet    59
+ * create_synchronizedSet 41    clone_treeSet          83
+ * create_treeSet         36
+ * }
+ * + *

Key findings: + * + *

    + *
  • Uncontended synchronization tax on {@code contains} is ~37% (1291 → 808M ops/s) even + * with no contention and biased locking disabled (Java 17, JEP 374) — the full per-lock CAS + * cost. On {@code iterate} it nearly vanishes: a single monitor acquire amortized over the + * traversal. + *
  • Construction: {@code TreeSet} is the slowest to build (~36M); the {@code synchronizedSet} + * wrapper adds a modest cost over plain {@code HashSet}. (Allocation-path numbers carry more + * run-to-run variance than the read paths.) + *
+ */ +@Fork(2) +@Warmup(iterations = 2) +@Measurement(iterations = 3) +@Threads(8) +@State(Scope.Thread) +public class SingleThreadedSetBenchmark { + static final String[] ELEMENTS = { + "foo", "bar", "baz", "quux", "hello", "world", + "service", "queryString", "lorem", "ipsum", "dolem", "sit" + }; + + // Distinct String instances so lookups exercise equals(), not identity. + static final String[] EQUAL_ELEMENTS = newEqualElements(); + + static String[] newEqualElements() { + String[] copies = new String[ELEMENTS.length]; + for (int i = 0; i < ELEMENTS.length; ++i) { + copies[i] = new String(ELEMENTS[i]); + } + return copies; + } + + static void fill(Set set) { + for (String s : ELEMENTS) { + set.add(s); + } + } + + // Per-thread prebuilt sets for the read + clone benchmarks (built once per trial, per thread). + HashSet hashSet; + Set synchronizedSet; + TreeSet treeSet; + LinkedHashSet linkedHashSet; + int index = 0; + + @Setup(Level.Trial) + public void setUp() { + hashSet = new HashSet<>(Arrays.asList(ELEMENTS)); + synchronizedSet = Collections.synchronizedSet(new HashSet<>(hashSet)); + treeSet = new TreeSet<>(Arrays.asList(ELEMENTS)); + linkedHashSet = new LinkedHashSet<>(Arrays.asList(ELEMENTS)); + } + + String nextLookup() { + if (++index >= EQUAL_ELEMENTS.length) { + index = 0; + } + return EQUAL_ELEMENTS[index]; + } + + // ---- construction: build cost + allocation ---- + + @Benchmark + public Set create_hashSet() { + HashSet set = new HashSet<>(); + fill(set); + return set; + } + + @Benchmark + public Set create_hashSet_sized() { + HashSet set = new HashSet<>(ELEMENTS.length); + fill(set); + return set; + } + + @Benchmark + public Set create_synchronizedSet() { + Set set = Collections.synchronizedSet(new HashSet<>()); + fill(set); + return set; + } + + @Benchmark + public Set create_treeSet() { + TreeSet set = new TreeSet<>(); + fill(set); + return set; + } + + @Benchmark + public Set create_linkedHashSet() { + LinkedHashSet set = new LinkedHashSet<>(); + fill(set); + return set; + } + + // ---- copy ---- + + @Benchmark + public Set clone_hashSet() { + return new HashSet<>(hashSet); + } + + @Benchmark + public Set clone_synchronizedSet() { + return Collections.synchronizedSet(new HashSet<>(hashSet)); + } + + @Benchmark + public Set clone_treeSet() { + return new TreeSet<>(treeSet); + } + + @Benchmark + public Set clone_linkedHashSet() { + return new LinkedHashSet<>(linkedHashSet); + } + + // ---- read: unsynchronized baseline vs uncontended synchronized (biased-locking story) ---- + + @Benchmark + public boolean contains_hashSet() { + return hashSet.contains(nextLookup()); + } + + @Benchmark + public boolean contains_synchronizedSet() { + return synchronizedSet.contains(nextLookup()); + } + + @Benchmark + public void iterate_hashSet(Blackhole blackhole) { + for (String s : hashSet) { + blackhole.consume(s); + } + } + + @Benchmark + public void iterate_synchronizedSet(Blackhole blackhole) { + // Collections.synchronizedSet requires the caller to synchronize during iteration; this is the + // correct usage and measures one (uncontended) monitor acquire around the traversal. + synchronized (synchronizedSet) { + for (String s : synchronizedSet) { + blackhole.consume(s); + } + } + } +}