diff --git a/.gitlab/sanitizer-tests/.gitlab-ci.yml b/.gitlab/sanitizer-tests/.gitlab-ci.yml index 2c63425b2..1bf7ca232 100644 --- a/.gitlab/sanitizer-tests/.gitlab-ci.yml +++ b/.gitlab/sanitizer-tests/.gitlab-ci.yml @@ -47,7 +47,6 @@ gtest-asan-amd64: extends: .sanitizer_job - allow_failure: true tags: [ "arch:amd64" ] image: $BUILD_IMAGE_X64 variables: @@ -56,12 +55,10 @@ gtest-asan-amd64: gtest-tsan-amd64: extends: .sanitizer_job - allow_failure: true # docker-in-docker:amd64 = Kata Containers (kata-qemu micro VMs). # Kata maps host-guest communication structures at fixed high addresses # that land in TSan's shadow region regardless of LLVM version or sysctl. # TSan on amd64 requires a non-Kata runner (EC2 or bare metal). - # Kept allow_failure so it runs and provides coverage if the environment is fixed. tags: [ "docker-in-docker:amd64" ] image: $BUILD_IMAGE_X64 variables: @@ -75,12 +72,14 @@ gtest-tsan-amd64: echo "=== $(basename $binary) ===" GTEST_DEATH_TEST_STYLE=threadsafe "$binary" rc=$? - [ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; } + if [ $rc -ne 0 ]; then + echo "FAILED: $(basename $binary) exited $rc" + exit $rc + fi done gtest-asan-arm64: extends: .sanitizer_job - allow_failure: true tags: [ "arch:arm64" ] image: $BUILD_IMAGE_ARM64 variables: @@ -89,7 +88,6 @@ gtest-asan-arm64: gtest-tsan-arm64: extends: .sanitizer_job - allow_failure: true # docker-in-docker:arm64 = EC2 VM. sysctl works directly. # vm.mmap_rnd_bits=28 is sufficient — TSan's LLVM re-exec handles the rare # case where a library lands in the shadow region by re-running the process @@ -111,5 +109,8 @@ gtest-tsan-arm64: echo "=== $(basename $binary) ===" GTEST_DEATH_TEST_STYLE=threadsafe "$binary" rc=$? - [ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; } + if [ $rc -ne 0 ]; then + echo "FAILED: $(basename $binary) exited $rc" + exit $rc + fi done diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt index 275e99321..b2be2b586 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt @@ -250,24 +250,38 @@ object ConfigurationPresets { config.compilerArgs.set(tsanCompilerArgs + commonLinuxCompilerArgs(version)) val libtsan = PlatformUtils.locateLibtsan(compiler) + // Use the library name from the resolved path so that clang's own + // libclang_rt.tsan-.so is linked by name (not as -ltsan). val tsanLinkerArgs = if (libtsan != null) { + val tsanLibDir = File(libtsan).parent + val tsanLibName = File(libtsan).nameWithoutExtension.removePrefix("lib") listOf( - "-L${File(libtsan).parent}", - "-ltsan", + "-L$tsanLibDir", + "-l$tsanLibName", + "-Wl,-rpath,$tsanLibDir", "-fsanitize=thread", "-fno-omit-frame-pointer" ) } else { - emptyList() + listOf("-fsanitize=thread", "-fno-omit-frame-pointer") } config.linkerArgs.set(commonLinuxLinkerArgs() + tsanLinkerArgs) - if (libtsan != null) { - config.testEnvironment.apply { + config.testEnvironment.apply { + if (libtsan != null) { put("LD_PRELOAD", libtsan) - put("TSAN_OPTIONS", "suppressions=$rootDir/gradle/sanitizers/tsan.supp") + // handle_segv=0 / handle_sigbus=0: let the JVM handle these signals + // (SafeFetch, NullPointerException, memory bus errors). + // use_sigaltstack=0: JVM manages its own alternate signal stack. + // halt_on_error=0: report all races; process exits with code 66 at end. + // abort_on_error=0: use exit() not abort() so Java shutdown hooks run. + // io_sync=0: disable TSan's own FD-tracking, which races internally + // when the JVM concurrently closes/reads file descriptors. + put("TSAN_OPTIONS", "handle_segv=0:handle_sigbus=0:use_sigaltstack=0:halt_on_error=0:abort_on_error=0:io_sync=0:suppressions=$rootDir/gradle/sanitizers/tsan.supp") } + // fork() is unsupported under TSan; threadsafe style uses execve instead. + put("GTEST_DEATH_TEST_STYLE", "threadsafe") } } Platform.MACOS -> { diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt index 064ed6ba0..6f7d59ba3 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt @@ -128,13 +128,11 @@ class GtestTaskBuilder( } private fun buildLinkTask(compileTask: TaskProvider): TaskProvider { - // For executables, clang's -fsanitize=address statically embeds the full - // ASan runtime (--whole-archive libclang_rt.asan*.a). Adding an explicit - // -lclang_rt.asan or -lasan on top produces a second dynamic NEEDED entry, - // which triggers "incompatible ASan runtimes" at startup (two __asan_init - // calls). Strip the explicit sanitizer -l/-L/-rpath flags here so the - // executable relies solely on clang's automatic static embedding. - val sanitizerLibPattern = Regex("^(-lasan|-lubsan|-lclang_rt\\.asan.*|-lclang_rt\\.ubsan.*|-L.*/clang.*/|-Wl,-rpath,.*/clang.*/)") + // Strip explicit sanitizer -l/-L/-rpath flags so the executable relies solely + // on clang's automatic static embedding of the sanitizer runtime. Mixing + // clang's statically-embedded runtime with an explicit GCC libtsan/libasan + // causes "incompatible runtimes" at startup (two __tsan_init/__asan_init calls). + val sanitizerLibPattern = Regex("^(-lasan|-lubsan|-ltsan|-lclang_rt\\.asan.*|-lclang_rt\\.ubsan.*|-lclang_rt\\.tsan.*|-L.*/clang.*/|-Wl,-rpath,.*/clang/.*)") val linkerArgs = config.linkerArgs.get().filter { !sanitizerLibPattern.containsMatchIn(it) } val objDir = project.file("${project.layout.buildDirectory.get()}/obj/gtest/${config.name}/$testName") val binary = project.file("${project.layout.buildDirectory.get()}/bin/gtest/${config.name}_$testName/$testName") diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt index 86a187893..b9b38d424 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt @@ -145,7 +145,23 @@ object PlatformUtils { return locateLibrary("libasan", compiler) } - fun locateLibtsan(compiler: String = "gcc"): String? = locateLibrary("libtsan", compiler) + fun locateLibtsan(compiler: String = "gcc"): String? { + if (currentPlatform != Platform.LINUX) return null + // For clang, prefer the architecture-specific clang_rt.tsan library over + // GCC's libtsan. GCC 11's libtsan only handles 39-bit VMA on aarch64; + // clang's compiler-rt tsan handles both 39-bit and 48-bit VMA. + if (compiler.contains("clang")) { + val archSuffix = when (currentArchitecture) { + Architecture.X64 -> "x86_64" + Architecture.ARM64 -> "aarch64" + Architecture.X86 -> "i386" + Architecture.ARM -> "arm" + } + val clangTsan = locateLibrary("libclang_rt.tsan-$archSuffix", compiler) + if (clangTsan != null) return clangTsan + } + return locateLibrary("libtsan", compiler) + } fun checkFuzzerSupport(): Boolean { return try { diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt index 36752d0f9..56827cb28 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt @@ -259,7 +259,11 @@ class ProfilerTestPlugin : Plugin { // https://github.com/eclipse-openj9/openj9/issues/23514 !PlatformUtils.isTestJvmJ9() } - "tsan" -> testTask.onlyIf { PlatformUtils.locateLibtsan() != null } + // TSan + JVM integration tests are incompatible: the profiler's signal + // handlers (SIGPROF at 1ms) are TSan-instrumented; when a signal fires + // while TSan is updating its shadow memory it causes re-entrance and a + // SIGSEGV. TSan coverage is provided by the C++ gtest suite (gtestTsan). + "tsan" -> testTask.onlyIf { false } } } } @@ -347,7 +351,7 @@ class ProfilerTestPlugin : Plugin { // https://github.com/eclipse-openj9/openj9/issues/23514 !PlatformUtils.isTestJvmJ9() } - "tsan" -> execTask.onlyIf { PlatformUtils.locateLibtsan() != null } + "tsan" -> execTask.onlyIf { false } // same reason as testTask above } } } diff --git a/ddprof-lib/src/main/cpp/callTraceHashTable.cpp b/ddprof-lib/src/main/cpp/callTraceHashTable.cpp index 5446e8082..03e0dd980 100644 --- a/ddprof-lib/src/main/cpp/callTraceHashTable.cpp +++ b/ddprof-lib/src/main/cpp/callTraceHashTable.cpp @@ -251,7 +251,10 @@ u64 CallTraceHashTable::put(int num_frames, ASGCT_CallFrame *frames, bool truncated, u64 weight) { u64 hash = calcHash(num_frames, frames, truncated); - LongHashTable *table = _table; + // ACQUIRE pairs with the ACQ_REL CAS in the expansion path below, ensuring + // that if another thread published a new (expanded) table we see its fully + // initialised contents. + LongHashTable *table = __atomic_load_n(&_table, __ATOMIC_ACQUIRE); if (table == nullptr) { // Table allocation failed or was cleared - drop sample Counters::increment(CALLTRACE_STORAGE_DROPPED); diff --git a/ddprof-lib/src/main/cpp/refCountGuard.cpp b/ddprof-lib/src/main/cpp/refCountGuard.cpp index 3b91bea08..583c3d8d4 100644 --- a/ddprof-lib/src/main/cpp/refCountGuard.cpp +++ b/ddprof-lib/src/main/cpp/refCountGuard.cpp @@ -170,7 +170,14 @@ void RefCountGuard::waitForRefCountToClear(void* table_to_delete) { bool all_clear = true; for (int i = 0; i < MAX_THREADS; ++i) { uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE); - if (count == 0) continue; + if (count == 0) { + // Check active_ptr to cover the non-reentrant constructor's activation window: + // active_ptr is stored (RELEASE) before count++ (RELEASE), so count==0 with + // active_ptr set means the thread is in the window and must be waited for. + void* aptr = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE); + if (aptr == table_to_delete) { all_clear = false; break; } + continue; + } if (slotReferences(refcount_slots[i], table_to_delete)) { all_clear = false; break; } } if (all_clear) return; @@ -183,7 +190,11 @@ void RefCountGuard::waitForRefCountToClear(void* table_to_delete) { bool all_clear = true; for (int i = 0; i < MAX_THREADS; ++i) { uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE); - if (count == 0) continue; + if (count == 0) { + void* aptr = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE); + if (aptr == table_to_delete) { all_clear = false; break; } + continue; + } if (slotReferences(refcount_slots[i], table_to_delete)) { all_clear = false; break; } } if (all_clear) return; @@ -208,6 +219,10 @@ void RefCountGuard::waitForAllRefCountsToClear() { bool any = false; for (int i = 0; i < MAX_THREADS; ++i) { if (__atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE) > 0) { any = true; break; } + // Also check active_ptr: non-reentrant constructor stores it (RELEASE) before + // count++ (RELEASE), so count==0 with active_ptr!=null means the thread is in + // the activation window and must be waited for. + if (__atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE) != nullptr) { any = true; break; } } if (!any) return; spinPause(); @@ -215,11 +230,12 @@ void RefCountGuard::waitForAllRefCountsToClear() { const int MAX_WAIT_ITERATIONS = 5000; struct timespec sleep_time = {0, 100000}; - int last_nonzero_slot = -1; // for timeout diagnostic below + int last_nonzero_slot = -1; for (int wait_count = 0; wait_count < MAX_WAIT_ITERATIONS; ++wait_count) { bool any = false; for (int i = 0; i < MAX_THREADS; ++i) { if (__atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE) > 0) { any = true; last_nonzero_slot = i; break; } + if (__atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE) != nullptr) { any = true; last_nonzero_slot = i; break; } } if (!any) return; nanosleep(&sleep_time, nullptr); diff --git a/ddprof-lib/src/main/cpp/refCountGuard.h b/ddprof-lib/src/main/cpp/refCountGuard.h index 7b2e1c0fd..5f5f25ad5 100644 --- a/ddprof-lib/src/main/cpp/refCountGuard.h +++ b/ddprof-lib/src/main/cpp/refCountGuard.h @@ -15,17 +15,16 @@ * Each slot occupies a full cache line (64 bytes) to eliminate false sharing. * * ACTIVATION PROTOCOL (pointer-first): - * - Constructor: store active_ptr first, then increment count - * - Destructor: decrement count first, then clear active_ptr - * - Scanner: check count; if 0, skip slot (treats it as inactive) + * - Constructor: store active_ptr (RELEASE) first, then increment count (RELEASE) + * - Destructor: decrement count (RELEASE) first, then clear active_ptr (RELEASE) + * - Scanner: load count (ACQUIRE); if 0, also load active_ptr (ACQUIRE); + * treat the slot as active if either count > 0 or active_ptr != null * - * There is a brief activation window between the store of active_ptr and the - * increment of count where the scanner sees count=0 and may skip the slot. - * For signal handlers this window is never observed in practice: handlers - * complete within microseconds while a buffer can only be cleared after TWO - * full dump cycles (typically 60+ seconds). If the window were hit, the caller - * observes a miss (e.g. 0 or equivalent sentinel) and handles it gracefully - * — a dropped trace or generic vtable frame, not a crash. + * The scanner checks active_ptr even when count==0 to cover the non-reentrant + * constructor's activation window (active_ptr stored but count not yet incremented) + * and the destructor's deactivation window (count decremented but active_ptr not yet + * cleared). The RELEASE/ACQUIRE pairing on active_ptr itself guarantees the scanner + * sees the stored value if it was written before the load in program order. * * REENTRANT NESTING: when a signal fires inside an outer guard (same thread), * the displaced active_ptr is parked in outer_stack[] so the scanner can still @@ -64,7 +63,7 @@ struct alignas(DEFAULT_CACHE_LINE_SIZE) RefCountSlot { * Correctness: * - Pointer stored BEFORE count increment (activation) * - Count decremented BEFORE pointer cleared (deactivation) - * - Scanner checks count first, ensuring consistent view + * - Scanner checks both count and active_ptr to close the activation window * * Reentrancy: * - A signal handler may create a RefCountGuard while a JNI thread already diff --git a/ddprof-lib/src/main/cpp/stringDictionary.h b/ddprof-lib/src/main/cpp/stringDictionary.h index 98c905faf..05bb62d3d 100644 --- a/ddprof-lib/src/main/cpp/stringDictionary.h +++ b/ddprof-lib/src/main/cpp/stringDictionary.h @@ -347,7 +347,10 @@ class StringDictionaryBuffer { return stored_id; } } - if (!row->next) { + // Relaxed is fine here: the optimization hint may be stale; the CAS + // below will handle that, and the ACQUIRE load of row->next below + // provides the necessary happens-before for the newly-created SBTable's contents. + if (!__atomic_load_n(&row->next, __ATOMIC_RELAXED)) { SBTable* nt = static_cast(calloc(1, sizeof(SBTable))); if (nt == nullptr) return 0; if (!__sync_bool_compare_and_swap(&row->next, nullptr, nt)) { diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h index e31c18bba..ff28e3550 100644 --- a/ddprof-lib/src/main/cpp/thread.h +++ b/ddprof-lib/src/main/cpp/thread.h @@ -87,11 +87,18 @@ class ProfiledThread : public ThreadLocalData { #ifdef UNIT_TEST // Simulates the moment inside release() after pthread_setspecific(NULL) but // before delete — the race window the clearCurrentThreadTLS fix covers. - static void clearCurrentThreadTLS() { + // Returns the detached pointer so the caller can delete it after assertions. + static ProfiledThread* clearCurrentThreadTLS() { if (__atomic_load_n(&_tls_key_initialized, __ATOMIC_ACQUIRE)) { + ProfiledThread *pt = (ProfiledThread *)pthread_getspecific(_tls_key); pthread_setspecific(_tls_key, nullptr); + return pt; } + return nullptr; } + // Deletes a ProfiledThread returned by clearCurrentThreadTLS(). + // Needed because the destructor is private. + static void deleteForTest(ProfiledThread *pt) { delete pt; } #endif static ProfiledThread *current(); diff --git a/ddprof-lib/src/test/cpp/refCountGuard_ut.cpp b/ddprof-lib/src/test/cpp/refCountGuard_ut.cpp new file mode 100644 index 000000000..6fac0bbe2 --- /dev/null +++ b/ddprof-lib/src/test/cpp/refCountGuard_ut.cpp @@ -0,0 +1,142 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "refCountGuard.h" +#include +#include +#include +#include +#include + +// ── Activation-window regression tests ──────────────────────────────────── +// +// The non-reentrant RefCountGuard constructor stores active_ptr (RELEASE) +// BEFORE incrementing count (RELEASE). Between those two stores the slot has +// count==0 but active_ptr!=null: the "activation window". +// +// Both wait functions must also check active_ptr when count==0, otherwise they +// return early and the caller can free the resource while the holder is still in +// the window and will access it after the free (heap-use-after-free). +// +// These tests exercise exactly that scenario: +// 1. Holder stores active_ptr, sleeps to keep the window open. +// 2. Cleaner calls the wait function, then frees the resource. +// 3. Holder wakes, increments count (exits window), writes to the resource. +// +// Without the fix: cleaner's wait returns immediately (count==0 → skip slot), +// freeing the resource before holder uses it. +// ASAN reports: heap-use-after-free on holder's write. +// TSAN reports: data race / use-after-free between the delete and the write. +// +// With the fix: cleaner's wait observes active_ptr!=null and blocks until +// holder clears it (after finishing the write), so the free is safe. +// +// NOTE: These tests bypass getThreadRefCountSlot() and manipulate +// refcount_slots[] directly. They use the last slot (MAX_THREADS-1) which +// is unlikely to be claimed by any real thread during the test, and always +// restore it to the clean state on exit. + +class RefCountGuardActivationWindowTest : public ::testing::Test { +protected: + static constexpr int TEST_SLOT = RefCountGuard::MAX_THREADS - 1; + + void SetUp() override { + auto& slot = RefCountGuard::refcount_slots[TEST_SLOT]; + __atomic_store_n(&slot.count, 0u, __ATOMIC_SEQ_CST); + __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_SEQ_CST); + for (int i = 0; i < RefCountSlot::OUTER_STACK_DEPTH; ++i) { + __atomic_store_n(&slot.outer_stack[i], nullptr, __ATOMIC_SEQ_CST); + } + } + + void TearDown() override { + // Restore to clean state regardless of test outcome. + auto& slot = RefCountGuard::refcount_slots[TEST_SLOT]; + __atomic_store_n(&slot.count, 0u, __ATOMIC_SEQ_CST); + __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_SEQ_CST); + } +}; + +// waitForAllRefCountsToClear must not return while a slot is in the activation window. +TEST_F(RefCountGuardActivationWindowTest, WaitForAllBlocksDuringActivationWindow) { + auto& slot = RefCountGuard::refcount_slots[TEST_SLOT]; + + char* resource = new char[64]; + std::memset(resource, 0xAB, 64); + + // Keep a stable pointer for the holder's write so the compiler cannot fold it + // away even after resource is set to nullptr by the cleaner. + volatile char* stable = resource; + + std::atomic window_entered{false}; + + std::thread holder([&] { + // Activation window start: store active_ptr (RELEASE), count still 0. + __atomic_store_n(&slot.active_ptr, resource, __ATOMIC_RELEASE); + window_entered.store(true, std::memory_order_release); + + // Hold the window open long enough for the cleaner to enter its wait loop. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + // Activation window end: count++ (RELEASE). + __atomic_fetch_add(&slot.count, 1u, __ATOMIC_RELEASE); + + // Write to resource. Under the bug this is heap-use-after-free (ASAN/TSAN catch it). + *stable = 0xCD; + + // Destroy guard: count--, active_ptr = null. + __atomic_fetch_sub(&slot.count, 1u, __ATOMIC_RELEASE); + __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_RELEASE); + }); + + // Ensure holder is in the activation window before the cleaner proceeds. + while (!window_entered.load(std::memory_order_acquire)) { /* spin */ } + + // With the fix: blocks until holder clears active_ptr (after the write). + // Without the fix: returns immediately because count==0. + RefCountGuard::waitForAllRefCountsToClear(); + + // Free the resource. With the fix the holder has already finished writing. + // Without the fix the holder is still asleep and will write to freed memory. + delete[] resource; + resource = nullptr; + + holder.join(); +} + +// waitForRefCountToClear(ptr) must not return while the target ptr is in the activation window. +TEST_F(RefCountGuardActivationWindowTest, WaitForSpecificBlocksDuringActivationWindow) { + auto& slot = RefCountGuard::refcount_slots[TEST_SLOT]; + + char* resource = new char[64]; + std::memset(resource, 0xAB, 64); + + volatile char* stable = resource; + + std::atomic window_entered{false}; + + std::thread holder([&] { + __atomic_store_n(&slot.active_ptr, resource, __ATOMIC_RELEASE); + window_entered.store(true, std::memory_order_release); + + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + __atomic_fetch_add(&slot.count, 1u, __ATOMIC_RELEASE); + + *stable = 0xCD; + + __atomic_fetch_sub(&slot.count, 1u, __ATOMIC_RELEASE); + __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_RELEASE); + }); + + while (!window_entered.load(std::memory_order_acquire)) { /* spin */ } + + RefCountGuard::waitForRefCountToClear(resource); + + delete[] resource; + resource = nullptr; + + holder.join(); +} diff --git a/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp b/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp index a0dc97934..545ee172f 100644 --- a/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp +++ b/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp @@ -114,7 +114,7 @@ static void *t02_body(void *) { g_t02_seen.store(kNotYetRun, std::memory_order_relaxed); // Simulate the race window: TLS cleared but object not yet freed. - ProfiledThread::clearCurrentThreadTLS(); + ProfiledThread *detached = ProfiledThread::clearCurrentThreadTLS(); // Signal delivered in the race window must see null, not a dangling pointer. pthread_kill(pthread_self(), SIGVTALRM); @@ -123,6 +123,9 @@ static void *t02_body(void *) { // release() with TLS already null must not double-free. ProfiledThread::release(); + // Complete the simulated teardown: delete the object (mirrors what freeKey + // would do). Destructor is private so we need the test helper. + ProfiledThread::deleteForTest(detached); return nullptr; } diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java index 3648a7c5f..e3c4dfd1f 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java @@ -56,7 +56,7 @@ private int profiledWork(Shape... shapes) { // is dropped, the receiver class name will not appear next to a vtable stub in JFR. @RetryingTest(5) public void testVtableReceiverFrameInCpuSamples() throws Exception { - Assumptions.assumeFalse(Platform.isZing() || Platform.isJ9()); + Assumptions.assumeFalse(Platform.isZing() || Platform.isJ9() || Platform.isGraal()); waitForProfilerReady(2000); int result = profiledWork(new Circle(), new Square(), new Triangle()); System.err.println(result); diff --git a/gradle/sanitizers/tsan.supp b/gradle/sanitizers/tsan.supp index cf1012df0..91b8999af 100644 --- a/gradle/sanitizers/tsan.supp +++ b/gradle/sanitizers/tsan.supp @@ -8,6 +8,10 @@ race:libj9*.so race:libjli.so # Suppress data races in libz.so race:libz.so +# Suppress races in JDK NIO/IO/java.so: file-descriptor management in the JVM +# is not TSan-clean (concurrent close vs read on the same fd). +race:libnio.so +race:libjava.so # # Suppress data race in G1Allocator::survivor_attempt_allocation diff --git a/utils/run-docker-tests.sh b/utils/run-docker-tests.sh index a29f95192..bfddc95c4 100755 --- a/utils/run-docker-tests.sh +++ b/utils/run-docker-tests.sh @@ -315,6 +315,25 @@ EOF else CLANG_RT_PKG="" fi + # On aarch64, install clang-17 + libclang-rt-17-dev from the LLVM apt repository. + # GCC 11's libtsan only knows 39-bit VMA; the linuxkit kernel (Docker Desktop) + # uses 48-bit VMA, so GCC 11's TSan crashes with "unexpected memory mapping". + # Clang-17's compiler-rt TSan runtime handles both 39-bit and 48-bit VMA. + # libclang-rt-17-dev provides libclang_rt.tsan-aarch64.{a,so} that clang-17 + # needs when linking executables with -fsanitize=thread. + if [[ "$ARCH" == "aarch64" ]]; then + CLANG17_BLOCK=' +# Install clang-17 + compiler-rt for TSan 48-bit VMA support (aarch64 only) +RUN wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key \ + | gpg --dearmor -o /etc/apt/trusted.gpg.d/llvm.gpg \ + && echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main" \ + > /etc/apt/sources.list.d/llvm.list \ + && apt-get update \ + && apt-get install -y --no-install-recommends clang-17 libclang-rt-17-dev \ + && rm -rf /var/lib/apt/lists/*' + else + CLANG17_BLOCK='' + fi cat > "$DOCKERFILE_DIR/Dockerfile.base" <