diff --git a/.gitlab/sanitizer-tests/.gitlab-ci.yml b/.gitlab/sanitizer-tests/.gitlab-ci.yml
index 2c63425b2..1bf7ca232 100644
--- a/.gitlab/sanitizer-tests/.gitlab-ci.yml
+++ b/.gitlab/sanitizer-tests/.gitlab-ci.yml
@@ -47,7 +47,6 @@
 
 gtest-asan-amd64:
   extends: .sanitizer_job
-  allow_failure: true
   tags: [ "arch:amd64" ]
   image: $BUILD_IMAGE_X64
   variables:
@@ -56,12 +55,10 @@ gtest-asan-amd64:
 
 gtest-tsan-amd64:
   extends: .sanitizer_job
-  allow_failure: true
   # docker-in-docker:amd64 = Kata Containers (kata-qemu micro VMs).
   # Kata maps host-guest communication structures at fixed high addresses
   # that land in TSan's shadow region regardless of LLVM version or sysctl.
   # TSan on amd64 requires a non-Kata runner (EC2 or bare metal).
-  # Kept allow_failure so it runs and provides coverage if the environment is fixed.
   tags: [ "docker-in-docker:amd64" ]
   image: $BUILD_IMAGE_X64
   variables:
@@ -75,12 +72,14 @@ gtest-tsan-amd64:
           echo "=== $(basename $binary) ==="
           GTEST_DEATH_TEST_STYLE=threadsafe "$binary"
           rc=$?
-          [ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; }
+          if [ $rc -ne 0 ]; then
+            echo "FAILED: $(basename $binary) exited $rc"
+            exit $rc
+          fi
         done
 
 gtest-asan-arm64:
   extends: .sanitizer_job
-  allow_failure: true
   tags: [ "arch:arm64" ]
   image: $BUILD_IMAGE_ARM64
   variables:
@@ -89,7 +88,6 @@ gtest-asan-arm64:
 
 gtest-tsan-arm64:
   extends: .sanitizer_job
-  allow_failure: true
   # docker-in-docker:arm64 = EC2 VM. sysctl works directly.
   # vm.mmap_rnd_bits=28 is sufficient — TSan's LLVM re-exec handles the rare
   # case where a library lands in the shadow region by re-running the process
@@ -111,5 +109,8 @@ gtest-tsan-arm64:
           echo "=== $(basename $binary) ==="
           GTEST_DEATH_TEST_STYLE=threadsafe "$binary"
           rc=$?
-          [ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; }
+          if [ $rc -ne 0 ]; then
+            echo "FAILED: $(basename $binary) exited $rc"
+            exit $rc
+          fi
         done
diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt
index 275e99321..b2be2b586 100644
--- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt
+++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt
@@ -250,24 +250,38 @@ object ConfigurationPresets {
                 config.compilerArgs.set(tsanCompilerArgs + commonLinuxCompilerArgs(version))
 
                 val libtsan = PlatformUtils.locateLibtsan(compiler)
+                // Use the library name from the resolved path so that clang's own
+                // libclang_rt.tsan-<arch>.so is linked by name (not as -ltsan).
                 val tsanLinkerArgs = if (libtsan != null) {
+                    val tsanLibDir = File(libtsan).parent
+                    val tsanLibName = File(libtsan).nameWithoutExtension.removePrefix("lib")
                     listOf(
-                        "-L${File(libtsan).parent}",
-                        "-ltsan",
+                        "-L$tsanLibDir",
+                        "-l$tsanLibName",
+                        "-Wl,-rpath,$tsanLibDir",
                         "-fsanitize=thread",
                         "-fno-omit-frame-pointer"
                     )
                 } else {
-                    emptyList()
+                    listOf("-fsanitize=thread", "-fno-omit-frame-pointer")
                 }
 
                 config.linkerArgs.set(commonLinuxLinkerArgs() + tsanLinkerArgs)
 
-                if (libtsan != null) {
-                    config.testEnvironment.apply {
+                config.testEnvironment.apply {
+                    if (libtsan != null) {
                         put("LD_PRELOAD", libtsan)
-                        put("TSAN_OPTIONS", "suppressions=$rootDir/gradle/sanitizers/tsan.supp")
+                        // handle_segv=0 / handle_sigbus=0: let the JVM handle these signals
+                        //   (SafeFetch, NullPointerException, memory bus errors).
+                        // use_sigaltstack=0: JVM manages its own alternate signal stack.
+                        // halt_on_error=0: report all races; process exits with code 66 at end.
+                        // abort_on_error=0: use exit() not abort() so Java shutdown hooks run.
+                        // io_sync=0: disable TSan's own FD-tracking, which races internally
+                        //   when the JVM concurrently closes/reads file descriptors.
+                        put("TSAN_OPTIONS", "handle_segv=0:handle_sigbus=0:use_sigaltstack=0:halt_on_error=0:abort_on_error=0:io_sync=0:suppressions=$rootDir/gradle/sanitizers/tsan.supp")
                     }
+                    // fork() is unsupported under TSan; threadsafe style uses execve instead.
+                    put("GTEST_DEATH_TEST_STYLE", "threadsafe")
                 }
             }
             Platform.MACOS -> {
diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt
index 064ed6ba0..6f7d59ba3 100644
--- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt
+++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt
@@ -128,13 +128,11 @@ class GtestTaskBuilder(
     }
 
     private fun buildLinkTask(compileTask: TaskProvider<NativeCompileTask>): TaskProvider<NativeLinkExecutableTask> {
-        // For executables, clang's -fsanitize=address statically embeds the full
-        // ASan runtime (--whole-archive libclang_rt.asan*.a). Adding an explicit
-        // -lclang_rt.asan or -lasan on top produces a second dynamic NEEDED entry,
-        // which triggers "incompatible ASan runtimes" at startup (two __asan_init
-        // calls). Strip the explicit sanitizer -l/-L/-rpath flags here so the
-        // executable relies solely on clang's automatic static embedding.
-        val sanitizerLibPattern = Regex("^(-lasan|-lubsan|-lclang_rt\\.asan.*|-lclang_rt\\.ubsan.*|-L.*/clang.*/|-Wl,-rpath,.*/clang.*/)")
+        // Strip explicit sanitizer -l/-L/-rpath flags so the executable relies solely
+        // on clang's automatic static embedding of the sanitizer runtime. Mixing
+        // clang's statically-embedded runtime with an explicit GCC libtsan/libasan
+        // causes "incompatible runtimes" at startup (two __tsan_init/__asan_init calls).
+        val sanitizerLibPattern = Regex("^(-lasan|-lubsan|-ltsan|-lclang_rt\\.asan.*|-lclang_rt\\.ubsan.*|-lclang_rt\\.tsan.*|-L.*/clang.*/|-Wl,-rpath,.*/clang/.*)")
         val linkerArgs = config.linkerArgs.get().filter { !sanitizerLibPattern.containsMatchIn(it) }
         val objDir = project.file("${project.layout.buildDirectory.get()}/obj/gtest/${config.name}/$testName")
         val binary = project.file("${project.layout.buildDirectory.get()}/bin/gtest/${config.name}_$testName/$testName")
diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt
index 86a187893..b9b38d424 100644
--- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt
+++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt
@@ -145,7 +145,23 @@ object PlatformUtils {
         return locateLibrary("libasan", compiler)
     }
 
-    fun locateLibtsan(compiler: String = "gcc"): String? = locateLibrary("libtsan", compiler)
+    fun locateLibtsan(compiler: String = "gcc"): String? {
+        if (currentPlatform != Platform.LINUX) return null
+        // For clang, prefer the architecture-specific clang_rt.tsan library over
+        // GCC's libtsan. GCC 11's libtsan only handles 39-bit VMA on aarch64;
+        // clang's compiler-rt tsan handles both 39-bit and 48-bit VMA.
+        if (compiler.contains("clang")) {
+            val archSuffix = when (currentArchitecture) {
+                Architecture.X64 -> "x86_64"
+                Architecture.ARM64 -> "aarch64"
+                Architecture.X86 -> "i386"
+                Architecture.ARM -> "arm"
+            }
+            val clangTsan = locateLibrary("libclang_rt.tsan-$archSuffix", compiler)
+            if (clangTsan != null) return clangTsan
+        }
+        return locateLibrary("libtsan", compiler)
+    }
 
     fun checkFuzzerSupport(): Boolean {
         return try {
diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt
index 36752d0f9..56827cb28 100644
--- a/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt
+++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt
@@ -259,7 +259,11 @@ class ProfilerTestPlugin : Plugin<Project> {
                     // https://github.com/eclipse-openj9/openj9/issues/23514
                     !PlatformUtils.isTestJvmJ9()
                 }
-                "tsan" -> testTask.onlyIf { PlatformUtils.locateLibtsan() != null }
+                // TSan + JVM integration tests are incompatible: the profiler's signal
+                // handlers (SIGPROF at 1ms) are TSan-instrumented; when a signal fires
+                // while TSan is updating its shadow memory it causes re-entrance and a
+                // SIGSEGV.  TSan coverage is provided by the C++ gtest suite (gtestTsan).
+                "tsan" -> testTask.onlyIf { false }
             }
         }
     }
@@ -347,7 +351,7 @@ class ProfilerTestPlugin : Plugin<Project> {
                     // https://github.com/eclipse-openj9/openj9/issues/23514
                     !PlatformUtils.isTestJvmJ9()
                 }
-                "tsan" -> execTask.onlyIf { PlatformUtils.locateLibtsan() != null }
+                "tsan" -> execTask.onlyIf { false }  // same reason as testTask above
             }
         }
     }
diff --git a/ddprof-lib/src/main/cpp/callTraceHashTable.cpp b/ddprof-lib/src/main/cpp/callTraceHashTable.cpp
index 5446e8082..03e0dd980 100644
--- a/ddprof-lib/src/main/cpp/callTraceHashTable.cpp
+++ b/ddprof-lib/src/main/cpp/callTraceHashTable.cpp
@@ -251,7 +251,10 @@ u64 CallTraceHashTable::put(int num_frames, ASGCT_CallFrame *frames,
                           bool truncated, u64 weight) {
   u64 hash = calcHash(num_frames, frames, truncated);
 
-  LongHashTable *table = _table;
+  // ACQUIRE pairs with the ACQ_REL CAS in the expansion path below, ensuring
+  // that if another thread published a new (expanded) table we see its fully
+  // initialised contents.
+  LongHashTable *table = __atomic_load_n(&_table, __ATOMIC_ACQUIRE);
   if (table == nullptr) {
     // Table allocation failed or was cleared - drop sample
     Counters::increment(CALLTRACE_STORAGE_DROPPED);
diff --git a/ddprof-lib/src/main/cpp/refCountGuard.cpp b/ddprof-lib/src/main/cpp/refCountGuard.cpp
index 3b91bea08..583c3d8d4 100644
--- a/ddprof-lib/src/main/cpp/refCountGuard.cpp
+++ b/ddprof-lib/src/main/cpp/refCountGuard.cpp
@@ -170,7 +170,14 @@ void RefCountGuard::waitForRefCountToClear(void* table_to_delete) {
         bool all_clear = true;
         for (int i = 0; i < MAX_THREADS; ++i) {
             uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
-            if (count == 0) continue;
+            if (count == 0) {
+                // Check active_ptr to cover the non-reentrant constructor's activation window:
+                // active_ptr is stored (RELEASE) before count++ (RELEASE), so count==0 with
+                // active_ptr set means the thread is in the window and must be waited for.
+                void* aptr = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE);
+                if (aptr == table_to_delete) { all_clear = false; break; }
+                continue;
+            }
             if (slotReferences(refcount_slots[i], table_to_delete)) { all_clear = false; break; }
         }
         if (all_clear) return;
@@ -183,7 +190,11 @@ void RefCountGuard::waitForRefCountToClear(void* table_to_delete) {
         bool all_clear = true;
         for (int i = 0; i < MAX_THREADS; ++i) {
             uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
-            if (count == 0) continue;
+            if (count == 0) {
+                void* aptr = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE);
+                if (aptr == table_to_delete) { all_clear = false; break; }
+                continue;
+            }
             if (slotReferences(refcount_slots[i], table_to_delete)) { all_clear = false; break; }
         }
         if (all_clear) return;
@@ -208,6 +219,10 @@ void RefCountGuard::waitForAllRefCountsToClear() {
         bool any = false;
         for (int i = 0; i < MAX_THREADS; ++i) {
             if (__atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE) > 0) { any = true; break; }
+            // Also check active_ptr: non-reentrant constructor stores it (RELEASE) before
+            // count++ (RELEASE), so count==0 with active_ptr!=null means the thread is in
+            // the activation window and must be waited for.
+            if (__atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE) != nullptr) { any = true; break; }
         }
         if (!any) return;
         spinPause();
@@ -215,11 +230,12 @@ void RefCountGuard::waitForAllRefCountsToClear() {
 
     const int MAX_WAIT_ITERATIONS = 5000;
     struct timespec sleep_time = {0, 100000};
-    int last_nonzero_slot = -1;  // for timeout diagnostic below
+    int last_nonzero_slot = -1;
     for (int wait_count = 0; wait_count < MAX_WAIT_ITERATIONS; ++wait_count) {
         bool any = false;
         for (int i = 0; i < MAX_THREADS; ++i) {
             if (__atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE) > 0) { any = true; last_nonzero_slot = i; break; }
+            if (__atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE) != nullptr) { any = true; last_nonzero_slot = i; break; }
         }
         if (!any) return;
         nanosleep(&sleep_time, nullptr);
diff --git a/ddprof-lib/src/main/cpp/refCountGuard.h b/ddprof-lib/src/main/cpp/refCountGuard.h
index 7b2e1c0fd..5f5f25ad5 100644
--- a/ddprof-lib/src/main/cpp/refCountGuard.h
+++ b/ddprof-lib/src/main/cpp/refCountGuard.h
@@ -15,17 +15,16 @@
  * Each slot occupies a full cache line (64 bytes) to eliminate false sharing.
  *
  * ACTIVATION PROTOCOL (pointer-first):
- * - Constructor: store active_ptr first, then increment count
- * - Destructor:  decrement count first, then clear active_ptr
- * - Scanner:     check count; if 0, skip slot (treats it as inactive)
+ * - Constructor: store active_ptr (RELEASE) first, then increment count (RELEASE)
+ * - Destructor:  decrement count (RELEASE) first, then clear active_ptr (RELEASE)
+ * - Scanner:     load count (ACQUIRE); if 0, also load active_ptr (ACQUIRE);
+ *                treat the slot as active if either count > 0 or active_ptr != null
  *
- * There is a brief activation window between the store of active_ptr and the
- * increment of count where the scanner sees count=0 and may skip the slot.
- * For signal handlers this window is never observed in practice: handlers
- * complete within microseconds while a buffer can only be cleared after TWO
- * full dump cycles (typically 60+ seconds).  If the window were hit, the caller
- * observes a miss (e.g. 0 or equivalent sentinel) and handles it gracefully
- * — a dropped trace or generic vtable frame, not a crash.
+ * The scanner checks active_ptr even when count==0 to cover the non-reentrant
+ * constructor's activation window (active_ptr stored but count not yet incremented)
+ * and the destructor's deactivation window (count decremented but active_ptr not yet
+ * cleared).  The RELEASE/ACQUIRE pairing on active_ptr itself guarantees the scanner
+ * sees the stored value if it was written before the load in program order.
  *
  * REENTRANT NESTING: when a signal fires inside an outer guard (same thread),
  * the displaced active_ptr is parked in outer_stack[] so the scanner can still
@@ -64,7 +63,7 @@ struct alignas(DEFAULT_CACHE_LINE_SIZE) RefCountSlot {
  * Correctness:
  * - Pointer stored BEFORE count increment (activation)
  * - Count decremented BEFORE pointer cleared (deactivation)
- * - Scanner checks count first, ensuring consistent view
+ * - Scanner checks both count and active_ptr to close the activation window
  *
  * Reentrancy:
  * - A signal handler may create a RefCountGuard while a JNI thread already
diff --git a/ddprof-lib/src/main/cpp/stringDictionary.h b/ddprof-lib/src/main/cpp/stringDictionary.h
index 98c905faf..05bb62d3d 100644
--- a/ddprof-lib/src/main/cpp/stringDictionary.h
+++ b/ddprof-lib/src/main/cpp/stringDictionary.h
@@ -347,7 +347,10 @@ class StringDictionaryBuffer {
                     return stored_id;
                 }
             }
-            if (!row->next) {
+            // Relaxed is fine here: the optimization hint may be stale; the CAS
+            // below will handle that, and the ACQUIRE load of row->next below
+            // provides the necessary happens-before for the newly-created SBTable's contents.
+            if (!__atomic_load_n(&row->next, __ATOMIC_RELAXED)) {
                 SBTable* nt = static_cast<SBTable*>(calloc(1, sizeof(SBTable)));
                 if (nt == nullptr) return 0;
                 if (!__sync_bool_compare_and_swap(&row->next, nullptr, nt)) {
diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h
index e31c18bba..ff28e3550 100644
--- a/ddprof-lib/src/main/cpp/thread.h
+++ b/ddprof-lib/src/main/cpp/thread.h
@@ -87,11 +87,18 @@ class ProfiledThread : public ThreadLocalData {
 #ifdef UNIT_TEST
   // Simulates the moment inside release() after pthread_setspecific(NULL) but
   // before delete — the race window the clearCurrentThreadTLS fix covers.
-  static void clearCurrentThreadTLS() {
+  // Returns the detached pointer so the caller can delete it after assertions.
+  static ProfiledThread* clearCurrentThreadTLS() {
     if (__atomic_load_n(&_tls_key_initialized, __ATOMIC_ACQUIRE)) {
+      ProfiledThread *pt = (ProfiledThread *)pthread_getspecific(_tls_key);
       pthread_setspecific(_tls_key, nullptr);
+      return pt;
     }
+    return nullptr;
   }
+  // Deletes a ProfiledThread returned by clearCurrentThreadTLS().
+  // Needed because the destructor is private.
+  static void deleteForTest(ProfiledThread *pt) { delete pt; }
 #endif
 
   static ProfiledThread *current();
diff --git a/ddprof-lib/src/test/cpp/refCountGuard_ut.cpp b/ddprof-lib/src/test/cpp/refCountGuard_ut.cpp
new file mode 100644
index 000000000..6fac0bbe2
--- /dev/null
+++ b/ddprof-lib/src/test/cpp/refCountGuard_ut.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2026, Datadog, Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "refCountGuard.h"
+#include <atomic>
+#include <chrono>
+#include <cstring>
+#include <gtest/gtest.h>
+#include <thread>
+
+// ── Activation-window regression tests ────────────────────────────────────
+//
+// The non-reentrant RefCountGuard constructor stores active_ptr (RELEASE)
+// BEFORE incrementing count (RELEASE).  Between those two stores the slot has
+// count==0 but active_ptr!=null: the "activation window".
+//
+// Both wait functions must also check active_ptr when count==0, otherwise they
+// return early and the caller can free the resource while the holder is still in
+// the window and will access it after the free (heap-use-after-free).
+//
+// These tests exercise exactly that scenario:
+//   1. Holder stores active_ptr, sleeps to keep the window open.
+//   2. Cleaner calls the wait function, then frees the resource.
+//   3. Holder wakes, increments count (exits window), writes to the resource.
+//
+// Without the fix: cleaner's wait returns immediately (count==0 → skip slot),
+//   freeing the resource before holder uses it.
+//   ASAN reports: heap-use-after-free on holder's write.
+//   TSAN reports: data race / use-after-free between the delete and the write.
+//
+// With the fix: cleaner's wait observes active_ptr!=null and blocks until
+//   holder clears it (after finishing the write), so the free is safe.
+//
+// NOTE: These tests bypass getThreadRefCountSlot() and manipulate
+//   refcount_slots[] directly.  They use the last slot (MAX_THREADS-1) which
+//   is unlikely to be claimed by any real thread during the test, and always
+//   restore it to the clean state on exit.
+
+class RefCountGuardActivationWindowTest : public ::testing::Test {
+protected:
+    static constexpr int TEST_SLOT = RefCountGuard::MAX_THREADS - 1;
+
+    void SetUp() override {
+        auto& slot = RefCountGuard::refcount_slots[TEST_SLOT];
+        __atomic_store_n(&slot.count, 0u, __ATOMIC_SEQ_CST);
+        __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_SEQ_CST);
+        for (int i = 0; i < RefCountSlot::OUTER_STACK_DEPTH; ++i) {
+            __atomic_store_n(&slot.outer_stack[i], nullptr, __ATOMIC_SEQ_CST);
+        }
+    }
+
+    void TearDown() override {
+        // Restore to clean state regardless of test outcome.
+        auto& slot = RefCountGuard::refcount_slots[TEST_SLOT];
+        __atomic_store_n(&slot.count, 0u, __ATOMIC_SEQ_CST);
+        __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_SEQ_CST);
+    }
+};
+
+// waitForAllRefCountsToClear must not return while a slot is in the activation window.
+TEST_F(RefCountGuardActivationWindowTest, WaitForAllBlocksDuringActivationWindow) {
+    auto& slot = RefCountGuard::refcount_slots[TEST_SLOT];
+
+    char* resource = new char[64];
+    std::memset(resource, 0xAB, 64);
+
+    // Keep a stable pointer for the holder's write so the compiler cannot fold it
+    // away even after resource is set to nullptr by the cleaner.
+    volatile char* stable = resource;
+
+    std::atomic<bool> window_entered{false};
+
+    std::thread holder([&] {
+        // Activation window start: store active_ptr (RELEASE), count still 0.
+        __atomic_store_n(&slot.active_ptr, resource, __ATOMIC_RELEASE);
+        window_entered.store(true, std::memory_order_release);
+
+        // Hold the window open long enough for the cleaner to enter its wait loop.
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        // Activation window end: count++ (RELEASE).
+        __atomic_fetch_add(&slot.count, 1u, __ATOMIC_RELEASE);
+
+        // Write to resource. Under the bug this is heap-use-after-free (ASAN/TSAN catch it).
+        *stable = 0xCD;
+
+        // Destroy guard: count--, active_ptr = null.
+        __atomic_fetch_sub(&slot.count, 1u, __ATOMIC_RELEASE);
+        __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_RELEASE);
+    });
+
+    // Ensure holder is in the activation window before the cleaner proceeds.
+    while (!window_entered.load(std::memory_order_acquire)) { /* spin */ }
+
+    // With the fix: blocks until holder clears active_ptr (after the write).
+    // Without the fix: returns immediately because count==0.
+    RefCountGuard::waitForAllRefCountsToClear();
+
+    // Free the resource.  With the fix the holder has already finished writing.
+    // Without the fix the holder is still asleep and will write to freed memory.
+    delete[] resource;
+    resource = nullptr;
+
+    holder.join();
+}
+
+// waitForRefCountToClear(ptr) must not return while the target ptr is in the activation window.
+TEST_F(RefCountGuardActivationWindowTest, WaitForSpecificBlocksDuringActivationWindow) {
+    auto& slot = RefCountGuard::refcount_slots[TEST_SLOT];
+
+    char* resource = new char[64];
+    std::memset(resource, 0xAB, 64);
+
+    volatile char* stable = resource;
+
+    std::atomic<bool> window_entered{false};
+
+    std::thread holder([&] {
+        __atomic_store_n(&slot.active_ptr, resource, __ATOMIC_RELEASE);
+        window_entered.store(true, std::memory_order_release);
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        __atomic_fetch_add(&slot.count, 1u, __ATOMIC_RELEASE);
+
+        *stable = 0xCD;
+
+        __atomic_fetch_sub(&slot.count, 1u, __ATOMIC_RELEASE);
+        __atomic_store_n(&slot.active_ptr, nullptr, __ATOMIC_RELEASE);
+    });
+
+    while (!window_entered.load(std::memory_order_acquire)) { /* spin */ }
+
+    RefCountGuard::waitForRefCountToClear(resource);
+
+    delete[] resource;
+    resource = nullptr;
+
+    holder.join();
+}
diff --git a/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp b/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp
index a0dc97934..545ee172f 100644
--- a/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp
+++ b/ddprof-lib/src/test/cpp/thread_teardown_safety_ut.cpp
@@ -114,7 +114,7 @@ static void *t02_body(void *) {
   g_t02_seen.store(kNotYetRun, std::memory_order_relaxed);
 
   // Simulate the race window: TLS cleared but object not yet freed.
-  ProfiledThread::clearCurrentThreadTLS();
+  ProfiledThread *detached = ProfiledThread::clearCurrentThreadTLS();
 
   // Signal delivered in the race window must see null, not a dangling pointer.
   pthread_kill(pthread_self(), SIGVTALRM);
@@ -123,6 +123,9 @@ static void *t02_body(void *) {
 
   // release() with TLS already null must not double-free.
   ProfiledThread::release();
+  // Complete the simulated teardown: delete the object (mirrors what freeKey
+  // would do). Destructor is private so we need the test helper.
+  ProfiledThread::deleteForTest(detached);
   return nullptr;
 }
 
diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java
index 3648a7c5f..e3c4dfd1f 100644
--- a/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java
+++ b/ddprof-test/src/test/java/com/datadoghq/profiler/cpu/VtableReceiverFrameTest.java
@@ -56,7 +56,7 @@ private int profiledWork(Shape... shapes) {
     // is dropped, the receiver class name will not appear next to a vtable stub in JFR.
     @RetryingTest(5)
     public void testVtableReceiverFrameInCpuSamples() throws Exception {
-        Assumptions.assumeFalse(Platform.isZing() || Platform.isJ9());
+        Assumptions.assumeFalse(Platform.isZing() || Platform.isJ9() || Platform.isGraal());
         waitForProfilerReady(2000);
         int result = profiledWork(new Circle(), new Square(), new Triangle());
         System.err.println(result);
diff --git a/gradle/sanitizers/tsan.supp b/gradle/sanitizers/tsan.supp
index cf1012df0..91b8999af 100644
--- a/gradle/sanitizers/tsan.supp
+++ b/gradle/sanitizers/tsan.supp
@@ -8,6 +8,10 @@ race:libj9*.so
 race:libjli.so
 # Suppress data races in libz.so
 race:libz.so
+# Suppress races in JDK NIO/IO/java.so: file-descriptor management in the JVM
+# is not TSan-clean (concurrent close vs read on the same fd).
+race:libnio.so
+race:libjava.so
 
 
 # # Suppress data race in G1Allocator::survivor_attempt_allocation
diff --git a/utils/run-docker-tests.sh b/utils/run-docker-tests.sh
index a29f95192..bfddc95c4 100755
--- a/utils/run-docker-tests.sh
+++ b/utils/run-docker-tests.sh
@@ -315,6 +315,25 @@ EOF
         else
             CLANG_RT_PKG=""
         fi
+        # On aarch64, install clang-17 + libclang-rt-17-dev from the LLVM apt repository.
+        # GCC 11's libtsan only knows 39-bit VMA; the linuxkit kernel (Docker Desktop)
+        # uses 48-bit VMA, so GCC 11's TSan crashes with "unexpected memory mapping".
+        # Clang-17's compiler-rt TSan runtime handles both 39-bit and 48-bit VMA.
+        # libclang-rt-17-dev provides libclang_rt.tsan-aarch64.{a,so} that clang-17
+        # needs when linking executables with -fsanitize=thread.
+        if [[ "$ARCH" == "aarch64" ]]; then
+            CLANG17_BLOCK='
+# Install clang-17 + compiler-rt for TSan 48-bit VMA support (aarch64 only)
+RUN wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key \
+        | gpg --dearmor -o /etc/apt/trusted.gpg.d/llvm.gpg \
+    && echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main" \
+        > /etc/apt/sources.list.d/llvm.list \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends clang-17 libclang-rt-17-dev \
+    && rm -rf /var/lib/apt/lists/*'
+        else
+            CLANG17_BLOCK=''
+        fi
         cat > "$DOCKERFILE_DIR/Dockerfile.base" <<EOF
 FROM ubuntu:22.04
 
@@ -324,14 +343,16 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Install build dependencies
 # - libasan/libtsan for GCC sanitizers
 # - libclang-rt-dev for clang sanitizers and libFuzzer (x64 only)
+# - gnupg for LLVM apt key import
 # - openssh-client for git clone over SSH
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-        curl wget bash make g++ clang git jq cmake \
+        curl wget gnupg bash make g++ clang git jq cmake \
         libgtest-dev libgmock-dev tar binutils libc6-dbg \
         ca-certificates linux-libc-dev \
         libasan6 libtsan0 ${CLANG_RT_PKG} openssh-client build-essential gdb && \
     rm -rf /var/lib/apt/lists/*
+${CLANG17_BLOCK}
 
 # Set up Gradle cache directory
 ENV GRADLE_USER_HOME=/gradle-cache
@@ -456,13 +477,32 @@ fi
 if ! $GTEST_ENABLED; then
     GRADLE_CMD="$GRADLE_CMD -Pskip-gtest"
 fi
+# On aarch64 glibc, TSan needs clang-17's embedded runtime (supports 48-bit VMA).
+# GCC 11's libtsan is linked by default but only knows 39-bit VMA, causing a crash.
+if [[ "$CONFIG" == "tsan" ]] && [[ "$ARCH" == "aarch64" ]] && [[ "$LIBC" == "glibc" ]]; then
+    GRADLE_CMD="$GRADLE_CMD -Pnative.forceCompiler=clang++-17"
+fi
 GRADLE_CMD="$GRADLE_CMD --no-daemon --parallel --build-cache --no-watch-fs"
 
+# On aarch64 glibc TSan: reduce ASLR entropy so TSan's shadow doesn't conflict with
+# initial library load addresses. Requires --privileged for sysctl in the container.
+# Do NOT set kernel.randomize_va_space=0: ld-linux-aarch64.so loads at 0x2000000000
+# (TSan's shadow start) with full ASLR off.
+SYSCTL_PREP=""
+NEEDS_PRIVILEGED=false
+if [[ "$CONFIG" == "tsan" ]] && [[ "$ARCH" == "aarch64" ]] && [[ "$LIBC" == "glibc" ]]; then
+    SYSCTL_PREP="sysctl -w vm.mmap_rnd_bits=28 2>/dev/null || true && "
+    NEEDS_PRIVILEGED=true
+fi
+
 # Build Docker run command base
 DOCKER_CMD="docker run --rm"
 if $SHELL_MODE; then
     DOCKER_CMD="$DOCKER_CMD -it --init --ulimit core=-1 --cap-add=SYS_PTRACE"
 fi
+if $NEEDS_PRIVILEGED; then
+    DOCKER_CMD="$DOCKER_CMD --privileged"
+fi
 DOCKER_CMD="$DOCKER_CMD $DOCKER_PLATFORM"
 DOCKER_CMD="$DOCKER_CMD -e LIBC=$LIBC"
 DOCKER_CMD="$DOCKER_CMD -e SANITIZER=$CONFIG"
@@ -477,7 +517,7 @@ if $MOUNT_MODE; then
     if $SHELL_MODE; then
         CONTAINER_CMD="/bin/bash"
     else
-        CONTAINER_CMD="$GRADLE_CMD"
+        CONTAINER_CMD="${SYSCTL_PREP}${GRADLE_CMD}"
     fi
 
     echo ""
@@ -494,7 +534,7 @@ else
     if $SHELL_MODE; then
         CLONE_CMD="git clone --depth 1 file:///source /workspace && cd /workspace && /bin/bash"
     else
-        CLONE_CMD="git clone --depth 1 file:///source /workspace && cd /workspace && $GRADLE_CMD"
+        CLONE_CMD="git clone --depth 1 file:///source /workspace && cd /workspace && ${SYSCTL_PREP}${GRADLE_CMD}"
     fi
 
     echo ""