Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions .gitlab/sanitizer-tests/.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@

gtest-asan-amd64:
extends: .sanitizer_job
allow_failure: true
tags: [ "arch:amd64" ]
image: $BUILD_IMAGE_X64
variables:
Expand All @@ -56,12 +55,10 @@ gtest-asan-amd64:

gtest-tsan-amd64:
extends: .sanitizer_job
allow_failure: true
# docker-in-docker:amd64 = Kata Containers (kata-qemu micro VMs).
# Kata maps host-guest communication structures at fixed high addresses
# that land in TSan's shadow region regardless of LLVM version or sysctl.
# TSan on amd64 requires a non-Kata runner (EC2 or bare metal).
# Kept allow_failure so it runs and provides coverage if the environment is fixed.
tags: [ "docker-in-docker:amd64" ]
image: $BUILD_IMAGE_X64
variables:
Expand All @@ -75,12 +72,14 @@ gtest-tsan-amd64:
echo "=== $(basename $binary) ==="
GTEST_DEATH_TEST_STYLE=threadsafe "$binary"
rc=$?
[ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; }
if [ $rc -ne 0 ]; then
echo "FAILED: $(basename $binary) exited $rc"
exit $rc
fi
done

gtest-asan-arm64:
extends: .sanitizer_job
allow_failure: true
tags: [ "arch:arm64" ]
image: $BUILD_IMAGE_ARM64
variables:
Expand All @@ -89,7 +88,6 @@ gtest-asan-arm64:

gtest-tsan-arm64:
extends: .sanitizer_job
allow_failure: true
# docker-in-docker:arm64 = EC2 VM. sysctl works directly.
# vm.mmap_rnd_bits=28 is sufficient — TSan's LLVM re-exec handles the rare
# case where a library lands in the shadow region by re-running the process
Expand All @@ -111,5 +109,8 @@ gtest-tsan-arm64:
echo "=== $(basename $binary) ==="
GTEST_DEATH_TEST_STYLE=threadsafe "$binary"
rc=$?
[ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; }
if [ $rc -ne 0 ]; then
echo "FAILED: $(basename $binary) exited $rc"
exit $rc
fi
done
Original file line number Diff line number Diff line change
Expand Up @@ -250,24 +250,38 @@ object ConfigurationPresets {
config.compilerArgs.set(tsanCompilerArgs + commonLinuxCompilerArgs(version))

val libtsan = PlatformUtils.locateLibtsan(compiler)
// Use the library name from the resolved path so that clang's own
// libclang_rt.tsan-<arch>.so is linked by name (not as -ltsan).
val tsanLinkerArgs = if (libtsan != null) {
val tsanLibDir = File(libtsan).parent
val tsanLibName = File(libtsan).nameWithoutExtension.removePrefix("lib")
listOf(
"-L${File(libtsan).parent}",
"-ltsan",
"-L$tsanLibDir",
"-l$tsanLibName",
"-Wl,-rpath,$tsanLibDir",
"-fsanitize=thread",
"-fno-omit-frame-pointer"
)
} else {
emptyList()
listOf("-fsanitize=thread", "-fno-omit-frame-pointer")
}

config.linkerArgs.set(commonLinuxLinkerArgs() + tsanLinkerArgs)

if (libtsan != null) {
config.testEnvironment.apply {
config.testEnvironment.apply {
if (libtsan != null) {
put("LD_PRELOAD", libtsan)
put("TSAN_OPTIONS", "suppressions=$rootDir/gradle/sanitizers/tsan.supp")
// handle_segv=0 / handle_sigbus=0: let the JVM handle these signals
// (SafeFetch, NullPointerException, memory bus errors).
// use_sigaltstack=0: JVM manages its own alternate signal stack.
// halt_on_error=0: report all races; process exits with code 66 at end.
// abort_on_error=0: use exit() not abort() so Java shutdown hooks run.
// io_sync=0: disable TSan's own FD-tracking, which races internally
// when the JVM concurrently closes/reads file descriptors.
put("TSAN_OPTIONS", "handle_segv=0:handle_sigbus=0:use_sigaltstack=0:halt_on_error=0:abort_on_error=0:io_sync=0:suppressions=$rootDir/gradle/sanitizers/tsan.supp")
}
// fork() is unsupported under TSan; threadsafe style uses execve instead.
put("GTEST_DEATH_TEST_STYLE", "threadsafe")
}
}
Platform.MACOS -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,11 @@ class GtestTaskBuilder(
}

private fun buildLinkTask(compileTask: TaskProvider<NativeCompileTask>): TaskProvider<NativeLinkExecutableTask> {
// For executables, clang's -fsanitize=address statically embeds the full
// ASan runtime (--whole-archive libclang_rt.asan*.a). Adding an explicit
// -lclang_rt.asan or -lasan on top produces a second dynamic NEEDED entry,
// which triggers "incompatible ASan runtimes" at startup (two __asan_init
// calls). Strip the explicit sanitizer -l/-L/-rpath flags here so the
// executable relies solely on clang's automatic static embedding.
val sanitizerLibPattern = Regex("^(-lasan|-lubsan|-lclang_rt\\.asan.*|-lclang_rt\\.ubsan.*|-L.*/clang.*/|-Wl,-rpath,.*/clang.*/)")
// Strip explicit sanitizer -l/-L/-rpath flags so the executable relies solely
// on clang's automatic static embedding of the sanitizer runtime. Mixing
// clang's statically-embedded runtime with an explicit GCC libtsan/libasan
// causes "incompatible runtimes" at startup (two __tsan_init/__asan_init calls).
val sanitizerLibPattern = Regex("^(-lasan|-lubsan|-ltsan|-lclang_rt\\.asan.*|-lclang_rt\\.ubsan.*|-lclang_rt\\.tsan.*|-L.*/clang.*/|-Wl,-rpath,.*/clang/.*)")
val linkerArgs = config.linkerArgs.get().filter { !sanitizerLibPattern.containsMatchIn(it) }
val objDir = project.file("${project.layout.buildDirectory.get()}/obj/gtest/${config.name}/$testName")
val binary = project.file("${project.layout.buildDirectory.get()}/bin/gtest/${config.name}_$testName/$testName")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,23 @@ object PlatformUtils {
return locateLibrary("libasan", compiler)
}

fun locateLibtsan(compiler: String = "gcc"): String? = locateLibrary("libtsan", compiler)
fun locateLibtsan(compiler: String = "gcc"): String? {
if (currentPlatform != Platform.LINUX) return null
// For clang, prefer the architecture-specific clang_rt.tsan library over
// GCC's libtsan. GCC 11's libtsan only handles 39-bit VMA on aarch64;
// clang's compiler-rt tsan handles both 39-bit and 48-bit VMA.
if (compiler.contains("clang")) {
val archSuffix = when (currentArchitecture) {
Architecture.X64 -> "x86_64"
Architecture.ARM64 -> "aarch64"
Architecture.X86 -> "i386"
Architecture.ARM -> "arm"
}
val clangTsan = locateLibrary("libclang_rt.tsan-$archSuffix", compiler)
if (clangTsan != null) return clangTsan
}
return locateLibrary("libtsan", compiler)
}

fun checkFuzzerSupport(): Boolean {
return try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,11 @@ class ProfilerTestPlugin : Plugin<Project> {
// https://github.com/eclipse-openj9/openj9/issues/23514
!PlatformUtils.isTestJvmJ9()
}
"tsan" -> testTask.onlyIf { PlatformUtils.locateLibtsan() != null }
// TSan + JVM integration tests are incompatible: the profiler's signal
// handlers (SIGPROF at 1ms) are TSan-instrumented; when a signal fires
// while TSan is updating its shadow memory it causes re-entrance and a
// SIGSEGV. TSan coverage is provided by the C++ gtest suite (gtestTsan).
"tsan" -> testTask.onlyIf { false }
Comment thread
jbachorik marked this conversation as resolved.
Comment thread
jbachorik marked this conversation as resolved.
}
}
}
Expand Down Expand Up @@ -347,7 +351,7 @@ class ProfilerTestPlugin : Plugin<Project> {
// https://github.com/eclipse-openj9/openj9/issues/23514
!PlatformUtils.isTestJvmJ9()
}
"tsan" -> execTask.onlyIf { PlatformUtils.locateLibtsan() != null }
"tsan" -> execTask.onlyIf { false } // same reason as testTask above
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion ddprof-lib/src/main/cpp/callTraceHashTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,10 @@ u64 CallTraceHashTable::put(int num_frames, ASGCT_CallFrame *frames,
bool truncated, u64 weight) {
u64 hash = calcHash(num_frames, frames, truncated);

LongHashTable *table = _table;
// ACQUIRE pairs with the ACQ_REL CAS in the expansion path below, ensuring
// that if another thread published a new (expanded) table we see its fully
// initialised contents.
LongHashTable *table = __atomic_load_n(&_table, __ATOMIC_ACQUIRE);
if (table == nullptr) {
// Table allocation failed or was cleared - drop sample
Counters::increment(CALLTRACE_STORAGE_DROPPED);
Expand Down
22 changes: 19 additions & 3 deletions ddprof-lib/src/main/cpp/refCountGuard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,14 @@ void RefCountGuard::waitForRefCountToClear(void* table_to_delete) {
bool all_clear = true;
for (int i = 0; i < MAX_THREADS; ++i) {
uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
if (count == 0) continue;
if (count == 0) {
// Check active_ptr to cover the non-reentrant constructor's activation window:
// active_ptr is stored (RELEASE) before count++ (RELEASE), so count==0 with
// active_ptr set means the thread is in the window and must be waited for.
void* aptr = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE);
if (aptr == table_to_delete) { all_clear = false; break; }
continue;
}
if (slotReferences(refcount_slots[i], table_to_delete)) { all_clear = false; break; }
}
if (all_clear) return;
Expand All @@ -183,7 +190,11 @@ void RefCountGuard::waitForRefCountToClear(void* table_to_delete) {
bool all_clear = true;
for (int i = 0; i < MAX_THREADS; ++i) {
uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
if (count == 0) continue;
if (count == 0) {
void* aptr = __atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE);
if (aptr == table_to_delete) { all_clear = false; break; }
continue;
}
if (slotReferences(refcount_slots[i], table_to_delete)) { all_clear = false; break; }
}
if (all_clear) return;
Expand All @@ -208,18 +219,23 @@ void RefCountGuard::waitForAllRefCountsToClear() {
bool any = false;
for (int i = 0; i < MAX_THREADS; ++i) {
if (__atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE) > 0) { any = true; break; }
// Also check active_ptr: non-reentrant constructor stores it (RELEASE) before
// count++ (RELEASE), so count==0 with active_ptr!=null means the thread is in
// the activation window and must be waited for.
if (__atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE) != nullptr) { any = true; break; }
}
if (!any) return;
spinPause();
}

const int MAX_WAIT_ITERATIONS = 5000;
struct timespec sleep_time = {0, 100000};
int last_nonzero_slot = -1; // for timeout diagnostic below
int last_nonzero_slot = -1;
for (int wait_count = 0; wait_count < MAX_WAIT_ITERATIONS; ++wait_count) {
bool any = false;
for (int i = 0; i < MAX_THREADS; ++i) {
if (__atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE) > 0) { any = true; last_nonzero_slot = i; break; }
if (__atomic_load_n(&refcount_slots[i].active_ptr, __ATOMIC_ACQUIRE) != nullptr) { any = true; last_nonzero_slot = i; break; }
}
if (!any) return;
nanosleep(&sleep_time, nullptr);
Expand Down
21 changes: 10 additions & 11 deletions ddprof-lib/src/main/cpp/refCountGuard.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,16 @@
* Each slot occupies a full cache line (64 bytes) to eliminate false sharing.
*
* ACTIVATION PROTOCOL (pointer-first):
* - Constructor: store active_ptr first, then increment count
* - Destructor: decrement count first, then clear active_ptr
* - Scanner: check count; if 0, skip slot (treats it as inactive)
* - Constructor: store active_ptr (RELEASE) first, then increment count (RELEASE)
* - Destructor: decrement count (RELEASE) first, then clear active_ptr (RELEASE)
* - Scanner: load count (ACQUIRE); if 0, also load active_ptr (ACQUIRE);
* treat the slot as active if either count > 0 or active_ptr != null
*
* There is a brief activation window between the store of active_ptr and the
* increment of count where the scanner sees count=0 and may skip the slot.
* For signal handlers this window is never observed in practice: handlers
* complete within microseconds while a buffer can only be cleared after TWO
* full dump cycles (typically 60+ seconds). If the window were hit, the caller
* observes a miss (e.g. 0 or equivalent sentinel) and handles it gracefully
* — a dropped trace or generic vtable frame, not a crash.
* The scanner checks active_ptr even when count==0 to cover the non-reentrant
* constructor's activation window (active_ptr stored but count not yet incremented)
* and the destructor's deactivation window (count decremented but active_ptr not yet
* cleared). The RELEASE/ACQUIRE pairing on active_ptr itself guarantees the scanner
* sees the stored value if it was written before the load in program order.
*
* REENTRANT NESTING: when a signal fires inside an outer guard (same thread),
* the displaced active_ptr is parked in outer_stack[] so the scanner can still
Expand Down Expand Up @@ -64,7 +63,7 @@ struct alignas(DEFAULT_CACHE_LINE_SIZE) RefCountSlot {
* Correctness:
* - Pointer stored BEFORE count increment (activation)
* - Count decremented BEFORE pointer cleared (deactivation)
* - Scanner checks count first, ensuring consistent view
* - Scanner checks both count and active_ptr to close the activation window
*
* Reentrancy:
* - A signal handler may create a RefCountGuard while a JNI thread already
Expand Down
5 changes: 4 additions & 1 deletion ddprof-lib/src/main/cpp/stringDictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,10 @@ class StringDictionaryBuffer {
return stored_id;
}
}
if (!row->next) {
// Relaxed is fine here: the optimization hint may be stale; the CAS
// below will handle that, and the ACQUIRE load of row->next below
// provides the necessary happens-before for the newly-created SBTable's contents.
if (!__atomic_load_n(&row->next, __ATOMIC_RELAXED)) {
SBTable* nt = static_cast<SBTable*>(calloc(1, sizeof(SBTable)));
if (nt == nullptr) return 0;
if (!__sync_bool_compare_and_swap(&row->next, nullptr, nt)) {
Expand Down
9 changes: 8 additions & 1 deletion ddprof-lib/src/main/cpp/thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,18 @@ class ProfiledThread : public ThreadLocalData {
#ifdef UNIT_TEST
// Simulates the moment inside release() after pthread_setspecific(NULL) but
// before delete — the race window the clearCurrentThreadTLS fix covers.
static void clearCurrentThreadTLS() {
// Returns the detached pointer so the caller can delete it after assertions.
static ProfiledThread* clearCurrentThreadTLS() {
if (__atomic_load_n(&_tls_key_initialized, __ATOMIC_ACQUIRE)) {
ProfiledThread *pt = (ProfiledThread *)pthread_getspecific(_tls_key);
pthread_setspecific(_tls_key, nullptr);
return pt;
}
return nullptr;
}
// Deletes a ProfiledThread returned by clearCurrentThreadTLS().
// Needed because the destructor is private.
static void deleteForTest(ProfiledThread *pt) { delete pt; }
#endif

static ProfiledThread *current();
Expand Down
Loading
Loading