From 0ed3fb5629f377d008ffb097a0b72bd67a692814 Mon Sep 17 00:00:00 2001 From: "erwan.viollet" Date: Tue, 23 Jun 2026 15:25:22 +0200 Subject: [PATCH 1/2] fix(profiler): close two TOCTOU races between SIGPROF handler and JFR lifecycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPU profiler sends SIGPROF to all threads via per-thread kernel timers. The signal handler checks _enabled and, if true, calls recordSample() which accesses JFR buffers. Two races existed around the recording cycle transition (default every 60 s) where JFR structures could be in mid-init or mid-teardown while the handler was active: Race 1 — stop() side (TOCTOU on _enabled vs _jfr.stop()): A handler that passed the _enabled=true check could still be executing inside recordSample() when disableEngines() set _enabled=false and _jfr.stop() freed JFR buffers — use-after-free → SIGSEGV. Fix: add an _inflight counter (incremented on handler entry, decremented on all exits). CTimer::stop() calls drainInflight() after deleting per- thread timers, spinning until _inflight==0 before returning to the caller that proceeds to _jfr.stop(). Any handler that fires after disableEngines() sees _enabled=false and returns early without touching JFR. Race 2 — start() side (enableEngines() before _jfr.start()): enableEngines() set _enabled=true before _jfr.start() had completed. A SIGPROF in that window would see _enabled=true and call recordSample() on partially-initialized JFR structures. Fix: move enableEngines() to after _jfr.start() and _cpu_engine->start() have both returned successfully (just before _state.store(RUNNING)). Validated empirically: a controlled reproducer in DataDog/profiling-backend (AnalysisEndpointTest.testResourceExhausted with a 60 s recording period) showed ~60% failure rate without the fix (SIGSEGV / hang), 0% with both fixes applied (12/12 iterations clean). Each fix alone only partially addressed the failures, confirming both races were independently active. Co-Authored-By: Claude Sonnet 4.5 --- ddprof-lib/src/main/cpp/ctimer.h | 13 +++++++++++++ ddprof-lib/src/main/cpp/ctimer_linux.cpp | 20 +++++++++++++++++++- ddprof-lib/src/main/cpp/profiler.cpp | 12 +++++++++--- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/ddprof-lib/src/main/cpp/ctimer.h b/ddprof-lib/src/main/cpp/ctimer.h index c99d2cd0e..ce2c6b96f 100644 --- a/ddprof-lib/src/main/cpp/ctimer.h +++ b/ddprof-lib/src/main/cpp/ctimer.h @@ -28,6 +28,10 @@ class CTimer : public Engine { protected: // This is accessed from signal handlers, so must be async-signal-safe static bool _enabled; + // Count of signal handlers currently executing past the _enabled check. + // stop() drains this to zero before tearing down JFR structures, closing + // the TOCTOU window between the _enabled check and JFR buffer access. + static int _inflight; static long _interval; static CStack _cstack; static int _signal; @@ -57,6 +61,15 @@ class CTimer : public Engine { __atomic_store_n(&_enabled, enabled, __ATOMIC_RELEASE); } + // Spin until all signal handlers that passed the _enabled=true check have + // returned. Must be called with _enabled already false (after disableEngines()), + // before any JFR teardown that handlers could race against. + static void drainInflight() { + while (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) > 0) { + sched_yield(); + } + } + // Get the signal number used by CTimer (0 if not initialized) static int getSignal() { return _signal; } }; diff --git a/ddprof-lib/src/main/cpp/ctimer_linux.cpp b/ddprof-lib/src/main/cpp/ctimer_linux.cpp index e0d2b8b0c..1d487d896 100644 --- a/ddprof-lib/src/main/cpp/ctimer_linux.cpp +++ b/ddprof-lib/src/main/cpp/ctimer_linux.cpp @@ -49,6 +49,7 @@ int CTimer::_max_timers = 0; int *CTimer::_timers = NULL; CStack CTimer::_cstack; bool CTimer::_enabled = false; +int CTimer::_inflight = 0; int CTimer::_signal; int CTimer::registerThread(int tid) { @@ -180,6 +181,10 @@ void CTimer::stop() { for (int i = 0; i < _max_timers; i++) { unregisterThread(i); } + // _enabled is already false (disableEngines() is called before stop()). + // Spin until all handlers that passed the _enabled=true check have returned, + // so it is safe to tear down JFR structures on return. + drainInflight(); } Error CTimerJvmti::check(Arguments &args) { @@ -210,12 +215,15 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { } Counters::increment(CTIMER_SIGNAL_OWN); + __atomic_fetch_add(&_inflight, 1, __ATOMIC_ACQUIRE); CriticalSection cs; if (!cs.entered()) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); return; } int saved_errno = errno; if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); errno = saved_errno; return; } @@ -245,6 +253,7 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { Profiler::instance()->recordSampleDelegated(ucontext, _interval, tid, BCI_CPU, &event); Shims::instance().setSighandlerTid(-1); + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); errno = saved_errno; } @@ -261,17 +270,24 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { } Counters::increment(CTIMER_SIGNAL_OWN); + // Increment before the _enabled check so drainInflight() reliably waits + // for all handlers currently past the check. + __atomic_fetch_add(&_inflight, 1, __ATOMIC_ACQUIRE); + // Atomically try to enter critical section - prevents all reentrancy races CriticalSection cs; if (!cs.entered()) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); return; // Another critical section is active, defer profiling } // Save the current errno value int saved_errno = errno; // we want to ensure memory order because of the possibility the instance gets // cleared - if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) + if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); return; + } int tid = 0; ProfiledThread *current = ProfiledThread::currentSignalSafe(); assert(current == nullptr || !current->isDeepCrashHandler()); @@ -282,6 +298,7 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { if (current != nullptr && JVMThread::isInitialized() && JVMThread::current() == nullptr && current->inInitWindow()) { current->tickInitWindow(); + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); errno = saved_errno; return; } @@ -298,6 +315,7 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { Profiler::instance()->recordSample(ucontext, _interval, tid, BCI_CPU, 0, &event); Shims::instance().setSighandlerTid(-1); + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); // we need to avoid spoiling the value of errno (tsan report) errno = saved_errno; } diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index def717294..f14abca91 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -1381,8 +1381,6 @@ Error Profiler::start(Arguments &args, bool reset) { _libs->updateBuildIds(); } - enableEngines(); - // Refresher must be running before the trap fires: dlopen_hook's // signal-context branch only marks dirty and relies on the refresher // to call refresh() within REFRESH_INTERVAL_NS (500 ms). @@ -1396,7 +1394,6 @@ Error Profiler::start(Arguments &args, bool reset) { _num_context_attributes = args._context_attributes.size(); error = _jfr.start(args, reset); if (error) { - disableEngines(); switchLibraryTrap(false); _libs->stopRefresher(); return error; @@ -1471,6 +1468,15 @@ Error Profiler::start(Arguments &args, bool reset) { // TODO: find a better way to resolve the thread name. onThreadStart(nullptr, nullptr, nullptr); + // enableEngines() (_enabled=true) is intentionally deferred until here, + // after _jfr.start() and all engine starts. Previously it was called + // before _jfr.start(), creating a TOCTOU race: a SIGPROF delivered + // between enableEngines() and _jfr.start() completing would enter + // recordSample() on partially-initialized JFR structures. + // Paired with drainInflight() in CTimer::stop() which closes the + // symmetric race on the stop side. + enableEngines(); + _state.store(RUNNING, std::memory_order_release); _start_time = time(NULL); __atomic_add_fetch(&_epoch, 1, __ATOMIC_RELAXED); From 81184abc5c4fb22748b1ba1058be3df992e386de Mon Sep 17 00:00:00 2001 From: "erwan.viollet" Date: Wed, 24 Jun 2026 10:12:13 +0200 Subject: [PATCH 2/2] fix(profiler): bound drainInflight() with 200ms timeout to preserve liveness Without a timeout, drainInflight() spins indefinitely if a SIGPROF handler is stuck (e.g. deadlock inside recordSample()). This would prevent _jfr.stop() from running and hang JVM shutdown. Use clock_gettime(CLOCK_MONOTONIC) for a real wall-clock bound of 200ms. If the deadline fires, log a warning and proceed; in that pathological case the caller's JFR teardown may race with the stuck handler, but the JVM is not permanently deadlocked. In normal operation (handlers complete in microseconds) the timeout is never reached. Refs: PROF-15201 Co-Authored-By: Claude Sonnet 4.5 --- ddprof-lib/src/main/cpp/ctimer.h | 9 +++--- ddprof-lib/src/main/cpp/ctimer_linux.cpp | 39 ++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/ddprof-lib/src/main/cpp/ctimer.h b/ddprof-lib/src/main/cpp/ctimer.h index ce2c6b96f..e0d57c2cc 100644 --- a/ddprof-lib/src/main/cpp/ctimer.h +++ b/ddprof-lib/src/main/cpp/ctimer.h @@ -64,11 +64,10 @@ class CTimer : public Engine { // Spin until all signal handlers that passed the _enabled=true check have // returned. Must be called with _enabled already false (after disableEngines()), // before any JFR teardown that handlers could race against. - static void drainInflight() { - while (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) > 0) { - sched_yield(); - } - } + // Bounded by DRAIN_TIMEOUT_NS; logs a warning and proceeds if the timeout + // fires (avoids a hang if a handler is stuck, at the cost of the theoretical + // race in that pathological case). + static void drainInflight(); // Get the signal number used by CTimer (0 if not initialized) static int getSignal() { return _signal; } diff --git a/ddprof-lib/src/main/cpp/ctimer_linux.cpp b/ddprof-lib/src/main/cpp/ctimer_linux.cpp index 1d487d896..f17183e9d 100644 --- a/ddprof-lib/src/main/cpp/ctimer_linux.cpp +++ b/ddprof-lib/src/main/cpp/ctimer_linux.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #ifndef SIGEV_THREAD_ID @@ -177,13 +178,47 @@ Error CTimer::start(Arguments &args) { return Error::OK; } +// 200 ms: long enough for any legitimate recordSample() call to finish, +// short enough to avoid a perceptible hang if a handler is somehow stuck. +static const long DRAIN_TIMEOUT_NS = 200000000L; + +void CTimer::drainInflight() { + if (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) == 0) { + return; // fast path: nothing in flight + } + + struct timespec deadline; + clock_gettime(CLOCK_MONOTONIC, &deadline); + deadline.tv_nsec += DRAIN_TIMEOUT_NS; + if (deadline.tv_nsec >= 1000000000L) { + deadline.tv_sec += 1; + deadline.tv_nsec -= 1000000000L; + } + + while (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) > 0) { + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + if (now.tv_sec > deadline.tv_sec || + (now.tv_sec == deadline.tv_sec && now.tv_nsec >= deadline.tv_nsec)) { + Log::warn("CTimer::drainInflight: timed out after %ldms waiting for " + "%d in-flight SIGPROF handler(s); proceeding with JFR teardown. " + "This may indicate a stuck signal handler.", + DRAIN_TIMEOUT_NS / 1000000L, + __atomic_load_n(&_inflight, __ATOMIC_ACQUIRE)); + break; + } + sched_yield(); + } +} + void CTimer::stop() { for (int i = 0; i < _max_timers; i++) { unregisterThread(i); } // _enabled is already false (disableEngines() is called before stop()). - // Spin until all handlers that passed the _enabled=true check have returned, - // so it is safe to tear down JFR structures on return. + // Wait for any handler that passed the _enabled=true check to finish before + // returning; the caller (Profiler::stop) will proceed to _jfr.stop() which + // frees JFR structures those handlers may still be accessing. drainInflight(); }