diff --git a/ddprof-lib/src/main/cpp/ctimer.h b/ddprof-lib/src/main/cpp/ctimer.h index c99d2cd0e..e0d57c2cc 100644 --- a/ddprof-lib/src/main/cpp/ctimer.h +++ b/ddprof-lib/src/main/cpp/ctimer.h @@ -28,6 +28,10 @@ class CTimer : public Engine { protected: // This is accessed from signal handlers, so must be async-signal-safe static bool _enabled; + // Count of signal handlers currently executing past the _enabled check. + // stop() drains this to zero before tearing down JFR structures, closing + // the TOCTOU window between the _enabled check and JFR buffer access. + static int _inflight; static long _interval; static CStack _cstack; static int _signal; @@ -57,6 +61,14 @@ class CTimer : public Engine { __atomic_store_n(&_enabled, enabled, __ATOMIC_RELEASE); } + // Spin until all signal handlers that passed the _enabled=true check have + // returned. Must be called with _enabled already false (after disableEngines()), + // before any JFR teardown that handlers could race against. + // Bounded by DRAIN_TIMEOUT_NS; logs a warning and proceeds if the timeout + // fires (avoids a hang if a handler is stuck, at the cost of the theoretical + // race in that pathological case). + static void drainInflight(); + // Get the signal number used by CTimer (0 if not initialized) static int getSignal() { return _signal; } }; diff --git a/ddprof-lib/src/main/cpp/ctimer_linux.cpp b/ddprof-lib/src/main/cpp/ctimer_linux.cpp index e0d2b8b0c..f17183e9d 100644 --- a/ddprof-lib/src/main/cpp/ctimer_linux.cpp +++ b/ddprof-lib/src/main/cpp/ctimer_linux.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #ifndef SIGEV_THREAD_ID @@ -49,6 +50,7 @@ int CTimer::_max_timers = 0; int *CTimer::_timers = NULL; CStack CTimer::_cstack; bool CTimer::_enabled = false; +int CTimer::_inflight = 0; int CTimer::_signal; int CTimer::registerThread(int tid) { @@ -176,10 +178,48 @@ Error CTimer::start(Arguments &args) { return Error::OK; } +// 200 ms: long enough for any legitimate recordSample() call to finish, +// short enough to avoid a perceptible hang if a handler is somehow stuck. +static const long DRAIN_TIMEOUT_NS = 200000000L; + +void CTimer::drainInflight() { + if (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) == 0) { + return; // fast path: nothing in flight + } + + struct timespec deadline; + clock_gettime(CLOCK_MONOTONIC, &deadline); + deadline.tv_nsec += DRAIN_TIMEOUT_NS; + if (deadline.tv_nsec >= 1000000000L) { + deadline.tv_sec += 1; + deadline.tv_nsec -= 1000000000L; + } + + while (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) > 0) { + struct timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + if (now.tv_sec > deadline.tv_sec || + (now.tv_sec == deadline.tv_sec && now.tv_nsec >= deadline.tv_nsec)) { + Log::warn("CTimer::drainInflight: timed out after %ldms waiting for " + "%d in-flight SIGPROF handler(s); proceeding with JFR teardown. " + "This may indicate a stuck signal handler.", + DRAIN_TIMEOUT_NS / 1000000L, + __atomic_load_n(&_inflight, __ATOMIC_ACQUIRE)); + break; + } + sched_yield(); + } +} + void CTimer::stop() { for (int i = 0; i < _max_timers; i++) { unregisterThread(i); } + // _enabled is already false (disableEngines() is called before stop()). + // Wait for any handler that passed the _enabled=true check to finish before + // returning; the caller (Profiler::stop) will proceed to _jfr.stop() which + // frees JFR structures those handlers may still be accessing. + drainInflight(); } Error CTimerJvmti::check(Arguments &args) { @@ -210,12 +250,15 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { } Counters::increment(CTIMER_SIGNAL_OWN); + __atomic_fetch_add(&_inflight, 1, __ATOMIC_ACQUIRE); CriticalSection cs; if (!cs.entered()) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); return; } int saved_errno = errno; if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); errno = saved_errno; return; } @@ -245,6 +288,7 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { Profiler::instance()->recordSampleDelegated(ucontext, _interval, tid, BCI_CPU, &event); Shims::instance().setSighandlerTid(-1); + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); errno = saved_errno; } @@ -261,17 +305,24 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { } Counters::increment(CTIMER_SIGNAL_OWN); + // Increment before the _enabled check so drainInflight() reliably waits + // for all handlers currently past the check. + __atomic_fetch_add(&_inflight, 1, __ATOMIC_ACQUIRE); + // Atomically try to enter critical section - prevents all reentrancy races CriticalSection cs; if (!cs.entered()) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); return; // Another critical section is active, defer profiling } // Save the current errno value int saved_errno = errno; // we want to ensure memory order because of the possibility the instance gets // cleared - if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) + if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) { + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); return; + } int tid = 0; ProfiledThread *current = ProfiledThread::currentSignalSafe(); assert(current == nullptr || !current->isDeepCrashHandler()); @@ -282,6 +333,7 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { if (current != nullptr && JVMThread::isInitialized() && JVMThread::current() == nullptr && current->inInitWindow()) { current->tickInitWindow(); + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); errno = saved_errno; return; } @@ -298,6 +350,7 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { Profiler::instance()->recordSample(ucontext, _interval, tid, BCI_CPU, 0, &event); Shims::instance().setSighandlerTid(-1); + __atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE); // we need to avoid spoiling the value of errno (tsan report) errno = saved_errno; } diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index def717294..f14abca91 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -1381,8 +1381,6 @@ Error Profiler::start(Arguments &args, bool reset) { _libs->updateBuildIds(); } - enableEngines(); - // Refresher must be running before the trap fires: dlopen_hook's // signal-context branch only marks dirty and relies on the refresher // to call refresh() within REFRESH_INTERVAL_NS (500 ms). @@ -1396,7 +1394,6 @@ Error Profiler::start(Arguments &args, bool reset) { _num_context_attributes = args._context_attributes.size(); error = _jfr.start(args, reset); if (error) { - disableEngines(); switchLibraryTrap(false); _libs->stopRefresher(); return error; @@ -1471,6 +1468,15 @@ Error Profiler::start(Arguments &args, bool reset) { // TODO: find a better way to resolve the thread name. onThreadStart(nullptr, nullptr, nullptr); + // enableEngines() (_enabled=true) is intentionally deferred until here, + // after _jfr.start() and all engine starts. Previously it was called + // before _jfr.start(), creating a TOCTOU race: a SIGPROF delivered + // between enableEngines() and _jfr.start() completing would enter + // recordSample() on partially-initialized JFR structures. + // Paired with drainInflight() in CTimer::stop() which closes the + // symmetric race on the stop side. + enableEngines(); + _state.store(RUNNING, std::memory_order_release); _start_time = time(NULL); __atomic_add_fetch(&_epoch, 1, __ATOMIC_RELAXED);