Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ddprof-lib/src/main/cpp/ctimer.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class CTimer : public Engine {
protected:
// This is accessed from signal handlers, so must be async-signal-safe
static bool _enabled;
// Count of signal handlers currently executing past the _enabled check.
// stop() drains this to zero before tearing down JFR structures, closing
// the TOCTOU window between the _enabled check and JFR buffer access.
static int _inflight;
static long _interval;
static CStack _cstack;
static int _signal;
Expand Down Expand Up @@ -57,6 +61,14 @@ class CTimer : public Engine {
__atomic_store_n(&_enabled, enabled, __ATOMIC_RELEASE);
}

// Spin until all signal handlers that passed the _enabled=true check have
// returned. Must be called with _enabled already false (after disableEngines()),
// before any JFR teardown that handlers could race against.
// Bounded by DRAIN_TIMEOUT_NS; logs a warning and proceeds if the timeout
// fires (avoids a hang if a handler is stuck, at the cost of the theoretical
// race in that pathological case).
static void drainInflight();

// Get the signal number used by CTimer (0 if not initialized)
static int getSignal() { return _signal; }
};
Expand Down
55 changes: 54 additions & 1 deletion ddprof-lib/src/main/cpp/ctimer_linux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <string.h>
#include <sys/syscall.h>
#include <time.h>
#include <time.h>
#include <unistd.h>

#ifndef SIGEV_THREAD_ID
Expand All @@ -49,6 +50,7 @@ int CTimer::_max_timers = 0;
int *CTimer::_timers = NULL;
CStack CTimer::_cstack;
bool CTimer::_enabled = false;
int CTimer::_inflight = 0;
int CTimer::_signal;

int CTimer::registerThread(int tid) {
Expand Down Expand Up @@ -176,10 +178,48 @@ Error CTimer::start(Arguments &args) {
return Error::OK;
}

// 200 ms: long enough for any legitimate recordSample() call to finish,
// short enough to avoid a perceptible hang if a handler is somehow stuck.
static const long DRAIN_TIMEOUT_NS = 200000000L;

void CTimer::drainInflight() {
if (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) == 0) {
return; // fast path: nothing in flight
}

struct timespec deadline;
clock_gettime(CLOCK_MONOTONIC, &deadline);
deadline.tv_nsec += DRAIN_TIMEOUT_NS;
if (deadline.tv_nsec >= 1000000000L) {
deadline.tv_sec += 1;
deadline.tv_nsec -= 1000000000L;
}

while (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) > 0) {
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
if (now.tv_sec > deadline.tv_sec ||
(now.tv_sec == deadline.tv_sec && now.tv_nsec >= deadline.tv_nsec)) {
Log::warn("CTimer::drainInflight: timed out after %ldms waiting for "
"%d in-flight SIGPROF handler(s); proceeding with JFR teardown. "
"This may indicate a stuck signal handler.",
DRAIN_TIMEOUT_NS / 1000000L,
__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE));
break;
}
sched_yield();
}
}

void CTimer::stop() {
for (int i = 0; i < _max_timers; i++) {
unregisterThread(i);
}
// _enabled is already false (disableEngines() is called before stop()).
// Wait for any handler that passed the _enabled=true check to finish before
// returning; the caller (Profiler::stop) will proceed to _jfr.stop() which
// frees JFR structures those handlers may still be accessing.
drainInflight();
}

Error CTimerJvmti::check(Arguments &args) {
Expand Down Expand Up @@ -210,12 +250,15 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
}
Counters::increment(CTIMER_SIGNAL_OWN);

__atomic_fetch_add(&_inflight, 1, __ATOMIC_ACQUIRE);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Balance JVMTI in-flight counter on init-window return

With jvmtistacks on Linux, this increment is not balanced when the handler takes the current->inInitWindow() early return a few lines below. A single sample delivered during that thread-init window permanently leaves _inflight positive, causing every later CTimer::stop() to wait until the 200 ms timeout and then proceed without knowing whether any real handler is still in flight, which defeats the drain this patch adds.

Useful? React with 👍 / 👎.

CriticalSection cs;
if (!cs.entered()) {
__atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE);
return;
}
int saved_errno = errno;
if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) {
__atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE);
errno = saved_errno;
return;
}
Expand Down Expand Up @@ -245,6 +288,7 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
Profiler::instance()->recordSampleDelegated(ucontext, _interval, tid,
BCI_CPU, &event);
Shims::instance().setSighandlerTid(-1);
__atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE);
errno = saved_errno;
}

Expand All @@ -261,17 +305,24 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
}
Counters::increment(CTIMER_SIGNAL_OWN);

// Increment before the _enabled check so drainInflight() reliably waits
// for all handlers currently past the check.
__atomic_fetch_add(&_inflight, 1, __ATOMIC_ACQUIRE);

// Atomically try to enter critical section - prevents all reentrancy races
CriticalSection cs;
if (!cs.entered()) {
__atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE);
return; // Another critical section is active, defer profiling
}
// Save the current errno value
int saved_errno = errno;
// we want to ensure memory order because of the possibility the instance gets
// cleared
if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE))
if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) {
__atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE);
return;
}
int tid = 0;
ProfiledThread *current = ProfiledThread::currentSignalSafe();
assert(current == nullptr || !current->isDeepCrashHandler());
Expand All @@ -282,6 +333,7 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
if (current != nullptr && JVMThread::isInitialized() && JVMThread::current() == nullptr
&& current->inInitWindow()) {
current->tickInitWindow();
__atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE);
errno = saved_errno;
return;
}
Expand All @@ -298,6 +350,7 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
Profiler::instance()->recordSample(ucontext, _interval, tid, BCI_CPU, 0,
&event);
Shims::instance().setSighandlerTid(-1);
__atomic_fetch_sub(&_inflight, 1, __ATOMIC_RELEASE);
// we need to avoid spoiling the value of errno (tsan report)
errno = saved_errno;
}
Expand Down
12 changes: 9 additions & 3 deletions ddprof-lib/src/main/cpp/profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1381,8 +1381,6 @@ Error Profiler::start(Arguments &args, bool reset) {
_libs->updateBuildIds();
}

enableEngines();

// Refresher must be running before the trap fires: dlopen_hook's
// signal-context branch only marks dirty and relies on the refresher
// to call refresh() within REFRESH_INTERVAL_NS (500 ms).
Expand All @@ -1396,7 +1394,6 @@ Error Profiler::start(Arguments &args, bool reset) {
_num_context_attributes = args._context_attributes.size();
error = _jfr.start(args, reset);
if (error) {
disableEngines();
switchLibraryTrap(false);
_libs->stopRefresher();
return error;
Expand Down Expand Up @@ -1471,6 +1468,15 @@ Error Profiler::start(Arguments &args, bool reset) {
// TODO: find a better way to resolve the thread name.
onThreadStart(nullptr, nullptr, nullptr);

// enableEngines() (_enabled=true) is intentionally deferred until here,
// after _jfr.start() and all engine starts. Previously it was called
// before _jfr.start(), creating a TOCTOU race: a SIGPROF delivered
// between enableEngines() and _jfr.start() completing would enter
// recordSample() on partially-initialized JFR structures.
// Paired with drainInflight() in CTimer::stop() which closes the
// symmetric race on the stop side.
enableEngines();

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Enable wall engine before launching its worker

When wall profiling is requested, _wall_engine->start() has already created the BaseWallClock pthread before this call runs; that thread enters timerLoopCommon() and returns immediately if _enabled is still false. On a normal first start or restart _enabled is false, so if the wall thread is scheduled before line 1478, wall-clock sampling silently dies for the whole recording instead of being enabled by this later call.

Useful? React with 👍 / 👎.


_state.store(RUNNING, std::memory_order_release);
_start_time = time(NULL);
__atomic_add_fetch(&_epoch, 1, __ATOMIC_RELAXED);
Expand Down
Loading