diff --git a/base/timing.jl b/base/timing.jl index 0860f588d8b3c..63a244229f841 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -68,6 +68,12 @@ function gc_alloc_count(diff::GC_Diff) diff.malloc + diff.realloc + diff.poolalloc + diff.bigalloc end +# Retrieve the specified thread's, or the aggregated value across all threads +# of wait time on all the locks for which wait-time measurement has been +# enabled (for RAI, currently only the codegen lock). +lock_waiting_time_ns(tid::Integer) = ccall(:jl_get_thread_lock_waiting_time, UInt64, (Cint,), tid-1) +lock_waiting_time_ns() = ccall(:jl_get_lock_waiting_time, UInt64, ()) + # cumulative total time spent on compilation and recompilation, in nanoseconds function cumulative_compile_time_ns() comp = ccall(:jl_cumulative_compile_time_ns, UInt64, ()) diff --git a/src/julia_threads.h b/src/julia_threads.h index fbfd5ed7d328e..e5c81873de438 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -204,6 +204,8 @@ typedef struct _jl_tls_states_t { uint64_t sleep_leave; ) + _Atomic(uint64_t) lock_waiting_time; + // some hidden state (usually just because we don't have the type's size declaration) #ifdef JL_LIBRARY_EXPORTS uv_mutex_t sleep_lock; diff --git a/src/threading.c b/src/threading.c index 0254ded69a58c..fa152547e7ce2 100644 --- a/src/threading.c +++ b/src/threading.c @@ -382,6 +382,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid) memset(bt_data, 0, sizeof(jl_bt_element_t) * (JL_MAX_BT_SIZE + 1)); ptls->bt_data = bt_data; small_arraylist_new(&ptls->locks, 0); + jl_atomic_store_relaxed(&ptls->lock_waiting_time, 0); jl_init_thread_heap(ptls); uv_mutex_init(&ptls->sleep_lock); @@ -808,6 +809,7 @@ void _jl_mutex_init(jl_mutex_t *lock, const char *name) JL_NOTSAFEPOINT void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint) { + jl_ptls_t ptls = self->ptls; jl_task_t *owner = jl_atomic_load_relaxed(&lock->owner); if (owner == self) { lock->count++; @@ -826,12 +828,15 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint) if (owner == NULL && jl_atomic_cmpswap(&lock->owner, &owner, self)) { lock->count = 1; jl_profile_lock_acquired(lock); - if (lock->record_waiting_time) - self->lock_waiting_time += jl_hrtime() - t0; + if (lock->record_waiting_time) { + uint64_t waiting_time = jl_hrtime() - t0; + self->lock_waiting_time += waiting_time; + ptls->lock_waiting_time += waiting_time; + } return; } if (safepoint) { - jl_gc_safepoint_(self->ptls); + jl_gc_safepoint_(ptls); } if (jl_running_under_rr(0)) { // when running under `rr`, use system mutexes rather than spin locking @@ -927,6 +932,34 @@ void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock) } } +JL_DLLEXPORT uint64_t jl_get_thread_lock_waiting_time(int16_t tid) +{ + int nthreads = jl_atomic_load_relaxed(&jl_n_threads); + jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + if (tid < nthreads) { + jl_ptls_t ptls = all_tls_states[tid]; + if (ptls) { + return jl_atomic_load_relaxed(&ptls->lock_waiting_time); + } + } + return 0; +} + +JL_DLLEXPORT uint64_t jl_get_lock_waiting_time(void) +{ + uint64_t waiting_time = 0; + int nthreads = jl_atomic_load_relaxed(&jl_n_threads); + int ngcthreads = jl_n_gcthreads; + int nmutatorthreads = nthreads - ngcthreads; + jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < nmutatorthreads; i++) { + jl_ptls_t ptls = all_tls_states[i]; + if (ptls) { + waiting_time += jl_atomic_load_relaxed(&ptls->lock_waiting_time); + } + } + return waiting_time; +} // Make gc alignment available for threading // see threads.jl alignment