From 2fc91352f54ee9bfb389b15710ca096154682a25 Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Thu, 19 Feb 2026 18:56:51 +0000 Subject: [PATCH 1/4] Add per-thread lock waiting time Alongside per-task lock waiting time. Also add an accessor for a thread's lock waiting time and also for a combined, all-threads lock waiting time. --- src/julia_threads.h | 2 ++ src/threading.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/julia_threads.h b/src/julia_threads.h index fbfd5ed7d328e..e5c81873de438 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -204,6 +204,8 @@ typedef struct _jl_tls_states_t { uint64_t sleep_leave; ) + _Atomic(uint64_t) lock_waiting_time; + // some hidden state (usually just because we don't have the type's size declaration) #ifdef JL_LIBRARY_EXPORTS uv_mutex_t sleep_lock; diff --git a/src/threading.c b/src/threading.c index 0254ded69a58c..23dc6c1d9f30b 100644 --- a/src/threading.c +++ b/src/threading.c @@ -382,6 +382,7 @@ jl_ptls_t jl_init_threadtls(int16_t tid) memset(bt_data, 0, sizeof(jl_bt_element_t) * (JL_MAX_BT_SIZE + 1)); ptls->bt_data = bt_data; small_arraylist_new(&ptls->locks, 0); + jl_atomic_store_relaxed(&ptls->lock_waiting_time, 0); jl_init_thread_heap(ptls); uv_mutex_init(&ptls->sleep_lock); @@ -808,6 +809,7 @@ void _jl_mutex_init(jl_mutex_t *lock, const char *name) JL_NOTSAFEPOINT void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint) { + jl_ptls_t ptls = self->ptls; jl_task_t *owner = jl_atomic_load_relaxed(&lock->owner); if (owner == self) { lock->count++; @@ -826,12 +828,15 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint) if (owner == NULL && jl_atomic_cmpswap(&lock->owner, &owner, self)) { lock->count = 1; jl_profile_lock_acquired(lock); - if (lock->record_waiting_time) - self->lock_waiting_time += jl_hrtime() - t0; + if (lock->record_waiting_time) { + uint64_t waiting_time = jl_hrtime() - t0; + self->lock_waiting_time += waiting_time; + ptls->lock_waiting_time += waiting_time; + } return; } if (safepoint) { - jl_gc_safepoint_(self->ptls); + jl_gc_safepoint_(ptls); } if (jl_running_under_rr(0)) { // when running under `rr`, use system mutexes rather than spin locking @@ -927,6 +932,32 @@ void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock) } } +JL_DLLEXPORT uint64_t jl_get_thread_lock_waiting_time(int64_t tid) +{ + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + if (tid < nthreads) { + jl_ptls_t ptls = all_tls_states[tid]; + if (ptls) { + return jl_atomic_load_relaxed(&ptls->lock_waiting_time); + } + } + return 0; +} + +JL_DLLEXPORT uint64_t jl_get_lock_waiting_time() +{ + uint64_t waiting_time = 0; + int nthreads = jl_atomic_load_acquire(&jl_n_threads); + jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); + for (int i = 0; i < nthreads; i++) {} + jl_ptls_t ptls = all_tls_states[i]; + if (ptls) { + waiting_time += jl_atomic_load_relaxed(&ptls->lock_waiting_time); + } + } + return waiting_time; +} // Make gc alignment available for threading // see threads.jl alignment From 82e40cf0bfd7db65a7bf6a1f23490e9382bcfe56 Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Thu, 19 Feb 2026 19:10:43 +0000 Subject: [PATCH 2/4] Add accessors --- base/timing.jl | 3 +++ src/threading.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/base/timing.jl b/base/timing.jl index 0860f588d8b3c..4dd955ca71c1e 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -68,6 +68,9 @@ function gc_alloc_count(diff::GC_Diff) diff.malloc + diff.realloc + diff.poolalloc + diff.bigalloc end +lock_waiting_time(tid::Int16) = ccall(:jl_get_thread_lock_waiting_time, UInt64, (Int16,), tid) +lock_waiting_time() = ccall(:jl_get_lock_waiting_time, UInt64, ()) + # cumulative total time spent on compilation and recompilation, in nanoseconds function cumulative_compile_time_ns() comp = ccall(:jl_cumulative_compile_time_ns, UInt64, ()) diff --git a/src/threading.c b/src/threading.c index 23dc6c1d9f30b..46faeb16351e9 100644 --- a/src/threading.c +++ b/src/threading.c @@ -945,12 +945,12 @@ JL_DLLEXPORT uint64_t jl_get_thread_lock_waiting_time(int64_t tid) return 0; } -JL_DLLEXPORT uint64_t jl_get_lock_waiting_time() +JL_DLLEXPORT uint64_t jl_get_lock_waiting_time(void) { uint64_t waiting_time = 0; int nthreads = jl_atomic_load_acquire(&jl_n_threads); jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); - for (int i = 0; i < nthreads; i++) {} + for (int i = 0; i < nthreads; i++) { jl_ptls_t ptls = all_tls_states[i]; if (ptls) { waiting_time += jl_atomic_load_relaxed(&ptls->lock_waiting_time); From c93630170c912e23efe1d7a0034aa9e6a948fdc8 Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Thu, 19 Feb 2026 21:56:18 +0000 Subject: [PATCH 3/4] Address review comments --- base/timing.jl | 7 +++++-- src/threading.c | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/base/timing.jl b/base/timing.jl index 4dd955ca71c1e..59943aa9a24bc 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -68,8 +68,11 @@ function gc_alloc_count(diff::GC_Diff) diff.malloc + diff.realloc + diff.poolalloc + diff.bigalloc end -lock_waiting_time(tid::Int16) = ccall(:jl_get_thread_lock_waiting_time, UInt64, (Int16,), tid) -lock_waiting_time() = ccall(:jl_get_lock_waiting_time, UInt64, ()) +# Retrieve the specified thread's, or the aggregated value across all threads +# of wait time on all the locks for which wait-time measurement has been +# enabled (for RAI, currently only the codegen lock). +lock_waiting_time_ns(tid::Int16) = ccall(:jl_get_thread_lock_waiting_time, UInt64, (Int16,), tid) +lock_waiting_time_ns() = ccall(:jl_get_lock_waiting_time, UInt64, ()) # cumulative total time spent on compilation and recompilation, in nanoseconds function cumulative_compile_time_ns() diff --git a/src/threading.c b/src/threading.c index 46faeb16351e9..0bed0c64144d2 100644 --- a/src/threading.c +++ b/src/threading.c @@ -949,8 +949,10 @@ JL_DLLEXPORT uint64_t jl_get_lock_waiting_time(void) { uint64_t waiting_time = 0; int nthreads = jl_atomic_load_acquire(&jl_n_threads); + int ngcthreads = jl_atomic_load_acquire(&jl_n_gcthreads); + int nmutatorthreads = nthreads - ngcthreads; jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); - for (int i = 0; i < nthreads; i++) { + for (int i = 0; i < nmutatorthreads; i++) { jl_ptls_t ptls = all_tls_states[i]; if (ptls) { waiting_time += jl_atomic_load_relaxed(&ptls->lock_waiting_time); From eeb4e13f1422be2002534e070a87534a7ed750d5 Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Fri, 20 Feb 2026 00:19:07 +0000 Subject: [PATCH 4/4] Address review comment And small tweaks to conform to standard API. --- base/timing.jl | 2 +- src/threading.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/base/timing.jl b/base/timing.jl index 59943aa9a24bc..63a244229f841 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -71,7 +71,7 @@ end # Retrieve the specified thread's, or the aggregated value across all threads # of wait time on all the locks for which wait-time measurement has been # enabled (for RAI, currently only the codegen lock). -lock_waiting_time_ns(tid::Int16) = ccall(:jl_get_thread_lock_waiting_time, UInt64, (Int16,), tid) +lock_waiting_time_ns(tid::Integer) = ccall(:jl_get_thread_lock_waiting_time, UInt64, (Cint,), tid-1) lock_waiting_time_ns() = ccall(:jl_get_lock_waiting_time, UInt64, ()) # cumulative total time spent on compilation and recompilation, in nanoseconds diff --git a/src/threading.c b/src/threading.c index 0bed0c64144d2..fa152547e7ce2 100644 --- a/src/threading.c +++ b/src/threading.c @@ -932,9 +932,9 @@ void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock) } } -JL_DLLEXPORT uint64_t jl_get_thread_lock_waiting_time(int64_t tid) +JL_DLLEXPORT uint64_t jl_get_thread_lock_waiting_time(int16_t tid) { - int nthreads = jl_atomic_load_acquire(&jl_n_threads); + int nthreads = jl_atomic_load_relaxed(&jl_n_threads); jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); if (tid < nthreads) { jl_ptls_t ptls = all_tls_states[tid]; @@ -948,8 +948,8 @@ JL_DLLEXPORT uint64_t jl_get_thread_lock_waiting_time(int64_t tid) JL_DLLEXPORT uint64_t jl_get_lock_waiting_time(void) { uint64_t waiting_time = 0; - int nthreads = jl_atomic_load_acquire(&jl_n_threads); - int ngcthreads = jl_atomic_load_acquire(&jl_n_gcthreads); + int nthreads = jl_atomic_load_relaxed(&jl_n_threads); + int ngcthreads = jl_n_gcthreads; int nmutatorthreads = nthreads - ngcthreads; jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); for (int i = 0; i < nmutatorthreads; i++) {