From 385f66b03932b39a7749d95ff0914a350bd6eaba Mon Sep 17 00:00:00 2001 From: d-netto Date: Tue, 14 Jan 2025 10:09:43 -0300 Subject: [PATCH] add option to dump backtrace of stop-the-world straggler --- src/julia_internal.h | 2 ++ src/options.h | 2 ++ src/safepoint.c | 21 +++++++++++++++++++-- src/stackwalk.c | 18 ++++++++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/julia_internal.h b/src/julia_internal.h index 00d603f26c7f2..3e4967c9d4dca 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -215,6 +215,8 @@ typedef struct { size_t bt_size; int tid; } jl_record_backtrace_result_t; +JL_DLLEXPORT JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, struct _jl_bt_element_t *bt_data, + size_t max_bt_size) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT; extern volatile struct _jl_bt_element_t *profile_bt_data_prof; diff --git a/src/options.h b/src/options.h index 0715069faab32..944b61ff56593 100644 --- a/src/options.h +++ b/src/options.h @@ -22,6 +22,8 @@ // Must be bigger than GC_MAX_SZCLASS. #define ARRAY_CACHE_ALIGN_THRESHOLD 2048 +#define DUMP_BACKTRACE_OF_STOP_THE_WORLD_STRAGGLER + // codegen options ------------------------------------------------------------ // (Experimental) Use MCJIT ELF, even where it's not the native format diff --git a/src/safepoint.c b/src/safepoint.c index 7eab653edd089..d4b0508615786 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -149,10 +149,27 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) // Use system mutexes rather than spin locking to minimize wasted CPU time // while we wait for other threads reach a safepoint. // This is particularly important when run under rr. + // If `DUMP_BACKTRACE_OF_STOP_THE_WORLD_STRAGGLER` is set, try to print backtrace of + // straggler after 120 seconds of waiting. + const int64_t timeout = 120 * 1000000; // 120 seconds + int ret = 0; uv_mutex_lock(&safepoint_lock); - if (!jl_atomic_load_relaxed(&ptls2->gc_state)) - uv_cond_wait(&safepoint_cond_begin, &safepoint_lock); + if (!jl_atomic_load_relaxed(&ptls2->gc_state)) { + ret = uv_cond_timedwait(&safepoint_cond_begin, &safepoint_lock, timeout); + } uv_mutex_unlock(&safepoint_lock); + // If we woke up because of a timeout, print the backtrace of the straggler + if (ret == UV_ETIMEDOUT) { + #ifdef DUMP_BACKTRACE_OF_STOP_THE_WORLD_STRAGGLER + // Try to record the backtrace of the straggler using `jl_try_record_thread_backtrace` + jl_ptls_t ptls = jl_current_task->ptls; + jl_try_record_thread_backtrace(ptls2, ptls->bt_data, JL_MAX_BT_SIZE); + // Print the backtrace of the straggler + for (size_t i = 0; i < ptls->bt_size; i += jl_bt_entry_size(ptls->bt_data + i)) { + jl_print_bt_entry_codeloc(ptls->bt_data + i); + } + #endif + } } } } diff --git a/src/stackwalk.c b/src/stackwalk.c index f1d807908cf42..4cf12b914f4cf 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -1260,6 +1260,24 @@ static void suspend(void *ctx) suspenddata->success = jl_thread_suspend_and_get_state(suspenddata->old, 1, suspenddata->c); } +JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT +{ + int16_t tid = ptls2->tid; + jl_task_t *t = NULL; + bt_context_t *context = NULL; + bt_context_t c; + suspend_t suspenddata = {tid, &c}; + jl_with_stackwalk_lock(suspend, &suspenddata); + if (!suspenddata.success) { + return 0; + } + // thread is stopped, safe to read the task it was running before we stopped it + t = jl_atomic_load_relaxed(&ptls2->current_task); + size_t bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, ptls2->previous_task ? NULL : t->gcstack); + jl_thread_resume(tid); + return bt_size; +} + JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT { int16_t tid = INT16_MAX;