From 11c692e1084a631bb40d5348b7074ed58c634714 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Fri, 17 Jan 2025 20:54:15 -0300 Subject: [PATCH] add option to dump backtrace of stop-the-world straggler (#57045) This is still a work in progress, but it should help determine what a straggler thread was doing during the stop-the-world phase and why it failed to reach a safepoint in a timely manner. We've encountered long TTSP issues in production, and this tool should provide a valuable means to accurately diagnose them. --- base/options.jl | 1 + src/jloptions.c | 12 ++++++++++++ src/jloptions.h | 1 + src/julia_internal.h | 2 ++ src/safepoint.c | 31 +++++++++++++++++++++++++++---- src/stackwalk.c | 19 +++++++++++++++++++ test/cmdlineargs.jl | 6 ++++++ test/threads_exec.jl | 25 +++++++++++++++++++++++++ 8 files changed, 93 insertions(+), 4 deletions(-) diff --git a/base/options.jl b/base/options.jl index 7e7808bd5c047..3281ec0de98d2 100644 --- a/base/options.jl +++ b/base/options.jl @@ -62,6 +62,7 @@ struct JLOptions trace_compile_timing::Int8 trim::Int8 task_metrics::Int8 + timeout_for_safepoint_straggler_s::Int16 end # This runs early in the sysimage != is not defined yet diff --git a/src/jloptions.c b/src/jloptions.c index c68b5ce193d98..2c5a9074eb465 100644 --- a/src/jloptions.c +++ b/src/jloptions.c @@ -153,6 +153,7 @@ JL_DLLEXPORT void jl_init_options(void) 0, // trace_compile_timing JL_TRIM_NO, // trim 0, // task_metrics + -1, // timeout_for_safepoint_straggler_s }; jl_options_initialized = 1; } @@ -311,6 +312,8 @@ static const char opts_hidden[] = " --output-asm Generate an assembly file (.s)\n" " --output-incremental={yes|no*} Generate an incremental output file (rather than\n" " complete)\n" + " --timeout-for-safepoint-straggler If this value is set, then we will dump the backtrace for a thread\n" + " that fails to reach a safepoint within the specified time\n" " --trace-compile={stderr|name} Print precompile statements for methods compiled\n" " during execution or save to stderr or a path. Methods that\n" " were recompiled are printed in yellow or with a trailing\n" @@ -346,6 +349,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) opt_warn_scope, opt_inline, opt_polly, + opt_timeout_for_safepoint_straggler, opt_trace_compile, opt_trace_compile_timing, opt_trace_dispatch, @@ -427,6 +431,7 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) { "warn-scope", required_argument, 0, opt_warn_scope }, { "inline", required_argument, 0, opt_inline }, { "polly", required_argument, 0, opt_polly }, + { "timeout-for-safepoint-straggler", required_argument, 0, opt_timeout_for_safepoint_straggler }, { "trace-compile", required_argument, 0, opt_trace_compile }, { "trace-compile-timing", no_argument, 0, opt_trace_compile_timing }, { "trace-dispatch", required_argument, 0, opt_trace_dispatch }, @@ -970,6 +975,13 @@ JL_DLLEXPORT void jl_parse_opts(int *argcp, char ***argvp) else jl_errorf("julia: invalid argument to --permalloc-pkgimg={yes|no} (%s)", optarg); break; + case opt_timeout_for_safepoint_straggler: + errno = 0; + long timeout = strtol(optarg, &endptr, 10); + if (errno != 0 || optarg == endptr || timeout < 1 || timeout > INT16_MAX) + jl_errorf("julia: --timeout-for-safepoint-straggler=; seconds must be an integer between 1 and %d", INT16_MAX); + jl_options.timeout_for_safepoint_straggler_s = (int16_t)timeout; + break; case opt_trim: if (optarg == NULL || !strcmp(optarg,"safe")) jl_options.trim = JL_TRIM_SAFE; diff --git a/src/jloptions.h b/src/jloptions.h index 211122242cbbd..a8cc4a9a9e33d 100644 --- a/src/jloptions.h +++ b/src/jloptions.h @@ -66,6 +66,7 @@ typedef struct { int8_t trace_compile_timing; int8_t trim; int8_t task_metrics; + int16_t timeout_for_safepoint_straggler_s; } jl_options_t; #endif diff --git a/src/julia_internal.h b/src/julia_internal.h index 00d603f26c7f2..3e4967c9d4dca 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -215,6 +215,8 @@ typedef struct { size_t bt_size; int tid; } jl_record_backtrace_result_t; +JL_DLLEXPORT JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, struct _jl_bt_element_t *bt_data, + size_t max_bt_size) JL_NOTSAFEPOINT; JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT; extern volatile struct _jl_bt_element_t *profile_bt_data_prof; diff --git a/src/safepoint.c b/src/safepoint.c index 7eab653edd089..66bea539861f8 100644 --- a/src/safepoint.c +++ b/src/safepoint.c @@ -149,10 +149,33 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads) // Use system mutexes rather than spin locking to minimize wasted CPU time // while we wait for other threads reach a safepoint. // This is particularly important when run under rr. - uv_mutex_lock(&safepoint_lock); - if (!jl_atomic_load_relaxed(&ptls2->gc_state)) - uv_cond_wait(&safepoint_cond_begin, &safepoint_lock); - uv_mutex_unlock(&safepoint_lock); + if (jl_options.timeout_for_safepoint_straggler_s == -1) { // timeout was not specified: no need to dump the backtrace + uv_mutex_lock(&safepoint_lock); + if (!jl_atomic_load_relaxed(&ptls2->gc_state)) { + uv_cond_wait(&safepoint_cond_begin, &safepoint_lock); + } + uv_mutex_unlock(&safepoint_lock); + } + else { + const int64_t timeout = jl_options.timeout_for_safepoint_straggler_s * 1000000000; // convert to nanoseconds + int ret = 0; + uv_mutex_lock(&safepoint_lock); + if (!jl_atomic_load_relaxed(&ptls2->gc_state)) { + ret = uv_cond_timedwait(&safepoint_cond_begin, &safepoint_lock, timeout); + } + uv_mutex_unlock(&safepoint_lock); + // If we woke up because of a timeout, print the backtrace of the straggler + if (ret == UV_ETIMEDOUT) { + jl_safe_printf("===== Thread %d failed to reach safepoint after %d seconds, printing backtrace below =====\n", ptls2->tid + 1, jl_options.timeout_for_safepoint_straggler_s); + // Try to record the backtrace of the straggler using `jl_try_record_thread_backtrace` + jl_ptls_t ptls = jl_current_task->ptls; + size_t bt_size = jl_try_record_thread_backtrace(ptls2, ptls->bt_data, JL_MAX_BT_SIZE); + // Print the backtrace of the straggler + for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(ptls->bt_data + i)) { + jl_print_bt_entry_codeloc(ptls->bt_data + i); + } + } + } } } } diff --git a/src/stackwalk.c b/src/stackwalk.c index f1d807908cf42..14dc5709671dc 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -1260,6 +1260,25 @@ static void suspend(void *ctx) suspenddata->success = jl_thread_suspend_and_get_state(suspenddata->old, 1, suspenddata->c); } +JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT +{ + int16_t tid = ptls2->tid; + jl_task_t *t = NULL; + bt_context_t *context = NULL; + bt_context_t c; + suspend_t suspenddata = {tid, &c}; + jl_with_stackwalk_lock(suspend, &suspenddata); + if (!suspenddata.success) { + return 0; + } + // thread is stopped, safe to read the task it was running before we stopped it + t = jl_atomic_load_relaxed(&ptls2->current_task); + context = &c; + size_t bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, ptls2->previous_task ? NULL : t->gcstack); + jl_thread_resume(tid); + return bt_size; +} + JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT { int16_t tid = INT16_MAX; diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 74f953250cd37..3ff7836223b84 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -1238,3 +1238,9 @@ end @test parse(UInt64,read(`$exename --heap-size-hint=$str -E "Base.JLOptions().heap_size_hint"`, String)) == val end end + +@testset "--timeout-for-safepoint-straggler" begin + exename = `$(Base.julia_cmd())` + timeout = 120 + @test parse(Int,read(`$exename --timeout-for-safepoint-straggler=$timeout -E "Base.JLOptions().timeout_for_safepoint_straggler_s"`, String)) == timeout +end diff --git a/test/threads_exec.jl b/test/threads_exec.jl index d77cf06905f44..65e5ef57c911b 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -1536,4 +1536,29 @@ end end end +@testset "--timeout-for-safepoint-straggler command-line flag" begin + program = " + function main() + t = Threads.@spawn begin + ccall(:uv_sleep, Cvoid, (Cuint,), 5000) + end + # Force a GC + ccall(:uv_sleep, Cvoid, (Cuint,), 1000) + GC.gc() + wait(t) + end + main() + " + tmp_output_filename = tempname() + tmp_output_file = open(tmp_output_filename, "w") + if isnothing(tmp_output_file) + error("Failed to open file $tmp_output_filename") + end + run(pipeline(`$(Base.julia_cmd()) --threads=4 --timeout-for-safepoint-straggler=1 -e $program`, stderr=tmp_output_file)) + # Check whether we printed the straggler's backtrace + @test !isempty(read(tmp_output_filename, String)) + close(tmp_output_file) + rm(tmp_output_filename) +end + end # main testset