Skip to content

Commit

Permalink
add option to dump backtrace of stop-the-world straggler
Browse files Browse the repository at this point in the history
  • Loading branch information
d-netto committed Jan 16, 2025
1 parent 316f0fe commit 385f66b
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ typedef struct {
size_t bt_size;
int tid;
} jl_record_backtrace_result_t;
JL_DLLEXPORT JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, struct _jl_bt_element_t *bt_data,
size_t max_bt_size) JL_NOTSAFEPOINT;
JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data,
size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT;
extern volatile struct _jl_bt_element_t *profile_bt_data_prof;
Expand Down
2 changes: 2 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
// Must be bigger than GC_MAX_SZCLASS.
#define ARRAY_CACHE_ALIGN_THRESHOLD 2048

#define DUMP_BACKTRACE_OF_STOP_THE_WORLD_STRAGGLER

// codegen options ------------------------------------------------------------

// (Experimental) Use MCJIT ELF, even where it's not the native format
Expand Down
21 changes: 19 additions & 2 deletions src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,27 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
// Use system mutexes rather than spin locking to minimize wasted CPU time
// while we wait for other threads reach a safepoint.
// This is particularly important when run under rr.
// If `DUMP_BACKTRACE_OF_STOP_THE_WORLD_STRAGGLER` is set, try to print backtrace of
// straggler after 120 seconds of waiting.
const int64_t timeout = 120 * 1000000; // 120 seconds
int ret = 0;
uv_mutex_lock(&safepoint_lock);
if (!jl_atomic_load_relaxed(&ptls2->gc_state))
uv_cond_wait(&safepoint_cond_begin, &safepoint_lock);
if (!jl_atomic_load_relaxed(&ptls2->gc_state)) {
ret = uv_cond_timedwait(&safepoint_cond_begin, &safepoint_lock, timeout);
}
uv_mutex_unlock(&safepoint_lock);
// If we woke up because of a timeout, print the backtrace of the straggler
if (ret == UV_ETIMEDOUT) {
#ifdef DUMP_BACKTRACE_OF_STOP_THE_WORLD_STRAGGLER
// Try to record the backtrace of the straggler using `jl_try_record_thread_backtrace`
jl_ptls_t ptls = jl_current_task->ptls;
jl_try_record_thread_backtrace(ptls2, ptls->bt_data, JL_MAX_BT_SIZE);
// Print the backtrace of the straggler
for (size_t i = 0; i < ptls->bt_size; i += jl_bt_entry_size(ptls->bt_data + i)) {
jl_print_bt_entry_codeloc(ptls->bt_data + i);
}
#endif
}
}
}
}
Expand Down
18 changes: 18 additions & 0 deletions src/stackwalk.c
Original file line number Diff line number Diff line change
Expand Up @@ -1260,6 +1260,24 @@ static void suspend(void *ctx)
suspenddata->success = jl_thread_suspend_and_get_state(suspenddata->old, 1, suspenddata->c);
}

JL_DLLEXPORT size_t jl_try_record_thread_backtrace(jl_ptls_t ptls2, jl_bt_element_t *bt_data, size_t max_bt_size) JL_NOTSAFEPOINT
{
int16_t tid = ptls2->tid;
jl_task_t *t = NULL;
bt_context_t *context = NULL;
bt_context_t c;
suspend_t suspenddata = {tid, &c};
jl_with_stackwalk_lock(suspend, &suspenddata);
if (!suspenddata.success) {
return 0;
}
// thread is stopped, safe to read the task it was running before we stopped it
t = jl_atomic_load_relaxed(&ptls2->current_task);
size_t bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, ptls2->previous_task ? NULL : t->gcstack);
jl_thread_resume(tid);
return bt_size;
}

JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT
{
int16_t tid = INT16_MAX;
Expand Down

0 comments on commit 385f66b

Please sign in to comment.