diff --git a/src/julia_internal.h b/src/julia_internal.h
index f8dd49eab65a3..17deddfdcf5f6 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -208,8 +208,7 @@ JL_DLLEXPORT void jl_lock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
 JL_DLLEXPORT void jl_unlock_profile(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
 JL_DLLEXPORT void jl_lock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
 JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
-int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
-void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
+void jl_with_stackwalk_lock(void (*f)(void*) JL_NOTSAFEPOINT, void *ctx) JL_NOTSAFEPOINT;
 
 arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT;
 typedef struct {
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 24508a8902d5e..1c4af2cf9d033 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -714,13 +714,10 @@ static void jl_unlock_profile_mach(int dlsymlock, int keymgr_locked)
     jl_unlock_profile();
 }
 
-int jl_lock_stackwalk(void)
-{
-    return jl_lock_profile_mach(1);
-}
-
-void jl_unlock_stackwalk(int lockret)
+void jl_with_stackwalk_lock(void (*f)(void*), void *ctx)
 {
+    int lockret = jl_lock_profile_mach(1);
+    f(ctx);
     jl_unlock_profile_mach(1, lockret);
 }
 
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 394c4a108b647..91d3378068f84 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -308,20 +308,27 @@ int exc_reg_is_write_fault(uintptr_t esr) {
 #else
 #include <poll.h>
 #include <sys/eventfd.h>
+#include <link.h>
 
-int jl_lock_stackwalk(void)
+typedef struct {
+    void (*f)(void*) JL_NOTSAFEPOINT;
+    void *ctx;
+} callback_t;
+static int with_dl_iterate_phdr_lock(struct dl_phdr_info *info, size_t size, void *data)
 {
     jl_lock_profile();
-    return 0;
+    callback_t *callback = (callback_t*)data;
+    callback->f(callback->ctx);
+    jl_unlock_profile();
+    return 1; // only call this once
 }
 
-void jl_unlock_stackwalk(int lockret)
+void jl_with_stackwalk_lock(void (*f)(void*), void *ctx)
 {
-    (void)lockret;
-    jl_unlock_profile();
+    callback_t callback = {f, ctx};
+    dl_iterate_phdr(with_dl_iterate_phdr_lock, &callback);
 }
 
-
 #if defined(_OS_LINUX_) && (defined(_CPU_X86_64_) || defined(_CPU_X86_))
 int is_write_fault(void *context) {
     ucontext_t *ctx = (ucontext_t*)context;
@@ -435,7 +442,7 @@ JL_NO_ASAN static void segv_handler(int sig, siginfo_t *info, void *context)
 }
 
 pthread_mutex_t in_signal_lock; // shared with jl_delete_thread
-static bt_context_t *signal_context; // protected by in_signal_lock
+static bt_context_t *usr2_signal_context; // protected by in_signal_lock
 static int exit_signal_cond = -1;
 static int signal_caught_cond = -1;
 static int signals_inflight = 0;
@@ -507,7 +514,7 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
     request = jl_atomic_load_acquire(&ptls2->signal_request);
     assert(request == 0 || request == -1); (void) request;
     jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
-    *ctx = *signal_context;
+    *ctx = *usr2_signal_context;
     return 1;
 }
 
@@ -587,8 +594,8 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
     if (!jl_atomic_cmpswap(&ptls->signal_request, &request, -1))
         return;
     if (request == 1) {
-        signal_context = jl_to_bt_context(ctx);
-        // acknowledge that we saw the signal_request and set signal_context
+        usr2_signal_context = jl_to_bt_context(ctx);
+        // acknowledge that we saw the signal_request and set usr2_signal_context
         int err;
         eventfd_t got = 1;
         err = write(signal_caught_cond, &got, sizeof(eventfd_t));
@@ -602,7 +609,7 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
         if (err != sizeof(eventfd_t)) abort();
         assert(got == 1);
         request = jl_atomic_exchange(&ptls->signal_request, -1);
-        signal_context = NULL;
+        usr2_signal_context = NULL;
         assert(request == 2 || request == 3 || request == 4);
     }
     int err;
@@ -806,7 +813,7 @@ void trigger_profile_peek(void)
     jl_safe_printf("\n======================================================================================\n");
     jl_safe_printf("Information request received. A stacktrace will print followed by a %.1f second profile\n", profile_peek_duration);
     jl_safe_printf("======================================================================================\n");
-    if (profile_bt_size_max == 0){
+    if (profile_bt_size_max == 0) {
         // If the buffer hasn't been initialized, initialize with default size
         // Keep these values synchronized with Profile.default_init()
         if (jl_profile_init(10000000, 1000000) == -1) {
@@ -821,59 +828,93 @@ void trigger_profile_peek(void)
         profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9);
 }
 
-// assumes holding `jl_lock_stackwalk`
-void jl_profile_thread_unix(int tid, bt_context_t *signal_context)
+#if !defined(JL_DISABLE_LIBUNWIND)
+
+static jl_bt_element_t signal_bt_data[JL_MAX_BT_SIZE + 1];
+static size_t signal_bt_size = 0;
+static void do_critical_profile(void *ctx)
 {
-    if (jl_profile_is_buffer_full()) {
-        // Buffer full: Delete the timer
-        jl_profile_stop_timer();
-        return;
-    }
-    // notify thread to stop
-    if (!jl_thread_suspend_and_get_state(tid, 1, signal_context))
-        return;
-    // unwinding can fail, so keep track of the current state
-    // and restore from the SEGV handler if anything happens.
-    jl_jmp_buf *old_buf = jl_get_safe_restore();
-    jl_jmp_buf buf;
-
-    jl_set_safe_restore(&buf);
-    if (jl_setjmp(buf, 0)) {
-        jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
-    } else {
-        // Get backtrace data
-        profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur,
-                profile_bt_size_max - profile_bt_size_cur - 1, signal_context, NULL);
+    bt_context_t signal_context;
+    // sample each thread, round-robin style in reverse order
+    // (so that thread zero gets notified last)
+    int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    for (int i = nthreads; i-- > 0; ) {
+        // notify thread to stop
+        if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
+            continue;
+
+        // do backtrace on thread contexts for critical signals
+        // this part must be signal-handler safe
+        signal_bt_size += rec_backtrace_ctx(signal_bt_data + signal_bt_size,
+                JL_MAX_BT_SIZE / nthreads - 1,
+                &signal_context, NULL);
+        signal_bt_data[signal_bt_size++].uintptr = 0;
+        jl_thread_resume(i);
     }
-    jl_set_safe_restore(old_buf);
+}
 
-    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+static void do_profile(void *ctx)
+{
+    bt_context_t signal_context;
+    int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    int *randperm = profile_get_randperm(nthreads);
+    for (int idx = nthreads; idx-- > 0; ) {
+        // Stop the threads in the random order.
+        int tid = randperm[idx];
+        // do backtrace for profiler
+        if (!profile_running)
+            return;
+        if (jl_profile_is_buffer_full()) {
+            // Buffer full: Delete the timer
+            jl_profile_stop_timer();
+            return;
+        }
+        // notify thread to stop
+        if (!jl_thread_suspend_and_get_state(tid, 1, &signal_context))
+            return;
+        // unwinding can fail, so keep track of the current state
+        // and restore from the SEGV handler if anything happens.
+        jl_jmp_buf *old_buf = jl_get_safe_restore();
+        jl_jmp_buf buf;
+
+        jl_set_safe_restore(&buf);
+        if (jl_setjmp(buf, 0)) {
+            jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
+        }
+        else {
+            // Get backtrace data
+            profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur,
+                    profile_bt_size_max - profile_bt_size_cur - 1, &signal_context, NULL);
+        }
+        jl_set_safe_restore(old_buf);
 
-    // store threadid but add 1 as 0 is preserved to indicate end of block
-    profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls2->tid + 1;
+        jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
 
-    // store task id (never null)
-    profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task);
+        // store threadid but add 1 as 0 is preserved to indicate end of block
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls2->tid + 1;
 
-    // store cpu cycle clock
-    profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
+        // store task id (never null)
+        profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task);
 
-    // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block)
-    int state = jl_atomic_load_relaxed(&ptls2->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING;
-    profile_bt_data_prof[profile_bt_size_cur++].uintptr = state;
+        // store cpu cycle clock
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
 
-    // Mark the end of this block with two 0's
-    profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
-    profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+        // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block)
+        int state = jl_atomic_load_relaxed(&ptls2->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING;
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = state;
 
-    // notify thread to resume
-    jl_thread_resume(tid);
+        // Mark the end of this block with two 0's
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+
+        // notify thread to resume
+        jl_thread_resume(tid);
+    }
 }
+#endif
 
 static void *signal_listener(void *arg)
 {
-    static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1];
-    static size_t bt_size = 0;
     sigset_t sset;
     int sig, critical, profile;
     jl_sigsetset(&sset);
@@ -1005,28 +1046,10 @@ static void *signal_listener(void *arg)
             }
         }
 
-        int nthreads = jl_atomic_load_acquire(&jl_n_threads);
-        bt_size = 0;
+        signal_bt_size = 0;
 #if !defined(JL_DISABLE_LIBUNWIND)
-        bt_context_t signal_context;
         if (critical) {
-            int lockret = jl_lock_stackwalk();
-            // sample each thread, round-robin style in reverse order
-            // (so that thread zero gets notified last)
-            for (int i = nthreads; i-- > 0; ) {
-                // notify thread to stop
-                if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
-                    continue;
-
-                // do backtrace on thread contexts for critical signals
-                // this part must be signal-handler safe
-                bt_size += rec_backtrace_ctx(bt_data + bt_size,
-                        JL_MAX_BT_SIZE / nthreads - 1,
-                        &signal_context, NULL);
-                bt_data[bt_size++].uintptr = 0;
-                jl_thread_resume(i);
-            }
-            jl_unlock_stackwalk(lockret);
+            jl_with_stackwalk_lock(do_critical_profile, NULL);
         }
         else if (profile) {
             if (profile_all_tasks) {
@@ -1034,17 +1057,7 @@ static void *signal_listener(void *arg)
                 jl_profile_task();
             }
             else {
-                int lockret = jl_lock_stackwalk();
-                int *randperm = profile_get_randperm(nthreads);
-                for (int idx = nthreads; idx-- > 0; ) {
-                    // Stop the threads in the random order.
-                    int i = randperm[idx];
-                    // do backtrace for profiler
-                    if (profile_running) {
-                        jl_profile_thread_unix(i, &signal_context);
-                    }
-                }
-                jl_unlock_stackwalk(lockret);
+                jl_with_stackwalk_lock(do_profile, NULL);
             }
         }
 #ifndef HAVE_MACH
@@ -1065,11 +1078,12 @@ static void *signal_listener(void *arg)
 //#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309L && !HAVE_KEVENT
 //            si_code = info.si_code;
 //#endif
-            jl_exit_thread0(sig, bt_data, bt_size);
+            jl_exit_thread0(sig, signal_bt_data, signal_bt_size);
         }
         else if (critical) {
             // critical in this case actually means SIGINFO request
 #ifndef SIGINFO // SIGINFO already prints something similar automatically
+            int nthreads = jl_atomic_load_acquire(&jl_n_threads);
             int n_threads_running = 0;
             for (int idx = nthreads; idx-- > 0; ) {
                 jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[idx];
@@ -1080,8 +1094,8 @@ static void *signal_listener(void *arg)
 
             jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
             size_t i;
-            for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
-                jl_print_bt_entry_codeloc(bt_data + i);
+            for (i = 0; i < signal_bt_size; i += jl_bt_entry_size(signal_bt_data + i)) {
+                jl_print_bt_entry_codeloc(signal_bt_data + i);
             }
         }
     }
diff --git a/src/signals-win.c b/src/signals-win.c
index dbf95fdb19791..c8ae74f52dba4 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -383,20 +383,25 @@ void jl_thread_resume(int tid)
     }
 }
 
-int jl_lock_stackwalk(void)
+void jl_lock_stackwalk(void)
 {
     uv_mutex_lock(&jl_in_stackwalk);
     jl_lock_profile();
-    return 0;
 }
 
-void jl_unlock_stackwalk(int lockret)
+void jl_unlock_stackwalk(void)
 {
-    (void)lockret;
     jl_unlock_profile();
     uv_mutex_unlock(&jl_in_stackwalk);
 }
 
+void jl_with_stackwalk_lock(void (*f)(void*), void *ctx)
+{
+    jl_lock_stackwalk();
+    f(ctx);
+    jl_unlock_stackwalk();
+}
+
 
 static DWORD WINAPI profile_bt( LPVOID lparam )
 {
@@ -416,10 +421,10 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
             }
             else {
                 // TODO: bring this up to parity with other OS by adding loop over tid here
-                int lockret = jl_lock_stackwalk();
+                jl_lock_stackwalk();
                 CONTEXT ctxThread;
                 if (!jl_thread_suspend_and_get_state(0, 0, &ctxThread)) {
-                    jl_unlock_stackwalk(lockret);
+                    jl_unlock_stackwalk();
                     fputs("failed to suspend main thread. aborting profiling.", stderr);
                     jl_profile_stop_timer();
                     break;
@@ -446,7 +451,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                 // Mark the end of this block with two 0's
                 profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
                 profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
-                jl_unlock_stackwalk(lockret);
+                jl_unlock_stackwalk();
                 jl_thread_resume(0);
                 jl_check_profile_autostop();
             }
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 28adef030395e..f1d807908cf42 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1249,6 +1249,17 @@ return 0;
 #endif
 }
 
+typedef struct {
+    int16_t old;
+    bt_context_t *c;
+    int success;
+} suspend_t;
+static void suspend(void *ctx)
+{
+    suspend_t *suspenddata = (suspend_t*)ctx;
+    suspenddata->success = jl_thread_suspend_and_get_state(suspenddata->old, 1, suspenddata->c);
+}
+
 JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT
 {
     int16_t tid = INT16_MAX;
@@ -1270,15 +1281,14 @@ JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_b
     bt_context_t c;
     int16_t old;
     for (old = -1; !jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid; old = -1) {
-        int lockret = jl_lock_stackwalk();
         // if this task is already running somewhere, we need to stop the thread it is running on and query its state
-        if (!jl_thread_suspend_and_get_state(old, 1, &c)) {
-            jl_unlock_stackwalk(lockret);
+        suspend_t suspenddata = {old, &c};
+        jl_with_stackwalk_lock(suspend, &suspenddata);
+        if (!suspenddata.success) {
             if (jl_atomic_load_relaxed(&t->tid) != old)
                 continue;
             return result;
         }
-        jl_unlock_stackwalk(lockret);
         if (jl_atomic_load_relaxed(&t->tid) == old) {
             jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[old];
             if (ptls2->previous_task == t || // we might print the wrong stack here, since we can't know whether we executed the swapcontext yet or not, but it at least avoids trying to access the state inside uc_mcontext which might not be set yet