From cec018a650023965cefe6c96de960164726ac597 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Mon, 25 Nov 2024 10:54:23 -0600 Subject: [PATCH] ch4/ofi: Add NIC information to error output On systems with multiple NICs, it could be helpful to know which NIC an error was detected on in case there are hardware issues that need investigating. Add NIC information to error checking macros. For now, we report the default NIC used by each process. TODO: extend to support multi-nic usage and take the device number as input for more fine-grained reporting. --- maint/extracterrmsgs | 6 +++--- .../ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h | 10 ++++++---- .../netmod/ofi/coll/ofi_bcast_tree_tagged.h | 10 ++++++---- src/mpid/ch4/netmod/ofi/errnames.txt | 12 +++++------ src/mpid/ch4/netmod/ofi/ofi_events.c | 16 +++++++++------ src/mpid/ch4/netmod/ofi/ofi_impl.h | 20 +++++++++++++------ src/mpid/ch4/netmod/ofi/ofi_spawn.c | 10 ++++++---- src/mpid/ch4/netmod/ofi/ofi_win.h | 5 +++-- 8 files changed, 54 insertions(+), 35 deletions(-) diff --git a/maint/extracterrmsgs b/maint/extracterrmsgs index a0fd42e63b7..3f0b6ec2ee5 100755 --- a/maint/extracterrmsgs +++ b/maint/extracterrmsgs @@ -680,12 +680,12 @@ sub ProcessFile # add longnames since we omit errnames.txt for these $longnames{"**ofid_$name"} = "OFI call $name failed"; $longnamesDefined{"**ofid_$name"} = "$filename:$linecount"; - $longnames{"**ofid_$name %s"} = "OFI call $name failed (%s)"; - $longnamesDefined{"**ofid_$name %s"} = "$filename:$linecount"; + $longnames{"**ofid_$name %s %s"} = "OFI call $name failed (default nic=%s: %s)"; + $longnamesDefined{"**ofid_$name %s %s"} = "$filename:$linecount"; } $generic_msgs{"**ofid_$name"}++; - $specific_msgs{"**ofid_$name %s"}++; + $specific_msgs{"**ofid_$name %s %s"}++; next; } diff --git a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h index 70808b157e3..9cb56278a27 100644 --- a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h +++ b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h @@ -70,9 +70,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(rcv_cntr, 1, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { @@ -89,9 +90,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(snd_cntr, num_children, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { diff --git a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h index d6af249d73e..c8f42a0da9d 100644 --- a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h +++ b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h @@ -69,9 +69,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(rcv_cntr, 1, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { @@ -88,9 +89,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(snd_cntr, num_children, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { diff --git a/src/mpid/ch4/netmod/ofi/errnames.txt b/src/mpid/ch4/netmod/ofi/errnames.txt index 71e76435af4..447771a7d86 100644 --- a/src/mpid/ch4/netmod/ofi/errnames.txt +++ b/src/mpid/ch4/netmod/ofi/errnames.txt @@ -10,14 +10,14 @@ # Most of the libfabric call error names are generated from MPIDI_OFI_CALL macros, # some of them are explicitly used via MPIR_ERR_CHKANDJUMP4, they need be listed here. **ofid_cancel:OFI cancel failed -**ofid_cancel %s:OFI cancel failed (%s) +**ofid_cancel %s %s:OFI cancel failed (default nic=%s: %s) **ofid_cntr_open:OFI Counter open failed -**ofid_cntr_open %s:OFI OFI Counter open failed (%s) +**ofid_cntr_open %s %s:OFI OFI Counter open failed (default nic=%s: %s) **ofid_cntr_wait:OFI Counter wait failed -**ofid_cntr_wait %s:OFI OFI Counter wait failed (%s) +**ofid_cntr_wait %s %s:OFI OFI Counter wait failed (default nic=%s: %s) **ofid_enable_trigger:OFI triggered ops enable failed -**ofid_enable_trigger %s:OFI triggered ops enable failed (%s) +**ofid_enable_trigger %s %s:OFI triggered ops enable failed (default nic=%s: %s) **ofid_issue_trigger:OFI triggered ops issue failed -**ofid_issue_trigger %s:OFI triggered ops issue failed (%s) +**ofid_issue_trigger %s %s:OFI triggered ops issue failed (default nic=%s: %s) **ofid_poll:OFI poll failed -**ofid_poll %s:OFI poll failed (%s) +**ofid_poll %s %s:OFI poll failed (default nic=%s: %s) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 364dae25ec6..178cbb80ca7 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -737,8 +737,10 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret) break; default: - MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", - "**ofid_poll %s", fi_strerror(e.err)); + MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", + "**ofid_poll %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, + fi_strerror(e.err)); } break; @@ -781,15 +783,17 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret) break; default: - MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", - "**ofid_poll %s", fi_strerror(e.err)); + MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", + "**ofid_poll %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(e.err)); } break; default: - MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", - "**ofid_poll %s", fi_strerror(errno)); + MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", + "**ofid_poll %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(errno)); } fn_exit: diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index a48116affd3..df45c3c32c1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -38,6 +38,10 @@ ATTRIBUTE((unused)); #define MPIDI_OFI_WIN(win) ((win)->dev.netmod.ofi) +#define MPIDI_OFI_NIC_NAME(nic) (MPIDI_OFI_global.prov_use[nic] ? \ + MPIDI_OFI_global.prov_use[nic]->domain_attr->name : "(n/a)") +#define MPIDI_OFI_DEFAULT_NIC_NAME (MPIDI_OFI_NIC_NAME(0)) + int MPIDI_OFI_progress_uninlined(int vci); int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); @@ -55,15 +59,16 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); #define MPIDI_OFI_PROGRESS_WHILE(cond, vci) \ while (cond) MPIDI_OFI_PROGRESS(vci) -#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP1 -#define MPIDI_OFI_CALL(FUNC,STR) \ +#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP2 +#define MPIDI_OFI_CALL(FUNC,STR) \ do { \ ssize_t _ret = FUNC; \ MPIDI_OFI_ERR(_ret<0, \ mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ } while (0) @@ -78,7 +83,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ if (_retry > 0) { \ _retry--; \ @@ -123,7 +129,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ mpi_errno = MPIDI_OFI_progress_do_queue(vci_); \ if (mpi_errno != MPI_SUCCESS) \ @@ -167,7 +174,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ } while (0) diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index 66635fa157b..20adc54b3b1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -55,8 +55,9 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s int rc; rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) &req.context); if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP1(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s", fi_strerror(-rc)); + MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", + "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, + fi_strerror(-rc)); } while (!req.done) { @@ -111,8 +112,9 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) int rc; rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].rx, (void *) &req.context); if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP1(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s", fi_strerror(-rc)); + MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", + "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, + fi_strerror(-rc)); } while (!req.done) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_win.h b/src/mpid/ch4/netmod/ofi/ofi_win.h index e56f06ec1d4..975e684b003 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_win.h +++ b/src/mpid/ch4/netmod/ofi/ofi_win.h @@ -51,9 +51,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_win_do_progress(MPIR_Win * win, int vci) if (itercount == 1000 && MPIDI_OFI_COUNTER_WAIT_OBJECTS) { ret = fi_cntr_wait(MPIDI_OFI_WIN(win).cmpl_cntr, tcount, 0); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); itercount = 0; DEBUG_PROGRESS_CHECK; }