diff --git a/maint/extracterrmsgs b/maint/extracterrmsgs index a0fd42e63b7..3f0b6ec2ee5 100755 --- a/maint/extracterrmsgs +++ b/maint/extracterrmsgs @@ -680,12 +680,12 @@ sub ProcessFile # add longnames since we omit errnames.txt for these $longnames{"**ofid_$name"} = "OFI call $name failed"; $longnamesDefined{"**ofid_$name"} = "$filename:$linecount"; - $longnames{"**ofid_$name %s"} = "OFI call $name failed (%s)"; - $longnamesDefined{"**ofid_$name %s"} = "$filename:$linecount"; + $longnames{"**ofid_$name %s %s"} = "OFI call $name failed (default nic=%s: %s)"; + $longnamesDefined{"**ofid_$name %s %s"} = "$filename:$linecount"; } $generic_msgs{"**ofid_$name"}++; - $specific_msgs{"**ofid_$name %s"}++; + $specific_msgs{"**ofid_$name %s %s"}++; next; } diff --git a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h index 70808b157e3..9cb56278a27 100644 --- a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h +++ b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_rma.h @@ -70,9 +70,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(rcv_cntr, 1, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { @@ -89,9 +90,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_rma(void *buffer, i if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(snd_cntr, num_children, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { diff --git a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h index d6af249d73e..c8f42a0da9d 100644 --- a/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h +++ b/src/mpid/ch4/netmod/ofi/coll/ofi_bcast_tree_tagged.h @@ -69,9 +69,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(rcv_cntr, 1, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { @@ -88,9 +89,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_Bcast_intra_triggered_tagged(void *buffer if (0 != strcmp(MPIDI_OFI_global.prov_use[0]->fabric_attr->prov_name, "cxi")) { do { ret = fi_cntr_wait(snd_cntr, num_children, 1); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); MPID_Progress_test(NULL); } while (ret == -FI_ETIMEDOUT); } else { diff --git a/src/mpid/ch4/netmod/ofi/errnames.txt b/src/mpid/ch4/netmod/ofi/errnames.txt index 71e76435af4..447771a7d86 100644 --- a/src/mpid/ch4/netmod/ofi/errnames.txt +++ b/src/mpid/ch4/netmod/ofi/errnames.txt @@ -10,14 +10,14 @@ # Most of the libfabric call error names are generated from MPIDI_OFI_CALL macros, # some of them are explicitly used via MPIR_ERR_CHKANDJUMP4, they need be listed here. **ofid_cancel:OFI cancel failed -**ofid_cancel %s:OFI cancel failed (%s) +**ofid_cancel %s %s:OFI cancel failed (default nic=%s: %s) **ofid_cntr_open:OFI Counter open failed -**ofid_cntr_open %s:OFI OFI Counter open failed (%s) +**ofid_cntr_open %s %s:OFI OFI Counter open failed (default nic=%s: %s) **ofid_cntr_wait:OFI Counter wait failed -**ofid_cntr_wait %s:OFI OFI Counter wait failed (%s) +**ofid_cntr_wait %s %s:OFI OFI Counter wait failed (default nic=%s: %s) **ofid_enable_trigger:OFI triggered ops enable failed -**ofid_enable_trigger %s:OFI triggered ops enable failed (%s) +**ofid_enable_trigger %s %s:OFI triggered ops enable failed (default nic=%s: %s) **ofid_issue_trigger:OFI triggered ops issue failed -**ofid_issue_trigger %s:OFI triggered ops issue failed (%s) +**ofid_issue_trigger %s %s:OFI triggered ops issue failed (default nic=%s: %s) **ofid_poll:OFI poll failed -**ofid_poll %s:OFI poll failed (%s) +**ofid_poll %s %s:OFI poll failed (default nic=%s: %s) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 364dae25ec6..178cbb80ca7 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -737,8 +737,10 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret) break; default: - MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", - "**ofid_poll %s", fi_strerror(e.err)); + MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", + "**ofid_poll %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, + fi_strerror(e.err)); } break; @@ -781,15 +783,17 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret) break; default: - MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", - "**ofid_poll %s", fi_strerror(e.err)); + MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", + "**ofid_poll %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(e.err)); } break; default: - MPIR_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", - "**ofid_poll %s", fi_strerror(errno)); + MPIR_ERR_SETFATALANDJUMP2(mpi_errno, MPI_ERR_OTHER, "**ofid_poll", + "**ofid_poll %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(errno)); } fn_exit: diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index a48116affd3..df45c3c32c1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -38,6 +38,10 @@ ATTRIBUTE((unused)); #define MPIDI_OFI_WIN(win) ((win)->dev.netmod.ofi) +#define MPIDI_OFI_NIC_NAME(nic) (MPIDI_OFI_global.prov_use[nic] ? \ + MPIDI_OFI_global.prov_use[nic]->domain_attr->name : "(n/a)") +#define MPIDI_OFI_DEFAULT_NIC_NAME (MPIDI_OFI_NIC_NAME(0)) + int MPIDI_OFI_progress_uninlined(int vci); int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); @@ -55,15 +59,16 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); #define MPIDI_OFI_PROGRESS_WHILE(cond, vci) \ while (cond) MPIDI_OFI_PROGRESS(vci) -#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP1 -#define MPIDI_OFI_CALL(FUNC,STR) \ +#define MPIDI_OFI_ERR MPIR_ERR_CHKANDJUMP2 +#define MPIDI_OFI_CALL(FUNC,STR) \ do { \ ssize_t _ret = FUNC; \ MPIDI_OFI_ERR(_ret<0, \ mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ } while (0) @@ -78,7 +83,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ if (_retry > 0) { \ _retry--; \ @@ -123,7 +129,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ mpi_errno = MPIDI_OFI_progress_do_queue(vci_); \ if (mpi_errno != MPI_SUCCESS) \ @@ -167,7 +174,8 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret); mpi_errno, \ MPI_ERR_OTHER, \ "**ofid_"#STR, \ - "**ofid_"#STR" %s", \ + "**ofid_"#STR" %s %s", \ + MPIDI_OFI_DEFAULT_NIC_NAME, \ fi_strerror(-_ret)); \ } while (0) diff --git a/src/mpid/ch4/netmod/ofi/ofi_spawn.c b/src/mpid/ch4/netmod/ofi/ofi_spawn.c index 66635fa157b..20adc54b3b1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_spawn.c +++ b/src/mpid/ch4/netmod/ofi/ofi_spawn.c @@ -55,8 +55,9 @@ int MPIDI_OFI_dynamic_send(uint64_t remote_gpid, int tag, const void *buf, int s int rc; rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].tx, (void *) &req.context); if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP1(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s", fi_strerror(-rc)); + MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", + "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, + fi_strerror(-rc)); } while (!req.done) { @@ -111,8 +112,9 @@ int MPIDI_OFI_dynamic_recv(int tag, void *buf, int size, int timeout) int rc; rc = fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].rx, (void *) &req.context); if (rc && rc != -FI_ENOENT) { - MPIR_ERR_CHKANDJUMP1(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", - "**ofid_cancel %s", fi_strerror(-rc)); + MPIR_ERR_CHKANDJUMP2(rc < 0, mpi_errno, MPI_ERR_OTHER, "**ofid_cancel", + "**ofid_cancel %s %s", MPIDI_OFI_DEFAULT_NIC_NAME, + fi_strerror(-rc)); } while (!req.done) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_win.h b/src/mpid/ch4/netmod/ofi/ofi_win.h index e56f06ec1d4..975e684b003 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_win.h +++ b/src/mpid/ch4/netmod/ofi/ofi_win.h @@ -51,9 +51,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_win_do_progress(MPIR_Win * win, int vci) if (itercount == 1000 && MPIDI_OFI_COUNTER_WAIT_OBJECTS) { ret = fi_cntr_wait(MPIDI_OFI_WIN(win).cmpl_cntr, tcount, 0); - MPIR_ERR_CHKANDJUMP1(ret < 0 && ret != -FI_ETIMEDOUT, + MPIR_ERR_CHKANDJUMP2(ret < 0 && ret != -FI_ETIMEDOUT, mpi_errno, MPI_ERR_RMA_RANGE, - "**ofid_cntr_wait", "**ofid_cntr_wait %s", fi_strerror(-ret)); + "**ofid_cntr_wait", "**ofid_cntr_wait %s %s", + MPIDI_OFI_DEFAULT_NIC_NAME, fi_strerror(-ret)); itercount = 0; DEBUG_PROGRESS_CHECK; }