Skip to content

Commit

Permalink
ch4/ofi: sparsely poll global progress in MPIDI_OFI_retry_progress
Browse files Browse the repository at this point in the history
In most cases, we only need to poll per-vci OFI progress to resolve the
EAGAIN issue. Only poll global progress sparsely -- every 1000 in this
commit.
  • Loading branch information
hzhou authored and raffenet committed Jan 8, 2025
1 parent cec018a commit 75d5c4b
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
10 changes: 3 additions & 7 deletions src/mpid/ch4/netmod/ofi/ofi_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
* for recursive locking in more than one lock (currently limited
* to one due to scalar TLS counter), this lock yielding
* operation can be avoided since we are inside a finite loop. */ \
MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vci_); \
mpi_errno = MPIDI_OFI_retry_progress(); \
MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vci_); \
mpi_errno = MPIDI_OFI_retry_progress(vci_, _retry); \
MPIR_ERR_CHECK(mpi_errno); \
} while (1); \
} while (0)
Expand All @@ -113,9 +111,7 @@ int MPIDI_OFI_handle_cq_error(int vci, int nic, ssize_t ret);
_retry--; \
MPIR_ERR_CHKANDJUMP(_retry == 0, mpi_errno, MPIX_ERR_EAGAIN, "**eagain"); \
} \
MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vci_); \
mpi_errno = MPIDI_OFI_retry_progress(); \
MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vci_); \
mpi_errno = MPIDI_OFI_retry_progress(vci_, _retry); \
} \
} while (0)

Expand Down Expand Up @@ -295,7 +291,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_mr_bind(struct fi_info *prov, struct fid_
#define MPIDI_OFI_LOCAL_MR_KEY 0
#define MPIDI_OFI_COLL_MR_KEY 1
#define MPIDI_OFI_INVALID_MR_KEY 0xFFFFFFFFFFFFFFFFULL
int MPIDI_OFI_retry_progress(void);
int MPIDI_OFI_retry_progress(int vci, int retry);
int MPIDI_OFI_recv_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * rreq);
int MPIDI_OFI_recv_huge_control(int vci, MPIR_Context_id_t comm_id, int rank, int tag,
MPIDI_OFI_huge_remote_info_t * info);
Expand Down
15 changes: 13 additions & 2 deletions src/mpid/ch4/netmod/ofi/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,23 @@
#include "ofi_impl.h"
#include "ofi_events.h"

int MPIDI_OFI_retry_progress(void)
int MPIDI_OFI_retry_progress(int vci, int retry)
{
/* We do not call progress on hooks form netmod level
* because it is not reentrant safe.
*/
return MPID_Progress_test(NULL);
int mpi_errno;
/* call global progress sparingly. I assume the netmod progress on its own
* will resolve most of the resource busy issue. Call global progress when
* that is not resolving. */
if ((retry & 0xff) == 0) {
MPIDI_OFI_THREAD_CS_EXIT_VCI_OPTIONAL(vci);
mpi_errno = MPID_Progress_test(NULL);
MPIDI_OFI_THREAD_CS_ENTER_VCI_OPTIONAL(vci);
} else {
mpi_errno = MPIDI_OFI_progress_uninlined(vci);
}
return mpi_errno;
}

typedef struct MPIDI_OFI_mr_key_allocator_t {
Expand Down

0 comments on commit 75d5c4b

Please sign in to comment.