From 86a0f9fa3590a0957c9eb183c98021e45b8b06b6 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 11 Feb 2025 18:42:58 +0000 Subject: [PATCH] [v2.0.x] prov/efa: Fix the unexp_pkt clean up. Currently there are two issues with the rxe->unexp_pkt: 1. rxe->unexp_pkt is assigned to a pkt entry when the rxe was allocated as an unexpected rx entry. After user posts a receive and such rxe is matched, it should clean this unexp_pkt field when processing the pkts. Otherwise it will cause a double free error in `efa_rdm_rxe_handle_error` which will release the unexp_pkt again if it's not NULL. 2. rxe->unexp_pkt is a linked list for multi-req protocols (medium, runting). The current code just call efa_rdm_pke_release_rx to release it, which is wrong, because efa_rdm_pke_release_rx requires the pkt entry to be unlinked. This patch introduces a new function efa_rdm_pke_release_rx_list to unlink and release each pkt entry in the linked list. Signed-off-by: Shi Jin (cherry picked from commit 36b974dc27c67fd0268e0ab575fd01592f756992) --- prov/efa/Makefile.include | 1 + prov/efa/src/rdm/efa_rdm_ope.c | 2 +- prov/efa/src/rdm/efa_rdm_pke.c | 21 +++++++++++ prov/efa/src/rdm/efa_rdm_pke.h | 2 ++ prov/efa/src/rdm/efa_rdm_srx.c | 9 ++++- prov/efa/test/efa_unit_test_mocks.c | 11 ++++++ prov/efa/test/efa_unit_test_mocks.h | 6 ++++ prov/efa/test/efa_unit_test_pke.c | 40 +++++++++++++++++++++ prov/efa/test/efa_unit_test_srx.c | 54 +++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 3 ++ prov/efa/test/efa_unit_tests.h | 2 ++ 11 files changed, 149 insertions(+), 2 deletions(-) diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 81e0fab0aed..b846767f086 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -162,6 +162,7 @@ prov_efa_test_efa_unit_test_LDFLAGS = $(cmocka_rpath) $(efa_LDFLAGS) $(cmocka_LD -Wl,--wrap=ofi_cudaMalloc \ -Wl,--wrap=ofi_copy_from_hmem_iov \ -Wl,--wrap=efa_rdm_pke_read \ + -Wl,--wrap=efa_rdm_pke_proc_matched_rtm \ -Wl,--wrap=efa_device_support_unsolicited_write_recv if HAVE_EFADV_CQ_EX diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c index f24d9c0150e..3c80068f134 100644 --- a/prov/efa/src/rdm/efa_rdm_ope.c +++ b/prov/efa/src/rdm/efa_rdm_ope.c @@ -592,7 +592,7 @@ void efa_rdm_rxe_handle_error(struct efa_rdm_ope *rxe, int err, int prov_errno) dlist_remove(&rxe->queued_entry); if (rxe->unexp_pkt) { - efa_rdm_pke_release_rx(rxe->unexp_pkt); + efa_rdm_pke_release_rx_list(rxe->unexp_pkt); rxe->unexp_pkt = NULL; } diff --git a/prov/efa/src/rdm/efa_rdm_pke.c b/prov/efa/src/rdm/efa_rdm_pke.c index 6b97eccda1c..508aeee575f 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.c +++ b/prov/efa/src/rdm/efa_rdm_pke.c @@ -165,6 +165,27 @@ void efa_rdm_pke_release_rx(struct efa_rdm_pke *pkt_entry) efa_rdm_pke_release(pkt_entry); } +/** + * @brief Release all rx packet entries that are linked + * This can happen when medium / runting protocol is used. + * This function will unlink the pkt entries in the list before + * releasing each pkt entry as it is required by efa_rdm_pke_release_rx + * (see above). + * @param pkt_entry the head of the pkt entry linked list + */ +void efa_rdm_pke_release_rx_list(struct efa_rdm_pke *pkt_entry) +{ + struct efa_rdm_pke *curr, *next; + + curr = pkt_entry; + while (curr) { + next = curr->next; + curr->next = NULL; + efa_rdm_pke_release_rx(curr); + curr = next; + } +} + void efa_rdm_pke_copy(struct efa_rdm_pke *dest, struct efa_rdm_pke *src) { diff --git a/prov/efa/src/rdm/efa_rdm_pke.h b/prov/efa/src/rdm/efa_rdm_pke.h index 223822ce595..8942709c560 100644 --- a/prov/efa/src/rdm/efa_rdm_pke.h +++ b/prov/efa/src/rdm/efa_rdm_pke.h @@ -215,6 +215,8 @@ void efa_rdm_pke_release_tx(struct efa_rdm_pke *pkt_entry); void efa_rdm_pke_release_rx(struct efa_rdm_pke *pkt_entry); +void efa_rdm_pke_release_rx_list(struct efa_rdm_pke *pkt_entry); + void efa_rdm_pke_release(struct efa_rdm_pke *pkt_entry); void efa_rdm_pke_append(struct efa_rdm_pke *dst, diff --git a/prov/efa/src/rdm/efa_rdm_srx.c b/prov/efa/src/rdm/efa_rdm_srx.c index 4efbe01da72..6625a0c30b8 100644 --- a/prov/efa/src/rdm/efa_rdm_srx.c +++ b/prov/efa/src/rdm/efa_rdm_srx.c @@ -62,6 +62,12 @@ static int efa_rdm_srx_start(struct fi_peer_rx_entry *peer_rxe) rxe->state = EFA_RDM_RXE_MATCHED; + /** + * Since the rxe is now matched, we need to clean the unexp_pkt + * as the pkts are now processed. + */ + rxe->unexp_pkt = NULL; + ret = efa_rdm_pke_proc_matched_rtm(pkt_entry); if (OFI_UNLIKELY(ret)) { /* If we run out of memory registrations, we fall back to @@ -101,7 +107,8 @@ static int efa_rdm_srx_discard(struct fi_peer_rx_entry *peer_rxe) EFA_WARN(FI_LOG_EP_CTRL, "Discarding unmatched unexpected rxe: %p pkt_entry %p\n", rxe, rxe->unexp_pkt); - efa_rdm_pke_release_rx(rxe->unexp_pkt); + efa_rdm_pke_release_rx_list(rxe->unexp_pkt); + rxe->unexp_pkt = NULL; efa_rdm_rxe_release_internal(rxe); return FI_SUCCESS; } diff --git a/prov/efa/test/efa_unit_test_mocks.c b/prov/efa/test/efa_unit_test_mocks.c index 75dd2bad732..23999edbb56 100644 --- a/prov/efa/test/efa_unit_test_mocks.c +++ b/prov/efa/test/efa_unit_test_mocks.c @@ -202,6 +202,11 @@ int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope) return mock(); } +ssize_t efa_mock_efa_rdm_pke_proc_matched_rtm_no_op(struct efa_rdm_pke *pkt_entry) +{ + return FI_SUCCESS; +} + bool efa_mock_efa_device_support_unsolicited_write_recv() { return mock(); @@ -223,6 +228,7 @@ struct efa_unit_test_mocks g_efa_unit_test_mocks = { #endif .ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov, .efa_rdm_pke_read = __real_efa_rdm_pke_read, + .efa_rdm_pke_proc_matched_rtm = __real_efa_rdm_pke_proc_matched_rtm, .efa_device_support_unsolicited_write_recv = __real_efa_device_support_unsolicited_write_recv, .ibv_is_fork_initialized = __real_ibv_is_fork_initialized, #if HAVE_EFADV_QUERY_MR @@ -358,6 +364,11 @@ int __wrap_efa_rdm_pke_read(struct efa_rdm_ope *ope) return g_efa_unit_test_mocks.efa_rdm_pke_read(ope); } +int __wrap_efa_rdm_pke_proc_matched_rtm(struct efa_rdm_pke *pkt_entry) +{ + return g_efa_unit_test_mocks.efa_rdm_pke_proc_matched_rtm(pkt_entry); +} + bool __wrap_efa_device_support_unsolicited_write_recv(void) { return g_efa_unit_test_mocks.efa_device_support_unsolicited_write_recv(); diff --git a/prov/efa/test/efa_unit_test_mocks.h b/prov/efa/test/efa_unit_test_mocks.h index 3e764c91fb1..853577b0708 100644 --- a/prov/efa/test/efa_unit_test_mocks.h +++ b/prov/efa/test/efa_unit_test_mocks.h @@ -91,6 +91,10 @@ bool __real_efa_device_support_unsolicited_write_recv(); int efa_mock_efa_rdm_pke_read_return_mock(struct efa_rdm_ope *ope); +ssize_t __real_efa_rdm_pke_proc_matched_rtm(struct efa_rdm_pke *pkt_entry); + +ssize_t efa_mock_efa_rdm_pke_proc_matched_rtm_no_op(struct efa_rdm_pke *pkt_entry); + bool efa_mock_efa_device_support_unsolicited_write_recv(void); struct efa_unit_test_mocks @@ -124,6 +128,8 @@ struct efa_unit_test_mocks int (*efa_rdm_pke_read)(struct efa_rdm_ope *ope); + ssize_t (*efa_rdm_pke_proc_matched_rtm)(struct efa_rdm_pke *pkt_entry); + bool (*efa_device_support_unsolicited_write_recv)(void); enum ibv_fork_status (*ibv_is_fork_initialized)(void); diff --git a/prov/efa/test/efa_unit_test_pke.c b/prov/efa/test/efa_unit_test_pke.c index d52ccf76cc3..eeabac447a9 100644 --- a/prov/efa/test/efa_unit_test_pke.c +++ b/prov/efa/test/efa_unit_test_pke.c @@ -68,3 +68,43 @@ void test_efa_rdm_pke_handle_longcts_rtm_send_completion(struct efa_resource **s efa_rdm_pke_release_tx(pkt_entry); } + +/** + * @brief Test the efa_rdm_pke_release_rx_list function + * + * @param state + */ +void test_efa_rdm_pke_release_rx_list(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_pke *pke, *curr; + int i; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + + /* Fake a rx pkt entry */ + pke = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(pke); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); + + /* link multiple pkes to this pke */ + for (i = 1; i < 10; i++) { + curr = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(curr); + efa_rdm_pke_append(pke, curr); + } + + /* Release all entries in the linked list */ + efa_rdm_pke_release_rx_list(pke); + + /** + * Now the rx pkt buffer pool should be empty so we can destroy it + * Otherwise there will be an assertion error on the use cnt is + * is non-zero + */ + ofi_bufpool_destroy(efa_rdm_ep->efa_rx_pkt_pool); + efa_rdm_ep->efa_rx_pkt_pool = NULL; +} diff --git a/prov/efa/test/efa_unit_test_srx.c b/prov/efa/test/efa_unit_test_srx.c index e0bff95169b..6d6c0b3a322 100644 --- a/prov/efa/test/efa_unit_test_srx.c +++ b/prov/efa/test/efa_unit_test_srx.c @@ -4,6 +4,7 @@ #include "efa_unit_tests.h" #include "ofi_util.h" #include "efa_rdm_ep.h" +#include "efa_rdm_msg.h" /** * @brief This test validates whether the default min_multi_recv size is correctly @@ -65,3 +66,56 @@ void test_efa_srx_lock(struct efa_resource **state) util_domain.domain_fid.fid); assert_true(((void *) srx_ctx->lock == (void *) &efa_domain->srx_lock)); } + + +/** + * @brief Test srx's start ops is updating the rxe status correctly + * + * @param state + */ +void test_efa_srx_unexp_pkt(struct efa_resource **state) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct util_srx_ctx *srx_ctx; + struct efa_rdm_ope *rxe; + struct efa_rdm_pke *pke; + struct efa_unit_test_eager_rtm_pkt_attr pke_attr = { + .msg_id = 0, + .connid = 0x1234 + }; + + g_efa_unit_test_mocks.efa_rdm_pke_proc_matched_rtm = &efa_mock_efa_rdm_pke_proc_matched_rtm_no_op; + + efa_unit_test_resource_construct(resource, FI_EP_RDM, EFA_FABRIC_NAME); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + srx_ctx = efa_rdm_ep_get_peer_srx_ctx(efa_rdm_ep); + + /* Fake a rx pkt entry */ + pke = efa_rdm_pke_alloc(efa_rdm_ep, efa_rdm_ep->efa_rx_pkt_pool, EFA_RDM_PKE_FROM_EFA_RX_POOL); + assert_non_null(pke); + efa_rdm_ep->efa_rx_pkts_posted = efa_rdm_ep_get_rx_pool_size(efa_rdm_ep); + pke->addr = FI_ADDR_UNSPEC; + + efa_unit_test_eager_msgrtm_pkt_construct(pke, &pke_attr); + /** + * Allocate an rxe with the rx pkt. + * Since there is no recv posted, the rxe must be unexpected + */ + ofi_genlock_lock(srx_ctx->lock); + rxe = efa_rdm_msg_alloc_rxe_for_msgrtm(efa_rdm_ep, &pke); + assert_true(rxe->state == EFA_RDM_RXE_UNEXP); + assert_true(rxe->unexp_pkt == pke); + + /* Start progressing the unexpected rxe */ + srx_ctx->peer_srx.peer_ops->start_msg(rxe->peer_rxe); + + /* Make sure rxe is updated as mateched and unexp_pkt is NULL */ + assert_true(rxe->state == EFA_RDM_RXE_MATCHED); + assert_true(rxe->unexp_pkt == NULL); + + efa_rdm_pke_release_rx(pke); + efa_rdm_rxe_release(rxe); + ofi_genlock_unlock(srx_ctx->lock); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 380a1136c9b..2f12aefb8d5 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -61,6 +61,7 @@ static int efa_unit_test_mocks_teardown(void **state) #endif .ofi_copy_from_hmem_iov = __real_ofi_copy_from_hmem_iov, .efa_rdm_pke_read = __real_efa_rdm_pke_read, + .efa_rdm_pke_proc_matched_rtm = __real_efa_rdm_pke_proc_matched_rtm, .efa_device_support_unsolicited_write_recv = __real_efa_device_support_unsolicited_write_recv, .ibv_is_fork_initialized = __real_ibv_is_fork_initialized, }; @@ -169,6 +170,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_srx_min_multi_recv_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_srx_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_srx_lock, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_srx_unexp_pkt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rnr_queue_and_resend, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ope_prepare_to_post_send_with_no_enough_tx_pkts, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ope_prepare_to_post_send_host_memory, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), @@ -194,6 +196,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_peer_select_readbase_rtm_no_runt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_peer_select_readbase_rtm_do_runt, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_pke_get_available_copy_methods_align128, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_pke_release_rx_list, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_domain_open_ops_wrong_name, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_domain_open_ops_mr_query, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_cq_ibv_cq_poll_list_same_tx_rx_cq_single_ep, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index aa1bfa8e784..978a4ad7091 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -183,6 +183,7 @@ void test_efa_use_device_rdma_opt_old(); void test_efa_srx_min_multi_recv_size(); void test_efa_srx_cq(); void test_efa_srx_lock(); +void test_efa_srx_unexp_pkt(); void test_efa_rnr_queue_and_resend(); void test_efa_rdm_ope_prepare_to_post_send_with_no_enough_tx_pkts(); void test_efa_rdm_ope_prepare_to_post_send_host_memory(); @@ -224,6 +225,7 @@ void test_efa_rdm_peer_move_overflow_pke_to_recvwin(); void test_efa_rdm_peer_keep_pke_in_overflow_list(); void test_efa_rdm_peer_append_overflow_pke_to_recvwin(); void test_efa_rdm_pke_handle_longcts_rtm_send_completion(); +void test_efa_rdm_pke_release_rx_list(); static inline int efa_unit_test_get_dlist_length(struct dlist_entry *head)