From 12c0be48642360aa768aec34356a95f3ac42ab5c Mon Sep 17 00:00:00 2001 From: Amedeo Sapio Date: Thu, 14 Mar 2024 01:28:57 +0000 Subject: [PATCH] rdma: remove Send-After-Send ordering requirement With this commit, the plugin does not ask for Send-After-Send ordering to libfabric. The only case where out-of-order sends can be a problem is if a receiver sends a connect response followed by a control message, and they arrive in the opposite order on the sender side. To deal with this rare case, we add a list of control messages in the send communicator to store those early control messages, and process them after the connect response is received. Signed-off-by: Amedeo Sapio --- include/nccl_ofi_rdma.h | 9 ++++++++ src/nccl_ofi_rdma.c | 49 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/include/nccl_ofi_rdma.h b/include/nccl_ofi_rdma.h index 0373846cb..4bf1d0745 100644 --- a/include/nccl_ofi_rdma.h +++ b/include/nccl_ofi_rdma.h @@ -381,6 +381,15 @@ typedef struct nccl_net_ofi_rdma_send_comm { * response message */ nccl_ofi_rdma_connection_info_t conn_msg; + /* + * Due to out-of-order delivery, it might happen that a + * control message is received before the connect response + * message. In this case we save the control message in + * the send communicator until the connect response message + * is received. + */ + nccl_ofi_deque_t *early_ctrl; + uint16_t next_msg_seq_num; nccl_ofi_msgbuff_t *msgbuff; diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index e492bb2df..f6ea29973 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -1269,6 +1269,7 @@ static inline int process_completions(struct fi_cq_data_entry *cq_entry, uint64_ nccl_net_ofi_rdma_listen_comm_t *l_comm = NULL; nccl_net_ofi_rdma_send_comm_t *s_comm = NULL; nccl_net_ofi_rdma_recv_comm_t *r_comm = NULL; + nccl_ofi_deque_elem_t *early_ctrl_deque_elem = NULL; for (comp_idx = 0; comp_idx < num_cqes; comp_idx++) { /* The context for these operations is req. @@ -1385,6 +1386,28 @@ static inline int process_completions(struct fi_cq_data_entry *cq_entry, uint64_ NCCL_OFI_WARN("Failed to repost bounce buff"); goto exit; } + + /* Check early ctrl list */ + if (OFI_UNLIKELY(!nccl_ofi_deque_isempty(s_comm->early_ctrl))) { + do { + ret = nccl_ofi_deque_remove_front(s_comm->early_ctrl, + &early_ctrl_deque_elem); + if (OFI_UNLIKELY(ret != 0)) { + NCCL_OFI_WARN( + "Failed to get element from early ctrl list"); + goto exit; + } + + req = container_of(early_ctrl_deque_elem, + nccl_net_ofi_rdma_req_t, pending_reqs_elem); + ctrl_msg = get_bounce_ctrl_msg( + get_bounce_data(req)->bounce_fl_item); + ret = handle_ctrl_recv(s_comm, ctrl_msg->msg_seq_num, req); + if (OFI_UNLIKELY(ret != 0)) { + goto exit; + } + } while (!nccl_ofi_deque_isempty(s_comm->early_ctrl)); + } } else if (*msg_type == NCCL_OFI_RDMA_MSG_CTRL) { /* CTRL receive completion */ assert(sizeof(nccl_net_ofi_rdma_ctrl_msg_t) == cq_entry[comp_idx].len); @@ -1401,7 +1424,15 @@ static inline int process_completions(struct fi_cq_data_entry *cq_entry, uint64_ NCCL_OFI_TRACE_SEND_CTRL_RECV(r_comm->base.base.dev_id, rail->rail_id, s_comm, ctrl_msg->msg_seq_num); - ret = handle_ctrl_recv(s_comm, ctrl_msg->msg_seq_num, req); + if (OFI_UNLIKELY(s_comm->conn_msg.type == NCCL_OFI_RDMA_MSG_CONN)) { + /* CTRL message received before CONN_RESP. This is a rare case. + Insert req into early_ctrl deque of the send communicator */ + ret = nccl_ofi_deque_insert_back(s_comm->early_ctrl, + &req->pending_reqs_elem); + } else { + ret = handle_ctrl_recv(s_comm, ctrl_msg->msg_seq_num, req); + } + if (OFI_UNLIKELY(ret != 0)) { goto exit; } @@ -4622,6 +4653,12 @@ static int send_close(nccl_net_ofi_rdma_send_comm_t *s_comm) req->free(req, false); } + ret = nccl_ofi_deque_finalize(s_comm->early_ctrl); + if (ret != 0) { + NCCL_OFI_WARN("Failed to finalize early_ctrl deque: %d", ret); + goto exit; + } + /* Release request freelist */ ret = nccl_ofi_freelist_fini(s_comm->nccl_ofi_reqs_fl); if (ret != 0) { @@ -4910,6 +4947,12 @@ static inline int create_send_comm(nccl_net_ofi_conn_handle_t *handle, /* Allocate communicator rails array */ ret_s_comm->num_rails = num_rails; + ret = nccl_ofi_deque_init(&ret_s_comm->early_ctrl); + if (ret != 0) { + NCCL_OFI_WARN("Failed to init early_ctrl deque: %d", ret); + goto error; + } + /* Insert remote name into AV of first rail */ ret = fi_av_insert(first_rail->av, (void *)handle->ep_name, 1, @@ -5763,8 +5806,8 @@ static void get_hints(struct fi_info *hints) hints->mode = 0; - hints->tx_attr->msg_order = FI_ORDER_SAS; - hints->rx_attr->msg_order = FI_ORDER_SAS; + hints->tx_attr->msg_order = FI_ORDER_NONE; + hints->rx_attr->msg_order = FI_ORDER_NONE; hints->ep_attr->type = FI_EP_RDM;