Skip to content

Commit

Permalink
rdma: remove Send-After-Send ordering requirement
Browse files Browse the repository at this point in the history
With this commit, the plugin does not ask for Send-After-Send ordering to
libfabric. The only case where out-of-order sends can be a problem is if a
receiver sends a connect response followed by a control message, and they
arrive in the opposite order on the sender side. To deal with this rare
case, we add a list of control messages in the send communicator to
store those early control messages, and process them after the connect
response is received.

Signed-off-by: Amedeo Sapio <asapio@amazon.com>
  • Loading branch information
AmedeoSapio committed Mar 14, 2024
1 parent f2065f1 commit 12c0be4
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 3 deletions.
9 changes: 9 additions & 0 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,15 @@ typedef struct nccl_net_ofi_rdma_send_comm {
* response message */
nccl_ofi_rdma_connection_info_t conn_msg;

/*
* Due to out-of-order delivery, it might happen that a
* control message is received before the connect response
* message. In this case we save the control message in
* the send communicator until the connect response message
* is received.
*/
nccl_ofi_deque_t *early_ctrl;

uint16_t next_msg_seq_num;

nccl_ofi_msgbuff_t *msgbuff;
Expand Down
49 changes: 46 additions & 3 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -1269,6 +1269,7 @@ static inline int process_completions(struct fi_cq_data_entry *cq_entry, uint64_
nccl_net_ofi_rdma_listen_comm_t *l_comm = NULL;
nccl_net_ofi_rdma_send_comm_t *s_comm = NULL;
nccl_net_ofi_rdma_recv_comm_t *r_comm = NULL;
nccl_ofi_deque_elem_t *early_ctrl_deque_elem = NULL;

for (comp_idx = 0; comp_idx < num_cqes; comp_idx++) {
/* The context for these operations is req.
Expand Down Expand Up @@ -1385,6 +1386,28 @@ static inline int process_completions(struct fi_cq_data_entry *cq_entry, uint64_
NCCL_OFI_WARN("Failed to repost bounce buff");
goto exit;
}

/* Check early ctrl list */
if (OFI_UNLIKELY(!nccl_ofi_deque_isempty(s_comm->early_ctrl))) {
do {
ret = nccl_ofi_deque_remove_front(s_comm->early_ctrl,
&early_ctrl_deque_elem);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN(
"Failed to get element from early ctrl list");
goto exit;
}

req = container_of(early_ctrl_deque_elem,
nccl_net_ofi_rdma_req_t, pending_reqs_elem);
ctrl_msg = get_bounce_ctrl_msg(
get_bounce_data(req)->bounce_fl_item);
ret = handle_ctrl_recv(s_comm, ctrl_msg->msg_seq_num, req);
if (OFI_UNLIKELY(ret != 0)) {
goto exit;
}
} while (!nccl_ofi_deque_isempty(s_comm->early_ctrl));
}
} else if (*msg_type == NCCL_OFI_RDMA_MSG_CTRL) {
/* CTRL receive completion */
assert(sizeof(nccl_net_ofi_rdma_ctrl_msg_t) == cq_entry[comp_idx].len);
Expand All @@ -1401,7 +1424,15 @@ static inline int process_completions(struct fi_cq_data_entry *cq_entry, uint64_
NCCL_OFI_TRACE_SEND_CTRL_RECV(r_comm->base.base.dev_id, rail->rail_id, s_comm,
ctrl_msg->msg_seq_num);

ret = handle_ctrl_recv(s_comm, ctrl_msg->msg_seq_num, req);
if (OFI_UNLIKELY(s_comm->conn_msg.type == NCCL_OFI_RDMA_MSG_CONN)) {
/* CTRL message received before CONN_RESP. This is a rare case.
Insert req into early_ctrl deque of the send communicator */
ret = nccl_ofi_deque_insert_back(s_comm->early_ctrl,
&req->pending_reqs_elem);
} else {
ret = handle_ctrl_recv(s_comm, ctrl_msg->msg_seq_num, req);
}

if (OFI_UNLIKELY(ret != 0)) {
goto exit;
}
Expand Down Expand Up @@ -4622,6 +4653,12 @@ static int send_close(nccl_net_ofi_rdma_send_comm_t *s_comm)
req->free(req, false);
}

ret = nccl_ofi_deque_finalize(s_comm->early_ctrl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to finalize early_ctrl deque: %d", ret);
goto exit;
}

/* Release request freelist */
ret = nccl_ofi_freelist_fini(s_comm->nccl_ofi_reqs_fl);
if (ret != 0) {
Expand Down Expand Up @@ -4910,6 +4947,12 @@ static inline int create_send_comm(nccl_net_ofi_conn_handle_t *handle,
/* Allocate communicator rails array */
ret_s_comm->num_rails = num_rails;

ret = nccl_ofi_deque_init(&ret_s_comm->early_ctrl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init early_ctrl deque: %d", ret);
goto error;
}

/* Insert remote name into AV of first rail */
ret = fi_av_insert(first_rail->av,
(void *)handle->ep_name, 1,
Expand Down Expand Up @@ -5763,8 +5806,8 @@ static void get_hints(struct fi_info *hints)

hints->mode = 0;

hints->tx_attr->msg_order = FI_ORDER_SAS;
hints->rx_attr->msg_order = FI_ORDER_SAS;
hints->tx_attr->msg_order = FI_ORDER_NONE;
hints->rx_attr->msg_order = FI_ORDER_NONE;

hints->ep_attr->type = FI_EP_RDM;

Expand Down

0 comments on commit 12c0be4

Please sign in to comment.