diff --git a/include/nccl_ofi_rdma.h b/include/nccl_ofi_rdma.h index 9620ff5a7..1fe2b9296 100644 --- a/include/nccl_ofi_rdma.h +++ b/include/nccl_ofi_rdma.h @@ -686,6 +686,10 @@ struct nccl_net_ofi_ep_rail { size_t max_bounce_posted; /* Mutex for bounce buffer operations */ pthread_mutex_t bounce_mutex; + /* Back-pointer to associated buffer freelist */ + nccl_ofi_freelist_t *bounce_buff_fl; + /* Size of bounce buffers */ + size_t bounce_buff_size; }; /* @@ -730,12 +734,12 @@ struct nccl_net_ofi_rdma_ep { /* Pending requests queue */ nccl_ofi_deque_t *pending_reqs_queue; - /* Free list of bounce buffers */ - nccl_ofi_freelist_t *bounce_buff_fl; + /* Free list of bounce buffers (ctrl rail) */ + nccl_ofi_freelist_t *bounce_buff_ctrl_fl; + /* Free list of bounce buffers (data rails) */ + nccl_ofi_freelist_t *bounce_buff_data_fl; /* Free list of bounce buffer requests */ nccl_ofi_freelist_t *bounce_buff_reqs_fl; - /* Size of bounce buffers */ - size_t bounce_buff_size; /* True if this ep is stored in the thread-local store */ bool thread_local_ep; diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 9bf337b94..668f5148e 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -2069,7 +2069,7 @@ static inline int free_bounce_req(nccl_net_ofi_rdma_req_t *req, nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep; /* Free buffer */ if (bounce_data->bounce_fl_item) { - nccl_ofi_freelist_entry_free(ep->bounce_buff_fl, bounce_data->bounce_fl_item); + nccl_ofi_freelist_entry_free(bounce_data->rail->bounce_buff_fl, bounce_data->bounce_fl_item); } return free_base_req(NULL, ep->bounce_buff_reqs_fl, req, false); } @@ -2087,9 +2087,11 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t * rdma_req_bounce_data_t *bounce_data = get_bounce_data(req); + nccl_ofi_freelist_t *bounce_buff_fl = rail->bounce_buff_fl; + nccl_net_ofi_rdma_bounce_fl_item_t *bounce_fl_item = (nccl_net_ofi_rdma_bounce_fl_item_t *)nccl_ofi_freelist_entry_alloc( - ep->bounce_buff_fl); + bounce_buff_fl); if (!bounce_fl_item) { NCCL_OFI_WARN("Failed to allocate bounce_fl_item"); req->free(req, false); @@ -2098,7 +2100,7 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t * assert(NCCL_OFI_IS_PTR_ALIGNED(&bounce_fl_item->bounce_msg, BOUNCE_BUFFER_ALIGNMENT)); bounce_data->bounce_fl_item = bounce_fl_item; - bounce_data->buff_len = ep->bounce_buff_size; + bounce_data->buff_len = rail->bounce_buff_size; bounce_data->rail = rail; bounce_data->ep = ep; return req; @@ -5035,8 +5037,7 @@ static int post_bounce_buffer(nccl_net_ofi_rdma_req_t *req, /* Reset memcheck guards of bounce buffer freelist entry to * accessible but undefined to cover cases where the buffer * gets re-posted */ - nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep; - nccl_ofi_freelist_entry_set_undefined(ep->bounce_buff_fl, + nccl_ofi_freelist_entry_set_undefined(ep_rail->bounce_buff_fl, bounce_fl_item); req->state = NCCL_OFI_RDMA_REQ_CREATED; @@ -5650,7 +5651,8 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) int ret = 0; ep->bounce_buff_reqs_fl = NULL; - ep->bounce_buff_fl = NULL; + ep->bounce_buff_ctrl_fl = NULL; + ep->bounce_buff_data_fl = NULL; ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t), ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, @@ -5660,12 +5662,26 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) return ret; } - ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + ep->bounce_buff_size, + size_t ctrl_bounce_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX( + sizeof(nccl_net_ofi_rdma_ctrl_msg_t), + sizeof(nccl_ofi_rdma_connection_info_t)), + sizeof(nccl_net_ofi_rdma_close_msg_t)); + ep->control_rail.bounce_buff_size = ctrl_bounce_buff_size; + ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + ctrl_bounce_buff_size, + ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, + freelist_regmr_host_fn, freelist_deregmr_host_fn, + ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_ctrl_fl); + if (ret != 0) { + NCCL_OFI_WARN("Failed to init bounce_buff_ctrl_fl"); + goto error; + } + + ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + eager_max_size, ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, freelist_regmr_host_fn, freelist_deregmr_host_fn, - ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_fl); + ep, false, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_data_fl); if (ret != 0) { - NCCL_OFI_WARN("Failed to init bounce_buff_fl"); + NCCL_OFI_WARN("Failed to init bounce_buff_data_fl"); goto error; } @@ -5686,6 +5702,7 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) if (ret != 0) { goto error; } + ep->control_rail.bounce_buff_fl = ep->bounce_buff_ctrl_fl; for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) { nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id); @@ -5705,6 +5722,8 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) } goto error; } + rail->bounce_buff_fl = ep->bounce_buff_data_fl; + rail->bounce_buff_size = eager_max_size; } return ret; @@ -5714,9 +5733,13 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl); ep->bounce_buff_reqs_fl = NULL; } - if (ep->bounce_buff_fl != NULL) { - nccl_ofi_freelist_fini(ep->bounce_buff_fl); - ep->bounce_buff_fl = NULL; + if (ep->bounce_buff_data_fl != NULL) { + nccl_ofi_freelist_fini(ep->bounce_buff_data_fl); + ep->bounce_buff_data_fl = NULL; + } + if (ep->bounce_buff_ctrl_fl != NULL) { + nccl_ofi_freelist_fini(ep->bounce_buff_ctrl_fl); + ep->bounce_buff_ctrl_fl = NULL; } return ret; @@ -5734,9 +5757,15 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) static inline int fini_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) { int ret = 0; - ret = nccl_ofi_freelist_fini(ep->bounce_buff_fl); + ret = nccl_ofi_freelist_fini(ep->bounce_buff_ctrl_fl); + if (ret != 0) { + NCCL_OFI_WARN("Failed to fini bounce_buff_ctrl_fl"); + return ret; + } + + ret = nccl_ofi_freelist_fini(ep->bounce_buff_data_fl); if (ret != 0) { - NCCL_OFI_WARN("Failed to fini bounce_buff_fl"); + NCCL_OFI_WARN("Failed to fini bounce_buff_data_fl"); return ret; } @@ -6553,9 +6582,6 @@ static int create_ep(nccl_net_ofi_rdma_device_t *device, /* Initialize number of rail */ ep->num_rails = num_rails; - ep->bounce_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), eager_max_size), - sizeof(nccl_ofi_rdma_connection_info_t)); - ep->rails = (nccl_net_ofi_ep_rail_t *)calloc(ep->num_rails, sizeof(nccl_net_ofi_ep_rail_t)); if (!ep->rails) {