diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 063a7bac2..9bf337b94 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -5649,6 +5649,9 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) { int ret = 0; + ep->bounce_buff_reqs_fl = NULL; + ep->bounce_buff_fl = NULL; + ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t), ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, &ep->bounce_buff_reqs_fl); @@ -5663,9 +5666,7 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_fl); if (ret != 0) { NCCL_OFI_WARN("Failed to init bounce_buff_fl"); - if (nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl)) - NCCL_OFI_WARN("Also failed to freelist_fini bounce_buff_reqs_fl"); - return ret; + goto error; } /* @@ -5682,6 +5683,9 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) ); ep->control_rail.num_bounce_posted = 0; ret = nccl_net_ofi_mutex_init(&ep->control_rail.bounce_mutex, NULL); + if (ret != 0) { + goto error; + } for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) { nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id); @@ -5691,7 +5695,28 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep) rail->max_bounce_posted = NCCL_OFI_DIV_CEIL( ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails ); - nccl_net_ofi_mutex_init(&rail->bounce_mutex, NULL); + rail->num_bounce_posted = 0; + ret = nccl_net_ofi_mutex_init(&rail->bounce_mutex, NULL); + if (ret != 0) { + /* Cleanup previous mutexes */ + nccl_net_ofi_mutex_destroy(&ep->control_rail.bounce_mutex); + for (int i = 0; i < rail_id; ++i) { + nccl_net_ofi_mutex_destroy(&(get_rail(ep, i)->bounce_mutex)); + } + goto error; + } + } + + return ret; + +error: + if (ep->bounce_buff_reqs_fl != NULL) { + nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl); + ep->bounce_buff_reqs_fl = NULL; + } + if (ep->bounce_buff_fl != NULL) { + nccl_ofi_freelist_fini(ep->bounce_buff_fl); + ep->bounce_buff_fl = NULL; } return ret;