Skip to content

Commit

Permalink
rdma: refactor cleanup in init_bounce_buffers
Browse files Browse the repository at this point in the history
Make cleanup more modular and test for success of mutex init

Signed-off-by: Eric Raut <eraut@amazon.com>
  • Loading branch information
rauteric committed Sep 19, 2024
1 parent af6eb8f commit 55311f6
Showing 1 changed file with 29 additions and 4 deletions.
33 changes: 29 additions & 4 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -5649,6 +5649,9 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
{
int ret = 0;

ep->bounce_buff_reqs_fl = NULL;
ep->bounce_buff_fl = NULL;

ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t),
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
&ep->bounce_buff_reqs_fl);
Expand All @@ -5663,9 +5666,7 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init bounce_buff_fl");
if (nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl))
NCCL_OFI_WARN("Also failed to freelist_fini bounce_buff_reqs_fl");
return ret;
goto error;
}

/*
Expand All @@ -5682,6 +5683,9 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
);
ep->control_rail.num_bounce_posted = 0;
ret = nccl_net_ofi_mutex_init(&ep->control_rail.bounce_mutex, NULL);
if (ret != 0) {
goto error;
}

for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) {
nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
Expand All @@ -5691,7 +5695,28 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
rail->max_bounce_posted = NCCL_OFI_DIV_CEIL(
ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails
);
nccl_net_ofi_mutex_init(&rail->bounce_mutex, NULL);
rail->num_bounce_posted = 0;
ret = nccl_net_ofi_mutex_init(&rail->bounce_mutex, NULL);
if (ret != 0) {
/* Cleanup previous mutexes */
nccl_net_ofi_mutex_destroy(&ep->control_rail.bounce_mutex);
for (int i = 0; i < rail_id; ++i) {
nccl_net_ofi_mutex_destroy(&(get_rail(ep, i)->bounce_mutex));
}
goto error;
}
}

return ret;

error:
if (ep->bounce_buff_reqs_fl != NULL) {
nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl);
ep->bounce_buff_reqs_fl = NULL;
}
if (ep->bounce_buff_fl != NULL) {
nccl_ofi_freelist_fini(ep->bounce_buff_fl);
ep->bounce_buff_fl = NULL;
}

return ret;
Expand Down

0 comments on commit 55311f6

Please sign in to comment.