Skip to content

Commit

Permalink
rdma: add separate bounce buffer for data (eager messages)
Browse files Browse the repository at this point in the history
Separate out bounce buffer freelists into a smaller-sized freelist for
control messages and a larger size for data (eager) messages

Signed-off-by: Eric Raut <eraut@amazon.com>
  • Loading branch information
rauteric committed Sep 19, 2024
1 parent 55311f6 commit 19af9ec
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 21 deletions.
12 changes: 8 additions & 4 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,10 @@ struct nccl_net_ofi_ep_rail {
size_t max_bounce_posted;
/* Mutex for bounce buffer operations */
pthread_mutex_t bounce_mutex;
/* Back-pointer to associated buffer freelist */
nccl_ofi_freelist_t *bounce_buff_fl;
/* Size of bounce buffers */
size_t bounce_buff_size;
};

/*
Expand Down Expand Up @@ -730,12 +734,12 @@ struct nccl_net_ofi_rdma_ep {
/* Pending requests queue */
nccl_ofi_deque_t *pending_reqs_queue;

/* Free list of bounce buffers */
nccl_ofi_freelist_t *bounce_buff_fl;
/* Free list of bounce buffers (ctrl rail) */
nccl_ofi_freelist_t *bounce_buff_ctrl_fl;
/* Free list of bounce buffers (data rails) */
nccl_ofi_freelist_t *bounce_buff_data_fl;
/* Free list of bounce buffer requests */
nccl_ofi_freelist_t *bounce_buff_reqs_fl;
/* Size of bounce buffers */
size_t bounce_buff_size;

/* True if this ep is stored in the thread-local store */
bool thread_local_ep;
Expand Down
60 changes: 43 additions & 17 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -2069,7 +2069,7 @@ static inline int free_bounce_req(nccl_net_ofi_rdma_req_t *req,
nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep;
/* Free buffer */
if (bounce_data->bounce_fl_item) {
nccl_ofi_freelist_entry_free(ep->bounce_buff_fl, bounce_data->bounce_fl_item);
nccl_ofi_freelist_entry_free(bounce_data->rail->bounce_buff_fl, bounce_data->bounce_fl_item);
}
return free_base_req(NULL, ep->bounce_buff_reqs_fl, req, false);
}
Expand All @@ -2087,9 +2087,11 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *

rdma_req_bounce_data_t *bounce_data = get_bounce_data(req);

nccl_ofi_freelist_t *bounce_buff_fl = rail->bounce_buff_fl;

nccl_net_ofi_rdma_bounce_fl_item_t *bounce_fl_item =
(nccl_net_ofi_rdma_bounce_fl_item_t *)nccl_ofi_freelist_entry_alloc(
ep->bounce_buff_fl);
bounce_buff_fl);
if (!bounce_fl_item) {
NCCL_OFI_WARN("Failed to allocate bounce_fl_item");
req->free(req, false);
Expand All @@ -2098,7 +2100,7 @@ static inline nccl_net_ofi_rdma_req_t *alloc_bounce_req(nccl_net_ofi_rdma_ep_t *
assert(NCCL_OFI_IS_PTR_ALIGNED(&bounce_fl_item->bounce_msg, BOUNCE_BUFFER_ALIGNMENT));

bounce_data->bounce_fl_item = bounce_fl_item;
bounce_data->buff_len = ep->bounce_buff_size;
bounce_data->buff_len = rail->bounce_buff_size;
bounce_data->rail = rail;
bounce_data->ep = ep;
return req;
Expand Down Expand Up @@ -5035,8 +5037,7 @@ static int post_bounce_buffer(nccl_net_ofi_rdma_req_t *req,
/* Reset memcheck guards of bounce buffer freelist entry to
* accessible but undefined to cover cases where the buffer
* gets re-posted */
nccl_net_ofi_rdma_ep_t *ep = bounce_data->ep;
nccl_ofi_freelist_entry_set_undefined(ep->bounce_buff_fl,
nccl_ofi_freelist_entry_set_undefined(ep_rail->bounce_buff_fl,
bounce_fl_item);

req->state = NCCL_OFI_RDMA_REQ_CREATED;
Expand Down Expand Up @@ -5650,7 +5651,8 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
int ret = 0;

ep->bounce_buff_reqs_fl = NULL;
ep->bounce_buff_fl = NULL;
ep->bounce_buff_ctrl_fl = NULL;
ep->bounce_buff_data_fl = NULL;

ret = nccl_ofi_freelist_init(sizeof(nccl_net_ofi_rdma_req_t),
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
Expand All @@ -5660,12 +5662,26 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
return ret;
}

ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + ep->bounce_buff_size,
size_t ctrl_bounce_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX(
sizeof(nccl_net_ofi_rdma_ctrl_msg_t),
sizeof(nccl_ofi_rdma_connection_info_t)),
sizeof(nccl_net_ofi_rdma_close_msg_t));
ep->control_rail.bounce_buff_size = ctrl_bounce_buff_size;
ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + ctrl_bounce_buff_size,
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_ctrl_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init bounce_buff_ctrl_fl");
goto error;
}

ret = nccl_ofi_freelist_init_mr(sizeof(nccl_net_ofi_rdma_bounce_fl_item_t) + eager_max_size,
ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0,
freelist_regmr_host_fn, freelist_deregmr_host_fn,
ep, 0, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_fl);
ep, false, BOUNCE_BUFFER_ALIGNMENT, &ep->bounce_buff_data_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to init bounce_buff_fl");
NCCL_OFI_WARN("Failed to init bounce_buff_data_fl");
goto error;
}

Expand All @@ -5686,6 +5702,7 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
if (ret != 0) {
goto error;
}
ep->control_rail.bounce_buff_fl = ep->bounce_buff_ctrl_fl;

for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) {
nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
Expand All @@ -5705,6 +5722,8 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
}
goto error;
}
rail->bounce_buff_fl = ep->bounce_buff_data_fl;
rail->bounce_buff_size = eager_max_size;
}

return ret;
Expand All @@ -5714,9 +5733,13 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
nccl_ofi_freelist_fini(ep->bounce_buff_reqs_fl);
ep->bounce_buff_reqs_fl = NULL;
}
if (ep->bounce_buff_fl != NULL) {
nccl_ofi_freelist_fini(ep->bounce_buff_fl);
ep->bounce_buff_fl = NULL;
if (ep->bounce_buff_data_fl != NULL) {
nccl_ofi_freelist_fini(ep->bounce_buff_data_fl);
ep->bounce_buff_data_fl = NULL;
}
if (ep->bounce_buff_ctrl_fl != NULL) {
nccl_ofi_freelist_fini(ep->bounce_buff_ctrl_fl);
ep->bounce_buff_ctrl_fl = NULL;
}

return ret;
Expand All @@ -5734,9 +5757,15 @@ static inline int init_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
static inline int fini_bounce_buffers(nccl_net_ofi_rdma_ep_t *ep)
{
int ret = 0;
ret = nccl_ofi_freelist_fini(ep->bounce_buff_fl);
ret = nccl_ofi_freelist_fini(ep->bounce_buff_ctrl_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini bounce_buff_ctrl_fl");
return ret;
}

ret = nccl_ofi_freelist_fini(ep->bounce_buff_data_fl);
if (ret != 0) {
NCCL_OFI_WARN("Failed to fini bounce_buff_fl");
NCCL_OFI_WARN("Failed to fini bounce_buff_data_fl");
return ret;
}

Expand Down Expand Up @@ -6553,9 +6582,6 @@ static int create_ep(nccl_net_ofi_rdma_device_t *device,
/* Initialize number of rail */
ep->num_rails = num_rails;

ep->bounce_buff_size = NCCL_OFI_MAX(NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), eager_max_size),
sizeof(nccl_ofi_rdma_connection_info_t));

ep->rails = (nccl_net_ofi_ep_rail_t *)calloc(ep->num_rails,
sizeof(nccl_net_ofi_ep_rail_t));
if (!ep->rails) {
Expand Down

0 comments on commit 19af9ec

Please sign in to comment.