From c315084140703f11911a8f4af77107761cfae8f1 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 21 Feb 2025 17:30:52 +0000 Subject: [PATCH 1/3] freelist: Handle 0 byte entry_size Parts of the freelist assume entry_size is larger than 0 bytes, but the interface does not check for size and does not specify restrictions either way. Improve the situation by overriding a 0 byte entry size to be 8 bytes (plus any additional size increases caused by the alignment and redzone requirements). This seems better than a bunch of 0 byte checks through the rest of the code. Signed-off-by: Brian Barrett --- src/nccl_ofi_freelist.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/nccl_ofi_freelist.c b/src/nccl_ofi_freelist.c index 10ab1c2e2..3a5c06e4e 100644 --- a/src/nccl_ofi_freelist.c +++ b/src/nccl_ofi_freelist.c @@ -68,6 +68,15 @@ static int freelist_init_internal(size_t entry_size, freelist->memcheck_redzone_size = NCCL_OFI_ROUND_UP(MEMCHECK_REDZONE_SIZE, entry_alignment); + /* The rest of the freelist code doesn't deal well with a 0 byte entry + * so increase to 8 bytes in that case rather than adding a bunch of + * special cases for size == 0 in the rest of the code. This happens + * before the bump-up for entry alignment and redzone checking, which + * may further increase the size. + */ + if (entry_size == 0) { + entry_size = 8; + } freelist->entry_size = NCCL_OFI_ROUND_UP(entry_size, NCCL_OFI_MAX(entry_alignment, NCCL_OFI_MAX(8, MEMCHECK_GRANULARITY))); freelist->entry_size += freelist->memcheck_redzone_size; From d2bf45e3673b2886b1352ee3fc9d85fda41ba850 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 21 Feb 2025 18:38:27 +0000 Subject: [PATCH 2/3] rdma: Allow eager to be entirely disabled Previously, we could only set the eager limit to 0, and even then there was logic to always allow 0 byte messages to be sent eager. This patch makes -1 a valid value for the eager size, which will disable eager messages entirely (including posting buffers). Signed-off-by: Brian Barrett --- include/nccl_ofi_param.h | 2 +- include/nccl_ofi_rdma.h | 15 +++++++- src/nccl_ofi_rdma.c | 81 ++++++++++++++++++++++++---------------- src/platform-aws.c | 8 +++- 4 files changed, 69 insertions(+), 37 deletions(-) diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index 4749d6bd9..00a3eea8e 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -319,7 +319,7 @@ OFI_NCCL_PARAM_INT(net_latency, "NET_LATENCY", -1); * Eager message size limit when using RDMA protocol. Message sizes greater than * this limit will always be sent using RDMA write instead of eagerly. */ -OFI_NCCL_PARAM_UINT(eager_max_size, "EAGER_MAX_SIZE", 8192); +OFI_NCCL_PARAM_INT(eager_max_size, "EAGER_MAX_SIZE", 8192); /* * Decide whether or not mutexes should default to errorcheck mode. diff --git a/include/nccl_ofi_rdma.h b/include/nccl_ofi_rdma.h index 5d864f833..7e317fb56 100644 --- a/include/nccl_ofi_rdma.h +++ b/include/nccl_ofi_rdma.h @@ -752,8 +752,19 @@ struct nccl_net_ofi_rdma_ep { nccl_ofi_freelist_t *conn_msg_fl; /* Size of ctrl rx buffers */ size_t ctrl_rx_buff_size; - /* Size of eager rx buffers */ - size_t eager_rx_buff_size; + /* Size of eager rx buffers. Will be -1 if eager is entirely + * disabled. */ + ssize_t eager_rx_buff_size; + /* max size of eager messages. This is only separate from + * eager_rx_buff_size because the EFA provider incorrectly throws an + * EINVAL when posting 0 byte rx buffers. To work around that, + * eager_rx_buff_size will either be -1 or positive (but not zero) and + * eager_send_size is the comparison that should be used for deciding + * whether a message is eligible for eager. eager_send_size will never + * be larger than eager_rx_buff_size. Will be -1 if eager is entirely + * disabled. + */ + ssize_t eager_send_size; /* true if the current endpoint is a endpoint_per_communicator receive communicator */ diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 19f75cd2e..36f11cff9 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -93,9 +93,6 @@ /** Global variables **/ -/* Maximum size of an eager message (see OFI_NCCL_EAGER_MAX_SIZE) */ -static size_t eager_max_size = 0; - /* List of comms undergoing deferred cleanup */ static nccl_ofi_deque_t *s_comm_cleanup_list = NULL; static nccl_ofi_deque_t *r_comm_cleanup_list = NULL; @@ -2332,6 +2329,9 @@ static inline int eager_rx_buff_req_free(nccl_net_ofi_rdma_req_t *req, assert(!dec_inflight_reqs); rdma_req_rx_buff_data_t *rx_buff_data = get_rx_buff_data(req); nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep; + + assert(ep->eager_rx_buff_size > 0); + /* Free buffer */ if (rx_buff_data->rx_buff_fl_elem) { nccl_ofi_freelist_entry_free(ep->eager_rx_buff_fl, rx_buff_data->rx_buff_fl_elem); @@ -2345,6 +2345,8 @@ static inline nccl_net_ofi_rdma_req_t *eager_rx_buff_req_alloc(nccl_net_ofi_rdma nccl_net_ofi_rdma_req_t *req = allocate_req(ep->rx_buff_reqs_fl); if (!req) return NULL; + assert(ep->eager_rx_buff_size > 0); + req->comm = NULL; req->type = NCCL_OFI_RDMA_EAGER_RX_BUFF; req->dev_id = rdma_endpoint_get_device(ep)->base.dev_id; @@ -5567,7 +5569,9 @@ static int post_rx_buffer(nccl_net_ofi_rdma_req_t *req, /* Reset memcheck guards of rx buffer freelist entry to * accessible but undefined to cover cases where the buffer * gets re-posted */ - nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep; + nccl_net_ofi_rdma_ep_t *ep = rx_buff_data->ep; + assert(req->type != NCCL_OFI_RDMA_EAGER_RX_BUFF || ep->eager_rx_buff_size > 0); + nccl_ofi_freelist_t *fl = (req->type == NCCL_OFI_RDMA_EAGER_RX_BUFF ? ep->eager_rx_buff_fl : ep->ctrl_rx_buff_fl); nccl_ofi_freelist_entry_set_undefined(fl, rx_buff_fl_elem->ptr); @@ -6010,7 +6014,7 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t /* Determine if this should be sent eagerly. */ eager = false; - if ((!have_ctrl && (size_t)size <= eager_max_size && s_comm->num_inflight_writes == 0) || (size == 0)) { + if (!have_ctrl && (ssize_t)size <= ep->eager_send_size && s_comm->num_inflight_writes == 0) { eager = true; } @@ -6258,19 +6262,19 @@ static inline int init_rx_buffers(nccl_net_ofi_rdma_ep_t *ep) return ret; } - /* Set the eager freelist buffer size to at least the maximum of EAGER_RX_BUFFER_ALIGNMENT - * and eager_rx_buff_size. This ensures the freelist maintains a minimum size equal to - * EAGER_RX_BUFFER_ALIGNMENT even when OFI_NCCL_EAGER_MAX_SIZE is set to 0. - */ - ret = nccl_ofi_freelist_init_mr(NCCL_OFI_MAX(EAGER_RX_BUFFER_ALIGNMENT, ep->eager_rx_buff_size), - ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, - freelist_regmr_host_fn, freelist_deregmr_host_fn, - ep, EAGER_RX_BUFFER_ALIGNMENT, &ep->eager_rx_buff_fl); - if (ret != 0) { - NCCL_OFI_WARN("Failed to init eager_rx_buff_size"); - nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl); - nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl); - return ret; + if (ep->eager_rx_buff_size > 0) { + ret = nccl_ofi_freelist_init_mr(ep->eager_rx_buff_size, + ofi_nccl_rdma_min_posted_bounce_buffers(), 16, 0, + freelist_regmr_host_fn, freelist_deregmr_host_fn, + ep, EAGER_RX_BUFFER_ALIGNMENT, &ep->eager_rx_buff_fl); + if (ret != 0) { + NCCL_OFI_WARN("Failed to init eager_rx_buff_size"); + nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl); + nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl); + return ret; + } + } else { + ep->eager_rx_buff_fl = NULL; } ret = nccl_ofi_freelist_init_mr(sizeof(nccl_ofi_rdma_connection_info_t), @@ -6279,7 +6283,9 @@ static inline int init_rx_buffers(nccl_net_ofi_rdma_ep_t *ep) ep, sizeof(void *), &ep->conn_msg_fl); if (ret != 0) { NCCL_OFI_WARN("Failed to init conn_msg freelist"); - nccl_ofi_freelist_fini(ep->eager_rx_buff_fl); + if (ep->eager_rx_buff_fl != NULL) { + nccl_ofi_freelist_fini(ep->eager_rx_buff_fl); + } nccl_ofi_freelist_fini(ep->ctrl_rx_buff_fl); nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl); return ret; @@ -6306,12 +6312,17 @@ static inline int init_rx_buffers(nccl_net_ofi_rdma_ep_t *ep) for (int rail_id = 0; rail_id < ep->num_rails; ++rail_id) { rail = rdma_endpoint_get_rail(ep, rail_id); - rail->min_rx_buff_posted = NCCL_OFI_DIV_CEIL( - ofi_nccl_rdma_min_posted_bounce_buffers(), ep->num_rails - ); - rail->max_rx_buff_posted = NCCL_OFI_DIV_CEIL( - ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails - ); + if (ep->eager_rx_buff_size >= 0) { + rail->min_rx_buff_posted = NCCL_OFI_DIV_CEIL( + ofi_nccl_rdma_min_posted_bounce_buffers(), ep->num_rails + ); + rail->max_rx_buff_posted = NCCL_OFI_DIV_CEIL( + ofi_nccl_rdma_max_posted_bounce_buffers(), ep->num_rails + ); + } else { + rail->min_rx_buff_posted = 0; + rail->max_rx_buff_posted = 0; + } rail->num_rx_buff_posted = 0; nccl_net_ofi_mutex_init(&rail->rx_buff_mutex, NULL); rail->rx_buff_req_alloc = eager_rx_buff_req_alloc; @@ -6340,10 +6351,12 @@ static inline int fini_rx_buffers(nccl_net_ofi_rdma_ep_t *ep) return ret; } - ret = nccl_ofi_freelist_fini(ep->eager_rx_buff_fl); - if (ret != 0) { - NCCL_OFI_WARN("Failed to fini eager_rx_buff_fl"); - return ret; + if (ep->eager_rx_buff_fl != NULL) { + ret = nccl_ofi_freelist_fini(ep->eager_rx_buff_fl); + if (ret != 0) { + NCCL_OFI_WARN("Failed to fini eager_rx_buff_fl"); + return ret; + } } ret = nccl_ofi_freelist_fini(ep->rx_buff_reqs_fl); @@ -7337,7 +7350,12 @@ static int nccl_net_ofi_rdma_domain_create_endpoint(nccl_net_ofi_domain_t *base_ NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), NCCL_OFI_MAX(sizeof(nccl_ofi_rdma_connection_info_t), sizeof(nccl_net_ofi_rdma_close_msg_t))); - ep->eager_rx_buff_size = eager_max_size; + ep->eager_send_size = ofi_nccl_eager_max_size(); + /* Work around EFA provider bug around posting 0 byte rx buffers by not + posting 0 byte rx buffers. Note that if eager_send_size is -1 + (disabled), eager_rx_buff_size will also be -1. */ + ep->eager_rx_buff_size = (ep->eager_send_size == 0) ? + EAGER_RX_BUFFER_ALIGNMENT : ep->eager_send_size; ep->is_endpoint_per_communicator_ep = false; @@ -8091,12 +8109,11 @@ int nccl_net_ofi_rdma_init(const char *provider_filter, goto error; } - if (ofi_nccl_eager_max_size() > ofi_nccl_min_stripe_size()) { + if ((ssize_t)ofi_nccl_eager_max_size() > (ssize_t)ofi_nccl_min_stripe_size()) { NCCL_OFI_WARN("Invalid value for EAGER_MAX_SIZE"); ret = ncclInvalidArgument; goto error; } - eager_max_size = (size_t) ofi_nccl_eager_max_size(); /* Create NCCL OFI topology */ topo = nccl_ofi_topo_create(provider_list); diff --git a/src/platform-aws.c b/src/platform-aws.c index dafc4ea2f..3c9c3f43c 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -332,10 +332,14 @@ static int configure_ep_max_msg_size(struct fid_ep *ep) int ret = 0; #if HAVE_DECL_FI_OPT_MAX_MSG_SIZE - size_t eager_max_size = (size_t)ofi_nccl_eager_max_size(); - size_t optval = NCCL_OFI_MAX(NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), eager_max_size), + ssize_t eager_max_size = (ssize_t)ofi_nccl_eager_max_size(); + size_t optval = NCCL_OFI_MAX(sizeof(nccl_net_ofi_rdma_ctrl_msg_t), sizeof(nccl_ofi_rdma_connection_info_t)); + if (eager_max_size > 0) { + optval = NCCL_OFI_MAX(optval, (size_t)eager_max_size); + } + ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_MAX_MSG_SIZE, &optval, sizeof(optval)); NCCL_OFI_TRACE(NCCL_INIT, "fi_setopt(FI_OPT_MAX_MSG_SIZE) RC: %d", ret); From b0723225834d6b53b814ec862cf4fe5bd5a0f344 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 21 Feb 2025 18:47:21 +0000 Subject: [PATCH 3/3] rdma: Disable eager by default We don't think we need eager with the performance improvements on P5 and P5en, so set the default to -1 (disabled). Signed-off-by: Brian Barrett --- include/nccl_ofi_param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index 00a3eea8e..023f29629 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -319,7 +319,7 @@ OFI_NCCL_PARAM_INT(net_latency, "NET_LATENCY", -1); * Eager message size limit when using RDMA protocol. Message sizes greater than * this limit will always be sent using RDMA write instead of eagerly. */ -OFI_NCCL_PARAM_INT(eager_max_size, "EAGER_MAX_SIZE", 8192); +OFI_NCCL_PARAM_INT(eager_max_size, "EAGER_MAX_SIZE", -1); /* * Decide whether or not mutexes should default to errorcheck mode.