diff --git a/include/nccl_ofi_rdma.h b/include/nccl_ofi_rdma.h index d7b468352..a57e14c8d 100644 --- a/include/nccl_ofi_rdma.h +++ b/include/nccl_ofi_rdma.h @@ -517,6 +517,9 @@ struct nccl_net_ofi_ep_rail { /* Completion Queue handle */ struct fid_cq *cq; + /* Access domain handles */ + struct fid_domain *domain; + /* * Bounce buffer management */ diff --git a/include/nccl_ofi_sendrecv.h b/include/nccl_ofi_sendrecv.h index a2cd2403d..d2366f15b 100644 --- a/include/nccl_ofi_sendrecv.h +++ b/include/nccl_ofi_sendrecv.h @@ -110,6 +110,9 @@ typedef struct nccl_net_ofi_sendrecv_ep { /* Endpoint handle to communicate to */ struct fid_ep *ofi_ep; + /* Access Domain handle */ + struct fid_domain *domain; + /* Address vector handle */ struct fid_av *av; diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 33f84bc98..ab4127c6c 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -249,6 +249,15 @@ static inline nccl_net_ofi_ep_rail_t *get_rail(nccl_net_ofi_rdma_ep_t *ep, return &ep->rails[rail_id]; } +/* + * @brief return the domain for the endpoint and rail. + */ + +static inline struct fid_domain *get_domain_from_endpoint(nccl_net_ofi_rdma_ep_t *ep, int rail_id) +{ + return get_rail(ep, rail_id)->domain; +} + /* * @brief Unlink temporary NCCL topology file written by `write_topo_file()` * @@ -2428,6 +2437,7 @@ static int reg_mr_ep(nccl_net_ofi_rdma_ep_t *ep, void *data, struct fi_mr_attr mr_attr = {0}; struct iovec iov = {0}; nccl_net_ofi_rdma_mr_handle_t *ret_handle = NULL; + struct fid_domain *domain; *mhandle = NULL; assert(ep); @@ -2461,10 +2471,10 @@ static int reg_mr_ep(nccl_net_ofi_rdma_ep_t *ep, void *data, /* Register memory on each rail */ ret_handle->num_rails = num_rails; for (int rail_id = 0; rail_id != num_rails; ++rail_id) { - nccl_net_ofi_rdma_device_rail_t *dev_rail = get_device_rail(device, rail_id); nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id); + domain = get_domain_from_endpoint(ep, rail_id); - ret = register_rail_mr_buffer(dev_rail->domain, rail->ofi_ep, + ret = register_rail_mr_buffer(domain, rail->ofi_ep, dev_id, type, &mr_attr, &ret_handle->mr[rail_id]); if (OFI_UNLIKELY(ret != 0)) { @@ -5312,7 +5322,19 @@ static int ep_rail_init(nccl_net_ofi_rdma_ep_t *ep, { int ret = 0; - ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18), dev_rail->info, dev_rail->domain, &ep_rail->ofi_ep, + if (domain_per_thread == 1) { + ret = fi_domain(dev_rail->fabric, dev_rail->info, + &ep_rail->domain, NULL); + if (OFI_UNLIKELY(ret != 0)) { + NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s", + ret, fi_strerror(-ret)); + return ret; + } + } else { + ep_rail->domain = dev_rail->domain; + } + + ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18), dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep, &ep_rail->av, &ep_rail->cq); if (ret != 0) { return ret; @@ -5582,13 +5604,21 @@ static int init_device_rail_ofi_resources(nccl_net_ofi_rdma_device_rail_t *rail_ goto error; } - /* Create domain */ - ret = fi_domain(rail_dev->fabric, rail_dev->info, - &rail_dev->domain, NULL); - if (OFI_UNLIKELY(ret != 0)) { - NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s", - ret, fi_strerror(-ret)); - goto error; + /* + * In the domain-per-thread case, create the domain in the endpoint structure. In the + * domain-per-process case, keep it in the device structure. This is because, on some + * platforms, libfabric locks when accessing the domain, so retaining separate domains + * per thread and per endpoint reduces contention for that lock. + */ + if (domain_per_thread == 0) { + /* Create domain */ + ret = fi_domain(rail_dev->fabric, rail_dev->info, + &rail_dev->domain, NULL); + if (OFI_UNLIKELY(ret != 0)) { + NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s", + ret, fi_strerror(-ret)); + goto error; + } } return ret; diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c index a6d7e4251..3f4e2649f 100644 --- a/src/nccl_ofi_sendrecv.c +++ b/src/nccl_ofi_sendrecv.c @@ -469,6 +469,19 @@ static int post_recv_conn(nccl_net_ofi_sendrecv_listen_comm_t *l_comm, return rc; } +/* + * @brief Returns the domain, dependent on the platform. + * + * @return fid_domain for the device (P-series) or endpoint (Neuron). + * + */ + +static inline struct fid_domain* get_domain_from_endpoint(nccl_net_ofi_sendrecv_ep_t *ep) +{ + return ep->domain; +} + + /* * @brief Registers memory region (both HOST and CUDA) * @@ -687,7 +700,9 @@ static int reg_mr_base_comm(nccl_net_ofi_comm_t *base_comm, void *data, int dev_id = device->base.dev_id; nccl_ofi_idpool_t *key_pool = &device->key_pool; - return reg_mr_base(device->domain, ep->ofi_ep, key_pool, + struct fid_domain *domain; + domain = get_domain_from_endpoint(ep); + return reg_mr_base(domain, ep->ofi_ep, key_pool, dev_id, data, size, type, mhandle); } @@ -1169,6 +1184,7 @@ static nccl_net_ofi_sendrecv_recv_comm_t *prepare_recv_comm(nccl_net_ofi_sendrec { int ret = 0; fi_addr_t remote_ep; + struct fid_domain *domain; nccl_net_ofi_sendrecv_recv_comm_t *r_comm = NULL; size_t req_size = sizeof(nccl_net_ofi_sendrecv_req_t); nccl_ofi_idpool_t *key_pool = &device->key_pool; @@ -1216,13 +1232,15 @@ static nccl_net_ofi_sendrecv_recv_comm_t *prepare_recv_comm(nccl_net_ofi_sendrec return NULL; } + domain = get_domain_from_endpoint(ep); + /* * Setup flush resources if using GPUDirect RDMA unless user disables * flush operations */ if (!ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush) { r_comm->flush_buff.size = NCCL_OFI_FLUSH_SIZE; - ret = alloc_and_reg_flush_buff(device->domain, ep->ofi_ep, key_pool, + ret = alloc_and_reg_flush_buff(domain, ep->ofi_ep, key_pool, &r_comm->flush_buff, dev_id); if (OFI_UNLIKELY(ret != 0)) { free(r_comm); @@ -2084,6 +2102,18 @@ static int get_ep(nccl_net_ofi_device_t *base_dev, "Unable to allocate sendrecv endpoint"); goto unlock; } + if (domain_per_thread == 1) { + ret = fi_domain(device->fabric, device->info, + &ep->domain, NULL); + if (OFI_UNLIKELY(ret != 0)) { + NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s", + ret, fi_strerror(-ret)); + free(ep); + goto unlock; + } + } else { + ep->domain = device->domain; + } /* Initialize base endpoint */ ep->base.device = &device->base; @@ -2107,7 +2137,9 @@ static int get_ep(nccl_net_ofi_device_t *base_dev, } if (ep->ref_cnt == 0) { - ret = nccl_ofi_ofiutils_init_connection(selected_api_version, device->info, device->domain, &ep->ofi_ep, + struct fid_domain *domain; + domain = get_domain_from_endpoint(ep); + ret = nccl_ofi_ofiutils_init_connection(selected_api_version, device->info, domain, &ep->ofi_ep, &ep->av, &ep->cq); if (ret != 0) { goto unlock; @@ -2160,13 +2192,21 @@ static int device_prepare_for_connection(nccl_net_ofi_sendrecv_device_t *device) goto error; } - /* Create domain */ - ret = fi_domain(device->fabric, device->info, - &device->domain, NULL); - if (OFI_UNLIKELY(ret != 0)) { - NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s", - ret, fi_strerror(-ret)); - goto error; + /* + * In the domain-per-thread case, create the domain in the endpoint structure. In the + * domain-per-process case, keep it in the device structure. This is because, on some + * platforms, libfabric locks when accessing the domain, so retaining separate domains + * per thread and per endpoint reduces contention for that lock. + */ + if (domain_per_thread == 0) { + /* Create domain */ + ret = fi_domain(device->fabric, device->info, + &device->domain, NULL); + if (OFI_UNLIKELY(ret != 0)) { + NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s", + ret, fi_strerror(-ret)); + goto error; + } } return ret; diff --git a/src/platform-aws.c b/src/platform-aws.c index 3999d3049..fd056d3ea 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -543,6 +543,7 @@ int platform_init(const char **provider_filter) if (domain_per_thread == -1) { domain_per_thread = platform_data->domain_per_thread; } + NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Creating one domain per %s", domain_per_thread ? "process" : "thread"); exit: return ret;