Skip to content

Commit

Permalink
Separate the domain, and abstract access to the domain.
Browse files Browse the repository at this point in the history
Create two paths for creating the domain: per endpoint or per device.
This is because, on some platforms, we lock the domain when accessing the
endpoint, so retaining separate domains per thread and per endpoint reduces contention.  This
depends on the threading model used with libfabric.

Create an accessor method, get_domain(), that abstractly accesses the endpoint by protocol
and architecture.

Signed-off-by: Ryan Hankins <rqh@amazon.com>
  • Loading branch information
ryanamazon authored and rajachan committed Apr 12, 2024
1 parent 8f0a4b0 commit b62f899
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 20 deletions.
3 changes: 3 additions & 0 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,9 @@ struct nccl_net_ofi_ep_rail {
/* Completion Queue handle */
struct fid_cq *cq;

/* Access domain handles */
struct fid_domain *domain;

/*
* Bounce buffer management
*/
Expand Down
3 changes: 3 additions & 0 deletions include/nccl_ofi_sendrecv.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ typedef struct nccl_net_ofi_sendrecv_ep {
/* Endpoint handle to communicate to */
struct fid_ep *ofi_ep;

/* Access Domain handle */
struct fid_domain *domain;

/* Address vector handle */
struct fid_av *av;

Expand Down
50 changes: 40 additions & 10 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,15 @@ static inline nccl_net_ofi_ep_rail_t *get_rail(nccl_net_ofi_rdma_ep_t *ep,
return &ep->rails[rail_id];
}

/*
* @brief return the domain for the endpoint and rail.
*/

static inline struct fid_domain *get_domain_from_endpoint(nccl_net_ofi_rdma_ep_t *ep, int rail_id)
{
return get_rail(ep, rail_id)->domain;
}

/*
* @brief Unlink temporary NCCL topology file written by `write_topo_file()`
*
Expand Down Expand Up @@ -2428,6 +2437,7 @@ static int reg_mr_ep(nccl_net_ofi_rdma_ep_t *ep, void *data,
struct fi_mr_attr mr_attr = {0};
struct iovec iov = {0};
nccl_net_ofi_rdma_mr_handle_t *ret_handle = NULL;
struct fid_domain *domain;
*mhandle = NULL;

assert(ep);
Expand Down Expand Up @@ -2461,10 +2471,10 @@ static int reg_mr_ep(nccl_net_ofi_rdma_ep_t *ep, void *data,
/* Register memory on each rail */
ret_handle->num_rails = num_rails;
for (int rail_id = 0; rail_id != num_rails; ++rail_id) {
nccl_net_ofi_rdma_device_rail_t *dev_rail = get_device_rail(device, rail_id);
nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
domain = get_domain_from_endpoint(ep, rail_id);

ret = register_rail_mr_buffer(dev_rail->domain, rail->ofi_ep,
ret = register_rail_mr_buffer(domain, rail->ofi_ep,
dev_id, type, &mr_attr,
&ret_handle->mr[rail_id]);
if (OFI_UNLIKELY(ret != 0)) {
Expand Down Expand Up @@ -5312,7 +5322,19 @@ static int ep_rail_init(nccl_net_ofi_rdma_ep_t *ep,
{
int ret = 0;

ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18), dev_rail->info, dev_rail->domain, &ep_rail->ofi_ep,
if (domain_per_thread == 1) {
ret = fi_domain(dev_rail->fabric, dev_rail->info,
&ep_rail->domain, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
return ret;
}
} else {
ep_rail->domain = dev_rail->domain;
}

ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18), dev_rail->info, ep_rail->domain, &ep_rail->ofi_ep,
&ep_rail->av, &ep_rail->cq);
if (ret != 0) {
return ret;
Expand Down Expand Up @@ -5582,13 +5604,21 @@ static int init_device_rail_ofi_resources(nccl_net_ofi_rdma_device_rail_t *rail_
goto error;
}

/* Create domain */
ret = fi_domain(rail_dev->fabric, rail_dev->info,
&rail_dev->domain, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
/*
* In the domain-per-thread case, create the domain in the endpoint structure. In the
* domain-per-process case, keep it in the device structure. This is because, on some
* platforms, libfabric locks when accessing the domain, so retaining separate domains
* per thread and per endpoint reduces contention for that lock.
*/
if (domain_per_thread == 0) {
/* Create domain */
ret = fi_domain(rail_dev->fabric, rail_dev->info,
&rail_dev->domain, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
}

return ret;
Expand Down
60 changes: 50 additions & 10 deletions src/nccl_ofi_sendrecv.c
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,19 @@ static int post_recv_conn(nccl_net_ofi_sendrecv_listen_comm_t *l_comm,
return rc;
}

/*
* @brief Returns the domain, dependent on the platform.
*
* @return fid_domain for the device (P-series) or endpoint (Neuron).
*
*/

static inline struct fid_domain* get_domain_from_endpoint(nccl_net_ofi_sendrecv_ep_t *ep)
{
return ep->domain;
}


/*
* @brief Registers memory region (both HOST and CUDA)
*
Expand Down Expand Up @@ -687,7 +700,9 @@ static int reg_mr_base_comm(nccl_net_ofi_comm_t *base_comm, void *data,
int dev_id = device->base.dev_id;

nccl_ofi_idpool_t *key_pool = &device->key_pool;
return reg_mr_base(device->domain, ep->ofi_ep, key_pool,
struct fid_domain *domain;
domain = get_domain_from_endpoint(ep);
return reg_mr_base(domain, ep->ofi_ep, key_pool,
dev_id, data, size, type, mhandle);
}

Expand Down Expand Up @@ -1169,6 +1184,7 @@ static nccl_net_ofi_sendrecv_recv_comm_t *prepare_recv_comm(nccl_net_ofi_sendrec
{
int ret = 0;
fi_addr_t remote_ep;
struct fid_domain *domain;
nccl_net_ofi_sendrecv_recv_comm_t *r_comm = NULL;
size_t req_size = sizeof(nccl_net_ofi_sendrecv_req_t);
nccl_ofi_idpool_t *key_pool = &device->key_pool;
Expand Down Expand Up @@ -1216,13 +1232,15 @@ static nccl_net_ofi_sendrecv_recv_comm_t *prepare_recv_comm(nccl_net_ofi_sendrec
return NULL;
}

domain = get_domain_from_endpoint(ep);

/*
* Setup flush resources if using GPUDirect RDMA unless user disables
* flush operations
*/
if (!ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush) {
r_comm->flush_buff.size = NCCL_OFI_FLUSH_SIZE;
ret = alloc_and_reg_flush_buff(device->domain, ep->ofi_ep, key_pool,
ret = alloc_and_reg_flush_buff(domain, ep->ofi_ep, key_pool,
&r_comm->flush_buff, dev_id);
if (OFI_UNLIKELY(ret != 0)) {
free(r_comm);
Expand Down Expand Up @@ -2084,6 +2102,18 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
"Unable to allocate sendrecv endpoint");
goto unlock;
}
if (domain_per_thread == 1) {
ret = fi_domain(device->fabric, device->info,
&ep->domain, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
free(ep);
goto unlock;
}
} else {
ep->domain = device->domain;
}

/* Initialize base endpoint */
ep->base.device = &device->base;
Expand All @@ -2107,7 +2137,9 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
}

if (ep->ref_cnt == 0) {
ret = nccl_ofi_ofiutils_init_connection(selected_api_version, device->info, device->domain, &ep->ofi_ep,
struct fid_domain *domain;
domain = get_domain_from_endpoint(ep);
ret = nccl_ofi_ofiutils_init_connection(selected_api_version, device->info, domain, &ep->ofi_ep,
&ep->av, &ep->cq);
if (ret != 0) {
goto unlock;
Expand Down Expand Up @@ -2160,13 +2192,21 @@ static int device_prepare_for_connection(nccl_net_ofi_sendrecv_device_t *device)
goto error;
}

/* Create domain */
ret = fi_domain(device->fabric, device->info,
&device->domain, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
/*
* In the domain-per-thread case, create the domain in the endpoint structure. In the
* domain-per-process case, keep it in the device structure. This is because, on some
* platforms, libfabric locks when accessing the domain, so retaining separate domains
* per thread and per endpoint reduces contention for that lock.
*/
if (domain_per_thread == 0) {
/* Create domain */
ret = fi_domain(device->fabric, device->info,
&device->domain, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
goto error;
}
}

return ret;
Expand Down
1 change: 1 addition & 0 deletions src/platform-aws.c
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,7 @@ int platform_init(const char **provider_filter)
if (domain_per_thread == -1) {
domain_per_thread = platform_data->domain_per_thread;
}
NCCL_OFI_INFO(NCCL_INIT | NCCL_NET, "Creating one domain per %s", domain_per_thread ? "process" : "thread");

exit:
return ret;
Expand Down

0 comments on commit b62f899

Please sign in to comment.