Skip to content

Commit

Permalink
Separate the domain, and abstract access to the domain.
Browse files Browse the repository at this point in the history
On Neuron, create the domain in the endpoint structure.  On P-series, keep it
in the device structure.  This is because, on Neuron, we lock when accessing the
domain, so retaining separate domains per thread and per endpoint reduces contention

Create an accessor method, get_domain(), that abstractly accesses the endpoint by protocol
and architecture.

Signed-off-by: Ryan Hankins <rqh@amazon.com>
  • Loading branch information
ryanamazon authored and rqh committed Mar 22, 2024
1 parent 2adb8f5 commit 5cd487c
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 7 deletions.
3 changes: 3 additions & 0 deletions include/nccl_ofi_sendrecv.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ typedef struct nccl_net_ofi_sendrecv_ep {
/* Endpoint handle to communicate to */
struct fid_ep *ofi_ep;

/* Access Domain handle */
struct fid_domain *domain;

/* Address vector handle */
struct fid_av *av;

Expand Down
36 changes: 33 additions & 3 deletions src/nccl_ofi_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,22 @@ static inline nccl_net_ofi_ep_rail_t *get_rail(nccl_net_ofi_rdma_ep_t *ep,
return &ep->rails[rail_id];
}

/*
* @brief return the domain for the endpoint and rail.
*/

static int get_domain(nccl_net_ofi_ep_t *base_ep, int rail, struct fid_domain **domain)
{
nccl_net_ofi_rdma_ep_t *ep =
(nccl_net_ofi_rdma_ep_t *)base_ep;
nccl_net_ofi_rdma_device_t *device =
(nccl_net_ofi_rdma_device_t *)ep->base.device;
nccl_net_ofi_rdma_device_rail_t *dev_rail = &device->device_rails[rail];

*domain = dev_rail->domain;
return 0;
}

/*
* @brief Unlink temporary NCCL topology file written by `write_topo_file()`
*
Expand Down Expand Up @@ -2527,6 +2543,7 @@ static int reg_mr_ep(nccl_net_ofi_rdma_ep_t *ep, void *data,
struct fi_mr_attr mr_attr = {0};
struct iovec iov = {0};
nccl_net_ofi_rdma_mr_handle_t *ret_handle = NULL;
struct fid_domain *domain;
*mhandle = NULL;

assert(ep);
Expand Down Expand Up @@ -2560,10 +2577,17 @@ static int reg_mr_ep(nccl_net_ofi_rdma_ep_t *ep, void *data,
/* Register memory on each rail */
ret_handle->num_rails = num_rails;
for (int rail_id = 0; rail_id != num_rails; ++rail_id) {
nccl_net_ofi_rdma_device_rail_t *dev_rail = get_device_rail(device, rail_id);
nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
ret = get_domain(&ep->base, rail_id, &domain);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Could not get domain for dev: %d rail: %d", dev_id, rail_id);
dereg_rails(ret_handle);
free(ret_handle);
ret_handle = NULL;
break;
}

ret = register_rail_mr_buffer(dev_rail->domain, rail->ofi_ep,
ret = register_rail_mr_buffer(domain, rail->ofi_ep,
dev_id, type, &mr_attr,
&ret_handle->mr[rail_id]);
if (OFI_UNLIKELY(ret != 0)) {
Expand Down Expand Up @@ -5420,10 +5444,16 @@ static int ep_rail_init(nccl_net_ofi_rdma_ep_t *ep,
nccl_net_ofi_rdma_device_rail_t *dev_rail,
nccl_net_ofi_ep_rail_t *ep_rail)
{
struct fid_domain *domain;
int ret = 0;

ret = get_domain(&ep->base, rail_id, &domain);
if (ret != 0) {
return ret;
}

ret = nccl_ofi_ofiutils_init_connection(FI_VERSION(1, 18),
dev_rail->info, dev_rail->domain,
dev_rail->info, domain,
&ep_rail->ofi_ep, &ep_rail->av, &ep_rail->cq);
if (ret != 0) {
return ret;
Expand Down
65 changes: 61 additions & 4 deletions src/nccl_ofi_sendrecv.c
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,23 @@ static int post_recv_conn(nccl_net_ofi_sendrecv_listen_comm_t *l_comm,
return rc;
}

/*
* @brief Returns the domain, dependent on the platform.
*
* @return fid_domain for the device (P-series) or endpoint (Neuron).
* @return 0 on success
*
*/

static int get_domain(nccl_net_ofi_ep_t *base_ep, struct fid_domain **domain)
{
nccl_net_ofi_sendrecv_ep_t *ep =
(nccl_net_ofi_sendrecv_ep_t *)base_ep;
*domain = ep->domain;
return 0;
}


/*
* @brief Registers memory region (both HOST and CUDA)
*
Expand Down Expand Up @@ -667,6 +684,7 @@ static int reg_mr_base(struct fid_domain *domain, struct fid_ep *ep,
static int reg_mr_base_comm(nccl_net_ofi_comm_t *base_comm, void *data,
size_t size, int type, void **mhandle)
{
int ret;
/* Retrieve and validate endpoint */
nccl_net_ofi_sendrecv_ep_t *ep =
(nccl_net_ofi_sendrecv_ep_t *)base_comm->ep;
Expand All @@ -685,7 +703,13 @@ static int reg_mr_base_comm(nccl_net_ofi_comm_t *base_comm, void *data,
int dev_id = device->base.dev_id;

nccl_ofi_idpool_t *key_pool = &device->key_pool;
return reg_mr_base(device->domain, ep->ofi_ep, key_pool,
struct fid_domain *domain;
ret = get_domain(base_comm->ep, &domain);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Failure retrieving domain ret=%d", ret);
return ret;
}
return reg_mr_base(domain, ep->ofi_ep, key_pool,
dev_id, data, size, type, mhandle);
}

Expand Down Expand Up @@ -1167,6 +1191,7 @@ static nccl_net_ofi_sendrecv_recv_comm_t *prepare_recv_comm(nccl_net_ofi_sendrec
{
int ret = 0;
fi_addr_t remote_ep;
struct fid_domain *domain;
nccl_net_ofi_sendrecv_recv_comm_t *r_comm = NULL;
size_t req_size = sizeof(nccl_net_ofi_sendrecv_req_t);
nccl_ofi_idpool_t *key_pool = &device->key_pool;
Expand Down Expand Up @@ -1214,13 +1239,19 @@ static nccl_net_ofi_sendrecv_recv_comm_t *prepare_recv_comm(nccl_net_ofi_sendrec
return NULL;
}

ret = get_domain(&ep->base, &domain);
if (OFI_UNLIKELY(ret != 0)) {
free(r_comm);
return NULL;
}

/*
* Setup flush resources if using GPUDirect RDMA unless user disables
* flush operations
*/
if (!ofi_nccl_gdr_flush_disable() && support_gdr == GDR_SUPPORTED && !cuda_flush) {
r_comm->flush_buff.size = NCCL_OFI_FLUSH_SIZE;
ret = alloc_and_reg_flush_buff(device->domain, ep->ofi_ep, key_pool,
ret = alloc_and_reg_flush_buff(domain, ep->ofi_ep, key_pool,
&r_comm->flush_buff, dev_id);
if (OFI_UNLIKELY(ret != 0)) {
free(r_comm);
Expand Down Expand Up @@ -2074,14 +2105,27 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
* initialize endpoint if necessary. */
nccl_net_ofi_sendrecv_ep_t *ep = pthread_getspecific(device->ep_key);
if (!ep) {

/* Allocate endpoint */
ep = calloc(1, sizeof(nccl_net_ofi_sendrecv_ep_t));
if (!ep) {
ret = -ENOMEM;
NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET,
"Unable to allocate sendrecv endpoint");
"Unable to allocate sendrecv endpoint");
goto unlock;
}
#if HAVE_NEURON
ret = fi_domain(device->fabric, device->info,
&ep->domain, NULL);
if (OFI_UNLIKELY(ret != 0)) {
NCCL_OFI_WARN("Couldn't open a fabric access domain. RC: %d, ERROR: %s",
ret, fi_strerror(-ret));
free(ep);
goto unlock;
}
#else
ep->domain = device->domain;
#endif /* HAVE_NEURON */

/* Initialize base endpoint */
ep->base.device = &device->base;
Expand All @@ -2105,8 +2149,13 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
}

if (ep->ref_cnt == 0) {
struct fid_domain *domain;
ret = get_domain(&ep->base, &domain);
if (ret) {
goto unlock;
}
ret = nccl_ofi_ofiutils_init_connection(selected_api_version, device->info,
device->domain, &ep->ofi_ep, &ep->av,
domain, &ep->ofi_ep, &ep->av,
&ep->cq);
if (ret != 0) {
goto unlock;
Expand Down Expand Up @@ -2159,6 +2208,13 @@ static int device_prepare_for_connection(nccl_net_ofi_sendrecv_device_t *device)
goto error;
}

/*
* On Neuron, create the domain in the endpoint structure. On P-series, keep it
* in the device structure. This is because, on Neuron, we lock when accessing the
* domain, so retaining separate domains per thread and per endpoint reduces contention
* for that lock.
*/
#if !HAVE_NEURON
/* Create domain */
ret = fi_domain(device->fabric, device->info,
&device->domain, NULL);
Expand All @@ -2167,6 +2223,7 @@ static int device_prepare_for_connection(nccl_net_ofi_sendrecv_device_t *device)
ret, fi_strerror(-ret));
goto error;
}
#endif /* !HAVE_NEURON */

return ret;
error:
Expand Down

0 comments on commit 5cd487c

Please sign in to comment.