Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate endpoint for control messages #543

Merged
merged 3 commits into from
Sep 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ typedef uint16_t nccl_ofi_rdma_msg_type_t;
* allocate a rdma memory registration handle with `num_rails' rails.
*/
typedef struct nccl_net_ofi_rdma_mr_handle {
struct fid_mr *control_mr;

int num_rails;

/* Array of size `num_rails' */
Expand Down Expand Up @@ -238,10 +240,6 @@ typedef struct {
typedef struct {
/* Pointer to the allocated control buffer from freelist */
nccl_net_ofi_rdma_ctrl_fl_item_t *ctrl_fl_item;
/* Schedule used to transfer the control buffer. We save the
* pointer to reference it when transferring the buffer over
* network. */
nccl_net_ofi_schedule_t *ctrl_schedule;
/* Pointer to recv parent request */
nccl_net_ofi_rdma_req_t *recv_req;
#if HAVE_NVTX_TRACING
Expand Down Expand Up @@ -398,13 +396,15 @@ typedef struct nccl_ofi_rdma_connection_info {
* on the receiver side */
uint32_t remote_comm_id;

nccl_ofi_rdma_ep_name_t control_ep_name;

/* Array of `MAX_NUM_RAILS` `nccl_ofi_rdma_ep_name_t`
* structs. The member `num_rails` indicates the number of
* entries that are in use. */
nccl_ofi_rdma_ep_name_t ep_names[MAX_NUM_RAILS];
} nccl_ofi_rdma_connection_info_t;
/* Since this is a message on the wire, check that it has the expected size */
_Static_assert(sizeof(nccl_ofi_rdma_connection_info_t) == 272,
_Static_assert(sizeof(nccl_ofi_rdma_connection_info_t) == 336,
"Wrong size for RDMA connect message");

/*
Expand Down Expand Up @@ -456,6 +456,8 @@ typedef struct nccl_net_ofi_rdma_send_comm {

nccl_ofi_msgbuff_t *msgbuff;

nccl_net_ofi_rdma_send_comm_rail_t control_rail;

/* Number of rails */
int num_rails;

Expand Down Expand Up @@ -538,6 +540,7 @@ typedef struct nccl_net_ofi_rdma_recv_comm {
#if HAVE_NVTX_TRACING
nvtxDomainHandle_t nvtx_domain[NCCL_OFI_N_NVTX_DOMAIN_PER_COMM];
#endif
nccl_net_ofi_rdma_recv_comm_rail_t control_rail;

/* Number of rails */
int num_rails;
Expand Down Expand Up @@ -630,6 +633,8 @@ struct nccl_net_ofi_rdma_ep {
* and its base struct. */
nccl_net_ofi_ep_t base;

nccl_net_ofi_ep_rail_t control_rail;

/* Number of rails */
int num_rails;

Expand Down
Loading
Loading