Skip to content

Commit

Permalink
Move RDMA control messages to own endpoint
Browse files Browse the repository at this point in the history
For the long message RDMA protocol, we want to make sure that we
never starve the sender for data to move, which means prioritizing
control messages from the receiver to the sender.  This patch moves
both the communicator setup and recv control messages to a new
endpoint, which is always on device rail 0.  Future patches will
optimize polling of the control message cq in the send path
and setting priority bits on the control cq.

Signed-off-by: Brian Barrett <bbarrett@amazon.com>
  • Loading branch information
bwbarrett committed Jan 23, 2024
1 parent 02403b1 commit cfe79c6
Show file tree
Hide file tree
Showing 2 changed files with 252 additions and 184 deletions.
24 changes: 17 additions & 7 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ typedef enum nccl_net_ofi_rdma_req_type {
* allocate a rdma memory registration handle with `num_rails' rails.
*/
typedef struct nccl_net_ofi_rdma_mr_handle {
struct fid_mr *control_mr;

int num_rails;

/* Array of size `num_rails' */
Expand Down Expand Up @@ -102,6 +104,8 @@ struct nccl_net_ofi_rdma_req;
struct nccl_net_ofi_rdma_ep;
typedef struct nccl_net_ofi_rdma_req nccl_net_ofi_rdma_req_t;
typedef struct nccl_net_ofi_rdma_ep nccl_net_ofi_rdma_ep_t;
struct nccl_net_ofi_ep_rail;
typedef struct nccl_net_ofi_ep_rail nccl_net_ofi_ep_rail_t;

typedef struct {
/* Bounce buffer freelist item */
Expand All @@ -116,7 +120,7 @@ typedef struct {
* This is useful for re-posting the bounce buffer on the same rail
* when it gets completed.
*/
int bounce_rail_id;
nccl_net_ofi_ep_rail_t *rail;
/*
* Back-pointer to associated endpoint
*/
Expand Down Expand Up @@ -157,10 +161,6 @@ typedef struct {
typedef struct {
/* Pointer to the allocated control buffer from freelist */
nccl_net_ofi_rdma_ctrl_fl_item_t *ctrl_fl_item;
/* Schedule used to transfer the control buffer. We save the
* pointer to reference it when transferring the buffer over
* network. */
nccl_net_ofi_schedule_t *ctrl_schedule;
/* Pointer to recv parent request */
nccl_net_ofi_rdma_req_t *recv_req;
} rdma_req_send_ctrl_data_t;
Expand Down Expand Up @@ -297,6 +297,8 @@ typedef struct nccl_ofi_rdma_connection_info {
side. The receiver must use this tag when sending messages to sender */
uint64_t local_tag;

nccl_ofi_rdma_ep_name_t control_ep_name;

/* Number of rails */
int num_rails;

Expand Down Expand Up @@ -357,6 +359,8 @@ typedef struct nccl_net_ofi_rdma_send_comm {

nccl_ofi_msgbuff_t *msgbuff;

nccl_net_ofi_rdma_send_comm_rail_t control_rail;

/* Number of rails */
int num_rails;

Expand Down Expand Up @@ -430,6 +434,8 @@ typedef struct nccl_net_ofi_rdma_recv_comm {
/* Free list to track control buffers, for sending RDMA control messages */
nccl_ofi_freelist_t *ctrl_buff_fl;

nccl_net_ofi_rdma_recv_comm_rail_t control_rail;

/* Number of rails */
int num_rails;

Expand Down Expand Up @@ -467,7 +473,9 @@ typedef struct nccl_net_ofi_rdma_listen_comm {
* Endpoint rail encapsulates data of an endpoint for a
* specific rail.
*/
typedef struct nccl_net_ofi_ep_rail {
struct nccl_net_ofi_ep_rail {
int rail_id;

/* Local libfabric endpoint handle */
struct fid_ep *ofi_ep;

Expand All @@ -492,7 +500,7 @@ typedef struct nccl_net_ofi_ep_rail {
size_t max_bounce_posted;
/* Mutex for bounce buffer operations */
pthread_mutex_t bounce_mutex;
} nccl_net_ofi_ep_rail_t;
};

/*
* @brief RDMA Endpoint
Expand All @@ -516,6 +524,8 @@ struct nccl_net_ofi_rdma_ep {
/* Current available tag ID */
uint64_t tag;

nccl_net_ofi_ep_rail_t control_rail;

/* Number of rails */
int num_rails;

Expand Down
Loading

0 comments on commit cfe79c6

Please sign in to comment.