Skip to content

Commit

Permalink
Static asserts for size of messages on the wire
Browse files Browse the repository at this point in the history
Added static asserts to check packing of messages sent on the wire.
For SENDRECV protocol the only message is connect.
For RDMA protocol the messages are connect, connect response and control.

Signed-off-by: Amedeo Sapio <asapio@amazon.com>
  • Loading branch information
AmedeoSapio committed Mar 20, 2024
1 parent 67078a1 commit 4cdc7cc
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 3 deletions.
2 changes: 2 additions & 0 deletions include/nccl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ typedef struct nccl_ofi_connection_info {
uint64_t connect_to_self;
nccl_net_ofi_req_t* req;
} nccl_ofi_connection_info_t;
_Static_assert(sizeof(nccl_ofi_connection_info_t) == 80,
"Wrong size for SENDRECV connect message");

typedef struct nccl_net_ofi_conn_handle {
char ep_name[MAX_EP_ADDR];
Expand Down
10 changes: 7 additions & 3 deletions include/nccl_ofi_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ typedef struct nccl_net_ofi_rdma_ctrl_msg {
uint64_t buff_len;
uint64_t buff_mr_key[MAX_NUM_RAILS];
} nccl_net_ofi_rdma_ctrl_msg_t;
_Static_assert(sizeof(nccl_net_ofi_rdma_ctrl_msg_t) == 64,
"Wrong size for RDMA Control message");

/* Structure used to store control messages in a free list */
typedef struct nccl_net_ofi_rdma_ctrl_fl_item {
Expand Down Expand Up @@ -318,6 +320,9 @@ typedef struct nccl_ofi_rdma_connection_info {
*/
nccl_ofi_rdma_msg_type_t type;

/* Number of rails */
int num_rails;

/* A comm identitifer that uniquely identifies the comm on the sender
side. The receiver must use this ID when sending messages to sender */
uint64_t local_comm_id;
Expand All @@ -326,14 +331,13 @@ typedef struct nccl_ofi_rdma_connection_info {
* on the receiver side */
uint64_t remote_comm_id;

/* Number of rails */
int num_rails;

/* Array of `MAX_NUM_RAILS` `nccl_ofi_rdma_ep_name_t`
* structs. The member `num_rails` indicates the number of
* entries that are in use. */
nccl_ofi_rdma_ep_name_t ep_names[MAX_NUM_RAILS];
} nccl_ofi_rdma_connection_info_t;
_Static_assert(sizeof(nccl_ofi_rdma_connection_info_t) == 248,
"Wrong size for RDMA connect message");

/*
* @brief Send communicator rail
Expand Down

0 comments on commit 4cdc7cc

Please sign in to comment.