From 4cdc7cc5b369c37cb819ecbb111db8a2da1ad16a Mon Sep 17 00:00:00 2001 From: Amedeo Sapio Date: Wed, 20 Mar 2024 00:20:42 +0000 Subject: [PATCH] Static asserts for size of messages on the wire Added static asserts to check packing of messages sent on the wire. For SENDRECV protocol the only message is connect. For RDMA protocol the messages are connect, connect response and control. Signed-off-by: Amedeo Sapio --- include/nccl_ofi.h | 2 ++ include/nccl_ofi_rdma.h | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h index 7890dbd3e..cbc05987c 100644 --- a/include/nccl_ofi.h +++ b/include/nccl_ofi.h @@ -191,6 +191,8 @@ typedef struct nccl_ofi_connection_info { uint64_t connect_to_self; nccl_net_ofi_req_t* req; } nccl_ofi_connection_info_t; +_Static_assert(sizeof(nccl_ofi_connection_info_t) == 80, + "Wrong size for SENDRECV connect message"); typedef struct nccl_net_ofi_conn_handle { char ep_name[MAX_EP_ADDR]; diff --git a/include/nccl_ofi_rdma.h b/include/nccl_ofi_rdma.h index 035d8b181..2561f7d78 100644 --- a/include/nccl_ofi_rdma.h +++ b/include/nccl_ofi_rdma.h @@ -98,6 +98,8 @@ typedef struct nccl_net_ofi_rdma_ctrl_msg { uint64_t buff_len; uint64_t buff_mr_key[MAX_NUM_RAILS]; } nccl_net_ofi_rdma_ctrl_msg_t; +_Static_assert(sizeof(nccl_net_ofi_rdma_ctrl_msg_t) == 64, + "Wrong size for RDMA Control message"); /* Structure used to store control messages in a free list */ typedef struct nccl_net_ofi_rdma_ctrl_fl_item { @@ -318,6 +320,9 @@ typedef struct nccl_ofi_rdma_connection_info { */ nccl_ofi_rdma_msg_type_t type; + /* Number of rails */ + int num_rails; + /* A comm identitifer that uniquely identifies the comm on the sender side. The receiver must use this ID when sending messages to sender */ uint64_t local_comm_id; @@ -326,14 +331,13 @@ typedef struct nccl_ofi_rdma_connection_info { * on the receiver side */ uint64_t remote_comm_id; - /* Number of rails */ - int num_rails; - /* Array of `MAX_NUM_RAILS` `nccl_ofi_rdma_ep_name_t` * structs. The member `num_rails` indicates the number of * entries that are in use. */ nccl_ofi_rdma_ep_name_t ep_names[MAX_NUM_RAILS]; } nccl_ofi_rdma_connection_info_t; +_Static_assert(sizeof(nccl_ofi_rdma_connection_info_t) == 248, + "Wrong size for RDMA connect message"); /* * @brief Send communicator rail