-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f9bb286
commit 309cea4
Showing
6 changed files
with
107 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. | ||
*/ | ||
|
||
#ifndef NCCL_NET_V9_H_ | ||
#define NCCL_NET_V9_H_ | ||
|
||
#include "net_device.h" | ||
|
||
#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4 | ||
#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9 | ||
typedef struct { | ||
int ndevs; | ||
int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9]; | ||
} ncclNetVDeviceProps_v9_t; | ||
typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t; | ||
|
||
typedef struct { | ||
char* name; // Used mostly for logging. | ||
char* pciPath; // Path to the PCI device in /sys. | ||
uint64_t guid; // Unique identifier for the NIC chip. Important for | ||
// cards with multiple PCI functions (Physical or virtual). | ||
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] | ||
int regIsGlobal; // regMr is not tied to a particular comm | ||
int forceFlush; // Force a flush on receives | ||
int speed; // Port speed in Mbps. | ||
int port; // Port number. | ||
float latency; // Network latency | ||
int maxComms; // Maximum number of comms we can create | ||
int maxRecvs; // Maximum number of grouped receives. | ||
ncclNetDeviceType netDeviceType; // Network offload type | ||
int netDeviceVersion; // Version number for network offload | ||
ncclNetVDeviceProps_v9_t vProps; | ||
size_t maxP2pBytes; // Max transfer size for point-to-point operations | ||
size_t maxCollBytes; // Max transfer size for collective operations | ||
} ncclNetProperties_v9_t; | ||
|
||
typedef ncclNetProperties_v9_t ncclNetProperties_t; | ||
|
||
typedef struct { | ||
// Name of the network (mainly for logs) | ||
const char* name; | ||
// Initialize the network. | ||
ncclResult_t (*init)(ncclDebugLogger_t logFunction); | ||
// Return the number of adapters. | ||
ncclResult_t (*devices)(int* ndev); | ||
// Get various device properties. | ||
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props); | ||
// Create a receiving object and provide a handle to connect to it. The | ||
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged | ||
// between ranks to create a connection. | ||
ncclResult_t (*listen)(int dev, void* handle, void** listenComm); | ||
// Connect to a handle and return a sending comm object for that peer. | ||
// This call must not block for the connection to be established, and instead | ||
// should return successfully with sendComm == NULL with the expectation that | ||
// it will be called again until sendComm != NULL. | ||
// If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection | ||
ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm); | ||
// Finalize connection establishment after remote peer has called connect. | ||
// This call must not block for the connection to be established, and instead | ||
// should return successfully with recvComm == NULL with the expectation that | ||
// it will be called again until recvComm != NULL. | ||
// If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection | ||
ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm); | ||
// Register/Deregister memory. Comm can be either a sendComm or a recvComm. | ||
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. | ||
ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle); | ||
/* DMA-BUF support */ | ||
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); | ||
ncclResult_t (*deregMr)(void* comm, void* mhandle); | ||
// Asynchronous send to a peer. | ||
// May return request == NULL if the call cannot be performed (or would block) | ||
ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request); | ||
// Asynchronous recv from a peer. | ||
// May return request == NULL if the call cannot be performed (or would block) | ||
ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request); | ||
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is | ||
// visible to the GPU | ||
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); | ||
// Test whether a request is complete. If size is not NULL, it returns the | ||
// number of bytes sent/received. | ||
ncclResult_t (*test)(void* request, int* done, int* sizes); | ||
// Close and free send/recv comm objects | ||
ncclResult_t (*closeSend)(void* sendComm); | ||
ncclResult_t (*closeRecv)(void* recvComm); | ||
ncclResult_t (*closeListen)(void* listenComm); | ||
|
||
// Copy the given mhandle to a dptr in a format usable by this plugin's device code | ||
ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); | ||
|
||
// Notify the plugin that a recv has completed by the device | ||
ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); | ||
|
||
// Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller | ||
// what index this new vNIC exists at | ||
ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props); | ||
} ncclNet_v9_t; | ||
|
||
#endif // end include guard |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters