From 4346d6ec5aafcbe111b866e1e13ad2844af9f333 Mon Sep 17 00:00:00 2001 From: Amedeo Sapio Date: Tue, 1 Oct 2024 00:04:10 +0000 Subject: [PATCH] tuner: added PAT algorithm to match NCCL interface NCCL 2.23 has introduced the PAT algorithm for AllGather and ReduceScatter. This commit is updating the list of algorithms in the tuner to match NCCL's. Signed-off-by: Amedeo Sapio --- include/nccl-headers/nvidia/tuner.h | 3 ++- src/tuner/nccl_ofi_tuner.c | 2 ++ tests/unit/show_tuner_decisions.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/nccl-headers/nvidia/tuner.h b/include/nccl-headers/nvidia/tuner.h index d0192fe67..e4d6c6c37 100644 --- a/include/nccl-headers/nvidia/tuner.h +++ b/include/nccl-headers/nvidia/tuner.h @@ -24,7 +24,7 @@ typedef enum { ncclNumFuncs = 8 } ncclFunc_t; -#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* +#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 @@ -32,6 +32,7 @@ typedef enum { #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 +#define NCCL_ALGO_PAT 6 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_UNDEF -1 diff --git a/src/tuner/nccl_ofi_tuner.c b/src/tuner/nccl_ofi_tuner.c index 7926b3f9b..87325c8c0 100644 --- a/src/tuner/nccl_ofi_tuner.c +++ b/src/tuner/nccl_ofi_tuner.c @@ -352,6 +352,8 @@ ncclResult_t nccl_ofi_tuner_get_coll_info(void *context, protocol = nccl_ofi_tuner_ctx->regions[i].protocol; if (table[algorithm][protocol] == NCCL_ALGO_PROTO_IGNORE || algorithm >= numAlgo || protocol >= numProto) { + /* Either NCCL says this combination is not valid/applicable or the algorithm or protocol is + * not in the table, hence it is not supported by this NCCL version. */ continue; } diff --git a/tests/unit/show_tuner_decisions.c b/tests/unit/show_tuner_decisions.c index ffd65b8a3..0ec1831cf 100644 --- a/tests/unit/show_tuner_decisions.c +++ b/tests/unit/show_tuner_decisions.c @@ -11,7 +11,7 @@ #include "nccl_ofi_tuner.h" -static const char *algo_names[] = { "tree", "ring", "collnet_direct", "collnet_chain", "nvls", "nvlstree" }; +static const char *algo_names[] = { "tree", "ring", "collnet_direct", "collnet_chain", "nvls", "nvlstree" , "pat" }; static const char *proto_names[] = { "ll", "ll128", "simple" }; void dummy_logger(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...) { return; };