diff --git a/include/nccl-headers/nvidia/tuner.h b/include/nccl-headers/nvidia/tuner.h index d0192fe67..e4d6c6c37 100644 --- a/include/nccl-headers/nvidia/tuner.h +++ b/include/nccl-headers/nvidia/tuner.h @@ -24,7 +24,7 @@ typedef enum { ncclNumFuncs = 8 } ncclFunc_t; -#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* +#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet* #define NCCL_ALGO_UNDEF -1 #define NCCL_ALGO_TREE 0 #define NCCL_ALGO_RING 1 @@ -32,6 +32,7 @@ typedef enum { #define NCCL_ALGO_COLLNET_CHAIN 3 #define NCCL_ALGO_NVLS 4 #define NCCL_ALGO_NVLS_TREE 5 +#define NCCL_ALGO_PAT 6 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 #define NCCL_PROTO_UNDEF -1 diff --git a/src/tuner/nccl_ofi_tuner.c b/src/tuner/nccl_ofi_tuner.c index 7926b3f9b..87325c8c0 100644 --- a/src/tuner/nccl_ofi_tuner.c +++ b/src/tuner/nccl_ofi_tuner.c @@ -352,6 +352,8 @@ ncclResult_t nccl_ofi_tuner_get_coll_info(void *context, protocol = nccl_ofi_tuner_ctx->regions[i].protocol; if (table[algorithm][protocol] == NCCL_ALGO_PROTO_IGNORE || algorithm >= numAlgo || protocol >= numProto) { + /* Either NCCL says this combination is not valid/applicable or the algorithm or protocol is + * not in the table, hence it is not supported by this NCCL version. */ continue; } diff --git a/tests/unit/show_tuner_decisions.c b/tests/unit/show_tuner_decisions.c index ffd65b8a3..0ec1831cf 100644 --- a/tests/unit/show_tuner_decisions.c +++ b/tests/unit/show_tuner_decisions.c @@ -11,7 +11,7 @@ #include "nccl_ofi_tuner.h" -static const char *algo_names[] = { "tree", "ring", "collnet_direct", "collnet_chain", "nvls", "nvlstree" }; +static const char *algo_names[] = { "tree", "ring", "collnet_direct", "collnet_chain", "nvls", "nvlstree" , "pat" }; static const char *proto_names[] = { "ll", "ll128", "simple" }; void dummy_logger(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...) { return; };