From a810c2510a97fb15cdf32b91a98e2e7be93e15df Mon Sep 17 00:00:00 2001 From: Arun Karthik Date: Tue, 3 Dec 2024 23:57:08 +0000 Subject: [PATCH] fix : Add regions for AG 0x0 and RS 0x0 Adding the tuner regions for All Gather 0x0 and Reduce Scatter 0x0 These regions were removed from the region based tuner because of the bug in the polygon extend logic. After the bug is fixed, adding back the regions for AG0x0 and RS0x0. Signed-off-by: Arun Karthik --- src/tuner/nccl_ofi_regions.c | 86 ++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/tuner/nccl_ofi_regions.c b/src/tuner/nccl_ofi_regions.c index 61108db3c..274e00746 100644 --- a/src/tuner/nccl_ofi_regions.c +++ b/src/tuner/nccl_ofi_regions.c @@ -342,6 +342,92 @@ static ncclResult_t region_init_internal_p5en(nccl_ofi_tuner_region_context_t *r goto exit; } } + { + collType = ncclFuncAllGather; + nccl_ofi_tuner_point_t extended_ring_ll = + extend_region((nccl_ofi_tuner_point_t){8388608, 256}, + (nccl_ofi_tuner_point_t){33554432, 1024}, + (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS}); + nccl_ofi_tuner_point_t extended_ring_ll128 = + extend_region((nccl_ofi_tuner_point_t){8589934592, 512}, + (nccl_ofi_tuner_point_t){17179869184, 1024}, + (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS}); + + const nccl_ofi_tuner_region_t regions[] = { + {.algorithm = NCCL_ALGO_RING, + .protocol = NCCL_PROTO_LL, + .num_vertices = 6, + .vertices = {{0, 16}, {131072, 16}, {262144, 32}, {8388608, 256}, {33554432, 1024}, extended_ring_ll}}, + {.algorithm = NCCL_ALGO_RING, + .protocol = NCCL_PROTO_LL128, + .num_vertices = 10, + .vertices = {extended_ring_ll, + {33554432, 1024}, + {8388608, 256}, + {262144, 32}, + {131072, 16}, + {268435456, 16}, + {2147483648, 128}, + {8589934592, 512}, + {17179869184, 1024}, + extended_ring_ll128}}, + {.algorithm = NCCL_ALGO_RING, + .protocol = NCCL_PROTO_SIMPLE, + .num_vertices = 6, + .vertices = {extended_ring_ll128, + {17179869184, 1024}, + {8589934592, 512}, + {268435456, 16}, + {17179869184, 16}, + {TUNER_MAX_SIZE, 16}}}}; + ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions); + if (ret != ncclSuccess) { + goto exit; + } + } + { + collType = ncclFuncReduceScatter; + nccl_ofi_tuner_point_t extended_ring_ll = + extend_region((nccl_ofi_tuner_point_t){8388608, 256}, + (nccl_ofi_tuner_point_t){33554432, 1024}, + (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS}); + nccl_ofi_tuner_point_t extended_ring_ll128 = + extend_region((nccl_ofi_tuner_point_t){8589934592, 512}, + (nccl_ofi_tuner_point_t){17179869184, 1024}, + (nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS}); + + const nccl_ofi_tuner_region_t regions[] = { + {.algorithm = NCCL_ALGO_RING, + .protocol = NCCL_PROTO_LL, + .num_vertices = 6, + .vertices = {{0, 16}, {131072, 16}, {262144, 32}, {8388608, 256}, {33554432, 1024}, extended_ring_ll}}, + {.algorithm = NCCL_ALGO_RING, + .protocol = NCCL_PROTO_LL128, + .num_vertices = 10, + .vertices = {extended_ring_ll, + {33554432, 1024}, + {8388608, 256}, + {262144, 32}, + {131072, 16}, + {268435456, 16}, + {2147483648, 128}, + {8589934592, 512}, + {17179869184, 1024}, + extended_ring_ll128}}, + {.algorithm = NCCL_ALGO_RING, + .protocol = NCCL_PROTO_SIMPLE, + .num_vertices = 6, + .vertices = {extended_ring_ll128, + {17179869184, 1024}, + {8589934592, 512}, + {268435456, 16}, + {17179869184, 16}, + {TUNER_MAX_SIZE, 16}}}}; + ret = set_regions(region_ctx, collType, sizeof(regions) / sizeof(regions[0]), regions); + if (ret != ncclSuccess) { + goto exit; + } + } } else if (nRanks == nNodes) { { collType = ncclFuncAllReduce;