Skip to content

Commit

Permalink
tuner: add support for AllGather/ReduceScatter
Browse files Browse the repository at this point in the history
This commit adds a region in the tuner for Ring+Simple in AllGather and ReduceScatter,
only in the one rank per node case, as this option is preferable to the one picked by
NCCL's internal tuner. Outside this region we fall back to NCCL's tuner.

Signed-off-by: Amedeo Sapio <asapio@amazon.com>
  • Loading branch information
AmedeoSapio committed Oct 2, 2024
1 parent 4346d6e commit feaff3c
Show file tree
Hide file tree
Showing 3 changed files with 320 additions and 264 deletions.
5 changes: 3 additions & 2 deletions include/nccl_ofi_tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@ typedef struct nccl_ofi_tuner_region {

typedef struct nccl_ofi_tuner_context {
nccl_ofi_tuner_model_dims_t dims;
size_t num_regions;
nccl_ofi_tuner_region_t *regions;
size_t num_regions[NCCL_NUM_FUNCTIONS];
nccl_ofi_tuner_region_t *regions[NCCL_NUM_FUNCTIONS];
} nccl_ofi_tuner_context_t;

/* Functions to set and test regions */
int is_inside_region(nccl_ofi_tuner_point_t point, nccl_ofi_tuner_region_t *region);

ncclResult_t set_regions(nccl_ofi_tuner_context_t *nccl_ofi_tuner_ctx,
ncclFunc_t collType,
size_t num_regions,
const nccl_ofi_tuner_region_t regions[]);

Expand Down
10 changes: 6 additions & 4 deletions src/tuner/nccl_ofi_regions.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,17 +201,19 @@ int is_inside_region(nccl_ofi_tuner_point_t point, nccl_ofi_tuner_region_t *regi

/* Allocate and copy regions */
ncclResult_t set_regions(nccl_ofi_tuner_context_t *nccl_ofi_tuner_ctx,
ncclFunc_t collType,
size_t num_regions,
const nccl_ofi_tuner_region_t regions[])
{
nccl_ofi_tuner_ctx->num_regions = num_regions;
nccl_ofi_tuner_ctx->regions = (nccl_ofi_tuner_region_t *)calloc(num_regions, sizeof(nccl_ofi_tuner_region_t));
if (nccl_ofi_tuner_ctx->regions == NULL) {
assert(collType < NCCL_NUM_FUNCTIONS);
nccl_ofi_tuner_ctx->num_regions[collType] = num_regions;
nccl_ofi_tuner_ctx->regions[collType] = (nccl_ofi_tuner_region_t *)calloc(num_regions, sizeof(nccl_ofi_tuner_region_t));
if (nccl_ofi_tuner_ctx->regions[collType] == NULL) {
NCCL_OFI_WARN("Context regions allocation failed.");
return ncclInternalError;
}

memcpy(nccl_ofi_tuner_ctx->regions, &regions[0], num_regions * sizeof(nccl_ofi_tuner_region_t));
memcpy(nccl_ofi_tuner_ctx->regions[collType], &regions[0], num_regions * sizeof(nccl_ofi_tuner_region_t));
return ncclSuccess;
}

Expand Down
Loading

0 comments on commit feaff3c

Please sign in to comment.