From 36f49dbdebaff024e14e2bbf6315b4974d0bf6f2 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Fri, 13 Dec 2024 12:13:47 -0800 Subject: [PATCH] Add reduction_unroll_factor to autotuning script (#3487) This PR renames `unroll_factor` to `iteration_unroll_factor` and adds `reduction_unroll_factor`. `reduction_unroll_factor` adds unroll factor on top of vectorization factor for the inner reduction domain. --- .../autotune_inner_reduction.py | 53 +++++++++++++++---- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/doc/dev/python_scheduling/autotune_inner_reduction.py b/doc/dev/python_scheduling/autotune_inner_reduction.py index c43a20f5767..260c0dab6ca 100644 --- a/doc/dev/python_scheduling/autotune_inner_reduction.py +++ b/doc/dev/python_scheduling/autotune_inner_reduction.py @@ -72,8 +72,10 @@ class FUSION(Enum): class InnerReductionConfiguration: # The vectorization factor for inner reduction domain. vectorize_factor: int = 1 + # The unroll factor for the inner reduction domain. + reduction_unroll_factor: int = 1 # The unroll factor for the outer iteration domain. - unroll_factor: int = 1 + iteration_unroll_factor: int = 1 # The grid size for the outer iteration domain. # If grdim > 1, then godim corresponds with y axis of the grid. # Otherwise, it is the x axis of the grid. @@ -121,11 +123,16 @@ def convert_to_inner_reduction_params(self, scheduler_config, reduction_params): reduction_params.vectorize_inner_reduction = ( scheduler_config.vectorize_factor > 1 ) + reduction_params.unroll_factor_top_of_vectorization = ( + scheduler_config.reduction_unroll_factor + ) if scheduler_config.bdimy > 1: reduction_params.block_dim_iter_dom = ParallelType.block_y - reduction_params.unroll_factor_iter_dom = scheduler_config.unroll_factor + reduction_params.unroll_factor_iter_dom = ( + scheduler_config.iteration_unroll_factor + ) gdimx = -1 gdimy = -1 @@ -161,16 +168,27 @@ def convert_to_inner_reduction_params(self, scheduler_config, reduction_params): def generate_scheduler_configurations(self, input_shape): threads_per_cta_options = [128, 256, 512, 1024] vectorization_factor_options = [1, 2, 4, 8] - unroll_factor_options = list(range(1, 11)) + reduction_unroll_factor_options = list(range(1, 6)) + iteration_unroll_factor_options = list(range(1, 6)) warp_size = 32 num_iterations, num_reductions = input_shape - for threads_per_cta, vectorize_factor, unroll_factor in itertools.product( - threads_per_cta_options, vectorization_factor_options, unroll_factor_options + for ( + threads_per_cta, + vectorize_factor, + reduction_unroll_factor, + iteration_unroll_factor, + ) in itertools.product( + threads_per_cta_options, + vectorization_factor_options, + reduction_unroll_factor_options, + iteration_unroll_factor_options, ): scheduler_config = self.InnerReductionConfiguration( - vectorize_factor=vectorize_factor, unroll_factor=unroll_factor + vectorize_factor=vectorize_factor, + reduction_unroll_factor=reduction_unroll_factor, + iteration_unroll_factor=iteration_unroll_factor, ) scheduler_config.bdimx = min( threads_per_cta, @@ -184,20 +202,35 @@ def generate_scheduler_configurations(self, input_shape): max(1, floor_div(threads_per_cta, scheduler_config.bdimx)), ) scheduler_config.godim = ceil_div( - num_iterations, scheduler_config.bdimy * scheduler_config.unroll_factor + num_iterations, scheduler_config.bdimy * iteration_unroll_factor ) # number of reduction elements not handled by a CTA remaining_reduction = ceil_div( - num_reductions, - (scheduler_config.bdimx * scheduler_config.vectorize_factor), + ceil_div( + ceil_div(num_reductions, vectorize_factor), scheduler_config.bdimx + ), + reduction_unroll_factor, ) - if unroll_factor == 1 and remaining_reduction > 1: + if iteration_unroll_factor == 1 and remaining_reduction > 1: # all remaining reduction goes to grdim scheduler_config.grdim = remaining_reduction yield scheduler_config + # When iteration dim is small, there may be unused SMs. We need + # to shift work from block reduction to grid reduction to + # increase SM usage. + godim = scheduler_config.godim + grdim = 1 + while ( + godim * grdim * 2 <= self.gpu_properties.multi_processor_count + and (remaining_reduction / grdim) >= 2 + ): + grdim *= 2 + scheduler_config.grdim = grdim + yield scheduler_config + # grid stride across reduction iterDomain is 1 scheduler_config.grdim = 1 yield scheduler_config