From e1a110fea067c0698baaaba132fc48e6ca1542f1 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 30 Jan 2025 02:16:26 +0200 Subject: [PATCH 01/11] Implement groupreduce API --- src/KernelAbstractions.jl | 2 + src/reduce.jl | 135 ++++++++++++++++++++++++++++++++++++++ test/groupreduce.jl | 58 ++++++++++++++++ test/testsuite.jl | 81 ++++++++++++----------- 4 files changed, 238 insertions(+), 38 deletions(-) create mode 100644 src/reduce.jl create mode 100644 test/groupreduce.jl diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index b82dadc5..f99458bf 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -798,6 +798,8 @@ function __fake_compiler_job end # - LoopInfo ### +include("reduce.jl") + include("extras/extras.jl") include("reflection.jl") diff --git a/src/reduce.jl b/src/reduce.jl new file mode 100644 index 00000000..4188e511 --- /dev/null +++ b/src/reduce.jl @@ -0,0 +1,135 @@ +""" + @groupreduce algo op val neutral [groupsize] + +Perform group reduction of `val` using `op`. + +# Arguments + +- `algo` specifies which reduction algorithm to use: + - `:thread`: + Perform thread group reduction (requires `groupsize * sizeof(T)` bytes of shared memory). + Available accross all backends. + - `:warp`: + Perform warp group reduction (requires `32 * sizeof(T)` bytes of shared memory). + +- `neutral` should be a neutral w.r.t. `op`, such that `op(neutral, x) == x`. +- `groupsize` specifies size of the workgroup. + If a kernel does not specifies `groupsize` statically, then it is required to + provide `groupsize`. + Also can be used to perform reduction accross first `groupsize` threads + (if `groupsize < @groupsize()`). + +# Returns + +Result of the reduction. +""" +macro groupreduce(algo, op, val, neutral) + f = if algo.value == :thread + __groupreduce + elseif algo.value == :warp + __warp_groupreduce + else + error( + "@groupreduce supports only :thread or :warp as a reduction algorithm, " * + "but $(algo.value) was specified.") + end + quote + $f( + $(esc(:__ctx__)), + $(esc(op)), + $(esc(val)), + $(esc(neutral)), + Val(prod($groupsize($(esc(:__ctx__))))), + ) + end +end + +macro groupreduce(algo, op, val, neutral, groupsize) + f = if algo.value == :thread + __groupreduce + elseif algo.value == :warp + __warp_groupreduce + else + error( + "@groupreduce supports only :thread or :warp as a reduction algorithm, " * + "but $(algo.value) was specified.") + end + quote + $f( + $(esc(:__ctx__)), + $(esc(op)), + $(esc(val)), + $(esc(neutral)), + Val($(esc(groupsize))), + ) + end +end + +function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) where {T, groupsize} + storage = @localmem T groupsize + + local_idx = @index(Local) + local_idx ≤ groupsize && (storage[local_idx] = val) + @synchronize() + + s::UInt64 = groupsize ÷ 0x2 + while s > 0x0 + if (local_idx - 0x1) < s + other_idx = local_idx + s + if other_idx ≤ groupsize + storage[local_idx] = op(storage[local_idx], storage[other_idx]) + end + end + @synchronize() + s >>= 0x1 + end + + if local_idx == 0x1 + val = storage[local_idx] + end + return val +end + +# Warp groupreduce. + +macro shfl_down(val, offset) + quote + $__shfl_down($(esc(val)), $(esc(offset))) + end +end + +# Backends should implement this. +function __shfl_down end + +@inline function __warp_reduce(val, op) + offset::UInt32 = UInt32(32) ÷ 0x2 + while offset > 0x0 + val = op(val, @shfl_down(val, offset)) + offset >>= 0x1 + end + return val +end + +# Assume warp is 32 lanes. +const __warpsize::UInt32 = 32 +# Maximum number of warps (for a groupsize = 1024). +const __warp_bins::UInt32 = 32 + +function __warp_groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) where {T, groupsize} + storage = @localmem T __warp_bins + + local_idx = @index(Local) + lane = (local_idx - 0x1) % __warpsize + 0x1 + warp_id = (local_idx - 0x1) ÷ __warpsize + 0x1 + + # Each warp performs a reduction and writes results into its own bin in `storage`. + val = __warp_reduce(val, op) + lane == 0x1 && (storage[warp_id] = val) + @synchronize() + + # Final reduction of the `storage` on the first warp. + within_storage = (local_idx - 0x1) < groupsize ÷ __warpsize + val = within_storage ? storage[lane] : neutral + warp_id == 0x1 && (val = __warp_reduce(val, op)) + return val +end diff --git a/test/groupreduce.jl b/test/groupreduce.jl new file mode 100644 index 00000000..b871e08d --- /dev/null +++ b/test/groupreduce.jl @@ -0,0 +1,58 @@ +@kernel function groupreduce_thread_1!(y, x, op, neutral) + i = @index(Global) + val = i > length(x) ? neutral : x[i] + res = KernelAbstractions.@groupreduce(:thread, op, val, neutral) + i == 1 && (y[1] = res) +end + +@kernel function groupreduce_thread_2!(y, x, op, neutral, ::Val{groupsize}) where {groupsize} + i = @index(Global) + val = i > length(x) ? neutral : x[i] + res = KernelAbstractions.@groupreduce(:thread, op, val, neutral, groupsize) + i == 1 && (y[1] = res) +end + +@kernel function groupreduce_warp_1!(y, x, op, neutral) + i = @index(Global) + val = i > length(x) ? neutral : x[i] + res = KernelAbstractions.@groupreduce(:warp, op, val, neutral) + i == 1 && (y[1] = res) +end + +@kernel function groupreduce_warp_2!(y, x, op, neutral, ::Val{groupsize}) where {groupsize} + i = @index(Global) + val = i > length(x) ? neutral : x[i] + res = KernelAbstractions.@groupreduce(:warp, op, val, neutral, groupsize) + i == 1 && (y[1] = res) +end + +function groupreduce_testsuite(backend, AT) + @testset "@groupreduce" begin + @testset ":thread T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) + x = AT(ones(T, n)) + y = AT(zeros(T, 1)) + + groupreduce_thread_1!(backend(), n)(y, x, +, zero(T); ndrange=n) + @test Array(y)[1] == n + + groupreduce_thread_2!(backend())(y, x, +, zero(T), Val(128); ndrange=n) + @test Array(y)[1] == 128 + + groupreduce_thread_2!(backend())(y, x, +, zero(T), Val(64); ndrange=n) + @test Array(y)[1] == 64 + end + + @testset ":warp T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) + x = AT(ones(T, n)) + y = AT(zeros(T, 1)) + groupreduce_warp_1!(backend(), n)(y, x, +, zero(T); ndrange=n) + @test Array(y)[1] == n + + groupreduce_warp_2!(backend())(y, x, +, zero(T), Val(128); ndrange=n) + @test Array(y)[1] == 128 + + groupreduce_warp_2!(backend())(y, x, +, zero(T), Val(64); ndrange=n) + @test Array(y)[1] == 64 + end + end +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 29f78027..a63a8442 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -38,58 +38,63 @@ include("reflection.jl") include("examples.jl") include("convert.jl") include("specialfunctions.jl") +include("groupreduce.jl") function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{String}()) - @conditional_testset "Unittests" skip_tests begin - unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) - end + # @conditional_testset "Unittests" skip_tests begin + # unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) + # end - @conditional_testset "SpecialFunctions" skip_tests begin - specialfunctions_testsuite(backend) - end + # @conditional_testset "SpecialFunctions" skip_tests begin + # specialfunctions_testsuite(backend) + # end - @conditional_testset "Localmem" skip_tests begin - localmem_testsuite(backend, AT) - end + # @conditional_testset "Localmem" skip_tests begin + # localmem_testsuite(backend, AT) + # end - @conditional_testset "Private" skip_tests begin - private_testsuite(backend, AT) - end + # @conditional_testset "Private" skip_tests begin + # private_testsuite(backend, AT) + # end - @conditional_testset "Unroll" skip_tests begin - unroll_testsuite(backend, AT) - end + # @conditional_testset "Unroll" skip_tests begin + # unroll_testsuite(backend, AT) + # end - @testset "NDIteration" begin - nditeration_testsuite() - end + # @testset "NDIteration" begin + # nditeration_testsuite() + # end - @conditional_testset "copyto!" skip_tests begin - copyto_testsuite(backend, AT) - end + # @conditional_testset "copyto!" skip_tests begin + # copyto_testsuite(backend, AT) + # end - @conditional_testset "Devices" skip_tests begin - devices_testsuite(backend) - end + # @conditional_testset "Devices" skip_tests begin + # devices_testsuite(backend) + # end - @conditional_testset "Printing" skip_tests begin - printing_testsuite(backend) - end + # @conditional_testset "Printing" skip_tests begin + # printing_testsuite(backend) + # end - @conditional_testset "Compiler" skip_tests begin - compiler_testsuite(backend, AT) - end + # @conditional_testset "Compiler" skip_tests begin + # compiler_testsuite(backend, AT) + # end - @conditional_testset "Reflection" skip_tests begin - reflection_testsuite(backend, backend_str, AT) - end + # @conditional_testset "Reflection" skip_tests begin + # reflection_testsuite(backend, backend_str, AT) + # end - @conditional_testset "Convert" skip_tests begin - convert_testsuite(backend, AT) - end + # @conditional_testset "Convert" skip_tests begin + # convert_testsuite(backend, AT) + # end + + # @conditional_testset "Examples" skip_tests begin + # examples_testsuite(backend_str) + # end - @conditional_testset "Examples" skip_tests begin - examples_testsuite(backend_str) + @conditional_testset "@groupreduce" skip_tests begin + groupreduce_testsuite(backend, AT) end return From ff4097fae7a1f620fcf0c10cea5c63500331abf9 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 30 Jan 2025 20:55:57 +0200 Subject: [PATCH 02/11] Simplify algo selection --- docs/src/api.md | 5 +-- src/reduce.jl | 61 ++++++++++++++++------------------- test/groupreduce.jl | 52 ++++++++++++------------------ test/testsuite.jl | 78 ++++++++++++++++++++++----------------------- 4 files changed, 91 insertions(+), 105 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 9373d231..49e53f4b 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -13,13 +13,14 @@ @uniform @groupsize @ndrange -synchronize -allocate +@groupreduce ``` ## Host language ```@docs +synchronize +allocate KernelAbstractions.zeros ``` diff --git a/src/reduce.jl b/src/reduce.jl index 4188e511..b24d709d 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,18 +1,28 @@ +export @groupreduce, Reduction + +module Reduction + const thread = Val(:thread) + const warp = Val(:warp) +end + """ - @groupreduce algo op val neutral [groupsize] + @groupreduce op val neutral algo [groupsize] Perform group reduction of `val` using `op`. # Arguments - `algo` specifies which reduction algorithm to use: - - `:thread`: + - `Reduction.thread`: Perform thread group reduction (requires `groupsize * sizeof(T)` bytes of shared memory). Available accross all backends. - - `:warp`: + - `Reduction.warp`: Perform warp group reduction (requires `32 * sizeof(T)` bytes of shared memory). + Potentially faster, since requires fewer writes to shared memory. + To query if backend supports warp reduction, use `supports_warp_reduction(backend)`. - `neutral` should be a neutral w.r.t. `op`, such that `op(neutral, x) == x`. + - `groupsize` specifies size of the workgroup. If a kernel does not specifies `groupsize` statically, then it is required to provide `groupsize`. @@ -23,53 +33,37 @@ Perform group reduction of `val` using `op`. Result of the reduction. """ -macro groupreduce(algo, op, val, neutral) - f = if algo.value == :thread - __groupreduce - elseif algo.value == :warp - __warp_groupreduce - else - error( - "@groupreduce supports only :thread or :warp as a reduction algorithm, " * - "but $(algo.value) was specified.") - end +macro groupreduce(op, val, neutral, algo) quote - $f( + __groupreduce( $(esc(:__ctx__)), $(esc(op)), $(esc(val)), $(esc(neutral)), Val(prod($groupsize($(esc(:__ctx__))))), + $(esc(algo)), ) end end -macro groupreduce(algo, op, val, neutral, groupsize) - f = if algo.value == :thread - __groupreduce - elseif algo.value == :warp - __warp_groupreduce - else - error( - "@groupreduce supports only :thread or :warp as a reduction algorithm, " * - "but $(algo.value) was specified.") - end +macro groupreduce(op, val, neutral, algo, groupsize) quote - $f( + __groupreduce( $(esc(:__ctx__)), $(esc(op)), $(esc(val)), $(esc(neutral)), Val($(esc(groupsize))), + $(esc(algo)), ) end end -function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) where {T, groupsize} +function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{:thread}) where {T, groupsize} storage = @localmem T groupsize local_idx = @index(Local) - local_idx ≤ groupsize && (storage[local_idx] = val) + @inbounds local_idx ≤ groupsize && (storage[local_idx] = val) @synchronize() s::UInt64 = groupsize ÷ 0x2 @@ -77,7 +71,7 @@ function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) where if (local_idx - 0x1) < s other_idx = local_idx + s if other_idx ≤ groupsize - storage[local_idx] = op(storage[local_idx], storage[other_idx]) + @inbounds storage[local_idx] = op(storage[local_idx], storage[other_idx]) end end @synchronize() @@ -85,7 +79,7 @@ function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) where end if local_idx == 0x1 - val = storage[local_idx] + @inbounds val = storage[local_idx] end return val end @@ -98,8 +92,9 @@ macro shfl_down(val, offset) end end -# Backends should implement this. +# Backends should implement these two. function __shfl_down end +supports_warp_reduction(::CPU) = false @inline function __warp_reduce(val, op) offset::UInt32 = UInt32(32) ÷ 0x2 @@ -115,7 +110,7 @@ const __warpsize::UInt32 = 32 # Maximum number of warps (for a groupsize = 1024). const __warp_bins::UInt32 = 32 -function __warp_groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) where {T, groupsize} +function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{:warp}) where {T, groupsize} storage = @localmem T __warp_bins local_idx = @index(Local) @@ -124,12 +119,12 @@ function __warp_groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) w # Each warp performs a reduction and writes results into its own bin in `storage`. val = __warp_reduce(val, op) - lane == 0x1 && (storage[warp_id] = val) + @inbounds lane == 0x1 && (storage[warp_id] = val) @synchronize() # Final reduction of the `storage` on the first warp. within_storage = (local_idx - 0x1) < groupsize ÷ __warpsize - val = within_storage ? storage[lane] : neutral + @inbounds val = within_storage ? storage[lane] : neutral warp_id == 0x1 && (val = __warp_reduce(val, op)) return val end diff --git a/test/groupreduce.jl b/test/groupreduce.jl index b871e08d..7fb2339c 100644 --- a/test/groupreduce.jl +++ b/test/groupreduce.jl @@ -1,58 +1,48 @@ -@kernel function groupreduce_thread_1!(y, x, op, neutral) +@kernel function groupreduce_1!(y, x, op, neutral, algo) i = @index(Global) val = i > length(x) ? neutral : x[i] - res = KernelAbstractions.@groupreduce(:thread, op, val, neutral) + res = @groupreduce(op, val, neutral, algo) i == 1 && (y[1] = res) end -@kernel function groupreduce_thread_2!(y, x, op, neutral, ::Val{groupsize}) where {groupsize} +@kernel function groupreduce_2!(y, x, op, neutral, algo, ::Val{groupsize}) where {groupsize} i = @index(Global) val = i > length(x) ? neutral : x[i] - res = KernelAbstractions.@groupreduce(:thread, op, val, neutral, groupsize) - i == 1 && (y[1] = res) -end - -@kernel function groupreduce_warp_1!(y, x, op, neutral) - i = @index(Global) - val = i > length(x) ? neutral : x[i] - res = KernelAbstractions.@groupreduce(:warp, op, val, neutral) - i == 1 && (y[1] = res) -end - -@kernel function groupreduce_warp_2!(y, x, op, neutral, ::Val{groupsize}) where {groupsize} - i = @index(Global) - val = i > length(x) ? neutral : x[i] - res = KernelAbstractions.@groupreduce(:warp, op, val, neutral, groupsize) + res = @groupreduce(op, val, neutral, algo, groupsize) i == 1 && (y[1] = res) end function groupreduce_testsuite(backend, AT) @testset "@groupreduce" begin - @testset ":thread T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) + @testset "thread reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) x = AT(ones(T, n)) y = AT(zeros(T, 1)) - groupreduce_thread_1!(backend(), n)(y, x, +, zero(T); ndrange=n) + groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.thread; ndrange=n) @test Array(y)[1] == n - groupreduce_thread_2!(backend())(y, x, +, zero(T), Val(128); ndrange=n) + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(128); ndrange=n) @test Array(y)[1] == 128 - groupreduce_thread_2!(backend())(y, x, +, zero(T), Val(64); ndrange=n) + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(64); ndrange=n) @test Array(y)[1] == 64 end - @testset ":warp T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) - x = AT(ones(T, n)) - y = AT(zeros(T, 1)) - groupreduce_warp_1!(backend(), n)(y, x, +, zero(T); ndrange=n) - @test Array(y)[1] == n + warp_reduction = KernelAbstractions.supports_warp_reduction(backend()) + if warp_reduction + @testset "warp reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) - groupreduce_warp_2!(backend())(y, x, +, zero(T), Val(128); ndrange=n) - @test Array(y)[1] == 128 + x = AT(ones(T, n)) + y = AT(zeros(T, 1)) + groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.warp; ndrange=n) + @test Array(y)[1] == n - groupreduce_warp_2!(backend())(y, x, +, zero(T), Val(64); ndrange=n) - @test Array(y)[1] == 64 + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(128); ndrange=n) + @test Array(y)[1] == 128 + + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(64); ndrange=n) + @test Array(y)[1] == 64 + end end end end diff --git a/test/testsuite.jl b/test/testsuite.jl index a63a8442..2a277184 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -41,57 +41,57 @@ include("specialfunctions.jl") include("groupreduce.jl") function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{String}()) - # @conditional_testset "Unittests" skip_tests begin - # unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) - # end + @conditional_testset "Unittests" skip_tests begin + unittest_testsuite(backend, backend_str, backend_mod, DAT; skip_tests) + end - # @conditional_testset "SpecialFunctions" skip_tests begin - # specialfunctions_testsuite(backend) - # end + @conditional_testset "SpecialFunctions" skip_tests begin + specialfunctions_testsuite(backend) + end - # @conditional_testset "Localmem" skip_tests begin - # localmem_testsuite(backend, AT) - # end + @conditional_testset "Localmem" skip_tests begin + localmem_testsuite(backend, AT) + end - # @conditional_testset "Private" skip_tests begin - # private_testsuite(backend, AT) - # end + @conditional_testset "Private" skip_tests begin + private_testsuite(backend, AT) + end - # @conditional_testset "Unroll" skip_tests begin - # unroll_testsuite(backend, AT) - # end + @conditional_testset "Unroll" skip_tests begin + unroll_testsuite(backend, AT) + end - # @testset "NDIteration" begin - # nditeration_testsuite() - # end + @testset "NDIteration" begin + nditeration_testsuite() + end - # @conditional_testset "copyto!" skip_tests begin - # copyto_testsuite(backend, AT) - # end + @conditional_testset "copyto!" skip_tests begin + copyto_testsuite(backend, AT) + end - # @conditional_testset "Devices" skip_tests begin - # devices_testsuite(backend) - # end + @conditional_testset "Devices" skip_tests begin + devices_testsuite(backend) + end - # @conditional_testset "Printing" skip_tests begin - # printing_testsuite(backend) - # end + @conditional_testset "Printing" skip_tests begin + printing_testsuite(backend) + end - # @conditional_testset "Compiler" skip_tests begin - # compiler_testsuite(backend, AT) - # end + @conditional_testset "Compiler" skip_tests begin + compiler_testsuite(backend, AT) + end - # @conditional_testset "Reflection" skip_tests begin - # reflection_testsuite(backend, backend_str, AT) - # end + @conditional_testset "Reflection" skip_tests begin + reflection_testsuite(backend, backend_str, AT) + end - # @conditional_testset "Convert" skip_tests begin - # convert_testsuite(backend, AT) - # end + @conditional_testset "Convert" skip_tests begin + convert_testsuite(backend, AT) + end - # @conditional_testset "Examples" skip_tests begin - # examples_testsuite(backend_str) - # end + @conditional_testset "Examples" skip_tests begin + examples_testsuite(backend_str) + end @conditional_testset "@groupreduce" skip_tests begin groupreduce_testsuite(backend, AT) From 6a35eb8e86cad50752cd8e9c063f7f4eace638c9 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 30 Jan 2025 21:14:11 +0200 Subject: [PATCH 03/11] Refactor --- src/reduce.jl | 32 ++++++++++++++++---------------- test/groupreduce.jl | 12 ++++++------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index b24d709d..6f4171b0 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -34,7 +34,7 @@ Perform group reduction of `val` using `op`. Result of the reduction. """ macro groupreduce(op, val, neutral, algo) - quote + return quote __groupreduce( $(esc(:__ctx__)), $(esc(op)), @@ -47,7 +47,7 @@ macro groupreduce(op, val, neutral, algo) end macro groupreduce(op, val, neutral, algo, groupsize) - quote + return quote __groupreduce( $(esc(:__ctx__)), $(esc(op)), @@ -66,19 +66,19 @@ function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{ @inbounds local_idx ≤ groupsize && (storage[local_idx] = val) @synchronize() - s::UInt64 = groupsize ÷ 0x2 - while s > 0x0 - if (local_idx - 0x1) < s + s::UInt64 = groupsize ÷ 0x02 + while s > 0x00 + if (local_idx - 0x01) < s other_idx = local_idx + s if other_idx ≤ groupsize @inbounds storage[local_idx] = op(storage[local_idx], storage[other_idx]) end end @synchronize() - s >>= 0x1 + s >>= 0x01 end - if local_idx == 0x1 + if local_idx == 0x01 @inbounds val = storage[local_idx] end return val @@ -87,7 +87,7 @@ end # Warp groupreduce. macro shfl_down(val, offset) - quote + return quote $__shfl_down($(esc(val)), $(esc(offset))) end end @@ -97,10 +97,10 @@ function __shfl_down end supports_warp_reduction(::CPU) = false @inline function __warp_reduce(val, op) - offset::UInt32 = UInt32(32) ÷ 0x2 - while offset > 0x0 + offset::UInt32 = UInt32(32) ÷ 0x02 + while offset > 0x00 val = op(val, @shfl_down(val, offset)) - offset >>= 0x1 + offset >>= 0x01 end return val end @@ -114,17 +114,17 @@ function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{ storage = @localmem T __warp_bins local_idx = @index(Local) - lane = (local_idx - 0x1) % __warpsize + 0x1 - warp_id = (local_idx - 0x1) ÷ __warpsize + 0x1 + lane = (local_idx - 0x01) % __warpsize + 0x01 + warp_id = (local_idx - 0x01) ÷ __warpsize + 0x01 # Each warp performs a reduction and writes results into its own bin in `storage`. val = __warp_reduce(val, op) - @inbounds lane == 0x1 && (storage[warp_id] = val) + @inbounds lane == 0x01 && (storage[warp_id] = val) @synchronize() # Final reduction of the `storage` on the first warp. - within_storage = (local_idx - 0x1) < groupsize ÷ __warpsize + within_storage = (local_idx - 0x01) < groupsize ÷ __warpsize @inbounds val = within_storage ? storage[lane] : neutral - warp_id == 0x1 && (val = __warp_reduce(val, op)) + warp_id == 0x01 && (val = __warp_reduce(val, op)) return val end diff --git a/test/groupreduce.jl b/test/groupreduce.jl index 7fb2339c..340b183a 100644 --- a/test/groupreduce.jl +++ b/test/groupreduce.jl @@ -18,13 +18,13 @@ function groupreduce_testsuite(backend, AT) x = AT(ones(T, n)) y = AT(zeros(T, 1)) - groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.thread; ndrange=n) + groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.thread; ndrange = n) @test Array(y)[1] == n - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(128); ndrange=n) + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(128); ndrange = n) @test Array(y)[1] == 128 - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(64); ndrange=n) + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(64); ndrange = n) @test Array(y)[1] == 64 end @@ -34,13 +34,13 @@ function groupreduce_testsuite(backend, AT) x = AT(ones(T, n)) y = AT(zeros(T, 1)) - groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.warp; ndrange=n) + groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.warp; ndrange = n) @test Array(y)[1] == n - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(128); ndrange=n) + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(128); ndrange = n) @test Array(y)[1] == 128 - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(64); ndrange=n) + groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(64); ndrange = n) @test Array(y)[1] == 64 end end From 224e8c8fe699ed5fa3b42f979d340fcf3b1f9d0e Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 30 Jan 2025 21:20:24 +0200 Subject: [PATCH 04/11] Correction --- src/reduce.jl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index 6f4171b0..6748599d 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -94,10 +94,15 @@ end # Backends should implement these two. function __shfl_down end -supports_warp_reduction(::CPU) = false +supports_warp_reduction(::Backend) = false + +# Assume warp is 32 lanes. +const __warpsize::UInt32 = 32 +# Maximum number of warps (for a groupsize = 1024). +const __warp_bins::UInt32 = 32 @inline function __warp_reduce(val, op) - offset::UInt32 = UInt32(32) ÷ 0x02 + offset::UInt32 = __warpsize ÷ 0x02 while offset > 0x00 val = op(val, @shfl_down(val, offset)) offset >>= 0x01 @@ -105,11 +110,6 @@ supports_warp_reduction(::CPU) = false return val end -# Assume warp is 32 lanes. -const __warpsize::UInt32 = 32 -# Maximum number of warps (for a groupsize = 1024). -const __warp_bins::UInt32 = 32 - function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{:warp}) where {T, groupsize} storage = @localmem T __warp_bins From 4a8e70739dd30396efd3df63bf7e15330bb4060c Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sat, 1 Feb 2025 12:18:06 +0200 Subject: [PATCH 05/11] Disable groupreduce tests for CPU --- test/testsuite.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/testsuite.jl b/test/testsuite.jl index 2a277184..528321ea 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -93,8 +93,11 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ examples_testsuite(backend_str) end - @conditional_testset "@groupreduce" skip_tests begin - groupreduce_testsuite(backend, AT) + # TODO @index(Local) only works as a top-level expression on CPU. + if backend != CPU + @conditional_testset "@groupreduce" skip_tests begin + groupreduce_testsuite(backend, AT) + end end return From 7c923fb2fb58c6093e8162020844640724b1d430 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sat, 1 Feb 2025 12:19:11 +0200 Subject: [PATCH 06/11] Strip globalvars of types --- src/reduce.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index 6748599d..cd15a850 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -97,9 +97,9 @@ function __shfl_down end supports_warp_reduction(::Backend) = false # Assume warp is 32 lanes. -const __warpsize::UInt32 = 32 +const __warpsize = UInt32(32) # Maximum number of warps (for a groupsize = 1024). -const __warp_bins::UInt32 = 32 +const __warp_bins = UInt32(32) @inline function __warp_reduce(val, op) offset::UInt32 = __warpsize ÷ 0x02 From a6479922d8beec7a37db8978e4c1cbcc9089d543 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Sat, 1 Feb 2025 14:45:40 +0200 Subject: [PATCH 07/11] Limit groupsize to 256 for oneAPI --- test/groupreduce.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/groupreduce.jl b/test/groupreduce.jl index 340b183a..40f0364b 100644 --- a/test/groupreduce.jl +++ b/test/groupreduce.jl @@ -13,8 +13,12 @@ end end function groupreduce_testsuite(backend, AT) + # TODO should be better way of querying max groupsize + groupsizes = "$backend" == "oneAPIBackend" ? + (256,) : + (256, 512, 1024) @testset "@groupreduce" begin - @testset "thread reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) + @testset "thread reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in groupsizes x = AT(ones(T, n)) y = AT(zeros(T, 1)) @@ -30,8 +34,7 @@ function groupreduce_testsuite(backend, AT) warp_reduction = KernelAbstractions.supports_warp_reduction(backend()) if warp_reduction - @testset "warp reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in (256, 512, 1024) - + @testset "warp reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in groupsizes x = AT(ones(T, n)) y = AT(zeros(T, 1)) groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.warp; ndrange = n) From cbc8bd5e6eb220d7d389f10660fd14d1138f6efb Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 4 Feb 2025 00:54:03 +0200 Subject: [PATCH 08/11] Auto-select reduction algorithm & remove at-shfl_down macro --- src/reduce.jl | 84 +++++++++++++++++++++++++-------------------- test/groupreduce.jl | 34 +++++------------- 2 files changed, 55 insertions(+), 63 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index cd15a850..7edf501b 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,4 +1,4 @@ -export @groupreduce, Reduction +export @groupreduce module Reduction const thread = Val(:thread) @@ -6,21 +6,13 @@ module Reduction end """ - @groupreduce op val neutral algo [groupsize] + @groupreduce op val neutral [groupsize] Perform group reduction of `val` using `op`. +If backend supports warp reduction, it will use it instead of thread reduction. # Arguments -- `algo` specifies which reduction algorithm to use: - - `Reduction.thread`: - Perform thread group reduction (requires `groupsize * sizeof(T)` bytes of shared memory). - Available accross all backends. - - `Reduction.warp`: - Perform warp group reduction (requires `32 * sizeof(T)` bytes of shared memory). - Potentially faster, since requires fewer writes to shared memory. - To query if backend supports warp reduction, use `supports_warp_reduction(backend)`. - - `neutral` should be a neutral w.r.t. `op`, such that `op(neutral, x) == x`. - `groupsize` specifies size of the workgroup. @@ -33,29 +25,51 @@ Perform group reduction of `val` using `op`. Result of the reduction. """ -macro groupreduce(op, val, neutral, algo) +macro groupreduce(op, val, neutral) return quote - __groupreduce( - $(esc(:__ctx__)), - $(esc(op)), - $(esc(val)), - $(esc(neutral)), - Val(prod($groupsize($(esc(:__ctx__))))), - $(esc(algo)), - ) + if __supports_warp_reduction() + __groupreduce( + $(esc(:__ctx__)), + $(esc(op)), + $(esc(val)), + $(esc(neutral)), + Val(prod($groupsize($(esc(:__ctx__))))), + $(esc(Reduction.warp)), + ) + else + __groupreduce( + $(esc(:__ctx__)), + $(esc(op)), + $(esc(val)), + $(esc(neutral)), + Val(prod($groupsize($(esc(:__ctx__))))), + $(esc(Reduction.thread)), + ) + end end end -macro groupreduce(op, val, neutral, algo, groupsize) +macro groupreduce(op, val, neutral, groupsize) return quote - __groupreduce( - $(esc(:__ctx__)), - $(esc(op)), - $(esc(val)), - $(esc(neutral)), - Val($(esc(groupsize))), - $(esc(algo)), - ) + if __supports_warp_reduction() + __groupreduce( + $(esc(:__ctx__)), + $(esc(op)), + $(esc(val)), + $(esc(neutral)), + Val($(esc(groupsize))), + $(esc(Reduction.warp)), + ) + else + __groupreduce( + $(esc(:__ctx__)), + $(esc(op)), + $(esc(val)), + $(esc(neutral)), + Val($(esc(groupsize))), + $(esc(Reduction.thread)), + ) + end end end @@ -86,15 +100,9 @@ end # Warp groupreduce. -macro shfl_down(val, offset) - return quote - $__shfl_down($(esc(val)), $(esc(offset))) - end -end - -# Backends should implement these two. +# NOTE: Backends should implement these two device functions (with `@device_override`). function __shfl_down end -supports_warp_reduction(::Backend) = false +function __supports_warp_reduction() end # Assume warp is 32 lanes. const __warpsize = UInt32(32) @@ -104,7 +112,7 @@ const __warp_bins = UInt32(32) @inline function __warp_reduce(val, op) offset::UInt32 = __warpsize ÷ 0x02 while offset > 0x00 - val = op(val, @shfl_down(val, offset)) + val = op(val, __shfl_down(val, offset)) offset >>= 0x01 end return val diff --git a/test/groupreduce.jl b/test/groupreduce.jl index 40f0364b..170f770b 100644 --- a/test/groupreduce.jl +++ b/test/groupreduce.jl @@ -1,51 +1,35 @@ -@kernel function groupreduce_1!(y, x, op, neutral, algo) +@kernel cpu=false function groupreduce_1!(y, x, op, neutral) i = @index(Global) val = i > length(x) ? neutral : x[i] - res = @groupreduce(op, val, neutral, algo) + res = @groupreduce(op, val, neutral) i == 1 && (y[1] = res) end -@kernel function groupreduce_2!(y, x, op, neutral, algo, ::Val{groupsize}) where {groupsize} +@kernel cpu=false function groupreduce_2!(y, x, op, neutral, ::Val{groupsize}) where {groupsize} i = @index(Global) val = i > length(x) ? neutral : x[i] - res = @groupreduce(op, val, neutral, algo, groupsize) + res = @groupreduce(op, val, neutral, groupsize) i == 1 && (y[1] = res) end function groupreduce_testsuite(backend, AT) - # TODO should be better way of querying max groupsize + # TODO should be a better way of querying max groupsize groupsizes = "$backend" == "oneAPIBackend" ? (256,) : (256, 512, 1024) @testset "@groupreduce" begin - @testset "thread reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in groupsizes + @testset "T=$T, n=$n" for T in (Float16, Float32, Float64, Int16, Int32, Int64), n in groupsizes x = AT(ones(T, n)) y = AT(zeros(T, 1)) - groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.thread; ndrange = n) + groupreduce_1!(backend(), n)(y, x, +, zero(T); ndrange = n) @test Array(y)[1] == n - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(128); ndrange = n) + groupreduce_2!(backend())(y, x, +, zero(T), Val(128); ndrange = n) @test Array(y)[1] == 128 - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.thread, Val(64); ndrange = n) + groupreduce_2!(backend())(y, x, +, zero(T), Val(64); ndrange = n) @test Array(y)[1] == 64 end - - warp_reduction = KernelAbstractions.supports_warp_reduction(backend()) - if warp_reduction - @testset "warp reduction T=$T, n=$n" for T in (Float16, Float32, Int32, Int64), n in groupsizes - x = AT(ones(T, n)) - y = AT(zeros(T, 1)) - groupreduce_1!(backend(), n)(y, x, +, zero(T), Reduction.warp; ndrange = n) - @test Array(y)[1] == n - - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(128); ndrange = n) - @test Array(y)[1] == 128 - - groupreduce_2!(backend())(y, x, +, zero(T), Reduction.warp, Val(64); ndrange = n) - @test Array(y)[1] == 64 - end - end end end From bb7727088b22a304b04538e2da19e14c336f6a31 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 4 Feb 2025 01:36:57 +0200 Subject: [PATCH 09/11] Fix default algo selection --- src/reduce.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/reduce.jl b/src/reduce.jl index 7edf501b..fda70796 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -102,7 +102,9 @@ end # NOTE: Backends should implement these two device functions (with `@device_override`). function __shfl_down end -function __supports_warp_reduction() end +function __supports_warp_reduction() + return false +end # Assume warp is 32 lanes. const __warpsize = UInt32(32) From db5abc5c6a93bc1da6cfe1e8cc9dc125ea324001 Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Tue, 4 Feb 2025 01:40:44 +0200 Subject: [PATCH 10/11] Don't test on Float64 --- test/groupreduce.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/groupreduce.jl b/test/groupreduce.jl index 170f770b..aec5d9dc 100644 --- a/test/groupreduce.jl +++ b/test/groupreduce.jl @@ -18,7 +18,7 @@ function groupreduce_testsuite(backend, AT) (256,) : (256, 512, 1024) @testset "@groupreduce" begin - @testset "T=$T, n=$n" for T in (Float16, Float32, Float64, Int16, Int32, Int64), n in groupsizes + @testset "T=$T, n=$n" for T in (Float16, Float32, Int16, Int32, Int64), n in groupsizes x = AT(ones(T, n)) y = AT(zeros(T, 1)) From 618c840f5251a442d81d639b73f9133ab96fd03f Mon Sep 17 00:00:00 2001 From: Anton Smirnov Date: Thu, 6 Feb 2025 01:36:58 +0200 Subject: [PATCH 11/11] Separate algorithms --- src/reduce.jl | 65 +++++++++++---------------------------------------- 1 file changed, 13 insertions(+), 52 deletions(-) diff --git a/src/reduce.jl b/src/reduce.jl index fda70796..94b58f28 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,9 +1,4 @@ -export @groupreduce - -module Reduction - const thread = Val(:thread) - const warp = Val(:warp) -end +export @groupreduce, @warp_groupreduce """ @groupreduce op val neutral [groupsize] @@ -25,55 +20,21 @@ If backend supports warp reduction, it will use it instead of thread reduction. Result of the reduction. """ -macro groupreduce(op, val, neutral) - return quote - if __supports_warp_reduction() - __groupreduce( - $(esc(:__ctx__)), - $(esc(op)), - $(esc(val)), - $(esc(neutral)), - Val(prod($groupsize($(esc(:__ctx__))))), - $(esc(Reduction.warp)), - ) - else - __groupreduce( - $(esc(:__ctx__)), - $(esc(op)), - $(esc(val)), - $(esc(neutral)), - Val(prod($groupsize($(esc(:__ctx__))))), - $(esc(Reduction.thread)), - ) - end - end +macro groupreduce(op, val) + :(__thread_groupreduce($(esc(:__ctx__)), $(esc(op)), $(esc(val)), Val(prod($groupsize($(esc(:__ctx__))))))) +end +macro groupreduce(op, val, groupsize) + :(__thread_groupreduce($(esc(:__ctx__)), $(esc(op)), $(esc(val)), Val($(esc(groupsize))))) end -macro groupreduce(op, val, neutral, groupsize) - return quote - if __supports_warp_reduction() - __groupreduce( - $(esc(:__ctx__)), - $(esc(op)), - $(esc(val)), - $(esc(neutral)), - Val($(esc(groupsize))), - $(esc(Reduction.warp)), - ) - else - __groupreduce( - $(esc(:__ctx__)), - $(esc(op)), - $(esc(val)), - $(esc(neutral)), - Val($(esc(groupsize))), - $(esc(Reduction.thread)), - ) - end - end +macro warp_groupreduce(op, val, neutral) + :(__warp_groupreduce($(esc(:__ctx__)), $(esc(op)), $(esc(val)), $(esc(neutral)), Val(prod($groupsize($(esc(:__ctx__))))))) +end +macro warp_groupreduce(op, val, neutral, groupsize) + :(__warp_groupreduce($(esc(:__ctx__)), $(esc(op)), $(esc(val)), $(esc(neutral)), Val($(esc(groupsize))))) end -function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{:thread}) where {T, groupsize} +function __thread_groupreduce(__ctx__, op, val::T, ::Val{groupsize}) where {T, groupsize} storage = @localmem T groupsize local_idx = @index(Local) @@ -120,7 +81,7 @@ const __warp_bins = UInt32(32) return val end -function __groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}, ::Val{:warp}) where {T, groupsize} +function __warp_groupreduce(__ctx__, op, val::T, neutral::T, ::Val{groupsize}) where {T, groupsize} storage = @localmem T __warp_bins local_idx = @index(Local)