From 77f1ee03942cfc7515ae103727c93f0e72920b82 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 15 Jan 2025 12:00:44 +0100 Subject: [PATCH] Use POCL as a CPU backend --- .github/workflows/ci.yml | 6 +- Project.toml | 35 +- src/KernelAbstractions.jl | 71 +- src/cpu.jl | 1 - src/macros.jl | 224 +---- src/nditeration.jl | 18 +- src/pocl/backend.jl | 204 +++++ src/pocl/compiler/compilation.jl | 76 ++ src/pocl/compiler/execution.jl | 217 +++++ src/pocl/compiler/reflection.jl | 75 ++ src/pocl/device/array.jl | 258 ++++++ src/pocl/device/quirks.jl | 59 ++ src/pocl/device/runtime.jl | 11 + src/pocl/nanoOpenCL.jl | 1434 ++++++++++++++++++++++++++++++ src/pocl/pocl.jl | 73 ++ test/compiler.jl | 79 -- test/convert.jl | 16 +- test/reflection.jl | 24 +- test/runtests.jl | 9 - test/test.jl | 36 +- test/testsuite.jl | 5 - 21 files changed, 2487 insertions(+), 444 deletions(-) create mode 100644 src/pocl/backend.jl create mode 100644 src/pocl/compiler/compilation.jl create mode 100644 src/pocl/compiler/execution.jl create mode 100644 src/pocl/compiler/reflection.jl create mode 100644 src/pocl/device/array.jl create mode 100644 src/pocl/device/quirks.jl create mode 100644 src/pocl/device/runtime.jl create mode 100644 src/pocl/nanoOpenCL.jl create mode 100644 src/pocl/pocl.jl delete mode 100644 test/compiler.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f80d0651b..7a4d363c1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,12 +22,8 @@ jobs: fail-fast: false matrix: version: - - '1.6' - - '1.7' - - '1.8' - - '1.9' - '1.10' - - '~1.11.0-0' + - '1.11' os: - ubuntu-latest - macOS-latest diff --git a/Project.toml b/Project.toml index bdf45fdc7..184820b0c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,20 +1,32 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.9.33" +version = "0.10.0-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" -EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -Requires = "ae029012-a4dd-5104-9daa-d747884805df" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" +pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd" + +[weakdeps] +EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[extensions] +EnzymeExt = "EnzymeCore" +LinearAlgebraExt = "LinearAlgebra" +SparseArraysExt = "SparseArrays" [compat] Adapt = "0.4, 1.0, 2.0, 3.0, 4" @@ -24,23 +36,12 @@ InteractiveUtils = "1.6" LinearAlgebra = "1.6" MacroTools = "0.5" PrecompileTools = "1" -Requires = "1.3" SparseArrays = "<0.0.1, 1.6" StaticArrays = "0.12, 1.0" UUIDs = "<0.0.1, 1.6" -julia = "1.6" - -[extensions] -EnzymeExt = "EnzymeCore" -LinearAlgebraExt = "LinearAlgebra" -SparseArraysExt = "SparseArrays" +julia = "1.10" [extras] EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[weakdeps] -EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index b82dadc54..1211d2fe5 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -65,6 +65,9 @@ This allows for two different configurations: !!! warn This is an experimental feature. + +!!! note + `cpu={true, false}` is deprecated for KernelAbstractions 1.0 """ macro kernel(ex...) if length(ex) == 1 @@ -184,6 +187,8 @@ After releasing the memory of an array, it should no longer be accessed. """ function unsafe_free! end +unsafe_free!(::AbstractArray) = return + ### # Kernel language # - @localmem @@ -248,6 +253,9 @@ For storage that only persists between `@synchronize` statements, an `MArray` ca instead. See also [`@uniform`](@ref). + +!!! note + `@private` is deprecated for KernelAbstractions 1.0 """ macro private(T, dims) if dims isa Integer @@ -263,6 +271,9 @@ end Creates a private local of `mem` per item in the workgroup. This can be safely used across [`@synchronize`](@ref) statements. + +!!! note + `@private` is deprecated for KernelAbstractions 1.0 """ macro private(expr) return esc(expr) @@ -273,6 +284,9 @@ end `expr` is evaluated outside the workitem scope. This is useful for variable declarations that span workitems, or are reused across `@synchronize` statements. + +!!! note + `@uniform` is deprecated for KernelAbstractions 1.0 """ macro uniform(value) return esc(value) @@ -316,6 +330,8 @@ Access the hidden context object used by KernelAbstractions. !!! warn Only valid to be used from a kernel with `cpu=false`. +!!! note + `@context` will be supported on all backends in KernelAbstractions 1.0 ``` function f(@context, a) I = @index(Global, Linear) @@ -464,31 +480,11 @@ Abstract type for all GPU based KernelAbstractions backends. !!! note New backend implementations **must** sub-type this abstract type. -""" -abstract type GPU <: Backend end - -""" - CPU(; static=false) - -Instantiate a CPU (multi-threaded) backend. - -## Options: - - `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code. - Defaults to false. -""" -struct CPU <: Backend - static::Bool - CPU(; static::Bool = false) = new(static) -end - -""" - isgpu(::Backend)::Bool -Returns true for all [`GPU`](@ref) backends. +!!! note + `GPU` will be removed in KernelAbstractions v1.0 """ -isgpu(::GPU) = true -isgpu(::CPU) = false - +abstract type GPU <: Backend end """ get_backend(A::AbstractArray)::Backend @@ -504,12 +500,9 @@ function get_backend end # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.: get_backend(A::AbstractArray) = get_backend(parent(A)) -get_backend(::Array) = CPU() - # Define: # adapt_storage(::Backend, a::Array) = adapt(BackendArray, a) # adapt_storage(::Backend, a::BackendArray) = a -Adapt.adapt_storage(::CPU, a::Array) = a """ allocate(::Backend, Type, dims...)::AbstractArray @@ -729,7 +722,7 @@ Partition a kernel for the given ndrange and workgroupsize. return iterspace, dynamic end -function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName} +function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: GPU, S <: _Size, NDRange <: _Size, XPUName} return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name) end @@ -746,6 +739,10 @@ include("compiler.jl") function __workitems_iterspace end function __validindex end +# for reflection +function mkcontext end +function launch_config end + include("macros.jl") ### @@ -815,8 +812,11 @@ end end # CPU backend +include("pocl/pocl.jl") +using .POCL +export POCLBackend -include("cpu.jl") +const CPU = POCLBackend # precompile PrecompileTools.@compile_workload begin @@ -830,19 +830,4 @@ PrecompileTools.@compile_workload begin end end -if !isdefined(Base, :get_extension) - using Requires -end - -@static if !isdefined(Base, :get_extension) - function __init__() - @require EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" include("../ext/EnzymeExt.jl") - end -end - -if !isdefined(Base, :get_extension) - include("../ext/LinearAlgebraExt.jl") - include("../ext/SparseArraysExt.jl") -end - end #module diff --git a/src/cpu.jl b/src/cpu.jl index ac1f970f2..e383386f7 100644 --- a/src/cpu.jl +++ b/src/cpu.jl @@ -1,4 +1,3 @@ -unsafe_free!(::AbstractArray) = return synchronize(::CPU) = nothing allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims) diff --git a/src/macros.jl b/src/macros.jl index 02b93ed72..074b109fe 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -31,19 +31,6 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) constargs[i] = false end - # create two functions - # 1. GPU function - # 2. CPU function with work-group loops inserted - # - # Without the deepcopy we might accidentially modify expr shared between CPU and GPU - cpu_name = Symbol(:cpu_, name) - if generate_cpu - def_cpu = deepcopy(def) - def_cpu[:name] = cpu_name - transform_cpu!(def_cpu, constargs, force_inbounds) - cpu_function = combinedef(def_cpu) - end - def_gpu = deepcopy(def) def_gpu[:name] = gpu_name = Symbol(:gpu_, name) transform_gpu!(def_gpu, constargs, force_inbounds) @@ -56,24 +43,12 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) $name(dev, size) = $name(dev, $StaticSize(size), $DynamicSize()) $name(dev, size, range) = $name(dev, $StaticSize(size), $StaticSize(range)) function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S <: $_Size, NDRange <: $_Size} - if $isgpu(dev) - return $construct(dev, sz, range, $gpu_name) - else - if $generate_cpu - return $construct(dev, sz, range, $cpu_name) - else - error("This kernel is unavailable for backend CPU") - end - end + return $construct(dev, sz, range, $gpu_name) end end end - if generate_cpu - return Expr(:block, esc(cpu_function), esc(gpu_function), esc(constructors)) - else - return Expr(:block, esc(gpu_function), esc(constructors)) - end + return Expr(:block, esc(gpu_function), esc(constructors)) end # The easy case, transform the function for GPU execution @@ -105,198 +80,3 @@ function transform_gpu!(def, constargs, force_inbounds) ) return end - -# The hard case, transform the function for CPU execution -# - mark constant arguments by applying `constify`. -# - insert aliasscope markers -# - insert implied loop bodys -# - handle indicies -# - hoist workgroup definitions -# - hoist uniform variables -function transform_cpu!(def, constargs, force_inbounds) - let_constargs = Expr[] - for (i, arg) in enumerate(def[:args]) - if constargs[i] - push!(let_constargs, :($arg = $constify($arg))) - end - end - pushfirst!(def[:args], :__ctx__) - new_stmts = Expr[] - body = MacroTools.flatten(def[:body]) - push!(new_stmts, Expr(:aliasscope)) - if force_inbounds - push!(new_stmts, Expr(:inbounds, true)) - end - append!(new_stmts, split(body.args)) - if force_inbounds - push!(new_stmts, Expr(:inbounds, :pop)) - end - push!(new_stmts, Expr(:popaliasscope)) - push!(new_stmts, :(return nothing)) - def[:body] = Expr( - :let, - Expr(:block, let_constargs...), - Expr(:block, new_stmts...), - ) - return -end - -struct WorkgroupLoop - indicies::Vector{Any} - stmts::Vector{Any} - allocations::Vector{Any} - private_allocations::Vector{Any} - private::Set{Symbol} -end - -is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_)) - -function is_scope_construct(expr::Expr) - return expr.head === :block # || - # expr.head === :let -end - -function find_sync(stmt) - result = false - postwalk(stmt) do expr - result |= is_sync(expr) - expr - end - return result -end - -# TODO proper handling of LineInfo -function split( - stmts, - indicies = Any[], private = Set{Symbol}(), - ) - # 1. Split the code into blocks separated by `@synchronize` - # 2. Aggregate `@index` expressions - # 3. Hoist allocations - # 4. Hoist uniforms - - current = Any[] - allocations = Any[] - private_allocations = Any[] - new_stmts = Any[] - for stmt in stmts - has_sync = find_sync(stmt) - if has_sync - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) - push!(new_stmts, emit(loop)) - allocations = Any[] - private_allocations = Any[] - current = Any[] - is_sync(stmt) && continue - - # Recurse into scope constructs - # TODO: This currently implements hard scoping - # probably need to implemet soft scoping - # by not deepcopying the environment. - recurse(x) = x - function recurse(expr::Expr) - expr = unblock(expr) - if is_scope_construct(expr) && any(find_sync, expr.args) - new_args = unblock(split(expr.args, deepcopy(indicies), deepcopy(private))) - return Expr(expr.head, new_args...) - else - return Expr(expr.head, map(recurse, expr.args)...) - end - end - push!(new_stmts, recurse(stmt)) - continue - end - - if @capture(stmt, @uniform x_) - push!(allocations, stmt) - continue - elseif @capture(stmt, @private lhs_ = rhs_) - push!(private, lhs) - push!(private_allocations, :($lhs = $rhs)) - continue - elseif @capture(stmt, lhs_ = rhs_ | (vs__, lhs_ = rhs_)) - if @capture(rhs, @index(args__)) - push!(indicies, stmt) - continue - elseif @capture(rhs, @localmem(args__) | @uniform(args__)) - push!(allocations, stmt) - continue - elseif @capture(rhs, @private(T_, dims_)) - # Implement the legacy `mem = @private T dims` as - # mem = Scratchpad(T, Val(dims)) - - if dims isa Integer - dims = (dims,) - end - alloc = :($Scratchpad(__ctx__, $T, Val($dims))) - push!(allocations, :($lhs = $alloc)) - push!(private, lhs) - continue - end - end - - push!(current, stmt) - end - - # everything since the last `@synchronize` - if !isempty(current) - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) - push!(new_stmts, emit(loop)) - end - return new_stmts -end - -function emit(loop) - idx = gensym(:I) - for stmt in loop.indicies - # splice index into the i = @index(Cartesian, $idx) - @assert stmt.head === :(=) - rhs = stmt.args[2] - push!(rhs.args, idx) - end - stmts = Any[] - append!(stmts, loop.allocations) - - # private_allocations turn into lhs = ntuple(i->rhs, length(__workitems_iterspace())) - N = gensym(:N) - push!(stmts, :($N = length($__workitems_iterspace(__ctx__)))) - - for stmt in loop.private_allocations - if @capture(stmt, lhs_ = rhs_) - push!(stmts, :($lhs = ntuple(_ -> $rhs, $N))) - else - error("@private $stmt not an assignment") - end - end - - # don't emit empty loops - if !(isempty(loop.stmts) || all(s -> s isa LineNumberNode, loop.stmts)) - body = Expr(:block, loop.stmts...) - body = postwalk(body) do expr - if @capture(expr, lhs_ = rhs_) - if lhs in loop.private - error("Can't assign to variables marked private") - end - elseif @capture(expr, A_[i__]) - if A in loop.private - return :($A[$__index_Local_Linear(__ctx__, $(idx))][$(i...)]) - end - elseif expr isa Symbol - if expr in loop.private - return :($expr[$__index_Local_Linear(__ctx__, $(idx))]) - end - end - return expr - end - loopexpr = quote - for $idx in $__workitems_iterspace(__ctx__) - $__validindex(__ctx__, $idx) || continue - $(loop.indicies...) - $(unblock(body)) - end - end - push!(stmts, loopexpr) - end - - return unblock(Expr(:block, stmts...)) -end diff --git a/src/nditeration.jl b/src/nditeration.jl index cd05b2dd4..24ff094cf 100644 --- a/src/nditeration.jl +++ b/src/nditeration.jl @@ -138,18 +138,14 @@ needs to perform dynamic bounds-checking. """ @inline function partition(ndrange, __workgroupsize) @assert length(__workgroupsize) <= length(ndrange) - if length(__workgroupsize) < length(ndrange) - # pad workgroupsize with ones - workgroupsize = ntuple(Val(length(ndrange))) do I - Base.@_inline_meta - if I > length(__workgroupsize) - return 1 - else - return __workgroupsize[I] - end + # pad workgroupsize with ones + workgroupsize = ntuple(Val(length(ndrange))) do I + Base.@_inline_meta + if I > length(__workgroupsize) || __workgroupsize[I] == 0 + return 1 + else + return __workgroupsize[I] end - else - workgroupsize = __workgroupsize end let workgroupsize = workgroupsize dynamic = Ref(false) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl new file mode 100644 index 000000000..8f8acfcdf --- /dev/null +++ b/src/pocl/backend.jl @@ -0,0 +1,204 @@ +module POCLKernels + +using ..POCL +using ..POCL: @device_override, SPIRVIntrinsics, cl +using ..POCL: device + +import KernelAbstractions as KA + +import StaticArrays + +import Adapt + + +## Back-end Definition + +export POCLBackend + +struct POCLBackend <: KA.GPU +end + + +## Memory Operations + +KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims) + +function KA.zeros(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T} + arr = KA.allocate(backend, T, dims) + kernel = KA.init_kernel(backend) + kernel(arr, zero, T, ndrange = length(arr)) + return arr +end +function KA.ones(backend::POCLBackend, ::Type{T}, dims::Tuple) where {T} + arr = KA.allocate(backend, T, dims) + kernel = KA.init_kernel(backend) + kernel(arr, one, T; ndrange = length(arr)) + return arr +end + +function KA.copyto!(backend::POCLBackend, A, B) + if KA.get_backend(A) == KA.get_backend(B) && KA.get_backend(A) isa POCLBackend + if length(A) != length(B) + error("Arrays must match in length") + end + if Base.mightalias(A, B) + error("Arrays may not alias") + end + kernel = KA.copy_kernel(backend) + kernel(A, B, ndrange = length(A)) + return A + else + return Base.copyto!(A, B) + end +end + +KA.functional(::POCLBackend) = true +KA.pagelock!(::POCLBackend, x) = nothing + +KA.get_backend(::Array) = POCLBackend() +KA.synchronize(::POCLBackend) = nothing +KA.supports_float64(::POCLBackend) = true + + +## Kernel Launch + +function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, _ndrange, iterspace) + return KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace) +end +function KA.mkcontext( + kernel::KA.Kernel{POCLBackend}, I, _ndrange, iterspace, + ::Dynamic + ) where {Dynamic} + return KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace) +end + +function KA.launch_config(kernel::KA.Kernel{POCLBackend}, ndrange, workgroupsize) + if ndrange isa Integer + ndrange = (ndrange,) + end + if workgroupsize isa Integer + workgroupsize = (workgroupsize,) + end + + # partition checked that the ndrange's agreed + if KA.ndrange(kernel) <: KA.StaticSize + ndrange = nothing + end + + iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize && + workgroupsize === nothing + # use ndrange as preliminary workgroupsize for autotuning + KA.partition(kernel, ndrange, ndrange) + else + KA.partition(kernel, ndrange, workgroupsize) + end + + return ndrange, workgroupsize, iterspace, dynamic +end + +function threads_to_workgroupsize(threads, ndrange) + total = 1 + return map(ndrange) do n + x = min(div(threads, total), n) + total *= x + return x + end +end + +function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize = nothing) + ndrange, workgroupsize, iterspace, dynamic = + KA.launch_config(obj, ndrange, workgroupsize) + + # this might not be the final context, since we may tune the workgroupsize + ctx = KA.mkcontext(obj, ndrange, iterspace) + kernel = @opencl launch = false obj.f(ctx, args...) + + # figure out the optimal workgroupsize automatically + if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing + wg_info = cl.work_group_info(kernel.fun, device()) + wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange) + iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd) + ctx = KA.mkcontext(obj, ndrange, iterspace) + end + + groups = length(KA.blocks(iterspace)) + items = length(KA.workitems(iterspace)) + + if groups == 0 + return nothing + end + + # Launch kernel + global_size = groups * items + local_size = items + event = kernel(ctx, args...; global_size, local_size) + wait(event) + cl.clReleaseEvent(event) + return nothing +end + + +## Indexing Functions + +@device_override @inline function KA.__index_Local_Linear(ctx) + return get_local_id(1) +end + +@device_override @inline function KA.__index_Group_Linear(ctx) + return get_group_id(1) +end + +@device_override @inline function KA.__index_Global_Linear(ctx) + return get_global_id(1) +end + +@device_override @inline function KA.__index_Local_Cartesian(ctx) + @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] +end + +@device_override @inline function KA.__index_Group_Cartesian(ctx) + @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] +end + +@device_override @inline function KA.__index_Global_Cartesian(ctx) + return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +end + +@device_override @inline function KA.__validindex(ctx) + if KA.__dynamic_checkbounds(ctx) + I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) + return I in KA.__ndrange(ctx) + else + return true + end +end + + +## Shared and Scratch Memory + +@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} + ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims))) + CLDeviceArray(Dims, ptr) +end + +@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims} + StaticArrays.MArray{KA.__size(Dims), T}(undef) +end + + +## Synchronization and Printing + +@device_override @inline function KA.__synchronize() + barrier() +end + +@device_override @inline function KA.__print(args...) + SPIRVIntrinsics._print(args...) +end + + +## Other + +KA.argconvert(::KA.Kernel{POCLBackend}, arg) = clconvert(arg) + +end diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl new file mode 100644 index 000000000..375e05081 --- /dev/null +++ b/src/pocl/compiler/compilation.jl @@ -0,0 +1,76 @@ +## gpucompiler interface + +struct OpenCLCompilerParams <: AbstractCompilerParams end +const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} +const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams} + +GPUCompiler.runtime_module(::CompilerJob{<:Any, OpenCLCompilerParams}) = POCL + +GPUCompiler.method_table(::OpenCLCompilerJob) = method_table + +# filter out OpenCL built-ins +# TODO: eagerly lower these using the translator API +GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) = + invoke( + GPUCompiler.isintrinsic, + Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)}, + job, fn +) || + in(fn, opencl_builtins) + + +## compiler implementation (cache, configure, compile, and link) + +# cache of compilation caches, per context +const _compiler_caches = Dict{cl.Context, Dict{Any, Any}}() +function compiler_cache(ctx::cl.Context) + cache = get(_compiler_caches, ctx, nothing) + if cache === nothing + cache = Dict{Any, Any}() + _compiler_caches[ctx] = cache + end + return cache +end + +# cache of compiler configurations, per device (but additionally configurable via kwargs) +const _toolchain = Ref{Any}() +const _compiler_configs = Dict{UInt, OpenCLCompilerConfig}() +function compiler_config(dev::cl.Device; kwargs...) + h = hash(dev, hash(kwargs)) + config = get(_compiler_configs, h, nothing) + if config === nothing + config = _compiler_config(dev; kwargs...) + _compiler_configs[h] = config + end + return config +end +@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...) + supports_fp16 = "cl_khr_fp16" in dev.extensions + supports_fp64 = "cl_khr_fp64" in dev.extensions + + # create GPUCompiler objects + target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) + params = OpenCLCompilerParams() + return CompilerConfig(target, params; kernel, name, always_inline) +end + +# compile to executable machine code +function compile(@nospecialize(job::CompilerJob)) + # TODO: this creates a context; cache those. + obj, meta = JuliaContext() do ctx + GPUCompiler.compile(:obj, job) + end + + return (; obj, entry = LLVM.name(meta.entry)) +end + +# link into an executable kernel +function link(@nospecialize(job::CompilerJob), compiled) + prog = if "cl_khr_il_program" in device().extensions + cl.Program(compiled.obj, context()) + else + error("Your device does not support SPIR-V, which is currently required for native execution.") + end + cl.build!(prog) + return cl.Kernel(prog, compiled.entry) +end diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl new file mode 100644 index 000000000..dc47cb302 --- /dev/null +++ b/src/pocl/compiler/execution.jl @@ -0,0 +1,217 @@ +export @opencl, clfunction, clconvert + + +## high-level @opencl interface + +const MACRO_KWARGS = [:launch] +const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const LAUNCH_KWARGS = [:global_size, :local_size, :queue] + +macro opencl(ex...) + call = ex[end] + kwargs = map(ex[1:(end - 1)]) do kwarg + if kwarg isa Symbol + :($kwarg = $kwarg) + elseif Meta.isexpr(kwarg, :(=)) + kwarg + else + throw(ArgumentError("Invalid keyword argument '$kwarg'")) + end + end + + # destructure the kernel call + Meta.isexpr(call, :call) || throw(ArgumentError("second argument to @opencl should be a function call")) + f = call.args[1] + args = call.args[2:end] + + code = quote end + vars, var_exprs = assign_args!(code, args) + + # group keyword argument + macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = + split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) + if !isempty(other_kwargs) + key, val = first(other_kwargs).args + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + + # handle keyword arguments that influence the macro's behavior + launch = true + for kwarg in macro_kwargs + key, val = kwarg.args + if key == :launch + isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @opencl should be a constant value")) + launch = val::Bool + else + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + end + if !launch && !isempty(call_kwargs) + error("@opencl with launch=false does not support launch-time keyword arguments; use them when calling the kernel") + end + + # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5) + # we esc() the whole thing now, necessitating gensyms... + @gensym f_var kernel_f kernel_args kernel_tt kernel + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!( + code.args, + quote + $f_var = $f + GC.@preserve $(vars...) $f_var begin + $kernel_f = $clconvert($f_var) + $kernel_args = map($clconvert, ($(var_exprs...),)) + $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} + $kernel = $clfunction($kernel_f, $kernel_tt; $(compiler_kwargs...)) + if $launch + $kernel($(var_exprs...); $(call_kwargs...)) + end + $kernel + end + end + ) + + return esc( + quote + let + $code + end + end + ) +end + + +## argument conversion + +struct KernelAdaptor + svm_pointers::Vector{Ptr{Cvoid}} +end + +# # assume directly-passed pointers are SVM pointers +# function Adapt.adapt_storage(to::KernelAdaptor, ptr::Ptr{T}) where {T} +# push!(to.svm_pointers, ptr) +# return ptr +# end + +# # convert SVM buffers to their GPU address +# function Adapt.adapt_storage(to::KernelAdaptor, buf::cl.SVMBuffer) +# ptr = pointer(buf) +# push!(to.svm_pointers, ptr) +# return ptr +# end + +# Base.RefValue isn't GPU compatible, so provide a compatible alternative +# TODO: port improvements from CUDA.jl +struct CLRefValue{T} <: Ref{T} + x::T +end +Base.getindex(r::CLRefValue) = r.x +Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CLRefValue(adapt(to, r[])) + +# broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box. +# avoid that by using a special kind of ref that knows about the boxed type. +struct CLRefType{T} <: Ref{DataType} end +Base.getindex(r::CLRefType{T}) where {T} = T +Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType, Type}}) = + CLRefType{r[]}() + +# case where type is the function being broadcasted +Adapt.adapt_structure( + to::KernelAdaptor, + bc::Broadcast.Broadcasted{Style, <:Any, Type{T}} +) where {Style, T} = + Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes) + +""" + clconvert(x, [pointers]) + +This function is called for every argument to be passed to a kernel, allowing it to be +converted to a GPU-friendly format. By default, the function does nothing and returns the +input object `x` as-is. + +Do not add methods to this function, but instead extend the underlying Adapt.jl package and +register methods for the the `OpenCL.KernelAdaptor` type. + +The `pointers` argument is used to collect pointers to indirect SVM buffers, which need to +be registered with OpenCL before invoking the kernel. +""" +function clconvert(arg, pointers::Vector{Ptr{Cvoid}} = Ptr{Cvoid}[]) + return adapt(KernelAdaptor(pointers), arg) +end + + +## abstract kernel functionality + +abstract type AbstractKernel{F, TT} end + +@inline @generated function (kernel::AbstractKernel{F, TT})( + args...; + call_kwargs... + ) where {F, TT} + sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type + args = (:(kernel.f), (:(clconvert(args[$i], svm_pointers)) for i in 1:length(args))...) + + # filter out ghost arguments that shouldn't be passed + predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt) + to_pass = map(!predicate, sig.parameters) + call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] + call_args = Union{Expr, Symbol}[x[1] for x in zip(args, to_pass) if x[2]] + + # replace non-isbits arguments (they should be unused, or compilation would have failed) + for (i, dt) in enumerate(call_t) + if !isbitstype(dt) + call_t[i] = Ptr{Any} + call_args[i] = :C_NULL + end + end + + # finalize types + call_tt = Base.to_tuple_type(call_t) + + return quote + svm_pointers = Ptr{Cvoid}[] + $cl.clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...) + end +end + + +## host-side kernels + +struct HostKernel{F, TT} <: AbstractKernel{F, TT} + f::F + fun::cl.Kernel +end + + +## host-side API + +const clfunction_lock = ReentrantLock() + +function clfunction(f::F, tt::TT = Tuple{}; kwargs...) where {F, TT} + ctx = context() + dev = device() + + Base.@lock clfunction_lock begin + # compile the function + cache = compiler_cache(ctx) + source = methodinstance(F, tt) + config = compiler_config(dev; kwargs...)::OpenCLCompilerConfig + fun = GPUCompiler.cached_compilation(cache, source, config, compile, link) + + # create a callable object that captures the function instance. we don't need to think + # about world age here, as GPUCompiler already does and will return a different object + h = hash(fun, hash(f, hash(tt))) + kernel = get(_kernel_instances, h, nothing) + if kernel === nothing + # create the kernel state object + kernel = HostKernel{F, tt}(f, fun) + _kernel_instances[h] = kernel + end + return kernel::HostKernel{F, tt} + end +end + +# cache of kernel instances +const _kernel_instances = Dict{UInt, Any}() diff --git a/src/pocl/compiler/reflection.jl b/src/pocl/compiler/reflection.jl new file mode 100644 index 000000000..55dd8ba66 --- /dev/null +++ b/src/pocl/compiler/reflection.jl @@ -0,0 +1,75 @@ +# code reflection entry-points + +# +# code_* replacements +# + +# function to split off certain kwargs for selective forwarding, at run time. +# `@opencl` does something similar at parse time, using `GPUCompiler.split_kwargs`. +function split_kwargs_runtime(kwargs, wanted::Vector{Symbol}) + remaining = Dict{Symbol, Any}() + extracted = Dict{Symbol, Any}() + for (key, value) in kwargs + if key in wanted + extracted[key] = value + else + remaining[key] = value + end + end + return extracted, remaining +end + +for method in (:code_typed, :code_warntype, :code_llvm, :code_native) + # only code_typed doesn't take a io argument + args = method == :code_typed ? (:job,) : (:io, :job) + + @eval begin + function $method( + io::IO, @nospecialize(func), @nospecialize(types); + kernel::Bool = false, kwargs... + ) + compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS) + source = methodinstance(typeof(func), Base.to_tuple_type(types)) + config = compiler_config(device(); kernel, compiler_kwargs...) + job = CompilerJob(source, config) + return GPUCompiler.$method($(args...); kwargs...) + end + $method(@nospecialize(func), @nospecialize(types); kwargs...) = + $method(stdout, func, types; kwargs...) + end +end + + +# +# @device_code_* functions +# + +export @device_code_lowered, @device_code_typed, @device_code_warntype, @device_code_llvm, + @device_code_native, @device_code + +# forward to GPUCompiler +@eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered"))) +@eval $(Symbol("@device_code_typed")) = $(getfield(GPUCompiler, Symbol("@device_code_typed"))) +@eval $(Symbol("@device_code_warntype")) = $(getfield(GPUCompiler, Symbol("@device_code_warntype"))) +@eval $(Symbol("@device_code_llvm")) = $(getfield(GPUCompiler, Symbol("@device_code_llvm"))) +@eval $(Symbol("@device_code_native")) = $(getfield(GPUCompiler, Symbol("@device_code_native"))) +@eval $(Symbol("@device_code")) = $(getfield(GPUCompiler, Symbol("@device_code"))) + + +# +# other +# + +""" + POCL.return_type(f, tt) -> r::Type + +Return a type `r` such that `f(args...)::r` where `args::tt`. +""" +function return_type(@nospecialize(func), @nospecialize(tt)) + source = methodinstance(typeof(func), tt) + config = compiler_config(cl.device()) + job = CompilerJob(source, config) + interp = GPUCompiler.get_interpreter(job) + sig = Base.signature_type(func, tt) + return Core.Compiler.return_type(interp, sig) +end diff --git a/src/pocl/device/array.jl b/src/pocl/device/array.jl new file mode 100644 index 000000000..63757911e --- /dev/null +++ b/src/pocl/device/array.jl @@ -0,0 +1,258 @@ +# Contiguous on-device arrays + +export CLDeviceArray, CLDeviceVector, CLDeviceMatrix, CLLocalArray + + +## construction + +# NOTE: we can't support the typical `tuple or series of integer` style construction, +# because we're currently requiring a trailing pointer argument. + +struct CLDeviceArray{T, N, A} <: DenseArray{T, N} + ptr::LLVMPtr{T, A} + maxsize::Int + + dims::Dims{N} + len::Int + + # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) + # TODO: deprecate; put `ptr` first like oneArray + CLDeviceArray{T, N, A}( + dims::Dims{N}, ptr::LLVMPtr{T, A}, + maxsize::Int = prod(dims) * sizeof(T) + ) where {T, A, N} = + new(ptr, maxsize, dims, prod(dims)) +end + +const CLDeviceVector = CLDeviceArray{T, 1, A} where {T, A} +const CLDeviceMatrix = CLDeviceArray{T, 2, A} where {T, A} + +# outer constructors, non-parameterized +CLDeviceArray(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(dims, p) +CLDeviceArray(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((len,), p) + +# outer constructors, partially parameterized +CLDeviceArray{T}(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(dims, p) +CLDeviceArray{T}(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((len,), p) +CLDeviceArray{T, N}(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(dims, p) +CLDeviceVector{T}(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((len,), p) + +# outer constructors, fully parameterized +CLDeviceArray{T, N, A}(dims::NTuple{N, <:Integer}, p::LLVMPtr{T, A}) where {T, A, N} = CLDeviceArray{T, N, A}(Int.(dims), p) +CLDeviceVector{T, A}(len::Integer, p::LLVMPtr{T, A}) where {T, A} = CLDeviceVector{T, A}((Int(len),), p) + + +## array interface + +Base.elsize(::Type{<:CLDeviceArray{T}}) where {T} = sizeof(T) + +Base.size(g::CLDeviceArray) = g.dims +Base.sizeof(x::CLDeviceArray) = Base.elsize(x) * length(x) + +# we store the array length too; computing prod(size) is expensive +Base.length(g::CLDeviceArray) = g.len + +Base.pointer(x::CLDeviceArray{T, <:Any, A}) where {T, A} = Base.unsafe_convert(LLVMPtr{T, A}, x) +@inline function Base.pointer(x::CLDeviceArray{T, <:Any, A}, i::Integer) where {T, A} + return Base.unsafe_convert(LLVMPtr{T, A}, x) + Base._memory_offset(x, i) +end + +typetagdata(a::CLDeviceArray{<:Any, <:Any, A}, i = 1) where {A} = + reinterpret(LLVMPtr{UInt8, A}, a.ptr + a.maxsize) + i - one(i) + + +## conversions + +Base.unsafe_convert(::Type{LLVMPtr{T, A}}, x::CLDeviceArray{T, <:Any, A}) where {T, A} = + x.ptr + + +## indexing intrinsics + +# TODO: how are allocations aligned by the level zero API? keep track of this +# because it enables optimizations like Load Store Vectorization +# (cfr. shared memory and its wider-than-datatype alignment) + +@generated function alignment(::CLDeviceArray{T}) where {T} + return if Base.isbitsunion(T) + _, sz, al = Base.uniontype_layout(T) + al + else + Base.datatype_alignment(T) + end +end + +@device_function @inline function arrayref(A::CLDeviceArray{T}, index::Integer) where {T} + @boundscheck checkbounds(A, index) + if isbitstype(T) + arrayref_bits(A, index) + else #if isbitsunion(T) + arrayref_union(A, index) + end +end + +@inline function arrayref_bits(A::CLDeviceArray{T}, index::Integer) where {T} + align = alignment(A) + return unsafe_load(pointer(A), index, Val(align)) +end + +@inline @generated function arrayref_union(A::CLDeviceArray{T, <:Any, AS}, index::Integer) where {T, AS} + typs = Base.uniontypes(T) + + # generate code that conditionally loads a value based on the selector value. + # lacking noreturn, we return T to avoid inference thinking this can return Nothing. + ex = :(Base.llvmcall("unreachable", $T, Tuple{})) + for (sel, typ) in Iterators.reverse(enumerate(typs)) + ex = quote + if selector == $(sel - 1) + ptr = reinterpret(LLVMPtr{$typ, AS}, data_ptr) + unsafe_load(ptr, 1, Val(align)) + else + $ex + end + end + end + + return quote + selector_ptr = typetagdata(A, index) + selector = unsafe_load(selector_ptr) + + align = alignment(A) + data_ptr = pointer(A, index) + + return $ex + end +end + +@device_function @inline function arrayset(A::CLDeviceArray{T}, x::T, index::Integer) where {T} + @boundscheck checkbounds(A, index) + if isbitstype(T) + arrayset_bits(A, x, index) + else #if isbitsunion(T) + arrayset_union(A, x, index) + end + return A +end + +@inline function arrayset_bits(A::CLDeviceArray{T}, x::T, index::Integer) where {T} + align = alignment(A) + return unsafe_store!(pointer(A), x, index, Val(align)) +end + +@inline @generated function arrayset_union(A::CLDeviceArray{T, <:Any, AS}, x::T, index::Integer) where {T, AS} + typs = Base.uniontypes(T) + sel = findfirst(isequal(x), typs) + + return quote + selector_ptr = typetagdata(A, index) + unsafe_store!(selector_ptr, $(UInt8(sel - 1))) + + align = alignment(A) + data_ptr = pointer(A, index) + + unsafe_store!(reinterpret(LLVMPtr{$x, AS}, data_ptr), x, 1, Val(align)) + return + end +end + +@device_function @inline function const_arrayref(A::CLDeviceArray{T}, index::Integer) where {T} + @boundscheck checkbounds(A, index) + align = alignment(A) + unsafe_cached_load(pointer(A), index, Val(align)) +end + + +## indexing + +Base.IndexStyle(::Type{<:CLDeviceArray}) = Base.IndexLinear() + +Base.@propagate_inbounds Base.getindex(A::CLDeviceArray{T}, i1::Integer) where {T} = + arrayref(A, i1) +Base.@propagate_inbounds Base.setindex!(A::CLDeviceArray{T}, x, i1::Integer) where {T} = + arrayset(A, convert(T, x)::T, i1) + +# preserve the specific integer type when indexing device arrays, +# to avoid extending 32-bit hardware indices to 64-bit. +Base.to_index(::CLDeviceArray, i::Integer) = i + +# Base doesn't like Integer indices, so we need our own ND get and setindex! routines. +# See also: https://github.com/JuliaLang/julia/pull/42289 +Base.@propagate_inbounds Base.getindex( + A::CLDeviceArray, + I::Union{Integer, CartesianIndex}... +) = + A[Base._to_linear_index(A, to_indices(A, I)...)] +Base.@propagate_inbounds Base.setindex!( + A::CLDeviceArray, x, + I::Union{Integer, CartesianIndex}... +) = + A[Base._to_linear_index(A, to_indices(A, I)...)] = x + + +## const indexing + +""" + Const(A::CLDeviceArray) + +Mark a CLDeviceArray as constant/read-only. The invariant guaranteed is that you will not +modify an CLDeviceArray for the duration of the current kernel. + +This API can only be used on devices with compute capability 3.5 or higher. + +!!! warning + Experimental API. Subject to change without deprecation. +""" +struct Const{T, N, AS} <: DenseArray{T, N} + a::CLDeviceArray{T, N, AS} +end +Base.Experimental.Const(A::CLDeviceArray) = Const(A) + +Base.IndexStyle(::Type{<:Const}) = IndexLinear() +Base.size(C::Const) = size(C.a) +Base.axes(C::Const) = axes(C.a) +Base.@propagate_inbounds Base.getindex(A::Const, i1::Integer) = const_arrayref(A.a, i1) + + +## other + +Base.show(io::IO, a::CLDeviceVector) = + print(io, "$(length(a))-element device array at $(pointer(a))") +Base.show(io::IO, a::CLDeviceArray) = + print(io, "$(join(a.shape, '×')) device array at $(pointer(a))") + +Base.show(io::IO, mime::MIME"text/plain", a::CLDeviceArray) = show(io, a) + +@inline function Base.iterate(A::CLDeviceArray, i = 1) + return if (i % UInt) - 1 < length(A) + (@inbounds A[i], i + 1) + else + nothing + end +end + +function Base.reinterpret(::Type{T}, a::CLDeviceArray{S, N, A}) where {T, S, N, A} + err = _reinterpret_exception(T, a) + err === nothing || throw(err) + + if sizeof(T) == sizeof(S) # fast case + return CLDeviceArray{T, N, A}(size(a), reinterpret(LLVMPtr{T, A}, a.ptr), a.maxsize) + end + + isize = size(a) + size1 = div(isize[1] * sizeof(S), sizeof(T)) + osize = tuple(size1, Base.tail(isize)...) + return CLDeviceArray{T, N, A}(osize, reinterpret(LLVMPtr{T, A}, a.ptr), a.maxsize) +end + + +## local memory + +# XXX: use OpenCL-style local memory arguments instead? + +@inline function CLLocalArray(::Type{T}, dims) where {T} + len = prod(dims) + # NOTE: this relies on const-prop to forward the literal length to the generator. + # maybe we should include the size in the type, like StaticArrays does? + ptr = emit_localmemory(T, Val(len)) + return CLDeviceArray(dims, ptr) +end diff --git a/src/pocl/device/quirks.jl b/src/pocl/device/quirks.jl new file mode 100644 index 000000000..79490eb93 --- /dev/null +++ b/src/pocl/device/quirks.jl @@ -0,0 +1,59 @@ +macro print_and_throw(args...) + return quote + @println "ERROR: " $(args...) "." + throw(nothing) + end +end + +# math.jl +@device_override @noinline Base.Math.throw_complex_domainerror(f::Symbol, x) = + @print_and_throw "This operation requires a complex input to return a complex result" +@device_override @noinline Base.Math.throw_exp_domainerror(x) = + @print_and_throw "Exponentiation yielding a complex result requires a complex argument" + +# intfuncs.jl +@device_override @noinline Base.throw_domerr_powbysq(::Any, p) = + @print_and_throw "Cannot raise an integer to a negative power" +@device_override @noinline Base.throw_domerr_powbysq(::Integer, p) = + @print_and_throw "Cannot raise an integer to a negative power" +@device_override @noinline Base.throw_domerr_powbysq(::AbstractMatrix, p) = + @print_and_throw "Cannot raise an integer to a negative power" + +# checked.jl +@device_override @noinline Base.Checked.throw_overflowerr_binaryop(op, x, y) = + @print_and_throw "Binary operation overflowed" +@device_override @noinline Base.Checked.throw_overflowerr_negation(op, x, y) = + @print_and_throw "Negation overflowed" + +# boot.jl +@device_override @noinline Core.throw_inexacterror(f::Symbol, ::Type{T}, val) where {T} = + @print_and_throw "Inexact conversion" + +# abstractarray.jl +@device_override @noinline Base.throw_boundserror(A, I) = + @print_and_throw "Out-of-bounds array access" + +# trig.jl +@device_override @noinline Base.Math.sincos_domain_error(x) = + @print_and_throw "sincos(x) is only defined for finite x." + +# diagonal.jl +# XXX: remove when we have malloc +# import LinearAlgebra +# @device_override function Base.setindex!(D::LinearAlgebra.Diagonal, v, i::Int, j::Int) +# @boundscheck checkbounds(D, i, j) +# if i == j +# @inbounds D.diag[i] = v +# elseif !iszero(v) +# @print_and_throw "cannot set off-diagonal entry to a nonzero value" +# end +# return v +# end + +# number.jl +# XXX: remove when we have malloc +@device_override @inline function Base.getindex(x::Number, I::Integer...) + @boundscheck all(isone, I) || + @print_and_throw "Out-of-bounds access of scalar value" + x +end diff --git a/src/pocl/device/runtime.jl b/src/pocl/device/runtime.jl new file mode 100644 index 000000000..cdd146a23 --- /dev/null +++ b/src/pocl/device/runtime.jl @@ -0,0 +1,11 @@ +signal_exception() = return + +malloc(sz) = C_NULL + +report_oom(sz) = return + +report_exception(ex) = return + +report_exception_name(ex) = return + +report_exception_frame(idx, func, file, line) = return diff --git a/src/pocl/nanoOpenCL.jl b/src/pocl/nanoOpenCL.jl new file mode 100644 index 000000000..5fa10ab18 --- /dev/null +++ b/src/pocl/nanoOpenCL.jl @@ -0,0 +1,1434 @@ +module nanoOpenCL + +using ..POCL: platform, device, context, queue + +import OpenCL_jll +import pocl_jll + +using Printf + +const libopencl = OpenCL_jll.libopencl # TODO directly use POCL + +""" + @checked function foo(...) + rv = ... + return rv + end + +Macro for wrapping a function definition returning a status code. Two versions of the +function will be generated: `foo`, with the function body wrapped by an invocation of the +`check` function (to be implemented by the caller of this macro), and `unchecked_foo` where no +such invocation is present and the status code is returned to the caller. +""" +macro checked(ex) + # parse the function definition + @assert Meta.isexpr(ex, :function) + sig = ex.args[1] + @assert Meta.isexpr(sig, :call) + body = ex.args[2] + @assert Meta.isexpr(body, :block) + + # we need to detect the first API call, so add an initialization check + body = quote + if !initialized[] + initialize() + end + $body + end + + # generate a "safe" version that performs a check + safe_body = quote + check() do + $body + end + end + safe_sig = Expr(:call, sig.args[1], sig.args[2:end]...) + safe_def = Expr(:function, safe_sig, safe_body) + + # generate a "unchecked" version that returns the error code instead + unchecked_sig = Expr(:call, Symbol("unchecked_", sig.args[1]), sig.args[2:end]...) + unchecked_def = Expr(:function, unchecked_sig, body) + + return esc(:($safe_def, $unchecked_def)) +end + +const CL_SUCCESS = 0 + +const CL_DEVICE_NOT_FOUND = -1 + +const CL_DEVICE_NOT_AVAILABLE = -2 + +const CL_INVALID_ARG_INDEX = -49 + +const CL_INVALID_ARG_VALUE = -50 + +const CL_INVALID_ARG_SIZE = -51 + +const CL_INVALID_KERNEL_ARGS = -52 + +const CL_PLATFORM_NOT_FOUND_KHR = -1001 + +const CL_PLATFORM_PROFILE = 0x0900 + +const CL_PLATFORM_VERSION = 0x0901 + +const CL_PLATFORM_NAME = 0x0902 + +const CL_PLATFORM_VENDOR = 0x0903 + +const CL_PLATFORM_EXTENSIONS = 0x0904 + +const CL_PLATFORM_HOST_TIMER_RESOLUTION = 0x0905 + +const CL_PLATFORM_NUMERIC_VERSION = 0x0906 + +const CL_PLATFORM_EXTENSIONS_WITH_VERSION = 0x0907 + +const CL_DEVICE_TYPE_DEFAULT = 1 << 0 + +const CL_DEVICE_TYPE_CPU = 1 << 1 + +const CL_DEVICE_TYPE_GPU = 1 << 2 + +const CL_DEVICE_TYPE_ACCELERATOR = 1 << 3 + +const CL_DEVICE_TYPE_CUSTOM = 1 << 4 + +const CL_DEVICE_TYPE_ALL = 0xffffffff + +const CL_DEVICE_TYPE = 0x1000 + +const CL_DEVICE_VENDOR_ID = 0x1001 + +const CL_DEVICE_MAX_COMPUTE_UNITS = 0x1002 + +const CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS = 0x1003 + +const CL_DEVICE_MAX_WORK_GROUP_SIZE = 0x1004 + +const CL_DEVICE_MAX_WORK_ITEM_SIZES = 0x1005 + +const CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR = 0x1006 + +const CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT = 0x1007 + +const CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT = 0x1008 + +const CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG = 0x1009 + +const CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT = 0x100a + +const CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE = 0x100b + +const CL_DEVICE_MAX_CLOCK_FREQUENCY = 0x100c + +const CL_DEVICE_ADDRESS_BITS = 0x100d + +const CL_DEVICE_MAX_READ_IMAGE_ARGS = 0x100e + +const CL_DEVICE_MAX_WRITE_IMAGE_ARGS = 0x100f + +const CL_DEVICE_MAX_MEM_ALLOC_SIZE = 0x1010 + +const CL_DEVICE_IMAGE2D_MAX_WIDTH = 0x1011 + +const CL_DEVICE_IMAGE2D_MAX_HEIGHT = 0x1012 + +const CL_DEVICE_IMAGE3D_MAX_WIDTH = 0x1013 + +const CL_DEVICE_IMAGE3D_MAX_HEIGHT = 0x1014 + +const CL_DEVICE_IMAGE3D_MAX_DEPTH = 0x1015 + +const CL_DEVICE_IMAGE_SUPPORT = 0x1016 + +const CL_DEVICE_MAX_PARAMETER_SIZE = 0x1017 + +const CL_DEVICE_MAX_SAMPLERS = 0x1018 + +const CL_DEVICE_MEM_BASE_ADDR_ALIGN = 0x1019 + +const CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE = 0x101a + +const CL_DEVICE_SINGLE_FP_CONFIG = 0x101b + +const CL_DEVICE_GLOBAL_MEM_CACHE_TYPE = 0x101c + +const CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE = 0x101d + +const CL_DEVICE_GLOBAL_MEM_CACHE_SIZE = 0x101e + +const CL_DEVICE_GLOBAL_MEM_SIZE = 0x101f + +const CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE = 0x1020 + +const CL_DEVICE_MAX_CONSTANT_ARGS = 0x1021 + +const CL_DEVICE_LOCAL_MEM_TYPE = 0x1022 + +const CL_DEVICE_LOCAL_MEM_SIZE = 0x1023 + +const CL_DEVICE_ERROR_CORRECTION_SUPPORT = 0x1024 + +const CL_DEVICE_PROFILING_TIMER_RESOLUTION = 0x1025 + +const CL_DEVICE_ENDIAN_LITTLE = 0x1026 + +const CL_DEVICE_AVAILABLE = 0x1027 + +const CL_DEVICE_COMPILER_AVAILABLE = 0x1028 + +const CL_DEVICE_EXECUTION_CAPABILITIES = 0x1029 + +const CL_DEVICE_QUEUE_PROPERTIES = 0x102a + +const CL_DEVICE_QUEUE_ON_HOST_PROPERTIES = 0x102a + +const CL_DEVICE_NAME = 0x102b + +const CL_DEVICE_VENDOR = 0x102c + +const CL_DRIVER_VERSION = 0x102d + +const CL_DEVICE_PROFILE = 0x102e + +const CL_DEVICE_VERSION = 0x102f + +const CL_DEVICE_EXTENSIONS = 0x1030 + +const CL_DEVICE_PLATFORM = 0x1031 + +const CL_DEVICE_DOUBLE_FP_CONFIG = 0x1032 + +const CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF = 0x1034 + +const CL_DEVICE_HOST_UNIFIED_MEMORY = 0x1035 + +const CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR = 0x1036 + +const CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT = 0x1037 + +const CL_DEVICE_NATIVE_VECTOR_WIDTH_INT = 0x1038 + +const CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG = 0x1039 + +const CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT = 0x103a + +const CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE = 0x103b + +const CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF = 0x103c + +const CL_DEVICE_OPENCL_C_VERSION = 0x103d + +const CL_DEVICE_LINKER_AVAILABLE = 0x103e + +const CL_DEVICE_BUILT_IN_KERNELS = 0x103f + +const CL_DEVICE_IMAGE_MAX_BUFFER_SIZE = 0x1040 + +const CL_DEVICE_IMAGE_MAX_ARRAY_SIZE = 0x1041 + +const CL_DEVICE_PARENT_DEVICE = 0x1042 + +const CL_DEVICE_PARTITION_MAX_SUB_DEVICES = 0x1043 + +const CL_DEVICE_PARTITION_PROPERTIES = 0x1044 + +const CL_DEVICE_PARTITION_AFFINITY_DOMAIN = 0x1045 + +const CL_DEVICE_PARTITION_TYPE = 0x1046 + +const CL_DEVICE_REFERENCE_COUNT = 0x1047 + +const CL_DEVICE_PREFERRED_INTEROP_USER_SYNC = 0x1048 + +const CL_DEVICE_PRINTF_BUFFER_SIZE = 0x1049 + +const CL_DEVICE_IMAGE_PITCH_ALIGNMENT = 0x104a + +const CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT = 0x104b + +const CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS = 0x104c + +const CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE = 0x104d + +const CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES = 0x104e + +const CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE = 0x104f + +const CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE = 0x1050 + +const CL_DEVICE_MAX_ON_DEVICE_QUEUES = 0x1051 + +const CL_DEVICE_MAX_ON_DEVICE_EVENTS = 0x1052 + +const CL_DEVICE_SVM_CAPABILITIES = 0x1053 + +const CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE = 0x1054 + +const CL_DEVICE_MAX_PIPE_ARGS = 0x1055 + +const CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS = 0x1056 + +const CL_DEVICE_PIPE_MAX_PACKET_SIZE = 0x1057 + +const CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT = 0x1058 + +const CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT = 0x1059 + +const CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT = 0x105a + +const CL_DEVICE_IL_VERSION = 0x105b + +const CL_DEVICE_MAX_NUM_SUB_GROUPS = 0x105c + +const CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 0x105d + +const CL_DEVICE_NUMERIC_VERSION = 0x105e + +const CL_DEVICE_EXTENSIONS_WITH_VERSION = 0x1060 + +const CL_DEVICE_ILS_WITH_VERSION = 0x1061 + +const CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION = 0x1062 + +const CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES = 0x1063 + +const CL_DEVICE_ATOMIC_FENCE_CAPABILITIES = 0x1064 + +const CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT = 0x1065 + +const CL_DEVICE_OPENCL_C_ALL_VERSIONS = 0x1066 + +const CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 0x1067 + +const CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT = 0x1068 + +const CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT = 0x1069 + +const CL_DEVICE_OPENCL_C_FEATURES = 0x106f + +const CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES = 0x1070 + +const CL_DEVICE_PIPE_SUPPORT = 0x1071 + +const CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED = 0x1072 + +const CL_PROGRAM_REFERENCE_COUNT = 0x1160 + +const CL_PROGRAM_CONTEXT = 0x1161 + +const CL_PROGRAM_NUM_DEVICES = 0x1162 + +const CL_PROGRAM_DEVICES = 0x1163 + +const CL_PROGRAM_SOURCE = 0x1164 + +const CL_PROGRAM_BINARY_SIZES = 0x1165 + +const CL_PROGRAM_BINARIES = 0x1166 + +const CL_PROGRAM_NUM_KERNELS = 0x1167 + +const CL_PROGRAM_KERNEL_NAMES = 0x1168 + +const CL_PROGRAM_IL = 0x1169 + +const CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT = 0x116a + +const CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT = 0x116b + +const CL_PROGRAM_BUILD_STATUS = 0x1181 + +const CL_PROGRAM_BUILD_OPTIONS = 0x1182 + +const CL_PROGRAM_BUILD_LOG = 0x1183 + +const CL_PROGRAM_BINARY_TYPE = 0x1184 + +const CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE = 0x1185 + +const CL_PROGRAM_BINARY_TYPE_NONE = 0x00 + +const CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT = 0x01 + +const CL_PROGRAM_BINARY_TYPE_LIBRARY = 0x02 + +const CL_PROGRAM_BINARY_TYPE_EXECUTABLE = 0x04 + +const CL_BUILD_SUCCESS = 0 + +const CL_BUILD_NONE = -1 + +const CL_BUILD_ERROR = -2 + +const CL_BUILD_IN_PROGRESS = -3 + +const CL_KERNEL_WORK_GROUP_SIZE = 0x11b0 + +const CL_KERNEL_COMPILE_WORK_GROUP_SIZE = 0x11b1 + +const CL_KERNEL_LOCAL_MEM_SIZE = 0x11b2 + +const CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 0x11b3 + +const CL_KERNEL_PRIVATE_MEM_SIZE = 0x11b4 + +const CL_KERNEL_GLOBAL_WORK_SIZE = 0x11b5 + +const CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE = 0x2033 + +const CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE = 0x2034 + +const CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT = 0x11b8 + +const CL_KERNEL_MAX_NUM_SUB_GROUPS = 0x11b9 + +const CL_KERNEL_COMPILE_NUM_SUB_GROUPS = 0x11ba + +const CL_KERNEL_EXEC_INFO_SVM_PTRS = 0x11b6 + +const CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM = 0x11b7 + +struct CLError <: Exception + code::Cint +end + +@noinline function throw_api_error(res) + throw(CLError(res)) +end + +function check(f) + res = f() + + if res != CL_SUCCESS + throw_api_error(res) + end + + return +end + +const intptr_t = if sizeof(Ptr{Cvoid}) == 8 + Int64 +else + Int32 +end + +const cl_int = Int32 + +const cl_uint = UInt32 + +const cl_ulong = UInt64 + +mutable struct _cl_platform_id end + +mutable struct _cl_device_id end + +mutable struct _cl_context end + +mutable struct _cl_command_queue end + +mutable struct _cl_mem end + +mutable struct _cl_program end + +mutable struct _cl_kernel end + +mutable struct _cl_event end + +const cl_platform_id = Ptr{_cl_platform_id} + +const cl_device_id = Ptr{_cl_device_id} + +const cl_context = Ptr{_cl_context} + +const cl_command_queue = Ptr{_cl_command_queue} + +const cl_mem = Ptr{_cl_mem} + +const cl_program = Ptr{_cl_program} + +const cl_kernel = Ptr{_cl_kernel} + +const cl_event = Ptr{_cl_event} + +const cl_bitfield = cl_ulong + +const cl_device_type = cl_bitfield + +const cl_platform_info = cl_uint + +const cl_device_info = cl_uint + +const cl_context_properties = intptr_t + +const cl_context_info = cl_uint + +const cl_build_status = cl_int + +const cl_program_info = cl_uint + +const cl_program_build_info = cl_uint + +const cl_kernel_info = cl_uint + +const cl_kernel_arg_info = cl_uint + +const cl_kernel_arg_address_qualifier = cl_uint + +const cl_kernel_arg_access_qualifier = cl_uint + +const cl_kernel_arg_type_qualifier = cl_bitfield + +const cl_kernel_work_group_info = cl_uint + +const cl_kernel_sub_group_info = cl_uint + +const cl_device_svm_capabilities = cl_bitfield + +const cl_command_queue_properties = cl_bitfield + +const cl_event_info = cl_uint + +@checked function clGetPlatformIDs(num_entries, platforms, num_platforms) + @ccall libopencl.clGetPlatformIDs( + num_entries::cl_uint, platforms::Ptr{cl_platform_id}, + num_platforms::Ptr{cl_uint} + )::cl_int +end + +@checked function clGetPlatformInfo( + platform, param_name, param_value_size, param_value, + param_value_size_ret + ) + @ccall libopencl.clGetPlatformInfo( + platform::cl_platform_id, + param_name::cl_platform_info, + param_value_size::Csize_t, param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int +end + +@checked function clGetDeviceIDs(platform, device_type, num_entries, devices, num_devices) + @ccall libopencl.clGetDeviceIDs( + platform::cl_platform_id, device_type::cl_device_type, + num_entries::cl_uint, devices::Ptr{cl_device_id}, + num_devices::Ptr{cl_uint} + )::cl_int +end + +@checked function clGetDeviceInfo( + device, param_name, param_value_size, param_value, + param_value_size_ret + ) + @ccall libopencl.clGetDeviceInfo( + device::cl_device_id, param_name::cl_device_info, + param_value_size::Csize_t, param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int +end + +function clCreateContext( + properties, num_devices, devices, pfn_notify, user_data, + errcode_ret + ) + return @ccall libopencl.clCreateContext( + properties::Ptr{cl_context_properties}, + num_devices::cl_uint, devices::Ptr{cl_device_id}, + pfn_notify::Ptr{Cvoid}, user_data::Ptr{Cvoid}, + errcode_ret::Ptr{cl_int} + )::cl_context +end + +@checked function clReleaseContext(context) + @ccall libopencl.clReleaseContext(context::cl_context)::cl_int +end + +function clCreateProgramWithIL(context, il, length, errcode_ret) + return @ccall libopencl.clCreateProgramWithIL( + context::cl_context, il::Ptr{Cvoid}, + length::Csize_t, + errcode_ret::Ptr{cl_int} + )::cl_program +end + +@checked function clReleaseProgram(program) + @ccall libopencl.clReleaseProgram(program::cl_program)::cl_int +end + +@checked function clBuildProgram( + program, num_devices, device_list, options, pfn_notify, + user_data + ) + @ccall libopencl.clBuildProgram( + program::cl_program, num_devices::cl_uint, + device_list::Ptr{cl_device_id}, options::Ptr{Cchar}, + pfn_notify::Ptr{Cvoid}, user_data::Ptr{Cvoid} + )::cl_int +end + +@checked function clGetProgramInfo( + program, param_name, param_value_size, param_value, + param_value_size_ret + ) + @ccall libopencl.clGetProgramInfo( + program::cl_program, param_name::cl_program_info, + param_value_size::Csize_t, param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int +end + +@checked function clGetProgramBuildInfo( + program, device, param_name, param_value_size, + param_value, param_value_size_ret + ) + @ccall libopencl.clGetProgramBuildInfo( + program::cl_program, device::cl_device_id, + param_name::cl_program_build_info, + param_value_size::Csize_t, + param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int +end + +function clCreateKernel(program, kernel_name, errcode_ret) + return @ccall libopencl.clCreateKernel( + program::cl_program, kernel_name::Ptr{Cchar}, + errcode_ret::Ptr{cl_int} + )::cl_kernel +end + +@checked function clReleaseKernel(kernel) + @ccall libopencl.clReleaseKernel(kernel::cl_kernel)::cl_int +end + +@checked function clSetKernelArg(kernel, arg_index, arg_size, arg_value) + @ccall libopencl.clSetKernelArg( + kernel::cl_kernel, arg_index::cl_uint, + arg_size::Csize_t, arg_value::Ptr{Cvoid} + )::cl_int +end + +@checked function clSetKernelArgSVMPointer(kernel, arg_index, arg_value) + @ccall libopencl.clSetKernelArgSVMPointer( + kernel::cl_kernel, arg_index::cl_uint, + arg_value::Ptr{Cvoid} + )::cl_int +end + +@checked function clGetKernelWorkGroupInfo( + kernel, device, param_name, param_value_size, + param_value, param_value_size_ret + ) + @ccall libopencl.clGetKernelWorkGroupInfo( + kernel::cl_kernel, device::cl_device_id, + param_name::cl_kernel_work_group_info, + param_value_size::Csize_t, + param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int +end + +@checked function clEnqueueNDRangeKernel( + command_queue, kernel, work_dim, + global_work_offset, global_work_size, + local_work_size, num_events_in_wait_list, + event_wait_list, event + ) + @ccall libopencl.clEnqueueNDRangeKernel( + command_queue::cl_command_queue, + kernel::cl_kernel, work_dim::cl_uint, + global_work_offset::Ptr{Csize_t}, + global_work_size::Ptr{Csize_t}, + local_work_size::Ptr{Csize_t}, + num_events_in_wait_list::cl_uint, + event_wait_list::Ptr{cl_event}, + event::Ptr{cl_event} + )::cl_int +end + +function clCreateCommandQueue(context, device, properties, errcode_ret) + return @ccall libopencl.clCreateCommandQueue( + context::cl_context, device::cl_device_id, + properties::cl_command_queue_properties, + errcode_ret::Ptr{cl_int} + )::cl_command_queue +end + +@checked function clReleaseCommandQueue(command_queue) + @ccall libopencl.clReleaseCommandQueue(command_queue::cl_command_queue)::cl_int +end + +@checked function clFinish(command_queue) + @ccall libopencl.clFinish(command_queue::cl_command_queue)::cl_int +end + +@checked function clWaitForEvents(num_events, event_list) + @ccall libopencl.clWaitForEvents(num_events::cl_uint, event_list::Ptr{cl_event})::cl_int +end + +@checked function clGetEventInfo( + event, param_name, param_value_size, param_value, + param_value_size_ret + ) + @ccall libopencl.clGetEventInfo( + event::cl_event, param_name::cl_event_info, + param_value_size::Csize_t, param_value::Ptr{Cvoid}, + param_value_size_ret::Ptr{Csize_t} + )::cl_int +end + +@checked function clReleaseEvent(event) + @ccall libopencl.clReleaseEvent(event::cl_event)::cl_int +end + +# Init + +# lazy initialization +const initialized = Ref{Bool}(false) +@noinline function initialize() + initialized[] = true + + @static if Sys.iswindows() + if is_high_integrity_level() + @warn """Running at high integrity level, preventing OpenCL.jl from loading drivers from JLLs. + + Only system drivers will be available. To enable JLL drivers, do not run Julia as an administrator.""" + end + end + + ocd_filenames = join(OpenCL_jll.drivers, ':') + if haskey(ENV, "OCL_ICD_FILENAMES") + ocd_filenames *= ":" * ENV["OCL_ICD_FILENAMES"] + end + + return withenv("OCL_ICD_FILENAMES" => ocd_filenames) do + num_platforms = Ref{Cuint}() + @ccall libopencl.clGetPlatformIDs( + 0::cl_uint, C_NULL::Ptr{cl_platform_id}, + num_platforms::Ptr{cl_uint} + )::cl_int + + if num_platforms[] == 0 && isempty(OpenCL_jll.drivers) + @error """No OpenCL drivers available, either system-wide or provided by a JLL. + + Please install a system-wide OpenCL driver, or load one together with OpenCL.jl, + e.g., by doing `using OpenCL, pocl_jll`.""" + end + end +end + +# Julia API + +struct Platform + id::cl_platform_id +end + +Base.unsafe_convert(::Type{cl_platform_id}, p::Platform) = p.id + +function platforms() + nplatforms = Ref{Cuint}() + res = unchecked_clGetPlatformIDs(0, C_NULL, nplatforms) + if res == CL_PLATFORM_NOT_FOUND_KHR || nplatforms[] == 0 + return Platform[] + elseif res != CL_SUCCESS + throw(CLError(res)) + end + cl_platform_ids = Vector{cl_platform_id}(undef, nplatforms[]) + clGetPlatformIDs(nplatforms[], cl_platform_ids, C_NULL) + return [Platform(id) for id in cl_platform_ids] +end + + +function Base.getproperty(p::Platform, s::Symbol) + # simple string properties + version_re = r"OpenCL (?\d+)\.(?\d+)(?.+)" + @inline function get_string(prop) + sz = Ref{Csize_t}() + clGetPlatformInfo(p, prop, 0, C_NULL, sz) + chars = Vector{Cchar}(undef, sz[]) + clGetPlatformInfo(p, prop, sz[], chars, C_NULL) + return GC.@preserve chars unsafe_string(pointer(chars)) + end + if s === :profile + return get_string(CL_PLATFORM_PROFILE) + elseif s === :version + str = get_string(CL_PLATFORM_VERSION) + m = match(version_re, str) + if m === nothing + error("Could not parse OpenCL version string: $str") + end + return strip(m["vendor"]) + elseif s === :opencl_version + str = get_string(CL_PLATFORM_VERSION) + m = match(version_re, str) + if m === nothing + error("Could not parse OpenCL version string: $str") + end + return VersionNumber(parse(Int, m["major"]), parse(Int, m["minor"])) + elseif s === :name + return get_string(CL_PLATFORM_NAME) + elseif s === :vendor + return get_string(CL_PLATFORM_VENDOR) + end + + if s == :extensions + size = Ref{Csize_t}() + clGetPlatformInfo(p, CL_PLATFORM_EXTENSIONS, 0, C_NULL, size) + result = Vector{Cchar}(undef, size[]) + clGetPlatformInfo(p, CL_PLATFORM_EXTENSIONS, size[], result, C_NULL) + return GC.@preserve result split(unsafe_string(pointer(result))) + end + return getfield(p, s) +end + +struct Device + id::cl_device_id +end + +Base.unsafe_convert(::Type{cl_device_id}, d::Device) = d.id + +function devices(p::Platform, dtype) + ndevices = Ref{Cuint}() + ret = unchecked_clGetDeviceIDs(p, dtype, 0, C_NULL, ndevices) + if ret == CL_DEVICE_NOT_FOUND || ndevices[] == 0 + return Device[] + elseif ret != CL_SUCCESS + throw(CLError(ret)) + end + result = Vector{cl_device_id}(undef, ndevices[]) + clGetDeviceIDs(p, dtype, ndevices[], result, C_NULL) + return Device[Device(id) for id in result] +end + +function default_device(p::Platform) + devs = devices(p, CL_DEVICE_TYPE_DEFAULT) + isempty(devs) && return nothing + # XXX: clGetDeviceIDs documents CL_DEVICE_TYPE_DEFAULT should only return one device, + # but it's been observed to return multiple devices on some platforms... + return first(devs) +end + +devices(p::Platform) = devices(p, CL_DEVICE_TYPE_ALL) + +@inline function Base.getproperty(d::Device, s::Symbol) + # simple string properties + version_re = r"OpenCL (?\d+)\.(?\d+)(?.+)" + @inline function get_string(prop) + sz = Ref{Csize_t}() + clGetDeviceInfo(d, prop, 0, C_NULL, sz) + chars = Vector{Cchar}(undef, sz[]) + clGetDeviceInfo(d, prop, sz[], chars, C_NULL) + return GC.@preserve chars unsafe_string(pointer(chars)) + end + if s === :profile + return get_string(CL_DEVICE_PROFILE) + elseif s === :version + str = get_string(CL_DEVICE_VERSION) + m = match(version_re, str) + if m === nothing + error("Could not parse OpenCL version string: $str") + end + return strip(m["vendor"]) + elseif s === :opencl_version + str = get_string(CL_DEVICE_VERSION) + m = match(version_re, str) + if m === nothing + error("Could not parse OpenCL version string: $str") + end + return VersionNumber(parse(Int, m["major"]), parse(Int, m["minor"])) + elseif s === :driver_version + return get_string(CL_DRIVER_VERSION) + elseif s === :name + return get_string(CL_DEVICE_NAME) + end + + # scalar values + @inline function get_scalar(prop, typ) + scalar = Ref{typ}() + clGetDeviceInfo(d, prop, sizeof(typ), scalar, C_NULL) + return Int(scalar[]) + end + if s === :vendor_id + return get_scalar(CL_DEVICE_VENDOR_ID, cl_uint) + elseif s === :max_compute_units + return get_scalar(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) + elseif s === :max_work_item_dims + return get_scalar(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) + elseif s === :max_clock_frequency + return get_scalar(CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) + elseif s === :address_bits + return get_scalar(CL_DEVICE_ADDRESS_BITS, cl_uint) + elseif s === :max_read_image_args + return get_scalar(CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) + elseif s === :max_write_image_args + return get_scalar(CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) + elseif s === :global_mem_size + return get_scalar(CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) + elseif s === :max_mem_alloc_size + return get_scalar(CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) + elseif s === :max_const_buffer_size + return get_scalar(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) + elseif s === :local_mem_size + return get_scalar(CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) + elseif s === :max_work_group_size + return get_scalar(CL_DEVICE_MAX_WORK_GROUP_SIZE, Csize_t) + elseif s === :max_parameter_size + return get_scalar(CL_DEVICE_MAX_PARAMETER_SIZE, Csize_t) + elseif s === :profiling_timer_resolution + return get_scalar(CL_DEVICE_PROFILING_TIMER_RESOLUTION, Csize_t) + end + + # boolean properties + @inline function get_bool(prop) + bool = Ref{cl_bool}() + clGetDeviceInfo(d, prop, sizeof(cl_bool), bool, C_NULL) + return bool[] == CL_TRUE + end + if s === :has_image_support + return get_bool(CL_DEVICE_IMAGE_SUPPORT) + elseif s === :has_local_mem + return get_bool(CL_DEVICE_LOCAL_MEM_TYPE) + elseif s === :host_unified_memory + return get_bool(CL_DEVICE_HOST_UNIFIED_MEMORY) + elseif s === :available + return get_bool(CL_DEVICE_AVAILABLE) + elseif s === :compiler_available + return get_bool(CL_DEVICE_COMPILER_AVAILABLE) + end + + if s == :extensions + size = Ref{Csize_t}() + clGetDeviceInfo(d, CL_DEVICE_EXTENSIONS, 0, C_NULL, size) + result = Vector{Cchar}(undef, size[]) + clGetDeviceInfo(d, CL_DEVICE_EXTENSIONS, size[], result, C_NULL) + bs = GC.@preserve result unsafe_string(pointer(result)) + return String[string(s) for s in split(bs)] + end + + if s == :platform + result = Ref{cl_platform_id}() + clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), result, C_NULL) + return Platform(result[]) + end + + if s == :device_type + result = Ref{cl_device_type}() + clGetDeviceInfo(d, CL_DEVICE_TYPE, sizeof(cl_device_type), result, C_NULL) + result = result[] + if result == CL_DEVICE_TYPE_GPU + return :gpu + elseif result == CL_DEVICE_TYPE_CPU + return :cpu + elseif result == CL_DEVICE_TYPE_ACCELERATOR + return :accelerator + elseif result == CL_DEVICE_TYPE_CUSTOM + return :custom + else + return :unknown + end + end + + if s == :max_work_item_size + result = Vector{Csize_t}(undef, d.max_work_item_dims) + clGetDeviceInfo(d, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(result), result, C_NULL) + return tuple([Int(r) for r in result]...) + end + + if s == :max_image2d_shape + width = Ref{Csize_t}() + height = Ref{Csize_t}() + clGetDeviceInfo(d, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(Csize_t), width, C_NULL) + clGetDeviceInfo(d, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(Csize_t), height, C_NULL) + return (width[], height[]) + end + + if s == :max_image3d_shape + width = Ref{Csize_t}() + height = Ref{Csize_t}() + depth = Ref{Csize_t}() + clGetDeviceInfo(d, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(Csize_t), width, C_NULL) + clGetDeviceInfo(d, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(Csize_t), height, C_NULL) + clGetDeviceInfo(d, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(Csize_t), depth, C_NULL) + return (width[], height[], depth[]) + end + + return getfield(d, s) +end + +mutable struct Context + const id::cl_context + + function Context(ctx_id::cl_context) + ctx = new(ctx_id) + finalizer(clReleaseContext, ctx) + return ctx + end +end + +Base.unsafe_convert(::Type{cl_context}, ctx::Context) = ctx.id + +function Context(device::Device) + device_id = Ref(device.id) + + err_code = Ref{Cint}() + ctx_id = clCreateContext( + C_NULL, 1, device_id, C_NULL, C_NULL, err_code + ) + if err_code[] != CL_SUCCESS + throw(CLError(err_code[])) + end + return Context(ctx_id) +end + +mutable struct Program + const id::cl_program + + function Program(program_id::cl_program) + p = new(program_id) + finalizer(clReleaseProgram, p) + return p + end +end + +Base.unsafe_convert(::Type{cl_program}, p::Program) = p.id + +function Program(il, ctx) + err_code = Ref{Cint}() + program_id = clCreateProgramWithIL(ctx, il, length(il), err_code) + if err_code[] != CL_SUCCESS + throw(CLError(err_code[])) + end + return Program(program_id) +end + +#TODO: build callback... +function build!(p::Program; options = "") + opts = String(options) + ndevices = 0 + device_ids = C_NULL + try + clBuildProgram(p, cl_uint(ndevices), device_ids, opts, C_NULL, C_NULL) + catch err + isa(err, CLError) || throw(err) + + for (dev, status) in p.build_status + if status == CL_BUILD_ERROR + io = IOBuffer() + println(io, "Failed to compile program") + if p.source !== nothing + println(io) + println(io, "Source code:") + for (i, line) in enumerate(split(p.source, "\n")) + println(io, @sprintf("%s%-2d: %s", " ", i, line)) + end + end + println(io) + println(io, "Build log:") + println(io, strip(p.build_log[dev])) + error(String(take!(io))) + end + end + end + return p +end + +function Base.getproperty(p::Program, s::Symbol) + if s == :reference_count + count = Ref{Cuint}() + clGetProgramInfo(p, CL_PROGRAM_REFERENCE_COUNT, sizeof(Cuint), count, C_NULL) + return Int(count[]) + elseif s == :num_devices + count = Ref{Cuint}() + clGetProgramInfo(p, CL_PROGRAM_NUM_DEVICES, sizeof(Cuint), count, C_NULL) + return Int(count[]) + elseif s == :devices + device_ids = Vector{cl_device_id}(undef, p.num_devices) + clGetProgramInfo(p, CL_PROGRAM_DEVICES, sizeof(device_ids), device_ids, C_NULL) + return [Device(id) for id in device_ids] + elseif s == :source + src_len = Ref{Csize_t}() + clGetProgramInfo(p, CL_PROGRAM_SOURCE, 0, C_NULL, src_len) + src_len[] <= 1 && return nothing + src = Vector{Cchar}(undef, src_len[]) + clGetProgramInfo(p, CL_PROGRAM_SOURCE, src_len[], src, C_NULL) + return GC.@preserve src unsafe_string(pointer(src)) + elseif s == :binary_sizes + sizes = Vector{Csize_t}(undef, p.num_devices) + clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(sizes), sizes, C_NULL) + return sizes + elseif s == :binaries + sizes = p.binary_sizes + + bins = Vector{Ptr{UInt8}}(undef, length(sizes)) + # keep a reference to the underlying binary arrays + # as storing the pointer to the array hides the additional + # reference from julia's garbage collector + bin_arrays = Any[] + for (i, s) in enumerate(sizes) + if s > 0 + bin = Vector{UInt8}(undef, s) + bins[i] = pointer(bin) + push!(bin_arrays, bin) + else + bins[i] = Base.unsafe_convert(Ptr{UInt8}, C_NULL) + end + end + clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(bins), bins, C_NULL) + + binary_dict = Dict{Device, Array{UInt8}}() + bidx = 1 + for (i, d) in enumerate(p.devices) + if sizes[i] > 0 + binary_dict[d] = bin_arrays[bidx] + bidx += 1 + end + end + return binary_dict + elseif s == :context + ctx = Ref{cl_context}() + clGetProgramInfo(p, CL_PROGRAM_CONTEXT, sizeof(cl_context), ctx, C_NULL) + return Context(ctx[], retain = true) + elseif s == :build_status + status_dict = Dict{Device, cl_build_status}() + for device in p.devices + status = Ref{cl_build_status}() + clGetProgramBuildInfo(p, device, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), status, C_NULL) + status_dict[device] = status[] + end + return status_dict + elseif s == :build_log + log_dict = Dict{Device, String}() + for device in p.devices + size = Ref{Csize_t}() + clGetProgramBuildInfo(p, device, CL_PROGRAM_BUILD_LOG, 0, C_NULL, size) + log = Vector{Cchar}(undef, size[]) + clGetProgramBuildInfo(p, device, CL_PROGRAM_BUILD_LOG, size[], log, C_NULL) + log_dict[device] = GC.@preserve log unsafe_string(pointer(log)) + end + return log_dict + else + return getfield(p, s) + end +end + +mutable struct Kernel + const id::cl_kernel + + function Kernel(k::cl_kernel) + kernel = new(k) + finalizer(clReleaseKernel, kernel) + return kernel + end +end + +Base.unsafe_convert(::Type{cl_kernel}, k::Kernel) = k.id + +function Kernel(p::Program, kernel_name::String) + for (dev, status) in p.build_status + if status != CL_BUILD_SUCCESS + msg = "OpenCL.Program has to be built before Kernel constructor invoked" + throw(ArgumentError(msg)) + end + end + err_code = Ref{Cint}() + kernel_id = clCreateKernel(p, kernel_name, err_code) + if err_code[] != CL_SUCCESS + throw(CLError(err_code[])) + end + return Kernel(kernel_id) +end + +struct LocalMem{T} + nbytes::Csize_t +end + +function LocalMem(::Type{T}, len::Integer) where {T} + @assert len > 0 + nbytes = sizeof(T) * len + return LocalMem{T}(convert(Csize_t, nbytes)) +end + +Base.ndims(l::LocalMem) = 1 +Base.eltype(l::LocalMem{T}) where {T} = T +Base.sizeof(l::LocalMem{T}) where {T} = l.nbytes +Base.length(l::LocalMem{T}) where {T} = Int(l.nbytes ÷ sizeof(T)) + +# preserve the LocalMem; it will be handled by set_arg! +# XXX: do we want set_arg!(C_NULL::Ptr) to just call clSetKernelArg? +Base.unsafe_convert(::Type{Ptr{T}}, l::LocalMem{T}) where {T} = l + +function set_arg!(k::Kernel, idx::Integer, arg::Nothing) + @assert idx > 0 + clSetKernelArg(k, cl_uint(idx - 1), sizeof(cl_mem), C_NULL) + return k +end + +# SVMBuffers +## when passing using `cl.call` +# function set_arg!(k::Kernel, idx::Integer, arg::SVMBuffer) +# clSetKernelArgSVMPointer(k, cl_uint(idx-1), arg.ptr) +# return k +# end +## when passing with `clcall`, which has pre-converted the buffer +function set_arg!(k::Kernel, idx::Integer, arg::Union{Ptr, Core.LLVMPtr}) + arg = reinterpret(Ptr{Cvoid}, arg) + if arg != C_NULL + # XXX: this assumes that the receiving argument is pointer-typed, which is not the + # case with Julia's `Ptr` ABI. Instead, one should reinterpret the pointer as a + # `Core.LLVMPtr`, which _is_ pointer-valued. We retain this handling for `Ptr` + # for users passing pointers to OpenCL C, and because `Ptr` is pointer-valued + # starting with Julia 1.12. + clSetKernelArgSVMPointer(k, cl_uint(idx - 1), arg) + end + return k +end + +# regular buffers +# function set_arg!(k::Kernel, idx::Integer, arg::AbstractMemory) +# arg_boxed = Ref(arg.id) +# clSetKernelArg(k, cl_uint(idx-1), sizeof(cl_mem), arg_boxed) +# return k +# end + +function set_arg!(k::Kernel, idx::Integer, arg::LocalMem) + clSetKernelArg(k, cl_uint(idx - 1), arg.nbytes, C_NULL) + return k +end + +function set_arg!(k::Kernel, idx::Integer, arg::T) where {T} + ref = Ref(arg) + tsize = sizeof(ref) + err = unchecked_clSetKernelArg(k, cl_uint(idx - 1), tsize, ref) + if err == CL_INVALID_ARG_SIZE + error( + """Mismatch between Julia and OpenCL type for kernel argument $idx. + + Possible reasons: + - OpenCL does not support empty types. + - Vectors of length 3 (e.g., `float3`) are packed as 4-element vectors; + consider padding your tuples. + - The alignment of fields in your struct may not match the OpenCL layout. + Make sure your Julia definition matches the OpenCL layout, e.g., by + using `__attribute__((packed))` in your OpenCL struct definition.""" + ) + elseif err != CL_SUCCESS + throw(CLError(err)) + end + return k +end + +function set_args!(k::Kernel, args...) + for (i, a) in enumerate(args) + set_arg!(k, i, a) + end + return +end + +function enqueue_kernel( + k::Kernel, global_work_size, local_work_size = nothing; + global_work_offset = nothing + ) + max_work_dim = device().max_work_item_dims + work_dim = length(global_work_size) + if work_dim > max_work_dim + throw(ArgumentError("global_work_size has max dim of $max_work_dim")) + end + gsize = Vector{Csize_t}(undef, work_dim) + for (i, s) in enumerate(global_work_size) + gsize[i] = s + end + + goffset = C_NULL + if global_work_offset !== nothing + if length(global_work_offset) > max_work_dim + throw(ArgumentError("global_work_offset has max dim of $max_work_dim")) + end + if length(global_work_offset) != work_dim + throw(ArgumentError("global_work_size and global_work_offset have differing dims")) + end + goffset = Vector{Csize_t}(undef, work_dim) + for (i, o) in enumerate(global_work_offset) + goffset[i] = o + end + else + # null global offset means (0, 0, 0) + end + + lsize = C_NULL + if local_work_size !== nothing + if length(local_work_size) > max_work_dim + throw(ArgumentError("local_work_offset has max dim of $max_work_dim")) + end + if length(local_work_size) != work_dim + throw(ArgumentError("global_work_size and local_work_size have differing dims")) + end + lsize = Vector{Csize_t}(undef, work_dim) + for (i, s) in enumerate(local_work_size) + lsize[i] = s + end + else + # null local size means OpenCL decides + end + + n_events = cl_uint(0) + wait_event_ids = C_NULL + ret_event = Ref{cl_event}() + + clEnqueueNDRangeKernel( + queue(), k, cl_uint(work_dim), goffset, gsize, lsize, + n_events, wait_event_ids, ret_event + ) + return Event(ret_event[]) +end + +function call( + k::Kernel, args...; global_size = (1,), local_size = nothing, + global_work_offset = nothing, + svm_pointers::Vector{Ptr{Cvoid}} = Ptr{Cvoid}[] + ) + set_args!(k, args...) + if !isempty(svm_pointers) + clSetKernelExecInfo( + k, CL_KERNEL_EXEC_INFO_SVM_PTRS, + sizeof(svm_pointers), svm_pointers + ) + end + return enqueue_kernel(k, global_size, local_size; global_work_offset) +end + +# convert the argument values to match the kernel's signature (specified by the user) +# (this mimics `lower-ccall` in julia-syntax.scm) +@inline @generated function convert_arguments(f::Function, ::Type{tt}, args...) where {tt} + types = tt.parameters + + ex = quote end + + converted_args = Vector{Symbol}(undef, length(args)) + arg_ptrs = Vector{Symbol}(undef, length(args)) + for i in 1:length(args) + converted_args[i] = gensym() + arg_ptrs[i] = gensym() + push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i]))) + push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i])))) + end + + append!( + ex.args, ( + quote + GC.@preserve $(converted_args...) begin + f($(arg_ptrs...)) + end + end + ).args + ) + + return ex +end + +clcall(f::F, types::Tuple, args::Vararg{Any, N}; kwargs...) where {N, F} = + clcall(f, _to_tuple_type(types), args...; kwargs...) + +function clcall(k::Kernel, types::Type{T}, args::Vararg{Any, N}; kwargs...) where {T, N} + call_closure = function (converted_args::Vararg{Any, N}) + return call(k, converted_args...; kwargs...) + end + return convert_arguments(call_closure, types, args...) +end + +struct KernelWorkGroupInfo + kernel::Kernel + device::Device +end +work_group_info(k::Kernel, d::Device) = KernelWorkGroupInfo(k, d) + +function Base.getproperty(ki::KernelWorkGroupInfo, s::Symbol) + k = getfield(ki, :kernel) + d = getfield(ki, :device) + + function get(val, typ) + result = Ref{typ}() + clGetKernelWorkGroupInfo(k, d, val, sizeof(typ), result, C_NULL) + return result[] + end + + return if s == :size + Int(get(CL_KERNEL_WORK_GROUP_SIZE, Csize_t)) + elseif s == :compile_size + Int.(get(CL_KERNEL_COMPILE_WORK_GROUP_SIZE, NTuple{3, Csize_t})) + elseif s == :local_mem_size + Int(get(CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong)) + elseif s == :private_mem_size + Int(get(CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong)) + elseif s == :prefered_size_multiple + Int(get(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, Csize_t)) + else + getfield(ki, s) + end +end + +mutable struct CmdQueue + const id::cl_command_queue + + function CmdQueue(q_id::cl_command_queue) + q = new(q_id) + finalizer(q) do _ + clReleaseCommandQueue(q) + end + return q + end +end + +Base.unsafe_convert(::Type{cl_command_queue}, q::CmdQueue) = q.id + +function CmdQueue() + flags = cl_command_queue_properties(0) + err_code = Ref{Cint}() + queue_id = clCreateCommandQueue(context(), device(), flags, err_code) + if err_code[] != CL_SUCCESS + if queue_id != C_NULL + clReleaseCommandQueue(queue_id) + end + throw(CLError(err_code[])) + end + return CmdQueue(queue_id) +end + +function finish(q::CmdQueue) + clFinish(q) + return q +end + +struct Event + id::cl_event +end +Base.unsafe_convert(::Type{cl_event}, e::Event) = e.id + +const CL_EVENT_COMMAND_EXECUTION_STATUS = 0x11d3 + +function Base.getproperty(evt::Event, s::Symbol) + # regular properties + if s == :status + st = Ref{Cint}() + clGetEventInfo(evt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(Cint), st, C_NULL) + status = st[] + return status + else + return getfield(evt, s) + end +end + +const CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST = -14 + +function Base.wait(evt::Event) + evt_id = Ref(evt.id) + err = unchecked_clWaitForEvents(cl_uint(1), evt_id) + if err == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST + error("Kernel execution failed") + elseif err != CL_SUCCESS + throw(CLError(err)) + end + return evt +end + +end diff --git a/src/pocl/pocl.jl b/src/pocl/pocl.jl new file mode 100644 index 000000000..9c995daff --- /dev/null +++ b/src/pocl/pocl.jl @@ -0,0 +1,73 @@ +module POCL + +function platform end +function device end +function context end +function queue end + +include("nanoOpenCL.jl") + +import .nanoOpenCL as cl + +function platform() + return get!(task_local_storage(), :POCLPlatform) do + for p in cl.platforms() + if p.vendor == "The pocl project" + return p + end + end + error("POCL not available") + end::cl.Platform +end + +function device() + return get!(task_local_storage(), :POCLDevice) do + p = platform() + return cl.default_device(p) + end::cl.Device +end + +# TODO: add a device context dict +function context() + return get!(task_local_storage(), :POCLContext) do + cl.Context(device()) + end::cl.Context +end + +function queue() + return get!(task_local_storage(), :POCLQueue) do + cl.CmdQueue() + end::cl.CmdQueue +end + +using GPUCompiler +import LLVM +using Adapt + +import SPIRVIntrinsics +SPIRVIntrinsics.@import_all +SPIRVIntrinsics.@reexport_public + +include("compiler/compilation.jl") +include("compiler/execution.jl") +include("compiler/reflection.jl") + +import Core: LLVMPtr + +include("device/array.jl") +include("device/quirks.jl") +include("device/runtime.jl") + +function Adapt.adapt_storage(to::KernelAdaptor, xs::Array{T, N}) where {T, N} + return CLDeviceArray{T, N, AS.Global}(size(xs), reinterpret(LLVMPtr{T, AS.Global}, pointer(xs))) +end + +include("backend.jl") +import .POCLKernels: POCLBackend +export POCLBackend + +import KernelAbstractions as KA + +Adapt.adapt_storage(::POCLBackend, a::Array) = a + +end diff --git a/test/compiler.jl b/test/compiler.jl deleted file mode 100644 index 85312262e..000000000 --- a/test/compiler.jl +++ /dev/null @@ -1,79 +0,0 @@ -using KernelAbstractions -using Test - -import KernelAbstractions.NDIteration: NDRange, StaticSize, DynamicSize - -@kernel function index(A) - I = @index(Global, NTuple) - @show A[I...] -end - -@kernel function literal_pow(A) - A[1] = 2^11 -end - -@kernel function square(A, B) - A[1] = B[1]^2 -end - -@kernel function pow(A, B) - A[1] = A[1]^B[1] -end - -@kernel function checked(A, a, b) - A[1] = Base.Checked.checked_add(a, b) -end - -function check_for_overdub(stmt) - if stmt isa Expr - if stmt.head == :invoke - mi = first(stmt.args)::Core.MethodInstance - if mi.def.name === :overdub - @show stmt - return true - end - end - end - return false -end - -function compiler_testsuite(backend, ArrayT) - kernel = index(CPU(), DynamicSize(), DynamicSize()) - iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}() - ctx = KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(KernelAbstractions.NoDynamicCheck())) - @test ndims(ctx) == 1 - @test KernelAbstractions.__index_Global_NTuple(ctx, CartesianIndex(1)) == (1,) - - A = ArrayT{Int}(undef, 1) - let (CI, rt) = @ka_code_typed literal_pow(backend())(A, ndrange = 1) - # test that there is no invoke of overdub - @test !any(check_for_overdub, CI.code) - end - - A = ArrayT{Float32}(undef, 1) - let (CI, rt) = @ka_code_typed square(backend())(A, A, ndrange = 1) - # test that there is no invoke of overdub - @test !any(check_for_overdub, CI.code) - end - - A = ArrayT{Float32}(undef, 1) - B = ArrayT{Float32}(undef, 1) - let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange = 1) - # test that there is no invoke of overdub - @test !any(check_for_overdub, CI.code) - end - - A = ArrayT{Float32}(undef, 1) - B = ArrayT{Int32}(undef, 1) - let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange = 1) - # test that there is no invoke of overdub - @test !any(check_for_overdub, CI.code) - end - - A = ArrayT{Int}(undef, 1) - let (CI, rt) = @ka_code_typed checked(backend())(A, 1, 2, ndrange = 1) - # test that there is no invoke of overdub - @test !any(check_for_overdub, CI.code) - end - return -end diff --git a/test/convert.jl b/test/convert.jl index 7b6c24029..c8500d046 100644 --- a/test/convert.jl +++ b/test/convert.jl @@ -12,34 +12,34 @@ using KernelAbstractions, Test @inbounds B[tid, 2] = ceil(Int16, A[tid]) @inbounds B[tid, 3] = ceil(Int32, A[tid]) @inbounds B[tid, 4] = ceil(Int64, A[tid]) - @inbounds B[tid, 5] = ceil(Int128, A[tid]) + # @inbounds B[tid, 5] = ceil(Int128, A[tid]) @inbounds B[tid, 6] = ceil(UInt8, A[tid]) @inbounds B[tid, 7] = ceil(UInt16, A[tid]) @inbounds B[tid, 8] = ceil(UInt32, A[tid]) @inbounds B[tid, 9] = ceil(UInt64, A[tid]) - @inbounds B[tid, 10] = ceil(UInt128, A[tid]) + # @inbounds B[tid, 10] = ceil(UInt128, A[tid]) @inbounds B[tid, 11] = floor(Int8, A[tid]) @inbounds B[tid, 12] = floor(Int16, A[tid]) @inbounds B[tid, 13] = floor(Int32, A[tid]) @inbounds B[tid, 14] = floor(Int64, A[tid]) - @inbounds B[tid, 15] = floor(Int128, A[tid]) + # @inbounds B[tid, 15] = floor(Int128, A[tid]) @inbounds B[tid, 16] = floor(UInt8, A[tid]) @inbounds B[tid, 17] = floor(UInt16, A[tid]) @inbounds B[tid, 18] = floor(UInt32, A[tid]) @inbounds B[tid, 19] = floor(UInt64, A[tid]) - @inbounds B[tid, 20] = floor(UInt128, A[tid]) + # @inbounds B[tid, 20] = floor(UInt128, A[tid]) @inbounds B[tid, 21] = round(Int8, A[tid]) @inbounds B[tid, 22] = round(Int16, A[tid]) @inbounds B[tid, 23] = round(Int32, A[tid]) @inbounds B[tid, 24] = round(Int64, A[tid]) - @inbounds B[tid, 25] = round(Int128, A[tid]) + # @inbounds B[tid, 25] = round(Int128, A[tid]) @inbounds B[tid, 26] = round(UInt8, A[tid]) @inbounds B[tid, 27] = round(UInt16, A[tid]) @inbounds B[tid, 28] = round(UInt32, A[tid]) @inbounds B[tid, 29] = round(UInt64, A[tid]) - @inbounds B[tid, 30] = round(UInt128, A[tid]) + # @inbounds B[tid, 30] = round(UInt128, A[tid]) end @@ -58,6 +58,10 @@ function convert_testsuite(backend, ArrayT) synchronize(backend()) for i in 1:10 + # don't run Int128/UInt128 tests + if i == 5 || i == 10 + continue + end @test d_B[:, i] == ceil.(d_A) @test d_B[:, i + 10] == floor.(d_A) @test d_B[:, i + 20] == round.(d_A) diff --git a/test/reflection.jl b/test/reflection.jl index 6ce46b2b1..dbfef47ba 100644 --- a/test/reflection.jl +++ b/test/reflection.jl @@ -23,11 +23,7 @@ function test_typed_kernel_dynamic(backend, backend_str, ArrayT) else @ka_code_typed kernel(A, ndrange = size(A), workgroupsize = (32, 32)) end - if backend_str in ["CUDA", "ROCM", "oneAPI", "Metal", "OpenCL"] - @test_broken isa(res, Pair{Core.CodeInfo, DataType}) - else - @test isa(res, Pair{Core.CodeInfo, DataType}) - end + @test_broken isa(res, Pair{Core.CodeInfo, DataType}) @test isa(res[1].code, Array{Any, 1}) return end @@ -38,11 +34,7 @@ function test_typed_kernel_dynamic_no_info(backend, backend_str, ArrayT) C = similar(A) kernel = add3(backend()) res = @ka_code_typed kernel(A, B, C, ndrange = size(A)) - if backend_str in ["CUDA", "ROCM", "oneAPI", "Metal", "OpenCL"] - @test_broken isa(res, Pair{Core.CodeInfo, DataType}) - else - @test isa(res, Pair{Core.CodeInfo, DataType}) - end + @test_broken isa(res, Pair{Core.CodeInfo, DataType}) @test isa(res[1].code, Array{Any, 1}) return end @@ -55,11 +47,7 @@ function test_typed_kernel_static(backend, backend_str, ArrayT) mul2(backend(), (32, 32)) end res = @ka_code_typed kernel(A, ndrange = size(A)) - if backend_str in ["CUDA", "ROCM", "oneAPI", "Metal", "OpenCL"] - @test_broken isa(res, Pair{Core.CodeInfo, DataType}) - else - @test isa(res, Pair{Core.CodeInfo, DataType}) - end + @test_broken isa(res, Pair{Core.CodeInfo, DataType}) @test isa(res[1].code, Array{Any, 1}) return end @@ -87,11 +75,7 @@ function test_expr_kernel(backend, backend_str, ArrayT) addi(backend(), (32, 32)) end res = @ka_code_typed kernel(A, C, 1 + 2, ndrange = size(A)) - if backend_str in ["CUDA", "ROCM", "oneAPI", "Metal", "OpenCL"] - @test_broken isa(res, Pair{Core.CodeInfo, DataType}) - else - @test isa(res, Pair{Core.CodeInfo, DataType}) - end + @test_broken isa(res, Pair{Core.CodeInfo, DataType}) @test isa(res[1].code, Array{Any, 1}) return end diff --git a/test/runtests.jl b/test/runtests.jl index f992afad2..aa586ede7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -17,15 +17,6 @@ A = zeros(Int, Threads.nthreads()) kern_static(CPU(static = true), (1,))(A, ndrange = length(A)) @test A == 1:Threads.nthreads() -@kernel cpu = false function my_no_cpu_kernel(a) -end -@test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel(CPU()) - -# testing multiple configurations at the same time -@kernel cpu = false inbounds = false function my_no_cpu_kernel2(a) -end -@test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel2(CPU()) - if Base.JLOptions().check_bounds == 0 || Base.JLOptions().check_bounds == 1 # testing bounds errors @kernel inbounds = false function my_bounded_kernel(a) diff --git a/test/test.jl b/test/test.jl index d86d9803b..8a88d2db7 100644 --- a/test/test.jl +++ b/test/test.jl @@ -273,30 +273,16 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk end @testset "No CPU kernel" begin - if !(Backend() isa CPU) - A = KernelAbstractions.zeros(Backend(), Int64, 1024) - context_kernel(Backend())(A, ndrange = size(A)) - synchronize(Backend()) - @test all((a) -> a == 1, Array(A)) - else - @test_throws ErrorException("This kernel is unavailable for backend CPU") context_kernel(Backend()) - end + A = KernelAbstractions.zeros(Backend(), Int64, 1024) + context_kernel(Backend())(A, ndrange = size(A)) + synchronize(Backend()) + @test all((a) -> a == 1, Array(A)) end @testset "functional" begin @test KernelAbstractions.functional(Backend()) isa Union{Missing, Bool} end - @testset "CPU default workgroupsize" begin - @test KernelAbstractions.default_cpu_workgroupsize((64,)) == (64,) - @test KernelAbstractions.default_cpu_workgroupsize((1024,)) == (1024,) - @test KernelAbstractions.default_cpu_workgroupsize((2056,)) == (1024,) - @test KernelAbstractions.default_cpu_workgroupsize((64, 64)) == (64, 16) - @test KernelAbstractions.default_cpu_workgroupsize((64, 64, 64, 4)) == (64, 16, 1, 1) - @test KernelAbstractions.default_cpu_workgroupsize((64, 15)) == (64, 15) - @test KernelAbstractions.default_cpu_workgroupsize((5, 7, 13, 17)) == (5, 7, 13, 2) - end - @testset "empty arrays" begin backend = Backend() @test size(allocate(backend, Float32, 0)) == (0,) @@ -315,14 +301,12 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk end end @testset "GPU kernel return statement" begin - if !(Backend() isa CPU) - A = KernelAbstractions.zeros(Backend(), Int64, 1024) - gpu_return_kernel!(Backend())(A; ndrange = length(A)) - synchronize(Backend()) - Ah = Array(A) - @test all(a -> a == 1, @view(Ah[1:(length(A) ÷ 2)])) - @test all(a -> a == 0, @view(Ah[(length(A) ÷ 2 + 1):end])) - end + A = KernelAbstractions.zeros(Backend(), Int64, 1024) + gpu_return_kernel!(Backend())(A; ndrange = length(A)) + synchronize(Backend()) + Ah = Array(A) + @test all(a -> a == 1, @view(Ah[1:(length(A) ÷ 2)])) + @test all(a -> a == 0, @view(Ah[(length(A) ÷ 2 + 1):end])) end return diff --git a/test/testsuite.jl b/test/testsuite.jl index 29f780272..f9f96fcf0 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -33,7 +33,6 @@ include("nditeration.jl") include("copyto.jl") include("devices.jl") include("print_test.jl") -include("compiler.jl") include("reflection.jl") include("examples.jl") include("convert.jl") @@ -76,10 +75,6 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ printing_testsuite(backend) end - @conditional_testset "Compiler" skip_tests begin - compiler_testsuite(backend, AT) - end - @conditional_testset "Reflection" skip_tests begin reflection_testsuite(backend, backend_str, AT) end