start setting up backend

JuliaGPU · Jan 16, 2025 · c22dfaa · c22dfaa
1 parent baace85
commit c22dfaa
Show file tree

Hide file tree

Showing 4 changed files with 512 additions and 2 deletions.
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
@@ -0,0 +1,180 @@
+module POCLKernels
+
+using ..POCL
+using ..POCL: @device_override, SPIRVIntrinsics, cl
+using ..POCL: device
+
+import KernelAbstractions as KA
+
+import StaticArrays
+
+import Adapt
+
+
+## Back-end Definition
+
+export POCLBackend
+
+struct POCLBackend <: KA.GPU
+end
+
+# KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where T = CLArray{T}(undef, dims)
+# KA.zeros(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.zeros(T, dims)
+# KA.ones(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.ones(T, dims)
+
+# KA.get_backend(::CLArray) = POCLBackend()
+# KA.synchronize(::POCLBackend) = cl.finish(cl.queue())
+# KA.supports_float64(::POCLBackend) = false  # XXX: this is platform/device dependent
+
+# Adapt.adapt_storage(::POCLBackend, a::Array) = Adapt.adapt(CLArray, a)
+# Adapt.adapt_storage(::POCLBackend, a::CLArray) = a
+# Adapt.adapt_storage(::KA.CPU, a::CLArray) = convert(Array, a)
+
+
+## Memory Operations
+
+# function KA.copyto!(::POCLBackend, A, B)
+#     copyto!(A, B)
+#     # TODO: Address device to host copies in jl being synchronizing
+# end
+
+
+## Kernel Launch
+
+function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, _ndrange, iterspace)
+    KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace)
+end
+function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, I, _ndrange, iterspace,
+                      ::Dynamic) where Dynamic
+    KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
+end
+
+function KA.launch_config(kernel::KA.Kernel{POCLBackend}, ndrange, workgroupsize)
+    if ndrange isa Integer
+        ndrange = (ndrange,)
+    end
+    if workgroupsize isa Integer
+        workgroupsize = (workgroupsize, )
+    end
+
+    # partition checked that the ndrange's agreed
+    if KA.ndrange(kernel) <: KA.StaticSize
+        ndrange = nothing
+    end
+
+    iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize &&
+        workgroupsize === nothing
+        # use ndrange as preliminary workgroupsize for autotuning
+        KA.partition(kernel, ndrange, ndrange)
+    else
+        KA.partition(kernel, ndrange, workgroupsize)
+    end
+
+    return ndrange, workgroupsize, iterspace, dynamic
+end
+
+function threads_to_workgroupsize(threads, ndrange)
+    total = 1
+    return map(ndrange) do n
+        x = min(div(threads, total), n)
+        total *= x
+        return x
+    end
+end
+
+function (obj::KA.Kernel{POCLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    ndrange, workgroupsize, iterspace, dynamic =
+        KA.launch_config(obj, ndrange, workgroupsize)
+
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+    kernel = @opencl launch=false obj.f(ctx, args...)
+
+    # figure out the optimal workgroupsize automatically
+    if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
+        wg_info = cl.work_group_info(kernel.fun, device())
+        wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange)
+        iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd)
+        ctx = KA.mkcontext(obj, ndrange, iterspace)
+    end
+
+    groups = length(KA.blocks(iterspace))
+    items = length(KA.workitems(iterspace))
+
+    if groups == 0
+        return nothing
+    end
+
+    # Launch kernel
+    global_size = groups * items
+    local_size = items
+    kernel(ctx, args...; global_size, local_size)
+
+    return nothing
+end
+
+
+## Indexing Functions
+
+@device_override @inline function KA.__index_Local_Linear(ctx)
+    return get_local_id(1)
+end
+
+@device_override @inline function KA.__index_Group_Linear(ctx)
+    return get_group_id(1)
+end
+
+@device_override @inline function KA.__index_Global_Linear(ctx)
+    return get_global_id(1)
+end
+
+@device_override @inline function KA.__index_Local_Cartesian(ctx)
+    @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
+end
+
+@device_override @inline function KA.__index_Group_Cartesian(ctx)
+    @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
+end
+
+@device_override @inline function KA.__index_Global_Cartesian(ctx)
+    return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+end
+
+@device_override @inline function KA.__validindex(ctx)
+    if KA.__dynamic_checkbounds(ctx)
+        I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+        return I in KA.__ndrange(ctx)
+    else
+        return true
+    end
+end
+
+
+## Shared and Scratch Memory
+
+@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+    ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims)))
+    CLDeviceArray(Dims, ptr)
+end
+
+@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
+    StaticArrays.MArray{KA.__size(Dims), T}(undef)
+end
+
+
+## Synchronization and Printing
+
+@device_override @inline function KA.__synchronize()
+    barrier()
+end
+
+@device_override @inline function KA.__print(args...)
+    SPIRVIntrinsics._print(args...)
+end
+
+
+## Other
+
+KA.argconvert(::KA.Kernel{POCLBackend}, arg) = clconvert(arg)
+
+end
diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl
@@ -147,7 +147,7 @@ abstract type AbstractKernel{F,TT} end
     args = (:(kernel.f), (:( clconvert(args[$i], svm_pointers) ) for i in 1:length(args))...)
 
     # filter out ghost arguments that shouldn't be passed
-    predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt)
+    predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt)
     to_pass = map(!predicate, sig.parameters)
     call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
     call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
@@ -165,7 +165,7 @@ abstract type AbstractKernel{F,TT} end
 
     quote
         svm_pointers = Ptr{Cvoid}[]
-        clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
+        $cl.clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
     end
 end