Skip to content

Commit

Permalink
start setting up backend
Browse files Browse the repository at this point in the history
  • Loading branch information
vchuravy committed Jan 16, 2025
1 parent baace85 commit c22dfaa
Show file tree
Hide file tree
Showing 4 changed files with 512 additions and 2 deletions.
180 changes: 180 additions & 0 deletions src/pocl/backend.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
module POCLKernels

using ..POCL
using ..POCL: @device_override, SPIRVIntrinsics, cl
using ..POCL: device

import KernelAbstractions as KA

import StaticArrays

import Adapt


## Back-end Definition

export POCLBackend

struct POCLBackend <: KA.GPU
end

# KA.allocate(::POCLBackend, ::Type{T}, dims::Tuple) where T = CLArray{T}(undef, dims)
# KA.zeros(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.zeros(T, dims)
# KA.ones(::POCLBackend, ::Type{T}, dims::Tuple) where T = OpenCL.ones(T, dims)

# KA.get_backend(::CLArray) = POCLBackend()
# KA.synchronize(::POCLBackend) = cl.finish(cl.queue())
# KA.supports_float64(::POCLBackend) = false # XXX: this is platform/device dependent

# Adapt.adapt_storage(::POCLBackend, a::Array) = Adapt.adapt(CLArray, a)
# Adapt.adapt_storage(::POCLBackend, a::CLArray) = a
# Adapt.adapt_storage(::KA.CPU, a::CLArray) = convert(Array, a)


## Memory Operations

# function KA.copyto!(::POCLBackend, A, B)
# copyto!(A, B)
# # TODO: Address device to host copies in jl being synchronizing
# end


## Kernel Launch

function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, _ndrange, iterspace)
KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace)
end
function KA.mkcontext(kernel::KA.Kernel{POCLBackend}, I, _ndrange, iterspace,
::Dynamic) where Dynamic
KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
end

function KA.launch_config(kernel::KA.Kernel{POCLBackend}, ndrange, workgroupsize)
if ndrange isa Integer
ndrange = (ndrange,)
end
if workgroupsize isa Integer
workgroupsize = (workgroupsize, )
end

# partition checked that the ndrange's agreed
if KA.ndrange(kernel) <: KA.StaticSize
ndrange = nothing
end

iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize &&
workgroupsize === nothing
# use ndrange as preliminary workgroupsize for autotuning
KA.partition(kernel, ndrange, ndrange)
else
KA.partition(kernel, ndrange, workgroupsize)
end

return ndrange, workgroupsize, iterspace, dynamic
end

function threads_to_workgroupsize(threads, ndrange)
total = 1
return map(ndrange) do n
x = min(div(threads, total), n)
total *= x
return x
end
end

function (obj::KA.Kernel{POCLBackend})(args...; ndrange=nothing, workgroupsize=nothing)
ndrange, workgroupsize, iterspace, dynamic =
KA.launch_config(obj, ndrange, workgroupsize)

# this might not be the final context, since we may tune the workgroupsize
ctx = KA.mkcontext(obj, ndrange, iterspace)
kernel = @opencl launch=false obj.f(ctx, args...)

# figure out the optimal workgroupsize automatically
if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
wg_info = cl.work_group_info(kernel.fun, device())
wg_size_nd = threads_to_workgroupsize(wg_info.size, ndrange)
iterspace, dynamic = KA.partition(obj, ndrange, wg_size_nd)
ctx = KA.mkcontext(obj, ndrange, iterspace)
end

groups = length(KA.blocks(iterspace))
items = length(KA.workitems(iterspace))

if groups == 0
return nothing
end

# Launch kernel
global_size = groups * items
local_size = items
kernel(ctx, args...; global_size, local_size)

return nothing
end


## Indexing Functions

@device_override @inline function KA.__index_Local_Linear(ctx)
return get_local_id(1)
end

@device_override @inline function KA.__index_Group_Linear(ctx)
return get_group_id(1)
end

@device_override @inline function KA.__index_Global_Linear(ctx)
return get_global_id(1)
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
end

@device_override @inline function KA.__validindex(ctx)
if KA.__dynamic_checkbounds(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
return I in KA.__ndrange(ctx)
else
return true
end
end


## Shared and Scratch Memory

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims)))
CLDeviceArray(Dims, ptr)
end

@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
StaticArrays.MArray{KA.__size(Dims), T}(undef)
end


## Synchronization and Printing

@device_override @inline function KA.__synchronize()
barrier()
end

@device_override @inline function KA.__print(args...)
SPIRVIntrinsics._print(args...)
end


## Other

KA.argconvert(::KA.Kernel{POCLBackend}, arg) = clconvert(arg)

end
4 changes: 2 additions & 2 deletions src/pocl/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ abstract type AbstractKernel{F,TT} end
args = (:(kernel.f), (:( clconvert(args[$i], svm_pointers) ) for i in 1:length(args))...)

# filter out ghost arguments that shouldn't be passed
predicate = dt -> isghosttype(dt) || Core.Compiler.isconstType(dt)
predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt)
to_pass = map(!predicate, sig.parameters)
call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]]
call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]]
Expand All @@ -165,7 +165,7 @@ abstract type AbstractKernel{F,TT} end

quote
svm_pointers = Ptr{Cvoid}[]
clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
$cl.clcall(kernel.fun, $call_tt, $(call_args...); svm_pointers, call_kwargs...)
end
end

Expand Down
Loading

0 comments on commit c22dfaa

Please sign in to comment.