diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 39588d11..750c3913 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -484,35 +484,6 @@ Abstract type for all GPU based KernelAbstractions backends. """ abstract type GPU <: Backend end -""" - CPU(; static=false) - -Instantiate a CPU (multi-threaded) backend. - -## Options: - - `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code. - Defaults to false. - -!!! note - `CPU` will be aliased to `POCLBackend()` on KernelAbstractions v1.0 -""" -struct CPU <: Backend - static::Bool - CPU(; static::Bool = false) = new(static) -end - -""" - isgpu(::Backend)::Bool - -Returns true for all [`GPU`](@ref) backends. - -!!! note - `isgpu` will be removed in KernelAbstractions v1.0 -""" -isgpu(::GPU) = true -isgpu(::CPU) = false - - """ get_backend(A::AbstractArray)::Backend @@ -530,7 +501,6 @@ get_backend(A::AbstractArray) = get_backend(parent(A)) # Define: # adapt_storage(::Backend, a::Array) = adapt(BackendArray, a) # adapt_storage(::Backend, a::BackendArray) = a -Adapt.adapt_storage(::CPU, a::Array) = a """ allocate(::Backend, Type, dims...)::AbstractArray @@ -750,7 +720,7 @@ Partition a kernel for the given ndrange and workgroupsize. return iterspace, dynamic end -function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName} +function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: GPU, S <: _Size, NDRange <: _Size, XPUName} return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name) end @@ -767,6 +737,10 @@ include("compiler.jl") function __workitems_iterspace end function __validindex end +# for reflection +function mkcontext end +function launch_config end + include("macros.jl") ### @@ -836,14 +810,12 @@ end end # CPU backend - -include("cpu.jl") - -# Future-CPU backend include("pocl/pocl.jl") using .POCL export POCLBackend +const CPU = POCLBackend + # precompile PrecompileTools.@compile_workload begin @eval begin diff --git a/src/macros.jl b/src/macros.jl index 02b93ed7..9a91e0b6 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -31,19 +31,6 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) constargs[i] = false end - # create two functions - # 1. GPU function - # 2. CPU function with work-group loops inserted - # - # Without the deepcopy we might accidentially modify expr shared between CPU and GPU - cpu_name = Symbol(:cpu_, name) - if generate_cpu - def_cpu = deepcopy(def) - def_cpu[:name] = cpu_name - transform_cpu!(def_cpu, constargs, force_inbounds) - cpu_function = combinedef(def_cpu) - end - def_gpu = deepcopy(def) def_gpu[:name] = gpu_name = Symbol(:gpu_, name) transform_gpu!(def_gpu, constargs, force_inbounds) @@ -56,24 +43,12 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false) $name(dev, size) = $name(dev, $StaticSize(size), $DynamicSize()) $name(dev, size, range) = $name(dev, $StaticSize(size), $StaticSize(range)) function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S <: $_Size, NDRange <: $_Size} - if $isgpu(dev) - return $construct(dev, sz, range, $gpu_name) - else - if $generate_cpu - return $construct(dev, sz, range, $cpu_name) - else - error("This kernel is unavailable for backend CPU") - end - end + return $construct(dev, sz, range, $gpu_name) end end end - if generate_cpu - return Expr(:block, esc(cpu_function), esc(gpu_function), esc(constructors)) - else - return Expr(:block, esc(gpu_function), esc(constructors)) - end + return Expr(:block, esc(gpu_function), esc(constructors)) end # The easy case, transform the function for GPU execution @@ -104,199 +79,4 @@ function transform_gpu!(def, constargs, force_inbounds) body, ) return -end - -# The hard case, transform the function for CPU execution -# - mark constant arguments by applying `constify`. -# - insert aliasscope markers -# - insert implied loop bodys -# - handle indicies -# - hoist workgroup definitions -# - hoist uniform variables -function transform_cpu!(def, constargs, force_inbounds) - let_constargs = Expr[] - for (i, arg) in enumerate(def[:args]) - if constargs[i] - push!(let_constargs, :($arg = $constify($arg))) - end - end - pushfirst!(def[:args], :__ctx__) - new_stmts = Expr[] - body = MacroTools.flatten(def[:body]) - push!(new_stmts, Expr(:aliasscope)) - if force_inbounds - push!(new_stmts, Expr(:inbounds, true)) - end - append!(new_stmts, split(body.args)) - if force_inbounds - push!(new_stmts, Expr(:inbounds, :pop)) - end - push!(new_stmts, Expr(:popaliasscope)) - push!(new_stmts, :(return nothing)) - def[:body] = Expr( - :let, - Expr(:block, let_constargs...), - Expr(:block, new_stmts...), - ) - return -end - -struct WorkgroupLoop - indicies::Vector{Any} - stmts::Vector{Any} - allocations::Vector{Any} - private_allocations::Vector{Any} - private::Set{Symbol} -end - -is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_)) - -function is_scope_construct(expr::Expr) - return expr.head === :block # || - # expr.head === :let -end - -function find_sync(stmt) - result = false - postwalk(stmt) do expr - result |= is_sync(expr) - expr - end - return result -end - -# TODO proper handling of LineInfo -function split( - stmts, - indicies = Any[], private = Set{Symbol}(), - ) - # 1. Split the code into blocks separated by `@synchronize` - # 2. Aggregate `@index` expressions - # 3. Hoist allocations - # 4. Hoist uniforms - - current = Any[] - allocations = Any[] - private_allocations = Any[] - new_stmts = Any[] - for stmt in stmts - has_sync = find_sync(stmt) - if has_sync - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) - push!(new_stmts, emit(loop)) - allocations = Any[] - private_allocations = Any[] - current = Any[] - is_sync(stmt) && continue - - # Recurse into scope constructs - # TODO: This currently implements hard scoping - # probably need to implemet soft scoping - # by not deepcopying the environment. - recurse(x) = x - function recurse(expr::Expr) - expr = unblock(expr) - if is_scope_construct(expr) && any(find_sync, expr.args) - new_args = unblock(split(expr.args, deepcopy(indicies), deepcopy(private))) - return Expr(expr.head, new_args...) - else - return Expr(expr.head, map(recurse, expr.args)...) - end - end - push!(new_stmts, recurse(stmt)) - continue - end - - if @capture(stmt, @uniform x_) - push!(allocations, stmt) - continue - elseif @capture(stmt, @private lhs_ = rhs_) - push!(private, lhs) - push!(private_allocations, :($lhs = $rhs)) - continue - elseif @capture(stmt, lhs_ = rhs_ | (vs__, lhs_ = rhs_)) - if @capture(rhs, @index(args__)) - push!(indicies, stmt) - continue - elseif @capture(rhs, @localmem(args__) | @uniform(args__)) - push!(allocations, stmt) - continue - elseif @capture(rhs, @private(T_, dims_)) - # Implement the legacy `mem = @private T dims` as - # mem = Scratchpad(T, Val(dims)) - - if dims isa Integer - dims = (dims,) - end - alloc = :($Scratchpad(__ctx__, $T, Val($dims))) - push!(allocations, :($lhs = $alloc)) - push!(private, lhs) - continue - end - end - - push!(current, stmt) - end - - # everything since the last `@synchronize` - if !isempty(current) - loop = WorkgroupLoop(deepcopy(indicies), current, allocations, private_allocations, deepcopy(private)) - push!(new_stmts, emit(loop)) - end - return new_stmts -end - -function emit(loop) - idx = gensym(:I) - for stmt in loop.indicies - # splice index into the i = @index(Cartesian, $idx) - @assert stmt.head === :(=) - rhs = stmt.args[2] - push!(rhs.args, idx) - end - stmts = Any[] - append!(stmts, loop.allocations) - - # private_allocations turn into lhs = ntuple(i->rhs, length(__workitems_iterspace())) - N = gensym(:N) - push!(stmts, :($N = length($__workitems_iterspace(__ctx__)))) - - for stmt in loop.private_allocations - if @capture(stmt, lhs_ = rhs_) - push!(stmts, :($lhs = ntuple(_ -> $rhs, $N))) - else - error("@private $stmt not an assignment") - end - end - - # don't emit empty loops - if !(isempty(loop.stmts) || all(s -> s isa LineNumberNode, loop.stmts)) - body = Expr(:block, loop.stmts...) - body = postwalk(body) do expr - if @capture(expr, lhs_ = rhs_) - if lhs in loop.private - error("Can't assign to variables marked private") - end - elseif @capture(expr, A_[i__]) - if A in loop.private - return :($A[$__index_Local_Linear(__ctx__, $(idx))][$(i...)]) - end - elseif expr isa Symbol - if expr in loop.private - return :($expr[$__index_Local_Linear(__ctx__, $(idx))]) - end - end - return expr - end - loopexpr = quote - for $idx in $__workitems_iterspace(__ctx__) - $__validindex(__ctx__, $idx) || continue - $(loop.indicies...) - $(unblock(body)) - end - end - push!(stmts, loopexpr) - end - - return unblock(Expr(:block, stmts...)) -end +end \ No newline at end of file