Skip to content

Commit

Permalink
Add support for dynamically-constructed opaque closures.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Apr 17, 2024
1 parent c4d7db3 commit 7782216
Show file tree
Hide file tree
Showing 2 changed files with 200 additions and 70 deletions.
229 changes: 161 additions & 68 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -435,22 +435,7 @@ end
using Core.Compiler: IRCode
using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode

struct OpaqueClosure{F, E, A, R} # func, env, args, ret
env::E
end

# XXX: because we can't call functions from other CUDA modules, we effectively need to
# recompile when the target function changes. this, and because of how GPUCompiler's
# deferred compilation mechanism currently works, is why we have `F` as a type param.

# XXX: because of GPU code requiring specialized signatures, we also need to recompile
# when the environment or argument types change. together with the above, this
# negates much of the benefit of opaque closures.

# TODO: support for constructing an opaque closure from source code

# TODO: complete support for passing an environment. this probably requires a split into
# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray.
# helpers

function compute_ir_rettype(ir::IRCode)
rt = Union{}
Expand All @@ -463,32 +448,25 @@ function compute_ir_rettype(ir::IRCode)
return Core.Compiler.widenconst(rt)
end

function compute_oc_signature(ir::IRCode, nargs::Int, isva::Bool)
function compute_oc_signature(ir::IRCode, nargs::Int)
argtypes = Vector{Any}(undef, nargs)
for i = 1:nargs
argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1])
end
if isva
lastarg = pop!(argtypes)
if lastarg <: Tuple
append!(argtypes, lastarg.parameters)
else
push!(argtypes, Vararg{Any})
end
end
return Tuple{argtypes...}
end

function OpaqueClosure(ir::IRCode, @nospecialize env...;
isva::Bool = false,
slotnames::Union{Nothing,Vector{Symbol}}=nothing)
function make_oc_codeinfo(ir::IRCode, @nospecialize env...; slotnames=nothing)
# NOTE: we need ir.argtypes[1] == typeof(env)
ir = Core.Compiler.copy(ir)
# if the user didn't specify a definition MethodInstance or filename Symbol to use for the debuginfo, set a filename now
ir.debuginfo.def === nothing && (ir.debuginfo.def = :var"generated IR for OpaqueClosure")
# if the user didn't specify a definition MethodInstance or filename Symbol to use
# for the debuginfo, set a filename now
if ir.debuginfo.def === nothing
ir.debuginfo.def = Symbol("IR for opaque gpu closure")
end
nargtypes = length(ir.argtypes)
nargs = nargtypes-1
sig = compute_oc_signature(ir, nargs, isva)
sig = compute_oc_signature(ir, nargs)
rt = compute_ir_rettype(ir)
src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ())
if slotnames === nothing
Expand All @@ -499,61 +477,39 @@ function OpaqueClosure(ir::IRCode, @nospecialize env...;
end
src.slotflags = Base.fill(zero(UInt8), nargtypes)
src.slottypes = copy(ir.argtypes)
src = Core.Compiler.ir_to_codeinf!(src, ir)
config = compiler_config(device(); kernel=false)
return generate_opaque_closure(config, src, sig, rt, nargs, isva, env...)
end

function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs, isva=false)
config = compiler_config(device(); kernel=false)
return generate_opaque_closure(config, src, sig, rettype, nargs, isva, env...)
Core.Compiler.ir_to_codeinf!(src, ir)
end

function generate_opaque_closure(config::CompilerConfig, src::CodeInfo,
@nospecialize(sig), @nospecialize(rt),
nargs::Int, isva::Bool, @nospecialize env...;
mod::Module=@__MODULE__,
file::Union{Nothing,Symbol}=nothing, line::Int=0)
# create a method (like `jl_make_opaque_closure_method`)
# create a method (like `jl_make_oc_method`)
function make_oc_method(nargs; file=nothing, line=0, world=GPUCompiler.tls_world_age())
meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main)
meth.sig = Tuple
meth.isva = isva # XXX: probably not supported?
meth.is_for_opaque_closure = 0 # XXX: do we want this?
meth.isva = false
meth.is_for_opaque_closure = 0
meth.name = Symbol("opaque gpu closure")
meth.nargs = nargs + 1
meth.file = something(file, Symbol())
meth.line = line
ccall(:jl_method_set_source, Nothing, (Any, Any), meth, src)

# look up a method instance and create a compiler job
full_sig = Tuple{typeof(env), sig.parameters...}
mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
(Any, Any, Any), meth, full_sig, Core.svec())
job = CompilerJob(mi, config) # this captures the current world age
Base.@atomic meth.primary_world = job.world
Base.@atomic meth.primary_world = world
Base.@atomic meth.deleted_world = typemax(UInt)
return meth
end

# create a code instance and store it in the cache
interp = GPUCompiler.get_interpreter(job)
function make_oc_codeinstance(mi::MethodInstance, src::CodeInfo; interp, world, rt)
owner = Core.Compiler.cache_owner(interp)
exctype = Any
inferred_const = C_NULL
inferred = src
const_flags = Int32(0)
min_world = meth.primary_world
max_world = meth.deleted_world
min_world = world
max_world = typemax(UInt)
ipo_effects = UInt32(0)
effects = UInt32(0)
analysis_results = nothing
relocatability = UInt8(0)
ci = CodeInstance(mi, owner, rt, exctype, inferred_const, inferred,
const_flags, min_world, max_world, ipo_effects, effects,
analysis_results, relocatability, src.debuginfo)
Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, mi)

id = length(GPUCompiler.deferred_codegen_jobs) + 1
GPUCompiler.deferred_codegen_jobs[id] = job
return OpaqueClosure{id, typeof(env), sig, rt}(env)
CodeInstance(mi, owner, rt, exctype, inferred_const, inferred,
const_flags, min_world, max_world, ipo_effects, effects,
analysis_results, relocatability, src.debuginfo)
end

# generated function `ccall`, working around the restriction that ccall type
Expand Down Expand Up @@ -587,7 +543,60 @@ end
return ex
end

# device-side call to an opaque closure
# static opaque closures

# XXX: because we can't call functions from other CUDA modules, we effectively need to
# recompile when the target function changes. this, and because of how GPUCompiler's
# deferred compilation mechanism currently works, is why we have `F` as a type param.

# XXX: because of GPU code requiring specialized signatures, we also need to recompile
# when the environment or argument types change. together with the above, this
# negates much of the benefit of opaque closures.

# TODO: support for constructing an opaque closure from source code

# TODO: complete support for passing an environment. this probably requires a split into
# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray.

struct OpaqueClosure{F, E, A, R} # func, env, args, ret
env::E
end

function OpaqueClosure(ir::IRCode, @nospecialize env...;
slotnames::Union{Nothing,Vector{Symbol}}=nothing)
nargtypes = length(ir.argtypes)
nargs = nargtypes-1
sig = compute_oc_signature(ir, nargs)
rt = compute_ir_rettype(ir)
src = make_oc_codeinfo(ir, env...; slotnames)
return create_static_oc(src, sig, rt, nargs, env...)
end

function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs)
return create_static_oc(src, sig, rettype, nargs, env...)
end

function create_static_oc(src, @nospecialize(sig), @nospecialize(rt), nargs::Int,
@nospecialize env...; file=nothing, line=0)
config = compiler_config(device(); kernel=false)
meth = make_oc_method(nargs; file, line)

# look up a method instance and create a compiler job
full_sig = Tuple{typeof(env), sig.parameters...}
mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
(Any, Any, Any), meth, full_sig, Core.svec())
job = CompilerJob(mi, config, meth.primary_world)

# create a callable object
id = length(GPUCompiler.deferred_codegen_jobs) + 1
GPUCompiler.deferred_codegen_jobs[id] = job
oc = OpaqueClosure{id, typeof(env), sig, rt}(env)

opaque_closure_jobs[job] = (; oc, src, rt)
return oc
end

# device-side call
(oc::OpaqueClosure)(args...) = call(oc, args...)
## NOTE: split into two to make `SciML.isinplace(oc)` work.
## it also resembles how kernels are called.
Expand All @@ -597,3 +606,87 @@ end
#ccall(ptr, R, (A...), args...)
generated_ccall(ptr, R, A, args...)
end

# dynamic opaque closures

const jit_opaque_closures = Dict()

struct JITOpaqueClosure{B, T}
builder::B
tfunc::T

function JITOpaqueClosure(builder, tfunc=Returns(nothing); nargs)
# the device and world are captured at closure construction time, but we only need
# them when creating the CompilerJob. as we cannot simply encode them in the
# JITOpaqueClosure object, we store them in a global dictionary instead.
config = compiler_config(device(); kernel=false)
meth = make_oc_method(nargs)

# create a callable object
oc = new{typeof(builder), typeof(tfunc)}(builder, tfunc)
jit_opaque_closures[typeof(oc)] = (; env=(), meth, config, oc)

return oc
end
end

# device-side call
function (oc::JITOpaqueClosure)(args...)
rt = oc.tfunc(map(Core.Typeof, args)...)
call(oc, rt, args...)
end
@inline @generated function call(oct::JITOpaqueClosure{B,T}, ::Type{R}, args...) where {B,T,R}
rt = R
(; env, meth, config, oc) = jit_opaque_closures[oct]

# look up a method instance and create a compiler job
full_sig = Tuple{typeof(env), args...}
mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
(Any, Any, Any), meth, full_sig, Core.svec())
job = CompilerJob(mi, config, meth.primary_world)
opaque_closure_jobs[job] = (; oc, args, rt)

# generate a deferred compilation call
id = length(GPUCompiler.deferred_codegen_jobs) + 1
GPUCompiler.deferred_codegen_jobs[id] = job
quote
ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id)
assume(ptr != C_NULL)
#ccall(ptr, R, (A...), args...)
generated_ccall(ptr, $rt, $(Tuple{args...}), args...)
end
end

# compilation of opaque closures

const opaque_closure_jobs = Dict{CompilerJob,Any}()

function GPUCompiler.prepare_job!(@nospecialize(job::CUDACompilerJob))
if haskey(opaque_closure_jobs, job)
rt = opaque_closure_jobs[job].rt
oc = opaque_closure_jobs[job].oc
if oc isa JITOpaqueClosure
args = opaque_closure_jobs[job].args
nargs = length(args)

src = oc.builder(args...)
if src isa IRCode
nargtypes = length(src.argtypes)
nargs = nargtypes-1
sig = compute_oc_signature(src, nargs)
@assert compute_ir_rettype(src) == rt "Inferred return type does not match the provided return type"
src = make_oc_codeinfo(src)
end
else
src = opaque_closure_jobs[job].src
end
@assert src isa CodeInfo

# create a code instance and store it in the cache
interp = GPUCompiler.get_interpreter(job)
ci = make_oc_codeinstance(job.source, src; interp, job.world, rt)
Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, job.source)
end

return
end
41 changes: 39 additions & 2 deletions test/core/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,7 @@ end
if VERSION >= v"1.12-"
@testset "opaque closures" begin

# basic closure, constructed from IRCode
# static closure, constructed from IRCode
let
ir, rettyp = only(Base.code_ircode(+, (Int, Int)))
oc = CUDA.OpaqueClosure(ir)
Expand All @@ -1118,7 +1118,7 @@ let
@test Array(c)[] == 3
end

# basic closure, constructed from CodeInfo
# static closure, constructed from CodeInfo
let
ir, rettype = only(Base.code_typed(*, (Int, Int, Int)))
oc = CUDA.OpaqueClosure(ir; sig=Tuple{Int,Int,Int}, rettype, nargs=3)
Expand All @@ -1138,6 +1138,43 @@ let
@test Array(d)[] == 24
end

# dynamic closure, constructing IRCode based on argument types
let
tfunc(arg1, arg2) = Core.Compiler.return_type(+, Tuple{arg1,arg2})
function builder(arg1, arg2)
ir, rettyp = only(Base.code_ircode(+, (arg1, arg2)))
return ir
end

oc = CUDA.JITOpaqueClosure(builder, tfunc; nargs=2)

function kernel(oc, c, a, b)
i = threadIdx().x
@inbounds c[i] = oc(a[i], b[i])
return
end

let
c = CuArray([0])
a = CuArray([1])
b = CuArray([2])

@cuda threads=1 kernel(oc, c, a, b)

@test Array(c)[] == 3
end

let
c = CuArray([3f0])
a = CuArray([4f0])
b = CuArray([5f0])

@cuda threads=1 kernel(oc, c, a, b)

@test Array(c)[] == 9f0
end
end

end
end

Expand Down

0 comments on commit 7782216

Please sign in to comment.