Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an experimental opaque closure type. #1853

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 265 additions & 0 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -425,3 +425,268 @@ function run_and_collect(cmd)

return proc, log
end



## opaque closures

# TODO: once stabilised, move bits of this into GPUCompiler.jl

using Core.Compiler: IRCode
using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode

# helpers

function compute_ir_rettype(ir::IRCode)
rt = Union{}
for i = 1:length(ir.stmts)
stmt = ir[Core.SSAValue(i)][:stmt]
if isa(stmt, Core.Compiler.ReturnNode) && isdefined(stmt, :val)
rt = Core.Compiler.tmerge(Core.Compiler.argextype(stmt.val, ir), rt)
end
end
return Core.Compiler.widenconst(rt)
end

function compute_oc_signature(ir::IRCode, nargs::Int)
argtypes = Vector{Any}(undef, nargs)
for i = 1:nargs
argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1])
end
return Tuple{argtypes...}
end

function make_oc_codeinfo(ir::IRCode, @nospecialize env...; slotnames=nothing)
# NOTE: we need ir.argtypes[1] == typeof(env)
ir = Core.Compiler.copy(ir)
# if the user didn't specify a definition MethodInstance or filename Symbol to use
# for the debuginfo, set a filename now
if ir.debuginfo.def === nothing
ir.debuginfo.def = Symbol("IR for opaque gpu closure")
end
nargtypes = length(ir.argtypes)
nargs = nargtypes-1
sig = compute_oc_signature(ir, nargs)
rt = compute_ir_rettype(ir)
src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ())
if slotnames === nothing
src.slotnames = Base.fill(:none, nargtypes)
else
length(slotnames) == nargtypes || error("mismatched `argtypes` and `slotnames`")
src.slotnames = slotnames
end
src.slotflags = Base.fill(zero(UInt8), nargtypes)
src.slottypes = copy(ir.argtypes)
Core.Compiler.ir_to_codeinf!(src, ir)
end

# create a method (like `jl_make_oc_method`)
function make_oc_method(nargs; file=nothing, line=0, world=GPUCompiler.tls_world_age())
meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main)
meth.sig = Tuple
meth.isva = false
meth.is_for_opaque_closure = 0
meth.name = Symbol("opaque gpu closure")
meth.nargs = nargs + 1
meth.file = something(file, Symbol())
meth.line = line
Base.@atomic meth.primary_world = world
Base.@atomic meth.deleted_world = typemax(UInt)
return meth
end

function make_oc_codeinstance(mi::MethodInstance, src::CodeInfo; interp, world, rt)
owner = Core.Compiler.cache_owner(interp)
exctype = Any
inferred_const = C_NULL
inferred = src
const_flags = Int32(0)
min_world = world
max_world = typemax(UInt)
ipo_effects = UInt32(0)
effects = UInt32(0)
analysis_results = nothing
relocatability = UInt8(0)
CodeInstance(mi, owner, rt, exctype, inferred_const, inferred,
const_flags, min_world, max_world, ipo_effects, effects,
analysis_results, relocatability, src.debuginfo)
end

# generated function `ccall`, working around the restriction that ccall type
# tuples need to be literals. this relies on ccall internals...
@inline @generated function generated_ccall(f::Ptr, _rettyp, _types, vals...)
ex = quote end

rettyp = _rettyp.parameters[1]
types = _types.parameters[1].parameters
args = [:(vals[$i]) for i in 1:length(vals)]

# cconvert
cconverted = [Symbol("cconverted_$i") for i in 1:length(vals)]
for (dst, typ, src) in zip(cconverted, types, args)
append!(ex.args, (quote
$dst = Base.cconvert($typ, $src)
end).args)
end

# unsafe_convert
unsafe_converted = [Symbol("unsafe_converted_$i") for i in 1:length(vals)]
for (dst, typ, src) in zip(unsafe_converted, types, cconverted)
append!(ex.args, (quote
$dst = Base.unsafe_convert($typ, $src)
end).args)
end

call = Expr(:foreigncall, :f, rettyp, Core.svec(types...), 0,
QuoteNode(:ccall), unsafe_converted..., cconverted...)
push!(ex.args, call)
return ex
end

# static opaque closures

# XXX: because we can't call functions from other CUDA modules, we effectively need to
# recompile when the target function changes. this, and because of how GPUCompiler's
# deferred compilation mechanism currently works, is why we have `F` as a type param.

# XXX: because of GPU code requiring specialized signatures, we also need to recompile
# when the environment or argument types change. together with the above, this
# negates much of the benefit of opaque closures.

# TODO: support for constructing an opaque closure from source code

# TODO: complete support for passing an environment. this probably requires a split into
# host and device structures to, e.g., root a CuArray and pass a CuDeviceArray.

struct OpaqueClosure{F, E, A, R} # func, env, args, ret
env::E
end

function OpaqueClosure(ir::IRCode, @nospecialize env...;
slotnames::Union{Nothing,Vector{Symbol}}=nothing)
nargtypes = length(ir.argtypes)
nargs = nargtypes-1
sig = compute_oc_signature(ir, nargs)
rt = compute_ir_rettype(ir)
src = make_oc_codeinfo(ir, env...; slotnames)
return create_static_oc(src, sig, rt, nargs, env...)
end

function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs)
return create_static_oc(src, sig, rettype, nargs, env...)
end

function create_static_oc(src, @nospecialize(sig), @nospecialize(rt), nargs::Int,
@nospecialize env...; file=nothing, line=0)
config = compiler_config(device(); kernel=false)
meth = make_oc_method(nargs; file, line)

# look up a method instance and create a compiler job
full_sig = Tuple{typeof(env), sig.parameters...}
mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
(Any, Any, Any), meth, full_sig, Core.svec())
job = CompilerJob(mi, config, meth.primary_world)

# create a callable object
id = length(GPUCompiler.deferred_codegen_jobs) + 1
GPUCompiler.deferred_codegen_jobs[id] = job
oc = OpaqueClosure{id, typeof(env), sig, rt}(env)

opaque_closure_jobs[job] = (; oc, src, rt)
return oc
end

# device-side call
(oc::OpaqueClosure)(args...) = call(oc, args...)
## NOTE: split into two to make `SciML.isinplace(oc)` work.
## it also resembles how kernels are called.
@inline function call(oc::OpaqueClosure{F,E,A,R}, args...) where {F,E,A,R}
ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), F)
assume(ptr != C_NULL)
#ccall(ptr, R, (A...), args...)
generated_ccall(ptr, R, A, args...)
end

# dynamic opaque closures

const jit_opaque_closures = Dict()

struct JITOpaqueClosure{B, T}
builder::B
tfunc::T

function JITOpaqueClosure(builder, tfunc=Returns(nothing); nargs)
# the device and world are captured at closure construction time, but we only need
# them when creating the CompilerJob. as we cannot simply encode them in the
# JITOpaqueClosure object, we store them in a global dictionary instead.
config = compiler_config(device(); kernel=false)
meth = make_oc_method(nargs)

# create a callable object
oc = new{typeof(builder), typeof(tfunc)}(builder, tfunc)
jit_opaque_closures[typeof(oc)] = (; env=(), meth, config, oc)

return oc
end
end

# device-side call
function (oc::JITOpaqueClosure)(args...)
rt = oc.tfunc(map(Core.Typeof, args)...)
call(oc, rt, args...)
end
@inline @generated function call(oct::JITOpaqueClosure{B,T}, ::Type{R}, args...) where {B,T,R}
rt = R
(; env, meth, config, oc) = jit_opaque_closures[oct]

# look up a method instance and create a compiler job
full_sig = Tuple{typeof(env), args...}
mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
(Any, Any, Any), meth, full_sig, Core.svec())
job = CompilerJob(mi, config, meth.primary_world)
opaque_closure_jobs[job] = (; oc, args, rt)

# generate a deferred compilation call
id = length(GPUCompiler.deferred_codegen_jobs) + 1
GPUCompiler.deferred_codegen_jobs[id] = job
quote
ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After #582 you should be able to just emit a gpuc.lookup(mi, oc, args...) or maybe gpuc.deferred(oc, args...)?

assume(ptr != C_NULL)
#ccall(ptr, R, (A...), args...)
generated_ccall(ptr, $rt, $(Tuple{args...}), args...)
end
end

# compilation of opaque closures

const opaque_closure_jobs = Dict{CompilerJob,Any}()

function GPUCompiler.prepare_job!(@nospecialize(job::CUDACompilerJob))
if haskey(opaque_closure_jobs, job)
rt = opaque_closure_jobs[job].rt
oc = opaque_closure_jobs[job].oc
if oc isa JITOpaqueClosure
args = opaque_closure_jobs[job].args
nargs = length(args)

src = oc.builder(args...)
if src isa IRCode
nargtypes = length(src.argtypes)
nargs = nargtypes-1
sig = compute_oc_signature(src, nargs)
@assert compute_ir_rettype(src) == rt "Inferred return type does not match the provided return type"
src = make_oc_codeinfo(src)
end
else
src = opaque_closure_jobs[job].src
end
@assert src isa CodeInfo

# create a code instance and store it in the cache
interp = GPUCompiler.get_interpreter(job)
ci = make_oc_codeinstance(job.source, src; interp, job.world, rt)
Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, job.source)
end

return
end
84 changes: 84 additions & 0 deletions test/core/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1095,3 +1095,87 @@ end
end

############################################################################################

if VERSION >= v"1.12-"
@testset "opaque closures" begin

# static closure, constructed from IRCode
let
ir, rettyp = only(Base.code_ircode(+, (Int, Int)))
oc = CUDA.OpaqueClosure(ir)

c = CuArray([0])
a = CuArray([1])
b = CuArray([2])

function kernel(oc, c, a, b)
i = threadIdx().x
@inbounds c[i] = oc(a[i], b[i])
return
end
@cuda threads=1 kernel(oc, c, a, b)

@test Array(c)[] == 3
end

# static closure, constructed from CodeInfo
let
ir, rettype = only(Base.code_typed(*, (Int, Int, Int)))
oc = CUDA.OpaqueClosure(ir; sig=Tuple{Int,Int,Int}, rettype, nargs=3)

d = CuArray([1])
a = CuArray([2])
b = CuArray([3])
c = CuArray([4])

function kernel(oc, d, a, b, c)
i = threadIdx().x
@inbounds d[i] = oc(a[i], b[i], c[i])
return
end
@cuda threads=1 kernel(oc, d, a, b, c)

@test Array(d)[] == 24
end

# dynamic closure, constructing IRCode based on argument types
let
tfunc(arg1, arg2) = Core.Compiler.return_type(+, Tuple{arg1,arg2})
function builder(arg1, arg2)
ir, rettyp = only(Base.code_ircode(+, (arg1, arg2)))
return ir
end

oc = CUDA.JITOpaqueClosure(builder, tfunc; nargs=2)

function kernel(oc, c, a, b)
i = threadIdx().x
@inbounds c[i] = oc(a[i], b[i])
return
end

let
c = CuArray([0])
a = CuArray([1])
b = CuArray([2])

@cuda threads=1 kernel(oc, c, a, b)

@test Array(c)[] == 3
end

let
c = CuArray([3f0])
a = CuArray([4f0])
b = CuArray([5f0])

@cuda threads=1 kernel(oc, c, a, b)

@test Array(c)[] == 9f0
end
end

end
end

############################################################################################