JuliaGPU · maleadt · Apr 4, 2023 · Apr 14, 2023 · Apr 14, 2023 · Apr 14, 2023
diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -425,3 +425,268 @@ function run_and_collect(cmd)
 
     return proc, log
 end
+
+
+
+## opaque closures
+
+# TODO: once stabilised, move bits of this into GPUCompiler.jl
+
+using Core.Compiler: IRCode
+using Core: CodeInfo, MethodInstance, CodeInstance, LineNumberNode
+
+# helpers
+
+function compute_ir_rettype(ir::IRCode)
+    rt = Union{}
+    for i = 1:length(ir.stmts)
+        stmt = ir[Core.SSAValue(i)][:stmt]
+        if isa(stmt, Core.Compiler.ReturnNode) && isdefined(stmt, :val)
+            rt = Core.Compiler.tmerge(Core.Compiler.argextype(stmt.val, ir), rt)
+        end
+    end
+    return Core.Compiler.widenconst(rt)
+end
+
+function compute_oc_signature(ir::IRCode, nargs::Int)
+    argtypes = Vector{Any}(undef, nargs)
+    for i = 1:nargs
+        argtypes[i] = Core.Compiler.widenconst(ir.argtypes[i+1])
+    end
+    return Tuple{argtypes...}
+end
+
+function make_oc_codeinfo(ir::IRCode, @nospecialize env...; slotnames=nothing)
+    # NOTE: we need ir.argtypes[1] == typeof(env)
+    ir = Core.Compiler.copy(ir)
+    # if the user didn't specify a definition MethodInstance or filename Symbol to use
+    # for the debuginfo, set a filename now
+    if ir.debuginfo.def === nothing
+        ir.debuginfo.def = Symbol("IR for opaque gpu closure")
+    end
+    nargtypes = length(ir.argtypes)
+    nargs = nargtypes-1
+    sig = compute_oc_signature(ir, nargs)
+    rt = compute_ir_rettype(ir)
+    src = ccall(:jl_new_code_info_uninit, Ref{CodeInfo}, ())
+    if slotnames === nothing
+        src.slotnames = Base.fill(:none, nargtypes)
+    else
+        length(slotnames) == nargtypes || error("mismatched `argtypes` and `slotnames`")
+        src.slotnames = slotnames
+    end
+    src.slotflags = Base.fill(zero(UInt8), nargtypes)
+    src.slottypes = copy(ir.argtypes)
+    Core.Compiler.ir_to_codeinf!(src, ir)
+end
+
+# create a method (like `jl_make_oc_method`)
+function make_oc_method(nargs; file=nothing, line=0, world=GPUCompiler.tls_world_age())
+    meth = ccall(:jl_new_method_uninit, Ref{Method}, (Any,), Main)
+    meth.sig = Tuple
+    meth.isva = false
+    meth.is_for_opaque_closure = 0
+    meth.name = Symbol("opaque gpu closure")
+    meth.nargs = nargs + 1
+    meth.file = something(file, Symbol())
+    meth.line = line
+    Base.@atomic meth.primary_world = world
+    Base.@atomic meth.deleted_world = typemax(UInt)
+    return meth
+end
+
+function make_oc_codeinstance(mi::MethodInstance, src::CodeInfo; interp, world, rt)
+    owner = Core.Compiler.cache_owner(interp)
+    exctype = Any
+    inferred_const = C_NULL
+    inferred = src
+    const_flags = Int32(0)
+    min_world = world
+    max_world = typemax(UInt)
+    ipo_effects = UInt32(0)
+    effects = UInt32(0)
+    analysis_results = nothing
+    relocatability = UInt8(0)
+    CodeInstance(mi, owner, rt, exctype, inferred_const, inferred,
+                    const_flags, min_world, max_world, ipo_effects, effects,
+                    analysis_results, relocatability, src.debuginfo)
+end
+
+# generated function `ccall`, working around the restriction that ccall type
+# tuples need to be literals. this relies on ccall internals...
+@inline @generated function generated_ccall(f::Ptr, _rettyp, _types, vals...)
+    ex = quote end
+
+    rettyp = _rettyp.parameters[1]
+    types = _types.parameters[1].parameters
+    args = [:(vals[$i]) for i in 1:length(vals)]
+
+    # cconvert
+    cconverted = [Symbol("cconverted_$i") for i in 1:length(vals)]
+    for (dst, typ, src) in zip(cconverted, types, args)
+      append!(ex.args, (quote
+         $dst = Base.cconvert($typ, $src)
+      end).args)
+    end
+
+    # unsafe_convert
+    unsafe_converted = [Symbol("unsafe_converted_$i") for i in 1:length(vals)]
+    for (dst, typ, src) in zip(unsafe_converted, types, cconverted)
+      append!(ex.args, (quote
+         $dst = Base.unsafe_convert($typ, $src)
+      end).args)
+    end
+
+    call = Expr(:foreigncall, :f, rettyp, Core.svec(types...), 0,
+                QuoteNode(:ccall), unsafe_converted..., cconverted...)
+    push!(ex.args, call)
+    return ex
+end
+
+# static opaque closures
+
+# XXX: because we can't call functions from other CUDA modules, we effectively need to
+#      recompile when the target function changes. this, and because of how GPUCompiler's
+#      deferred compilation mechanism currently works, is why we have `F` as a type param.
+
+# XXX: because of GPU code requiring specialized signatures, we also need to recompile
+#      when the environment or argument types change. together with the above, this
+#      negates much of the benefit of opaque closures.
+
+# TODO: support for constructing an opaque closure from source code
+
+# TODO: complete support for passing an environment. this probably requires a split into
+#       host and device structures to, e.g., root a CuArray and pass a CuDeviceArray.
+
+struct OpaqueClosure{F, E, A, R}    # func, env, args, ret
+    env::E
+end
+
+function OpaqueClosure(ir::IRCode, @nospecialize env...;
+                       slotnames::Union{Nothing,Vector{Symbol}}=nothing)
+    nargtypes = length(ir.argtypes)
+    nargs = nargtypes-1
+    sig = compute_oc_signature(ir, nargs)
+    rt = compute_ir_rettype(ir)
+    src = make_oc_codeinfo(ir, env...; slotnames)
+    return create_static_oc(src, sig, rt, nargs, env...)
+end
+
+function OpaqueClosure(src::CodeInfo, @nospecialize env...; rettype, sig, nargs)
+    return create_static_oc(src, sig, rettype, nargs, env...)
+end
+
+function create_static_oc(src, @nospecialize(sig), @nospecialize(rt), nargs::Int,
+                          @nospecialize env...; file=nothing, line=0)
+    config = compiler_config(device(); kernel=false)
+    meth = make_oc_method(nargs; file, line)
+
+    # look up a method instance and create a compiler job
+    full_sig = Tuple{typeof(env), sig.parameters...}
+    mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
+               (Any, Any, Any), meth, full_sig, Core.svec())
+    job = CompilerJob(mi, config, meth.primary_world)
+
+    # create a callable object
+    id = length(GPUCompiler.deferred_codegen_jobs) + 1
+    GPUCompiler.deferred_codegen_jobs[id] = job
+    oc = OpaqueClosure{id, typeof(env), sig, rt}(env)
+
+    opaque_closure_jobs[job] = (; oc, src, rt)
+    return oc
+end
+
+# device-side call
+(oc::OpaqueClosure)(args...) = call(oc, args...)
+## NOTE: split into two to make `SciML.isinplace(oc)` work.
+##       it also resembles how kernels are called.
+@inline function call(oc::OpaqueClosure{F,E,A,R}, args...) where {F,E,A,R}
+    ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), F)
+    assume(ptr != C_NULL)
+    #ccall(ptr, R, (A...), args...)
+    generated_ccall(ptr, R, A, args...)
+end
+
+# dynamic opaque closures
+
+const jit_opaque_closures = Dict()
+
+struct JITOpaqueClosure{B, T}
+    builder::B
+    tfunc::T
+
+    function JITOpaqueClosure(builder, tfunc=Returns(nothing); nargs)
+        # the device and world are captured at closure construction time, but we only need
+        # them when creating the CompilerJob. as we cannot simply encode them in the
+        # JITOpaqueClosure object, we store them in a global dictionary instead.
+        config = compiler_config(device(); kernel=false)
+        meth = make_oc_method(nargs)
+
+        # create a callable object
+        oc = new{typeof(builder), typeof(tfunc)}(builder, tfunc)
+        jit_opaque_closures[typeof(oc)] = (; env=(), meth, config, oc)
+
+        return oc
+    end
+end
+
+# device-side call
+function (oc::JITOpaqueClosure)(args...)
+    rt = oc.tfunc(map(Core.Typeof, args)...)
+    call(oc, rt, args...)
+end
+@inline @generated function call(oct::JITOpaqueClosure{B,T}, ::Type{R}, args...) where {B,T,R}
+    rt = R
+    (; env, meth, config, oc) = jit_opaque_closures[oct]
+
+    # look up a method instance and create a compiler job
+    full_sig = Tuple{typeof(env), args...}
+    mi = ccall(:jl_specializations_get_linfo, Ref{MethodInstance},
+               (Any, Any, Any), meth, full_sig, Core.svec())
+    job = CompilerJob(mi, config, meth.primary_world)
+    opaque_closure_jobs[job] = (; oc, args, rt)
+
+    # generate a deferred compilation call
+    id = length(GPUCompiler.deferred_codegen_jobs) + 1
+    GPUCompiler.deferred_codegen_jobs[id] = job
+    quote
+        ptr = ccall("extern deferred_codegen", llvmcall, Ptr{Cvoid}, (Int,), $id)
+        assume(ptr != C_NULL)
+        #ccall(ptr, R, (A...), args...)
+        generated_ccall(ptr, $rt, $(Tuple{args...}), args...)
+    end
+end
+
+# compilation of opaque closures
+
+const opaque_closure_jobs = Dict{CompilerJob,Any}()
+
+function GPUCompiler.prepare_job!(@nospecialize(job::CUDACompilerJob))
+    if haskey(opaque_closure_jobs, job)
+        rt = opaque_closure_jobs[job].rt
+        oc = opaque_closure_jobs[job].oc
+        if oc isa JITOpaqueClosure
+            args = opaque_closure_jobs[job].args
+            nargs = length(args)
+
+            src = oc.builder(args...)
+            if src isa IRCode
+                nargtypes = length(src.argtypes)
+                nargs = nargtypes-1
+                sig = compute_oc_signature(src, nargs)
+                @assert compute_ir_rettype(src) == rt "Inferred return type does not match the provided return type"
+                src = make_oc_codeinfo(src)
+            end
+        else
+            src = opaque_closure_jobs[job].src
+        end
+        @assert src isa CodeInfo
+
+        # create a code instance and store it in the cache
+        interp = GPUCompiler.get_interpreter(job)
+        ci = make_oc_codeinstance(job.source, src; interp, job.world, rt)
+        Core.Compiler.setindex!(GPUCompiler.ci_cache(job), ci, job.source)
+    end
+
+    return
+end
diff --git a/test/core/execution.jl b/test/core/execution.jl
@@ -1095,3 +1095,87 @@ end
 end
 
 ############################################################################################
+
+if VERSION >= v"1.12-"
+@testset "opaque closures" begin
+
+# static closure, constructed from IRCode
+let
+    ir, rettyp = only(Base.code_ircode(+, (Int, Int)))
+    oc = CUDA.OpaqueClosure(ir)
+
+    c = CuArray([0])
+    a = CuArray([1])
+    b = CuArray([2])
+
+    function kernel(oc, c, a, b)
+        i = threadIdx().x
+        @inbounds c[i] = oc(a[i], b[i])
+        return
+    end
+    @cuda threads=1 kernel(oc, c, a, b)
+
+    @test Array(c)[] == 3
+end
+
+# static closure, constructed from CodeInfo
+let
+    ir, rettype = only(Base.code_typed(*, (Int, Int, Int)))
+    oc = CUDA.OpaqueClosure(ir; sig=Tuple{Int,Int,Int}, rettype, nargs=3)
+
+    d = CuArray([1])
+    a = CuArray([2])
+    b = CuArray([3])
+    c = CuArray([4])
+
+    function kernel(oc, d, a, b, c)
+        i = threadIdx().x
+        @inbounds d[i] = oc(a[i], b[i], c[i])
+        return
+    end
+    @cuda threads=1 kernel(oc, d, a, b, c)
+
+    @test Array(d)[] == 24
+end
+
+# dynamic closure, constructing IRCode based on argument types
+let
+    tfunc(arg1, arg2) = Core.Compiler.return_type(+, Tuple{arg1,arg2})
+    function builder(arg1, arg2)
+        ir, rettyp = only(Base.code_ircode(+, (arg1, arg2)))
+        return ir
+    end
+
+    oc = CUDA.JITOpaqueClosure(builder, tfunc; nargs=2)
+
+    function kernel(oc, c, a, b)
+        i = threadIdx().x
+        @inbounds c[i] = oc(a[i], b[i])
+        return
+    end
+
+    let
+        c = CuArray([0])
+        a = CuArray([1])
+        b = CuArray([2])
+
+        @cuda threads=1 kernel(oc, c, a, b)
+
+        @test Array(c)[] == 3
+    end
+
+    let
+        c = CuArray([3f0])
+        a = CuArray([4f0])
+        b = CuArray([5f0])
+
+        @cuda threads=1 kernel(oc, c, a, b)
+
+        @test Array(c)[] == 9f0
+    end
+end
+
+end
+end
+
+############################################################################################