Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement 3D async copy #447

Merged
merged 5 commits into from
Jul 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/hip/libhip.jl
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@ function hipPointerGetAttributes(attribute, ptr)
(Ptr{hipPointerAttribute_t}, Ptr{Cvoid}), attribute, ptr)
end

function hipMemcpy3DAsync(p, stream)
ccall((:hipMemcpy3DAsync, libhip), hipError_t,
(Ptr{hipMemcpy3DParms}, hipStream_t), p, stream)
end

function hipMallocAsync(ptr, sz, stream)
ccall((:hipMallocAsync, libhip), hipError_t,
(Ptr{Ptr{Cvoid}}, Csize_t, hipStream_t),
Expand Down
64 changes: 47 additions & 17 deletions src/hip/libhip_common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,6 @@ const HIP_LAUNCH_PARAM_BUFFER_POINTER = Ptr{Cvoid}(1)
const HIP_LAUNCH_PARAM_BUFFER_SIZE = Ptr{Cvoid}(2)
const HIP_LAUNCH_PARAM_END = Ptr{Cvoid}(3)

@cenum hipMemoryType begin
hipMemoryTypeHost
hipMemoryTypeDevice
hipMemoryTypeArray
hipMemoryTypeUnified
hipMemoryTypeManaged
end

struct hipPointerAttribute_t
memoryType::hipMemoryType
device::Cint
devicePointer::Ptr{Cvoid}
hostPointer::Ptr{Cvoid}
isManaged::Cint
allocationFlags::Cuint
end

@cenum hipMemcpyKind begin
hipMemcpyHostToHost
hipMemcpyHostToDevice
Expand All @@ -27,6 +10,14 @@ end
hipMemcpyDefault
end

@cenum hipMemoryType begin
hipMemoryTypeHost
hipMemoryTypeDevice
hipMemoryTypeArray
hipMemoryTypeUnified
hipMemoryTypeManaged
end

@cenum hiprtcResult::UInt32 begin
HIPRTC_SUCCESS = 0
HIPRTC_ERROR_OUT_OF_MEMORY = 1
Expand Down Expand Up @@ -206,6 +197,45 @@ end
hipErrorTbd
end

struct hipPos
x::Csize_t
y::Csize_t
z::Csize_t
end

struct hipPitchedPtr
ptr::Ptr{Cvoid}
pitch::Csize_t
xsize::Csize_t
ysize::Csize_t
end

struct hipExtent
width::Csize_t
height::Csize_t
depth::Csize_t
end

struct hipMemcpy3DParms
stcArray::Ptr{Cvoid}
srcPos::hipPos
srcPtr::hipPitchedPtr
dstArray::Ptr{Cvoid}
dstPos::hipPos
dstPtr::hipPitchedPtr
extent::hipExtent
kind::hipMemcpyKind
end

struct hipPointerAttribute_t
memoryType::hipMemoryType
device::Cint
devicePointer::Ptr{Cvoid}
hostPointer::Ptr{Cvoid}
isManaged::Cint
allocationFlags::Cuint
end

hipContext_t = Ptr{Cvoid}

hipDevice_t = Ptr{Cvoid}
Expand Down
54 changes: 54 additions & 0 deletions src/runtime/memory/hip.jl
Original file line number Diff line number Diff line change
Expand Up @@ -246,3 +246,57 @@ function attributes(ptr::Ptr{Cvoid})
st = HIP.hipPointerGetAttributes(data, ptr)
st, data[]
end

"""
Asynchronous 3D array copy.

# Arguments:

- `width::Integer`: Width of 3D memory copy.
- `height::Integer = 1`: Height of 3D memory copy.
- `depth::Integer = 1`: Depth of 3D memory copy.
- `dstPos::ROCDim3 = (1, 1, 1)`:
Starting position of the destination data for the copy.
- `srcPos::ROCDim3 = (1, 1, 1)`:
Starting position of the source data for the copy.
- `dstPitch::Integer = 0`: Pitch in bytes of the destination data for the copy.
- `srcPitch::Integer = 0`: Pitch in bytes of the source data for the copy.
- `async::Bool = false`: If `false`, then synchronize `stream` immediately.
- `stream::HIP.HIPStream = AMDGPU.stream()`: Stream on which to perform the copy.
"""
function unsafe_copy3d!(
dst::Ptr{T}, dstTyp::Type{D},
src::Ptr{T}, srcTyp::Type{S},
width::Integer, height::Integer = 1, depth::Integer = 1;
dstPos::ROCDim = (1, 1, 1), srcPos::ROCDim = (1, 1, 1),
dstPitch::Integer = 0, dstWidth::Integer = 0, dstHeight::Integer = 0,
srcPitch::Integer = 0, srcWidth::Integer = 0, srcHeight::Integer = 0,
async::Bool = false, stream::HIP.HIPStream = AMDGPU.stream(),
) where {T, D, S}
(width == 0 || height == 0 || depth == 0) && return dst

srcPos, dstPos = ROCDim3(srcPos), ROCDim3(dstPos)
srcPos = HIP.hipPos((srcPos[1] - 1) * sizeof(T), srcPos[2] - 1, srcPos[3] - 1)
dstPos = HIP.hipPos((dstPos[1] - 1) * sizeof(T), dstPos[2] - 1, dstPos[3] - 1)

extent = HIP.hipExtent(width * sizeof(T), height, depth)
kind = if D <: HIPBuffer && S <: HIPBuffer
HIP.hipMemcpyDeviceToDevice
elseif D <: HIPBuffer && S <: HostBuffer
HIP.hipMemcpyHostToDevice
elseif D <: HostBuffer && S <: HIPBuffer
HIP.hipMemcpyDeviceToHost
elseif D <: HostBuffer && S <: HostBuffer
HIP.hipMemcpyHostToHost
end

srcPtr = HIP.hipPitchedPtr(src, srcPitch, srcWidth, srcHeight)
dstPtr = HIP.hipPitchedPtr(dst, dstPitch, dstWidth, dstHeight)
params = Ref(HIP.hipMemcpy3DParms(
C_NULL, srcPos, srcPtr,
C_NULL, dstPos, dstPtr, extent, kind))

HIP.hipMemcpy3DAsync(params, stream) |> HIP.check
async || AMDGPU.synchronize(stream)
return dst
end
40 changes: 0 additions & 40 deletions src/runtime/memory/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ Returns the used amount of memory (in bytes), allocated on the device.
"""
used() = total() - free()

const ALL_ALLOCS = Threads.Atomic{Int64}(0)

function parse_memory_limit(limit_str::String)
limit_str == "none" && return typemax(UInt64)

Expand Down Expand Up @@ -117,41 +115,3 @@ function alloc_or_retry!(f)
check(status)
return
end

"""
Allocate linear memory on the device and return a buffer to the allocated memory. The
allocated memory is suitably aligned for any kind of variable. The memory will not be freed
automatically, use [`free(::Buffer)`](@ref) for that.
"""
function alloc end

"""
Free device memory.
"""
function free end

"""
Initialize device memory with a repeating value.
"""
function set! end

"""
Upload memory from host to device.
Executed asynchronously on `queue` if `async` is true.
"""
function upload end
@doc (@doc upload) upload!

"""
Download memory from device to host.
Executed asynchronously on `queue` if `async` is true.
"""
function download end
@doc (@doc download) download!

"""
Transfer memory from device to device.
Executed asynchronously on `queue` if `async` is true.
"""
function transfer end
@doc (@doc transfer) transfer!
101 changes: 101 additions & 0 deletions test/rocarray/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,105 @@ end
@test refcount_live(A) == (0, false)
end

@testset "unsafe_copy3d!" begin
@testset "Full copy" begin
T = Int32
src = ROCArray(ones(T, 4, 4, 4))
dst = ROCArray(zeros(T, 4, 4, 4))
Mem.unsafe_copy3d!(
pointer(dst), typeof(dst.buf),
pointer(src), typeof(src.buf),
length(src))
@test Array(src) == Array(dst)
end

@testset "3D Copy middle part of y-z planes, each dimension is different in size" begin
nx, ny, nz = 4, 6, 8
src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz)))
dst = ROCArray(zeros(Int, nx, ny, nz))
Mem.unsafe_copy3d!(
pointer(dst), typeof(dst.buf),
pointer(src), typeof(src.buf),
1, 4, 4;
dstPos=(1, 2, 3), srcPos=(1, 2, 3),
dstPitch=nx * sizeof(Int), dstHeight=ny,
srcPitch=nx * sizeof(Int), srcHeight=ny)
@test Array(src)[1, 2:5, 3:6] == Array(dst)[1, 2:5, 3:6]
end

@testset "3D Copy middle part of x-y-z planes, each dimension is different in size" begin
nx, ny, nz = 4, 6, 8
src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz)))
dst = ROCArray(zeros(Int, nx, ny, nz))
Mem.unsafe_copy3d!(
pointer(dst), typeof(dst.buf),
pointer(src), typeof(src.buf),
2, 4, 4;
dstPos=(2, 2, 3), srcPos=(2, 2, 3),
dstPitch=nx * sizeof(Int), dstHeight=ny,
srcPitch=nx * sizeof(Int), srcHeight=ny)
@test Array(src)[2:3, 2:5, 3:6] == Array(dst)[2:3, 2:5, 3:6]
end

@testset "3D -> 2D -> 3D copy" begin
nx, ny, nz = 2, 3, 4
T = Int
P = ROCArray(reshape(1:(2 * 3 * 4), nx, ny, nz))

for dim in 1:3
if dim == 1
ranges = [2:2, 1:size(P,2), 1:size(P,3)]
buf = zeros(T, size(P,2), size(P,3))
buf_view_shape = (1, size(P,2), size(P,3))
elseif dim == 2
ranges = [1:size(P,1), 3:3, 1:size(P,3)]
buf = zeros(T, size(P,1), size(P,3))
buf_view_shape = (size(P,1), 1, size(P,3))
elseif dim == 3
ranges = [1:size(P,1), 1:size(P,2), 3:3]
buf = zeros(T, size(P,1), size(P,2))
buf_view_shape = (size(P,1), size(P,2), 1)
end

# Reshape 2D to 3D for simplicity.
buf_view = reshape(buf, buf_view_shape)

AMDGPU.Mem.unsafe_copy3d!(
pointer(buf), AMDGPU.Mem.HostBuffer,
pointer(P), typeof(P.buf),
length(ranges[1]), length(ranges[2]), length(ranges[3]);
srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]),
dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
srcPitch=sizeof(T) * size(P, 1), srcHeight=size(P, 2))

if dim == 1
@assert buf == Array(P)[2, :, :]
elseif dim == 2
@assert buf == Array(P)[:, 3, :]
elseif dim == 3
@assert buf == Array(P)[:, :, 3]
end

# host to device
P2 = similar(P)

AMDGPU.Mem.unsafe_copy3d!(
pointer(P2), typeof(P2.buf),
pointer(buf), AMDGPU.Mem.HostBuffer,
length(ranges[1]), length(ranges[2]), length(ranges[3]);
dstPos=(ranges[1][1], ranges[2][1], ranges[3][1]),
dstPitch=sizeof(T) * size(P2,1), dstHeight=size(P2, 2),
srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2))

if dim == 1
@assert Array(P2)[2, :, :] == Array(P)[2, :, :]
elseif dim == 2
@assert Array(P2)[:, 3, :] == Array(P)[:, 3, :]
elseif dim == 3
@assert Array(P2)[:, :, 3] == Array(P)[:, :, 3]
end
end
end
end

end
7 changes: 4 additions & 3 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ if "core" in TARGET_TESTS
include("codegen/synchronization.jl")
include("codegen/trap.jl")
end)
push!(tests, "Multitasking" => ()->include("tls.jl"))
push!(tests, "ROCArray - Base" => ()->include("rocarray/base.jl"))
push!(tests, "ROCArray - Broadcast" => ()->include("rocarray/broadcast.jl"))

# if AMDGPU.Runtime.LOGGING_STATIC_ENABLED
# push!(tests, "Logging" => ()->include("logging.jl"))
# else
Expand All @@ -145,9 +149,6 @@ if "core" in TARGET_TESTS
# """
# @test_skip "Logging"
# end
push!(tests, "Multitasking" => ()->include("tls.jl"))
push!(tests, "ROCArray - Base" => ()->include("rocarray/base.jl"))
push!(tests, "ROCArray - Broadcast" => ()->include("rocarray/broadcast.jl"))
end

if "hip" in TARGET_TESTS
Expand Down
Loading