Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add cms caching cuda allocator #79

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ vecmem_add_library( vecmem_cuda cuda SHARED
"src/utils/cuda_wrappers.hpp"
"src/utils/cuda_wrappers.cpp"
"src/utils/select_device.hpp"
"src/utils/select_device.cpp" )
"src/utils/select_device.cpp"
# not-cub caching allocator
"src/memory/cuda/notcub/allocate_device.cpp"
"src/memory/cuda/notcub/allocate_host.cpp"
)
target_link_libraries( vecmem_cuda
PUBLIC vecmem::core
PRIVATE CUDA::cudart )
747 changes: 747 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/CachingDeviceAllocator.h

Large diffs are not rendered by default.

648 changes: 648 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/CachingHostAllocator.h

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/ScopedSetDevice.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
#define HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h

#include "../../../../../src/utils/cuda_error_handling.hpp"

#include <cuda_runtime.h>

namespace vecmem {
namespace cuda {
namespace notcub {
class ScopedSetDevice {
public:
explicit ScopedSetDevice(int newDevice) {
VECMEM_CUDA_ERROR_CHECK(cudaGetDevice(&prevDevice_));
VECMEM_CUDA_ERROR_CHECK(cudaSetDevice(newDevice));
}

~ScopedSetDevice() {
// Intentionally don't check the return value to avoid
// exceptions to be thrown. If this call fails, the process is
// doomed anyway.
cudaSetDevice(prevDevice_);
}

private:
int prevDevice_;
};
} // namespace notcub
} // namespace cuda
} // namespace vecmem

#endif
18 changes: 18 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/allocate_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef HeterogeneousCore_CUDAUtilities_allocate_device_h
#define HeterogeneousCore_CUDAUtilities_allocate_device_h

#include <cuda_runtime.h>

namespace vecmem {
namespace cuda {
namespace notcub {
// Allocate device memory
void *allocate_device(int device, size_t nbytes, cudaStream_t stream);

// Free device memory (to be called from unique_ptr)
void free_device(int device, void *ptr, cudaStream_t stream);
} // namespace notcub
} // namespace cuda
} // namespace vecmem

#endif
18 changes: 18 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/allocate_host.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef HeterogeneousCore_CUDAUtilities_allocate_host_h
#define HeterogeneousCore_CUDAUtilities_allocate_host_h

#include <cuda_runtime.h>

namespace vecmem {
namespace cuda {
namespace notcub {
// Allocate pinned host memory (to be called from unique_ptr)
void *allocate_host(size_t nbytes, cudaStream_t stream);

// Free pinned host memory (to be called from unique_ptr)
void free_host(void *ptr);
} // namespace notcub
} // namespace cuda
} // namespace vecmem

#endif
18 changes: 18 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/cuda_assert.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// The omission of #include guards is on purpose: it does make sense to #include
// this file multiple times, setting a different value of GPU_DEBUG beforehand.

#ifdef __CUDA_ARCH__
#ifndef GPU_DEBUG
// disable asserts
#ifndef NDEBUG
#define NDEBUG
#endif
#else
// enable asserts
#ifdef NDEBUG
#undef NDEBUG
#endif
#endif
#endif // __CUDA_ARCH__

#include <cassert>
23 changes: 23 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/deviceAllocatorStatus.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h

#include <map>

namespace vecmem {
namespace cuda {
namespace allocator {
struct TotalBytes {
size_t free;
size_t live;
size_t liveRequested; // CMS: monitor also requested amount
TotalBytes() { free = live = liveRequested = 0; }
};
/// Map type of device ordinals to the number of cached bytes cached by each device
using GpuCachedBytes = std::map<int, TotalBytes>;
} // namespace allocator

allocator::GpuCachedBytes deviceAllocatorStatus();
} // namespace cuda
} // namespace vecmem

#endif
20 changes: 20 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/deviceCount.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef HeterogenousCore_CUDAUtilities_deviceCount_h
#define HeterogenousCore_CUDAUtilities_deviceCount_h

#include "../../../../../src/utils/cuda_error_handling.hpp"

#include <cuda_runtime.h>

namespace vecmem {
namespace cuda {
namespace notcub {
inline int deviceCount() {
int ndevices;
VECMEM_CUDA_ERROR_CHECK(cudaGetDeviceCount(&ndevices));
return ndevices;
}
} // namespace notcub
} // namespace cuda
} // namespace vecmem

#endif
86 changes: 86 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/getCachingDeviceAllocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
#define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator

#include <iomanip>
#include <iostream>

#include <cuda_runtime.h>

#include "../../../../../src/utils/cuda_error_handling.hpp"
#include "deviceCount.h"
#include "CachingDeviceAllocator.h"

namespace vecmem::cuda::allocator {
// Use caching or not
enum class Policy { Synchronous = 0, Asynchronous = 1, Caching = 2 };
#ifndef CUDA_DISABLE_CACHING_ALLOCATOR
constexpr Policy policy = Policy::Caching;
#elif CUDA_VERSION >= 11020 && !defined CUDA_DISABLE_ASYNC_ALLOCATOR
constexpr Policy policy = Policy::Asynchronous;
#else
constexpr Policy policy = Policy::Synchronous;
#endif
// Growth factor (bin_growth in cub::CachingDeviceAllocator
constexpr unsigned int binGrowth = 2;
// Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 8;
// Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 30;
// Total storage for the allocator. 0 means no limit.
constexpr size_t maxCachedBytes = 0;
// Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
constexpr double maxCachedFraction = 0.8;
constexpr bool debug = false;

inline size_t minCachedBytes() {
size_t ret = std::numeric_limits<size_t>::max();
int currentDevice;
VECMEM_CUDA_ERROR_CHECK(cudaGetDevice(&currentDevice));
const int numberOfDevices = vecmem::cuda::notcub::deviceCount();
for (int i = 0; i < numberOfDevices; ++i) {
size_t freeMemory, totalMemory;
VECMEM_CUDA_ERROR_CHECK(cudaSetDevice(i));
VECMEM_CUDA_ERROR_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
}
VECMEM_CUDA_ERROR_CHECK(cudaSetDevice(currentDevice));
if (maxCachedBytes > 0) {
ret = std::min(ret, maxCachedBytes);
}
return ret;
}

inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
if (debug) {
std::cout << "cub::CachingDeviceAllocator settings\n"
<< " bin growth " << binGrowth << "\n"
<< " min bin " << minBin << "\n"
<< " max bin " << maxBin << "\n"
<< " resulting bins:\n";
for (auto bin = minBin; bin <= maxBin; ++bin) {
auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n";
} else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n";
} else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n";
} else {
std::cout << " " << std::setw(9) << binSize << " B\n";
}
}
std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
}

// the public interface is thread safe
static notcub::CachingDeviceAllocator allocator{binGrowth,
minBin,
maxBin,
minCachedBytes(),
false, // do not skip cleanup
debug};
return allocator;
}
} // namespace vecmem::cuda::allocator

#endif
45 changes: 45 additions & 0 deletions cuda/include/vecmem/memory/cuda/notcub/getCachingHostAllocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator
#define HeterogeneousCore_CUDACore_src_getCachingHostAllocator

#include <iomanip>
#include <iostream>

#include "../../../../../src/utils/cuda_error_handling.hpp"
#include "CachingHostAllocator.h"
#include "getCachingDeviceAllocator.h"

namespace vecmem::cuda::allocator {
inline notcub::CachingHostAllocator& getCachingHostAllocator() {
if (debug) {
std::cout << "cub::CachingHostAllocator settings\n"
<< " bin growth " << binGrowth << "\n"
<< " min bin " << minBin << "\n"
<< " max bin " << maxBin << "\n"
<< " resulting bins:\n";
for (auto bin = minBin; bin <= maxBin; ++bin) {
auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
std::cout << " " << std::setw(8) << (binSize >> 30) << " GB\n";
} else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
std::cout << " " << std::setw(8) << (binSize >> 20) << " MB\n";
} else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
std::cout << " " << std::setw(8) << (binSize >> 10) << " kB\n";
} else {
std::cout << " " << std::setw(9) << binSize << " B\n";
}
}
std::cout << " maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
}

// the public interface is thread safe
static notcub::CachingHostAllocator allocator{binGrowth,
minBin,
maxBin,
minCachedBytes(),
false, // do not skip cleanup
debug};
return allocator;
}
} // namespace vecmem::cuda::allocator

#endif
52 changes: 52 additions & 0 deletions cuda/src/memory/cuda/notcub/allocate_device.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include <cassert>
#include <limits>

#include <cuda_runtime.h>

#include "../../../utils/cuda_error_handling.hpp"

#include "vecmem/memory/cuda/notcub/ScopedSetDevice.h"
#include "vecmem/memory/cuda/notcub/allocate_device.h"
#include "vecmem/memory/cuda/notcub/getCachingDeviceAllocator.h"

namespace {
const size_t maxAllocationSize =
vecmem::cuda::notcub::CachingDeviceAllocator::IntPow(vecmem::cuda::allocator::binGrowth, vecmem::cuda::allocator::maxBin);
}

namespace vecmem::cuda::notcub {
void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
void *ptr = nullptr;
if constexpr (allocator::policy == allocator::Policy::Caching) {
if (nbytes > maxAllocationSize) {
throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
" bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
}
VECMEM_CUDA_ERROR_CHECK(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
#if CUDA_VERSION >= 11020
} else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
ScopedSetDevice setDeviceForThisScope(dev);
VECMEM_CUDA_ERROR_CHECK(cudaMallocAsync(&ptr, nbytes, stream));
#endif
} else {
ScopedSetDevice setDeviceForThisScope(dev);
VECMEM_CUDA_ERROR_CHECK(cudaMalloc(&ptr, nbytes));
}
return ptr;
}

void free_device(int device, void *ptr, cudaStream_t stream) {
if constexpr (allocator::policy == allocator::Policy::Caching) {
VECMEM_CUDA_ERROR_CHECK(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
#if CUDA_VERSION >= 11020
} else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
ScopedSetDevice setDeviceForThisScope(device);
VECMEM_CUDA_ERROR_CHECK(cudaFreeAsync(ptr, stream));
#endif
} else {
ScopedSetDevice setDeviceForThisScope(device);
VECMEM_CUDA_ERROR_CHECK(cudaFree(ptr));
}
}

} // namespace vecmem::cuda::notcub
36 changes: 36 additions & 0 deletions cuda/src/memory/cuda/notcub/allocate_host.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include <limits>

#include "../../../utils/cuda_error_handling.hpp"
#include "vecmem/memory/cuda/notcub/allocate_host.h"
#include "vecmem/memory/cuda/notcub/getCachingDeviceAllocator.h"
#include "vecmem/memory/cuda/notcub/getCachingHostAllocator.h"

namespace {
const size_t maxAllocationSize =
vecmem::cuda::notcub::CachingDeviceAllocator::IntPow(vecmem::cuda::allocator::binGrowth, vecmem::cuda::allocator::maxBin);
}

namespace vecmem::cuda::notcub {
void *allocate_host(size_t nbytes, cudaStream_t stream) {
void *ptr = nullptr;
if constexpr (allocator::policy == allocator::Policy::Caching) {
if (nbytes > maxAllocationSize) {
throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
" bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
}
VECMEM_CUDA_ERROR_CHECK(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
} else {
VECMEM_CUDA_ERROR_CHECK(cudaMallocHost(&ptr, nbytes));
}
return ptr;
}

void free_host(void *ptr) {
if constexpr (allocator::policy == allocator::Policy::Caching) {
VECMEM_CUDA_ERROR_CHECK(allocator::getCachingHostAllocator().HostFree(ptr));
} else {
VECMEM_CUDA_ERROR_CHECK(cudaFreeHost(ptr));
}
}

} // namespace vecmem::cuda::notcub