acts-project · cgleggett · Apr 22, 2021 · Apr 22, 2021
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
@@ -34,7 +34,11 @@ vecmem_add_library( vecmem_cuda cuda SHARED
    "src/utils/cuda_wrappers.hpp"
    "src/utils/cuda_wrappers.cpp"
    "src/utils/select_device.hpp"
-   "src/utils/select_device.cpp" )
+   "src/utils/select_device.cpp"
+   # not-cub caching allocator
+   "src/memory/cuda/notcub/allocate_device.cpp"
+   "src/memory/cuda/notcub/allocate_host.cpp"
+   )
 target_link_libraries( vecmem_cuda
    PUBLIC vecmem::core
    PRIVATE CUDA::cudart )
diff --git a/cuda/include/vecmem/memory/cuda/notcub/CachingDeviceAllocator.h b/cuda/include/vecmem/memory/cuda/notcub/CachingDeviceAllocator.h
diff --git a/cuda/include/vecmem/memory/cuda/notcub/CachingHostAllocator.h b/cuda/include/vecmem/memory/cuda/notcub/CachingHostAllocator.h
diff --git a/cuda/include/vecmem/memory/cuda/notcub/ScopedSetDevice.h b/cuda/include/vecmem/memory/cuda/notcub/ScopedSetDevice.h
@@ -0,0 +1,32 @@
+#ifndef HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+#define HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+
+#include "../../../../../src/utils/cuda_error_handling.hpp"
+
+#include <cuda_runtime.h>
+
+namespace vecmem {
+  namespace cuda {
+    namespace notcub {
+      class ScopedSetDevice {
+      public:
+        explicit ScopedSetDevice(int newDevice) {
+          VECMEM_CUDA_ERROR_CHECK(cudaGetDevice(&prevDevice_));
+          VECMEM_CUDA_ERROR_CHECK(cudaSetDevice(newDevice));
+        }
+
+        ~ScopedSetDevice() {
+          // Intentionally don't check the return value to avoid
+          // exceptions to be thrown. If this call fails, the process is
+          // doomed anyway.
+          cudaSetDevice(prevDevice_);
+        }
+
+      private:
+        int prevDevice_;
+      };
+    } // namespace notcub
+  }  // namespace cuda
+}  // namespace vecmem
+
+#endif
diff --git a/cuda/include/vecmem/memory/cuda/notcub/allocate_device.h b/cuda/include/vecmem/memory/cuda/notcub/allocate_device.h
@@ -0,0 +1,18 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_device_h
+#define HeterogeneousCore_CUDAUtilities_allocate_device_h
+
+#include <cuda_runtime.h>
+
+namespace vecmem {
+  namespace cuda {
+    namespace notcub {
+      // Allocate device memory
+      void *allocate_device(int device, size_t nbytes, cudaStream_t stream);
+
+      // Free device memory (to be called from unique_ptr)
+      void free_device(int device, void *ptr, cudaStream_t stream);
+    }  // namespace notcub
+  }  // namespace cuda
+} // namespace vecmem
+
+#endif
diff --git a/cuda/include/vecmem/memory/cuda/notcub/allocate_host.h b/cuda/include/vecmem/memory/cuda/notcub/allocate_host.h
@@ -0,0 +1,18 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_host_h
+#define HeterogeneousCore_CUDAUtilities_allocate_host_h
+
+#include <cuda_runtime.h>
+
+namespace vecmem {
+  namespace cuda {
+    namespace notcub {
+      // Allocate pinned host memory (to be called from unique_ptr)
+      void *allocate_host(size_t nbytes, cudaStream_t stream);
+
+      // Free pinned host memory (to be called from unique_ptr)
+      void free_host(void *ptr);
+    } // namespace notcub
+  }  // namespace cuda
+}  // namespace vecmem
+
+#endif
diff --git a/cuda/include/vecmem/memory/cuda/notcub/cuda_assert.h b/cuda/include/vecmem/memory/cuda/notcub/cuda_assert.h
@@ -0,0 +1,18 @@
+// The omission of #include guards is on purpose: it does make sense to #include
+// this file multiple times, setting a different value of GPU_DEBUG beforehand.
+
+#ifdef __CUDA_ARCH__
+#ifndef GPU_DEBUG
+// disable asserts
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#else
+// enable asserts
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+#endif
+#endif  // __CUDA_ARCH__
+
+#include <cassert>
diff --git a/cuda/include/vecmem/memory/cuda/notcub/deviceAllocatorStatus.h b/cuda/include/vecmem/memory/cuda/notcub/deviceAllocatorStatus.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+#define HeterogeneousCore_CUDAUtilities_deviceAllocatorStatus_h
+
+#include <map>
+
+namespace vecmem {
+  namespace cuda {
+    namespace allocator {
+      struct TotalBytes {
+        size_t free;
+        size_t live;
+        size_t liveRequested;  // CMS: monitor also requested amount
+        TotalBytes() { free = live = liveRequested = 0; }
+      };
+      /// Map type of device ordinals to the number of cached bytes cached by each device
+      using GpuCachedBytes = std::map<int, TotalBytes>;
+    }  // namespace allocator
+
+    allocator::GpuCachedBytes deviceAllocatorStatus();
+  }  // namespace cuda
+}  // namespace vecmem
+
+#endif
diff --git a/cuda/include/vecmem/memory/cuda/notcub/deviceCount.h b/cuda/include/vecmem/memory/cuda/notcub/deviceCount.h
@@ -0,0 +1,20 @@
+#ifndef HeterogenousCore_CUDAUtilities_deviceCount_h
+#define HeterogenousCore_CUDAUtilities_deviceCount_h
+
+#include "../../../../../src/utils/cuda_error_handling.hpp"
+
+#include <cuda_runtime.h>
+
+namespace vecmem {
+  namespace cuda {
+    namespace notcub {
+      inline int deviceCount() {
+        int ndevices;
+        VECMEM_CUDA_ERROR_CHECK(cudaGetDeviceCount(&ndevices));
+        return ndevices;
+      }
+    } // namespace notcub
+  }  // namespace cuda
+}  // namespace vecmem
+
+#endif
diff --git a/cuda/include/vecmem/memory/cuda/notcub/getCachingDeviceAllocator.h b/cuda/include/vecmem/memory/cuda/notcub/getCachingDeviceAllocator.h
@@ -0,0 +1,86 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+
+#include <iomanip>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include "../../../../../src/utils/cuda_error_handling.hpp"
+#include "deviceCount.h"
+#include "CachingDeviceAllocator.h"
+
+namespace vecmem::cuda::allocator {
+  // Use caching or not
+  enum class Policy { Synchronous = 0, Asynchronous = 1, Caching = 2 };
+#ifndef CUDA_DISABLE_CACHING_ALLOCATOR
+  constexpr Policy policy = Policy::Caching;
+#elif CUDA_VERSION >= 11020 && !defined CUDA_DISABLE_ASYNC_ALLOCATOR
+  constexpr Policy policy = Policy::Asynchronous;
+#else
+  constexpr Policy policy = Policy::Synchronous;
+#endif
+  // Growth factor (bin_growth in cub::CachingDeviceAllocator
+  constexpr unsigned int binGrowth = 2;
+  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+  constexpr unsigned int minBin = 8;
+  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+  constexpr unsigned int maxBin = 30;
+  // Total storage for the allocator. 0 means no limit.
+  constexpr size_t maxCachedBytes = 0;
+  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
+  constexpr double maxCachedFraction = 0.8;
+  constexpr bool debug = false;
+
+  inline size_t minCachedBytes() {
+    size_t ret = std::numeric_limits<size_t>::max();
+    int currentDevice;
+    VECMEM_CUDA_ERROR_CHECK(cudaGetDevice(&currentDevice));
+    const int numberOfDevices = vecmem::cuda::notcub::deviceCount();
+    for (int i = 0; i < numberOfDevices; ++i) {
+      size_t freeMemory, totalMemory;
+      VECMEM_CUDA_ERROR_CHECK(cudaSetDevice(i));
+      VECMEM_CUDA_ERROR_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
+      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
+    }
+    VECMEM_CUDA_ERROR_CHECK(cudaSetDevice(currentDevice));
+    if (maxCachedBytes > 0) {
+      ret = std::min(ret, maxCachedBytes);
+    }
+    return ret;
+  }
+
+  inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingDeviceAllocator settings\n"
+                << "  bin growth " << binGrowth << "\n"
+                << "  min bin    " << minBin << "\n"
+                << "  max bin    " << maxBin << "\n"
+                << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
+    // the public interface is thread safe
+    static notcub::CachingDeviceAllocator allocator{binGrowth,
+                                                    minBin,
+                                                    maxBin,
+                                                    minCachedBytes(),
+                                                    false,  // do not skip cleanup
+                                                    debug};
+    return allocator;
+  }
+}  // namespace vecmem::cuda::allocator
+
+#endif
diff --git a/cuda/include/vecmem/memory/cuda/notcub/getCachingHostAllocator.h b/cuda/include/vecmem/memory/cuda/notcub/getCachingHostAllocator.h
@@ -0,0 +1,45 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+
+#include <iomanip>
+#include <iostream>
+
+#include "../../../../../src/utils/cuda_error_handling.hpp"
+#include "CachingHostAllocator.h"
+#include "getCachingDeviceAllocator.h"
+
+namespace vecmem::cuda::allocator {
+  inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+    if (debug) {
+      std::cout << "cub::CachingHostAllocator settings\n"
+                << "  bin growth " << binGrowth << "\n"
+                << "  min bin    " << minBin << "\n"
+                << "  max bin    " << maxBin << "\n"
+                << "  resulting bins:\n";
+      for (auto bin = minBin; bin <= maxBin; ++bin) {
+        auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+        if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+        } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+        } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+          std::cout << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+        } else {
+          std::cout << "    " << std::setw(9) << binSize << " B\n";
+        }
+      }
+      std::cout << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+    }
+
+    // the public interface is thread safe
+    static notcub::CachingHostAllocator allocator{binGrowth,
+                                                  minBin,
+                                                  maxBin,
+                                                  minCachedBytes(),
+                                                  false,  // do not skip cleanup
+                                                  debug};
+    return allocator;
+  }
+}  // namespace vecmem::cuda::allocator
+
+#endif
diff --git a/cuda/src/memory/cuda/notcub/allocate_device.cpp b/cuda/src/memory/cuda/notcub/allocate_device.cpp
@@ -0,0 +1,52 @@
+#include <cassert>
+#include <limits>
+
+#include <cuda_runtime.h>
+
+#include "../../../utils/cuda_error_handling.hpp"
+
+#include "vecmem/memory/cuda/notcub/ScopedSetDevice.h"
+#include "vecmem/memory/cuda/notcub/allocate_device.h"
+#include "vecmem/memory/cuda/notcub/getCachingDeviceAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+    vecmem::cuda::notcub::CachingDeviceAllocator::IntPow(vecmem::cuda::allocator::binGrowth, vecmem::cuda::allocator::maxBin);
+}
+
+namespace vecmem::cuda::notcub {
+  void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (allocator::policy == allocator::Policy::Caching) {
+      if (nbytes > maxAllocationSize) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      VECMEM_CUDA_ERROR_CHECK(allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
+#if CUDA_VERSION >= 11020
+    } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
+      ScopedSetDevice setDeviceForThisScope(dev);
+      VECMEM_CUDA_ERROR_CHECK(cudaMallocAsync(&ptr, nbytes, stream));
+#endif
+    } else {
+      ScopedSetDevice setDeviceForThisScope(dev);
+      VECMEM_CUDA_ERROR_CHECK(cudaMalloc(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_device(int device, void *ptr, cudaStream_t stream) {
+    if constexpr (allocator::policy == allocator::Policy::Caching) {
+      VECMEM_CUDA_ERROR_CHECK(allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
+#if CUDA_VERSION >= 11020
+    } else if constexpr (allocator::policy == allocator::Policy::Asynchronous) {
+      ScopedSetDevice setDeviceForThisScope(device);
+      VECMEM_CUDA_ERROR_CHECK(cudaFreeAsync(ptr, stream));
+#endif
+    } else {
+      ScopedSetDevice setDeviceForThisScope(device);
+      VECMEM_CUDA_ERROR_CHECK(cudaFree(ptr));
+    }
+  }
+
+}  // namespace vecmem::cuda::notcub
diff --git a/cuda/src/memory/cuda/notcub/allocate_host.cpp b/cuda/src/memory/cuda/notcub/allocate_host.cpp
@@ -0,0 +1,36 @@
+#include <limits>
+
+#include "../../../utils/cuda_error_handling.hpp"
+#include "vecmem/memory/cuda/notcub/allocate_host.h"
+#include "vecmem/memory/cuda/notcub/getCachingDeviceAllocator.h"
+#include "vecmem/memory/cuda/notcub/getCachingHostAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+    vecmem::cuda::notcub::CachingDeviceAllocator::IntPow(vecmem::cuda::allocator::binGrowth, vecmem::cuda::allocator::maxBin);
+}
+
+namespace vecmem::cuda::notcub {
+  void *allocate_host(size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (allocator::policy == allocator::Policy::Caching) {
+      if (nbytes > maxAllocationSize) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      VECMEM_CUDA_ERROR_CHECK(allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
+    } else {
+      VECMEM_CUDA_ERROR_CHECK(cudaMallocHost(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_host(void *ptr) {
+    if constexpr (allocator::policy == allocator::Policy::Caching) {
+      VECMEM_CUDA_ERROR_CHECK(allocator::getCachingHostAllocator().HostFree(ptr));
+    } else {
+      VECMEM_CUDA_ERROR_CHECK(cudaFreeHost(ptr));
+    }
+  }
+
+}  // namespace vecmem::cuda::notcub