From 38da65b2e78da3316436c3bec8f630249ee68afe Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 4 Sep 2024 15:40:36 -0400 Subject: [PATCH] rmm dynamic_load_runtime can now detect static linking of cudart. No longer needs RMM_STATIC_CUDART to be set for static cudart usages --- CMakeLists.txt | 1 - include/rmm/detail/dynamic_load_runtime.hpp | 80 ++++++++++++--------- tests/CMakeLists.txt | 17 ++++- 3 files changed, 61 insertions(+), 37 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26fcf1fd0..6b911c85f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,7 +90,6 @@ target_include_directories(rmm INTERFACE "$ static std::optional function(const char* func_name) { - auto* runtime = get_cuda_runtime_handle(); - auto* handle = ::dlsym(runtime, func_name); - if (!handle) { return std::nullopt; } + // query if the function has already been loaded by the program + auto* handle = ::dlsym(RTLD_DEFAULT, func_name); + auto* error = dlerror(); + + // throw rmm::logic_error{std::string{"dlysm: "} + error}; + if (error != nullptr) { + // function hasn't been loaded already, load it from CUDA runtime + auto* runtime = get_cuda_runtime_handle(); + handle = ::dlsym(runtime, func_name); + error = dlerror(); + } + if (error != nullptr) { return std::nullopt; } auto* function_ptr = reinterpret_cast(handle); return std::optional(function_ptr); } }; -#if defined(RMM_STATIC_CUDART) -// clang-format off -#define RMM_CUDART_API_WRAPPER(name, signature) \ - template \ - static cudaError_t name(Args... args) \ - { \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Waddress\"") \ - static_assert(static_cast(::name), \ - "Failed to find #name function with arguments #signature"); \ - _Pragma("GCC diagnostic pop") \ - return ::name(args...); \ - } -// clang-format on -#else -#define RMM_CUDART_API_WRAPPER(name, signature) \ - template \ - static cudaError_t name(Args... args) \ - { \ - static const auto func = dynamic_load_runtime::function(#name); \ - if (func) { return (*func)(args...); } \ - RMM_FAIL("Failed to find #name function in libcudart.so"); \ +#define RMM_CUDART_API_WRAPPER(name, signature) \ + template \ + static cudaError_t name(Args... args) \ + { \ + auto* p = static_cast(::name); \ + if (p != nullptr) { \ + return (*p)(args...); \ + } else { \ + static const auto func = dynamic_load_runtime::function(#name); \ + if (func) { return (*func)(args...); } \ + RMM_FAIL("Failed to find #name function in libcudart.so"); \ + } \ } -#endif #if CUDART_VERSION >= 11020 // 11.2 introduced cudaMallocAsync /** @@ -110,17 +108,31 @@ struct dynamic_load_runtime { * This allows RMM users to compile/link against CUDA 11.2+ and run with * < CUDA 11.2 runtime as these functions are found at call time. */ + + +extern "C" { +cudaError_t cudaMemPoolCreate(cudaMemPool_t*, const cudaMemPoolProps*) __attribute((weak)); +cudaError_t cudaMemPoolSetAttribute(cudaMemPool_t, cudaMemPoolAttr, void*) __attribute((weak)); +cudaError_t cudaMemPoolDestroy(cudaMemPool_t) __attribute((weak)); +cudaError_t cudaMallocFromPoolAsync(void**, size_t, cudaMemPool_t, cudaStream_t) + __attribute((weak)); +cudaError_t cudaFreeAsync(void*, cudaStream_t) __attribute((weak)); +cudaError_t cudaDeviceGetDefaultMemPool_sig(cudaMemPool_t*, int) __attribute((weak)); +} + struct async_alloc { static bool is_supported() { -#if defined(RMM_STATIC_CUDART) - static bool runtime_supports_pool = (CUDART_VERSION >= 11020); -#else - static bool runtime_supports_pool = - dynamic_load_runtime::function>( - "cudaFreeAsync") - .has_value(); -#endif + static bool runtime_supports_pool{[] { + using cuda_free_async_sig = dynamic_load_runtime::function_sig; + bool cuda_free_async_supported = true; + auto* p = static_cast(::cudaFreeAsync); + if (p == nullptr) { + cuda_free_async_supported = + dynamic_load_runtime::function("cudaFreeAsync").has_value(); + } + return cuda_free_async_supported; + }()}; static auto driver_supports_pool{[] { int cuda_pool_supported{}; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0258c59c5..5c78b85d0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -85,7 +85,7 @@ endfunction() function(ConfigureTest TEST_NAME) set(options) - set(one_value GPUS PERCENT) + set(one_value CUDART GPUS PERCENT) set(multi_value) cmake_parse_arguments(_RMM_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN}) if(NOT DEFINED _RMM_TEST_GPUS AND NOT DEFINED _RMM_TEST_PERCENT) @@ -99,13 +99,23 @@ function(ConfigureTest TEST_NAME) set(_RMM_TEST_PERCENT 100) endif() + if(_RMM_TEST_CUDART STREQUAL SHARED) + set(cudart_link_libs $ CUDA::cudart) + elseif(_RMM_TEST_CUDART STREQUAL STATIC) + set(cudart_link_libs $ CUDA::cudart_static) + else() + set(cudart_link_libs rmm) + endif() + # Test with legacy default stream. ConfigureTestInternal(${TEST_NAME} ${_RMM_TEST_UNPARSED_ARGUMENTS}) + target_link_libraries(${TEST_NAME} ${cudart_link_libs}) # Test with per-thread default stream. string(REGEX REPLACE "_TEST$" "_PTDS_TEST" PTDS_TEST_NAME "${TEST_NAME}") ConfigureTestInternal("${PTDS_TEST_NAME}" ${_RMM_TEST_UNPARSED_ARGUMENTS}) target_compile_definitions("${PTDS_TEST_NAME}" PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM) + target_link_libraries(${PTDS_TEST_NAME} ${cudart_link_libs}) foreach(name ${TEST_NAME} ${PTDS_TEST_NAME} ${NS_TEST_NAME}) rapids_test_add( @@ -131,7 +141,10 @@ ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp GPUS 1 PERCENT 100) # cuda_async mr tests -ConfigureTest(CUDA_ASYNC_MR_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60) +ConfigureTest(CUDA_ASYNC_MR_STATIC_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60 + CUDART STATIC) +ConfigureTest(CUDA_ASYNC_MR_SHARED_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60 + CUDART SHARED) # thrust allocator tests ConfigureTest(THRUST_ALLOCATOR_TEST mr/device/thrust_allocator_tests.cu GPUS 1 PERCENT 60)