From 90adef2d6d55f324067b1f750742a75a0efcf2af Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 12:22:49 +0200 Subject: [PATCH 01/30] Add cccl repo to build dependencies --- CMakeLists.txt | 1 + cmake/thirdparty/get_libcudacxx.cmake | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 cmake/thirdparty/get_libcudacxx.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c883bc4c..808e52ded 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ rapids_cpm_init() include(cmake/thirdparty/get_fmt.cmake) include(cmake/thirdparty/get_spdlog.cmake) +include(cmake/thirdparty/get_libcudacxx.cmake) include(cmake/thirdparty/get_thrust.cmake) # ################################################################################################## diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake new file mode 100644 index 000000000..b1f990d39 --- /dev/null +++ b/cmake/thirdparty/get_libcudacxx.cmake @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone libcudacxx +function(find_and_configure_libcudacxx) + + include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) + rapids_cpm_libcudacxx(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports) + +endfunction() + +find_and_configure_libcudacxx() From a49fe9c352b4609f011895ed59d936abdb898915 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 12:29:12 +0200 Subject: [PATCH 02/30] PoC for the new design of `cuda::mr::{async_}resource_ref` --- CMakeLists.txt | 1 + include/rmm/cuda_stream_view.hpp | 16 + .../mr/device/callback_memory_resource.hpp | 4 +- .../rmm/mr/device/device_memory_resource.hpp | 150 +++++++ include/rmm/mr/host/host_memory_resource.hpp | 35 ++ tests/CMakeLists.txt | 7 + tests/mr/device/adaptor_tests.cpp | 26 ++ tests/mr/device/cuda_async_mr_tests.cpp | 2 + tests/mr/device/cuda_async_view_mr_tests.cpp | 4 + .../mr/device/mr_ref_multithreaded_tests.cpp | 221 ++++++++++ tests/mr/device/mr_ref_test.hpp | 412 ++++++++++++++++++ tests/mr/device/mr_ref_tests.cpp | 109 +++++ tests/mr/host/mr_ref_tests.cpp | 263 +++++++++++ tests/mr/host/mr_tests.cpp | 5 + 14 files changed, 1253 insertions(+), 2 deletions(-) create mode 100644 tests/mr/device/mr_ref_multithreaded_tests.cpp create mode 100644 tests/mr/device/mr_ref_test.hpp create mode 100644 tests/mr/device/mr_ref_tests.cpp create mode 100644 tests/mr/host/mr_ref_tests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 808e52ded..9455c4267 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,6 +95,7 @@ target_link_libraries(rmm INTERFACE fmt::fmt-header-only) target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only) target_link_libraries(rmm INTERFACE dl) target_compile_features(rmm INTERFACE cxx_std_17 $) +target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # ################################################################################################## # * tests and benchmarks --------------------------------------------------------------------------- diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp index fe07fa1b9..ad2768535 100644 --- a/include/rmm/cuda_stream_view.hpp +++ b/include/rmm/cuda_stream_view.hpp @@ -20,6 +20,8 @@ #include +#include + #include #include #include @@ -58,6 +60,13 @@ class cuda_stream_view { */ constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {} + /** + * @brief Implicit conversion from cuda::stream_ref. + * + * @param stream The underlying stream for this view + */ + constexpr cuda_stream_view(cuda::stream_ref stream) noexcept : stream_{stream.get()} {} + /** * @brief Get the wrapped stream. * @@ -72,6 +81,13 @@ class cuda_stream_view { */ constexpr operator cudaStream_t() const noexcept { return value(); } + /** + * @brief Implicit conversion to stream_ref. + * + * @return cuda::stream_ref The underlying stream referenced by this cuda_stream_view + */ + constexpr operator cuda::stream_ref() const noexcept { return value(); } + /** * @briefreturn{true if the wrapped stream is the CUDA per-thread default stream} */ diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp index c6519ed5c..36802c83a 100644 --- a/include/rmm/mr/device/callback_memory_resource.hpp +++ b/include/rmm/mr/device/callback_memory_resource.hpp @@ -143,8 +143,8 @@ class callback_memory_resource final : public device_memory_resource { throw std::runtime_error("cannot get free / total memory"); } - [[nodiscard]] virtual bool supports_streams() const noexcept { return false; } - [[nodiscard]] virtual bool supports_get_mem_info() const noexcept { return false; } + [[nodiscard]] bool supports_streams() const noexcept override { return false; } + [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; } allocate_callback_t allocate_callback_; deallocate_callback_t deallocate_callback_; diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 8ad84644b..cc1ae2c3f 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -18,6 +18,9 @@ #include #include +#include +#include + #include #include @@ -157,6 +160,150 @@ class device_memory_resource { return do_is_equal(other); } + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate(std::size_t bytes, std::size_t alignment) + { + return do_allocate(rmm::detail::align_up(bytes, alignment), cuda_stream_view{}); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + */ + void deallocate(void* ptr, std::size_t bytes, std::size_t alignment) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), cuda_stream_view{}); + } + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream) + { + return do_allocate(rmm::detail::align_up(bytes, alignment), stream); + } + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate_async(std::size_t bytes, cuda_stream_view stream) + { + return do_allocate(bytes, stream); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + * @param stream Stream on which to perform allocation + */ + void deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + cuda_stream_view stream) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), stream); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param stream Stream on which to perform allocation + */ + void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream) + { + do_deallocate(ptr, bytes, stream); + } + + [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept + { + return do_is_equal(other); + } + + [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept + { + return !do_is_equal(other); + } + /** * @brief Query whether the resource supports use of non-null CUDA streams for * allocation/deallocation. @@ -185,6 +332,8 @@ class device_memory_resource { return do_get_mem_info(stream); } + friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {} + private: /** * @brief Allocates memory of size at least \p bytes. @@ -243,5 +392,6 @@ class device_memory_resource { [[nodiscard]] virtual std::pair do_get_mem_info( cuda_stream_view stream) const = 0; }; +static_assert(cuda::mr::resource_with, ""); /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index 8a5739b2a..aa40d57cd 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -15,6 +15,8 @@ */ #pragma once +#include + #include #include @@ -114,6 +116,37 @@ class host_memory_resource { return do_is_equal(other); } + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equivalent + */ + [[nodiscard]] bool operator==(host_memory_resource const& other) const noexcept + { + return do_is_equal(other); + } + + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return false If the two resources are equivalent + * @return true If the two resources are not equivalent + */ + [[nodiscard]] bool operator!=(host_memory_resource const& other) const noexcept + { + return !do_is_equal(other); + } + + /** + * @brief Enables the `cuda::mr::host_accessible` property + * + * This property declares that a `host_memory_resource` provides host accessible memory + */ + friend void get_property(host_memory_resource const&, cuda::mr::host_accessible) noexcept {} + private: /** * @brief Allocates memory on the host of size at least `bytes` bytes. @@ -166,5 +199,7 @@ class host_memory_resource { return this == &other; } }; +static_assert(cuda::mr::resource_with, ""); /** @} */ // end of group + } // namespace rmm::mr diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 36c3aa043..80e2b846b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -126,6 +126,10 @@ endfunction() ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp GPUS 1 PERCENT 90) +# device mr tests +ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp + mr/device/mr_ref_multithreaded_tests.cpp) + # general adaptor tests ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) @@ -162,6 +166,9 @@ ConfigureTest(LIMITING_TEST mr/device/limiting_mr_tests.cpp) # host mr tests ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp) +# host mr_ref tests +ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp) + # cuda stream tests ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp) diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp index 44c14240b..3e00dd208 100644 --- a/tests/mr/device/adaptor_tests.cpp +++ b/tests/mr/device/adaptor_tests.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include #include @@ -64,6 +66,30 @@ using adaptors = ::testing::Types, thread_safe_resource_adaptor, tracking_resource_adaptor>; +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, ""); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>, + ""); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); + template struct AdaptorTest : public ::testing::Test { using adaptor_type = MemoryResourceType; diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp index 37ed5c306..90c7b0ff9 100644 --- a/tests/mr/device/cuda_async_mr_tests.cpp +++ b/tests/mr/device/cuda_async_mr_tests.cpp @@ -24,6 +24,8 @@ namespace rmm::test { namespace { using cuda_async_mr = rmm::mr::cuda_async_memory_resource; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); class AsyncMRTest : public ::testing::Test { protected: diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp index 209429b4b..fe82431a9 100644 --- a/tests/mr/device/cuda_async_view_mr_tests.cpp +++ b/tests/mr/device/cuda_async_view_mr_tests.cpp @@ -18,12 +18,16 @@ #include #include +#include + #include namespace rmm::test { namespace { using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT) diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp new file mode 100644 index 000000000..dd39d1e1d --- /dev/null +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mr_ref_test.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace rmm::test { +namespace { + +struct mr_ref_test_mt : public mr_ref_test {}; + +INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests, + mr_ref_test_mt, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}), + [](auto const& info) { return info.param.name; }); + +template +void spawn_n(std::size_t num_threads, Task task, Arguments&&... args) +{ + std::vector threads; + threads.reserve(num_threads); + for (std::size_t i = 0; i < num_threads; ++i) { + threads.emplace_back(std::thread(task, std::forward(args)...)); + } + + for (auto& thread : threads) { + thread.join(); + } +} + +template +void spawn(Task task, Arguments&&... args) +{ + spawn_n(4, task, std::forward(args)...); +} + +TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); } + +TEST_P(mr_ref_test_mt, AllocateDefaultStream) +{ + spawn(test_various_async_allocations, this->ref, rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, AllocateOnStream) +{ + spawn(test_various_async_allocations, this->ref, this->stream.view()); +} + +TEST_P(mr_ref_test_mt, RandomAllocations) +{ + spawn(test_random_allocations, this->ref, default_num_allocations, default_max_size); +} + +TEST_P(mr_ref_test_mt, RandomAllocationsDefaultStream) +{ + spawn(test_random_async_allocations, + this->ref, + default_num_allocations, + default_max_size, + rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, RandomAllocationsStream) +{ + spawn(test_random_async_allocations, + this->ref, + default_num_allocations, + default_max_size, + this->stream.view()); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFree) +{ + spawn(test_mixed_random_allocation_free, this->ref, default_max_size); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeDefaultStream) +{ + spawn( + test_mixed_random_async_allocation_free, this->ref, default_max_size, rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeStream) +{ + spawn(test_mixed_random_async_allocation_free, this->ref, default_max_size, this->stream.view()); +} + +void allocate_async_loop(async_resource_ref ref, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + cudaEvent_t& event, + rmm::cuda_stream_view stream) +{ + constexpr std::size_t max_size{1_MiB}; + + std::default_random_engine generator; + std::uniform_int_distribution size_distribution(1, max_size); + + for (std::size_t i = 0; i < num_allocations; ++i) { + std::size_t size = size_distribution(generator); + void* ptr = ref.allocate_async(size, stream); + { + std::lock_guard lock(mtx); + RMM_CUDA_TRY(cudaEventRecord(event, stream.value())); + allocations.emplace_back(ptr, size); + } + } +} + +void deallocate_async_loop(async_resource_ref ref, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + cudaEvent_t& event, + rmm::cuda_stream_view stream) +{ + for (std::size_t i = 0; i < num_allocations;) { + std::lock_guard lock(mtx); + if (allocations.empty()) { continue; } + i++; + RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event)); + allocation alloc = allocations.front(); + allocations.pop_front(); + ref.deallocate_async(alloc.ptr, alloc.size, stream); + } +} + +void test_allocate_async_free_different_threads(async_resource_ref ref, + rmm::cuda_stream_view streamA, + rmm::cuda_stream_view streamB) +{ + constexpr std::size_t num_allocations{100}; + + std::mutex mtx; + std::list allocations; + cudaEvent_t event; + + RMM_CUDA_TRY(cudaEventCreate(&event)); + + std::thread producer(allocate_async_loop, + ref, + num_allocations, + std::ref(allocations), + std::ref(mtx), + std::ref(event), + streamA); + + std::thread consumer(deallocate_async_loop, + ref, + num_allocations, + std::ref(allocations), + std::ref(mtx), + std::ref(event), + streamB); + + producer.join(); + consumer.join(); + + RMM_CUDA_TRY(cudaEventDestroy(event)); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDefaultStream) +{ + test_allocate_async_free_different_threads( + this->ref, rmm::cuda_stream_default, rmm::cuda_stream_default); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsPerThreadDefaultStream) +{ + test_allocate_async_free_different_threads( + this->ref, rmm::cuda_stream_per_thread, rmm::cuda_stream_per_thread); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsSameStream) +{ + test_allocate_async_free_different_threads(this->ref, this->stream, this->stream); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDifferentStream) +{ + rmm::cuda_stream streamB; + test_allocate_async_free_different_threads(this->ref, this->stream, streamB); + streamB.synchronize(); +} + +} // namespace +} // namespace rmm::test diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp new file mode 100644 index 000000000..ecb13c2a7 --- /dev/null +++ b/tests/mr/device/mr_ref_test.hpp @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +using resource_ref = cuda::mr::resource_ref; +using async_resource_ref = cuda::mr::async_resource_ref; + +namespace rmm::test { + +/** + * @brief Returns if a pointer points to a device memory or managed memory + * allocation. + */ +inline bool is_device_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } +#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 + return attributes.memoryType == cudaMemoryTypeDevice; +#else + return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); +#endif +} + +enum size_in_bytes : size_t {}; + +constexpr auto default_num_allocations{100}; +constexpr size_in_bytes default_max_size{5_MiB}; + +struct allocation { + void* ptr{nullptr}; + std::size_t size{0}; + allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {} + allocation() = default; +}; + +// Various test functions, shared between single-threaded and multithreaded tests. +inline void test_allocate(resource_ref ref, std::size_t bytes) +{ + try { + void* ptr = ref.allocate(bytes); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr)); + EXPECT_TRUE(is_device_memory(ptr)); + ref.deallocate(ptr, bytes); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } +} + +inline void test_allocate_async(async_resource_ref ref, + std::size_t bytes, + cuda_stream_view stream = {}) +{ + try { + void* ptr = ref.allocate_async(bytes, stream); + if (not stream.is_default()) { stream.synchronize(); } + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr)); + EXPECT_TRUE(is_device_memory(ptr)); + ref.deallocate_async(ptr, bytes, stream); + if (not stream.is_default()) { stream.synchronize(); } + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } +} + +// Simple reproducer for https://github.com/rapidsai/rmm/issues/861 +inline void concurrent_allocations_are_different(resource_ref ref) +{ + const auto size{8_B}; + void* ptr1 = ref.allocate(size); + void* ptr2 = ref.allocate(size); + + EXPECT_NE(ptr1, ptr2); + + ref.deallocate(ptr1, size); + ref.deallocate(ptr2, size); +} + +inline void concurrent_async_allocations_are_different(async_resource_ref ref, + cuda_stream_view stream) +{ + const auto size{8_B}; + void* ptr1 = ref.allocate_async(size, stream); + void* ptr2 = ref.allocate_async(size, stream); + + EXPECT_NE(ptr1, ptr2); + + ref.deallocate_async(ptr1, size, stream); + ref.deallocate_async(ptr2, size, stream); +} + +inline void test_various_allocations(resource_ref ref) +{ + // test allocating zero bytes on non-default stream + { + void* ptr = ref.allocate(0); + EXPECT_NO_THROW(ref.deallocate(ptr, 0)); + } + + test_allocate(ref, 4_B); + test_allocate(ref, 1_KiB); + test_allocate(ref, 1_MiB); + test_allocate(ref, 1_GiB); + + // should fail to allocate too much + { + void* ptr{nullptr}; + EXPECT_THROW(ptr = ref.allocate(1_PiB), rmm::out_of_memory); + EXPECT_EQ(nullptr, ptr); + + // test e.what(); + try { + ptr = ref.allocate(1_PiB); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } + } +} + +inline void test_various_async_allocations(async_resource_ref ref, cuda_stream_view stream) +{ + // test allocating zero bytes on non-default stream + { + void* ptr = ref.allocate_async(0, stream); + stream.synchronize(); + EXPECT_NO_THROW(ref.deallocate_async(ptr, 0, stream)); + stream.synchronize(); + } + + test_allocate_async(ref, 4_B, stream); + test_allocate_async(ref, 1_KiB, stream); + test_allocate_async(ref, 1_MiB, stream); + test_allocate_async(ref, 1_GiB, stream); + + // should fail to allocate too much + { + void* ptr{nullptr}; + EXPECT_THROW(ptr = ref.allocate_async(1_PiB, stream), rmm::out_of_memory); + EXPECT_EQ(nullptr, ptr); + + // test e.what(); + try { + ptr = ref.allocate_async(1_PiB, stream); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } + } +} + +inline void test_random_allocations(resource_ref ref, + std::size_t num_allocations = default_num_allocations, + size_in_bytes max_size = default_max_size) +{ + std::vector allocations(num_allocations); + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, max_size); + + // num_allocations allocations from [0,max_size) + std::for_each( + allocations.begin(), allocations.end(), [&generator, &distribution, &ref](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size)); + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [&ref](allocation& alloc) { + EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size)); + }); +} + +inline void test_random_async_allocations(async_resource_ref ref, + std::size_t num_allocations = default_num_allocations, + size_in_bytes max_size = default_max_size, + cuda_stream_view stream = {}) +{ + std::vector allocations(num_allocations); + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, max_size); + + // num_allocations allocations from [0,max_size) + std::for_each(allocations.begin(), + allocations.end(), + [&generator, &distribution, &ref, stream](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size)); + if (not stream.is_default()) { stream.synchronize(); } + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [stream, &ref](allocation& alloc) { + EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size)); + if (not stream.is_default()) { stream.synchronize(); } + }); +} + +inline void test_mixed_random_allocation_free(resource_ref ref, + size_in_bytes max_size = default_max_size) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + + std::uniform_int_distribution size_distribution(1, max_size); + + constexpr int allocation_probability{53}; // percent + constexpr int max_probability{99}; + std::uniform_int_distribution op_distribution(0, max_probability); + std::uniform_int_distribution index_distribution(0, num_allocations - 1); + + std::size_t active_allocations{0}; + std::size_t allocation_count{0}; + + std::vector allocations; + + for (std::size_t i = 0; i < num_allocations * 2; ++i) { + bool do_alloc = true; + if (active_allocations > 0) { + int chance = op_distribution(generator); + do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); + } + + if (do_alloc) { + std::size_t size = size_distribution(generator); + active_allocations++; + allocation_count++; + EXPECT_NO_THROW(allocations.emplace_back(ref.allocate(size), size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr)); + } else { + auto const index = static_cast(index_distribution(generator) % active_allocations); + active_allocations--; + allocation to_free = allocations[index]; + allocations.erase(std::next(allocations.begin(), index)); + EXPECT_NO_THROW(ref.deallocate(to_free.ptr, to_free.size)); + } + } + + EXPECT_EQ(active_allocations, 0); + EXPECT_EQ(allocations.size(), active_allocations); +} + +inline void test_mixed_random_async_allocation_free(async_resource_ref ref, + size_in_bytes max_size = default_max_size, + cuda_stream_view stream = {}) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + + std::uniform_int_distribution size_distribution(1, max_size); + + constexpr int allocation_probability{53}; // percent + constexpr int max_probability{99}; + std::uniform_int_distribution op_distribution(0, max_probability); + std::uniform_int_distribution index_distribution(0, num_allocations - 1); + + std::size_t active_allocations{0}; + std::size_t allocation_count{0}; + + std::vector allocations; + + for (std::size_t i = 0; i < num_allocations * 2; ++i) { + bool do_alloc = true; + if (active_allocations > 0) { + int chance = op_distribution(generator); + do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); + } + + if (do_alloc) { + std::size_t size = size_distribution(generator); + active_allocations++; + allocation_count++; + EXPECT_NO_THROW(allocations.emplace_back(ref.allocate_async(size, stream), size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr)); + } else { + auto const index = static_cast(index_distribution(generator) % active_allocations); + active_allocations--; + allocation to_free = allocations[index]; + allocations.erase(std::next(allocations.begin(), index)); + EXPECT_NO_THROW(ref.deallocate_async(to_free.ptr, to_free.size, stream)); + } + } + + EXPECT_EQ(active_allocations, 0); + EXPECT_EQ(allocations.size(), active_allocations); +} + +using MRFactoryFunc = std::function()>; + +/// Encapsulates a `device_memory_resource` factory function and associated name +struct mr_factory { + mr_factory(std::string name, MRFactoryFunc factory) + : name{std::move(name)}, factory{std::move(factory)} + { + } + + std::string name; ///< Name to associate with tests that use this factory + MRFactoryFunc factory; ///< Factory function that returns shared_ptr to `device_memory_resource` + ///< instance to use in test +}; + +/// Test fixture class value-parameterized on different `mr_factory`s +struct mr_ref_test : public ::testing::TestWithParam { + void SetUp() override + { + auto factory = GetParam().factory; + mr = factory(); + if (mr == nullptr) { + GTEST_SKIP() << "Skipping tests since the memory resource is not supported with this CUDA " + << "driver/runtime version"; + } + ref = async_resource_ref{*mr}; + } + + std::shared_ptr mr; ///< Pointer to resource to use in tests + async_resource_ref ref{*mr}; + rmm::cuda_stream stream{}; +}; + +struct mr_ref_allocation_test : public mr_ref_test {}; + +/// MR factory functions +inline auto make_cuda() { return std::make_shared(); } + +inline auto make_cuda_async() +{ + if (rmm::detail::async_alloc::is_supported()) { + return std::make_shared(); + } + return std::shared_ptr{nullptr}; +} + +inline auto make_managed() { return std::make_shared(); } + +inline auto make_pool() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_arena() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_fixed_size() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_binning() +{ + auto pool = make_pool(); + // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB + // Larger allocations will use the pool resource + auto const bin_range_start{18}; + auto const bin_range_end{22}; + + auto mr = rmm::mr::make_owning_wrapper( + pool, bin_range_start, bin_range_end); + return mr; +} + +} // namespace rmm::test diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp new file mode 100644 index 000000000..2507d468a --- /dev/null +++ b/tests/mr/device/mr_ref_tests.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mr_ref_test.hpp" + +#include + +#include + +#include + +namespace rmm::test { +namespace { + +INSTANTIATE_TEST_SUITE_P(ResourceTests, + mr_ref_test, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}, + mr_factory{"Fixed_Size", &make_fixed_size}), + [](auto const& info) { return info.param.name; }); + +// Leave out fixed-size MR here because it can't handle the dynamic allocation sizes +INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests, + mr_ref_allocation_test, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}), + [](auto const& info) { return info.param.name; }); +TEST_P(mr_ref_test, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } + +// Simple reproducer for https://github.com/rapidsai/rmm/issues/861 +TEST_P(mr_ref_test, AllocationsAreDifferent) { concurrent_allocations_are_different(this->ref); } + +TEST_P(mr_ref_test, AsyncAllocationsAreDifferentDefaultStream) +{ + concurrent_async_allocations_are_different(this->ref, cuda_stream_view{}); +} + +TEST_P(mr_ref_test, AsyncAllocationsAreDifferent) +{ + concurrent_async_allocations_are_different(this->ref, this->stream); +} + +TEST_P(mr_ref_allocation_test, AllocateDefault) { test_various_allocations(this->ref); } + +TEST_P(mr_ref_allocation_test, AllocateDefaultStream) +{ + test_various_async_allocations(this->ref, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, AllocateOnStream) +{ + test_various_async_allocations(this->ref, this->stream); +} + +TEST_P(mr_ref_allocation_test, RandomAllocations) { test_random_allocations(this->ref); } + +TEST_P(mr_ref_allocation_test, RandomAllocationsDefaultStream) +{ + test_random_async_allocations( + this->ref, default_num_allocations, default_max_size, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, RandomAllocationsStream) +{ + test_random_async_allocations(this->ref, default_num_allocations, default_max_size, this->stream); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFree) +{ + test_mixed_random_allocation_free(this->ref, default_max_size); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeDefaultStream) +{ + test_mixed_random_async_allocation_free(this->ref, default_max_size, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeStream) +{ + test_mixed_random_async_allocation_free(this->ref, default_max_size, this->stream); +} + +} // namespace +} // namespace rmm::test diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp new file mode 100644 index 000000000..568e18576 --- /dev/null +++ b/tests/mr/host/mr_ref_tests.cpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include + +namespace rmm::test { +namespace { +inline bool is_aligned(void* ptr, std::size_t alignment = alignof(std::max_align_t)) +{ + return rmm::detail::is_pointer_aligned(ptr, alignment); +} + +// Returns true if a pointer points to a device memory or managed memory allocation. +inline bool is_device_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } +#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 + return attributes.memoryType == cudaMemoryTypeDevice; +#else + return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); +#endif +} + +/** + * @brief Returns if a pointer `p` points to pinned host memory. + */ +inline bool is_pinned_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } + return attributes.type == cudaMemoryTypeHost; +} + +constexpr std::size_t size_word{4_B}; +constexpr std::size_t size_kb{1_KiB}; +constexpr std::size_t size_mb{1_MiB}; +constexpr std::size_t size_gb{1_GiB}; +constexpr std::size_t size_pb{1_PiB}; + +struct allocation { + void* ptr{nullptr}; + std::size_t size{0}; + allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {} + allocation() = default; +}; +} // namespace + +template +struct MRRefTest : public ::testing::Test { + MemoryResourceType mr; + cuda::mr::resource_ref ref; + + MRRefTest() : mr{}, ref{mr} {} +}; + +using resources = ::testing::Types; +static_assert(cuda::mr::resource_with, ""); +static_assert(cuda::mr::resource_with, + ""); + +TYPED_TEST_CASE(MRRefTest, resources); + +TYPED_TEST(MRRefTest, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } + +TYPED_TEST(MRRefTest, AllocateZeroBytes) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(0)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, 0)); +} + +TYPED_TEST(MRRefTest, AllocateWord) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_word)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_word)); +} + +TYPED_TEST(MRRefTest, AllocateKB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_kb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_kb)); +} + +TYPED_TEST(MRRefTest, AllocateMB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_mb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_mb)); +} + +TYPED_TEST(MRRefTest, AllocateGB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_gb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_gb)); +} + +TYPED_TEST(MRRefTest, AllocateTooMuch) +{ + void* ptr{nullptr}; + EXPECT_THROW(ptr = this->ref.allocate(size_pb), std::bad_alloc); + EXPECT_EQ(nullptr, ptr); +} + +TYPED_TEST(MRRefTest, RandomAllocations) +{ + constexpr std::size_t num_allocations{100}; + std::vector allocations(num_allocations); + + constexpr std::size_t MAX_ALLOCATION_SIZE{5 * size_mb}; + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, MAX_ALLOCATION_SIZE); + + // 100 allocations from [0,5MB) + std::for_each( + allocations.begin(), allocations.end(), [&generator, &distribution, this](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = this->ref.allocate(alloc.size)); + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(is_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [this](allocation& alloc) { + EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size)); + }); +} + +TYPED_TEST(MRRefTest, MixedRandomAllocationFree) +{ + std::default_random_engine generator; + + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + // How often a free will occur. For example, if `1`, then every allocation + // will immediately be free'd. Or, if 4, on average, a free will occur after + // every 4th allocation + constexpr std::size_t FREE_FREQUENCY{4}; + std::uniform_int_distribution free_distribution(1, FREE_FREQUENCY); + + std::deque allocations; + + constexpr std::size_t num_allocations{100}; + for (std::size_t i = 0; i < num_allocations; ++i) { + std::size_t allocation_size = size_distribution(generator); + EXPECT_NO_THROW(allocations.emplace_back(this->ref.allocate(allocation_size), allocation_size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(is_aligned(new_allocation.ptr)); + + bool const free_front{free_distribution(generator) == free_distribution.max()}; + + if (free_front) { + auto front = allocations.front(); + EXPECT_NO_THROW(this->ref.deallocate(front.ptr, front.size)); + allocations.pop_front(); + } + } + // free any remaining allocations + for (auto alloc : allocations) { + EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size)); + allocations.pop_front(); + } +} + +static constexpr std::size_t MinTestedAlignment{16}; +static constexpr std::size_t MaxTestedAlignment{4096}; +static constexpr std::size_t TestedAlignmentMultiplier{2}; +static constexpr std::size_t NUM_TRIALS{100}; + +TYPED_TEST(MRRefTest, AlignmentTest) +{ + std::default_random_engine generator(0); + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { + for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; + alignment *= TestedAlignmentMultiplier) { + auto allocation_size = size_distribution(generator); + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, alignment)); + EXPECT_TRUE(is_aligned(ptr, alignment)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, alignment)); + } + } +} + +TYPED_TEST(MRRefTest, UnsupportedAlignmentTest) +{ + std::default_random_engine generator(0); + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { + for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; + alignment *= TestedAlignmentMultiplier) { + auto allocation_size = size_distribution(generator); + void* ptr{nullptr}; + // An unsupported alignment (like an odd number) should result in an + // alignment of `alignof(std::max_align_t)` + auto const bad_alignment = alignment + 1; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment)); + EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t))); + EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment)); + } + } +} + +TEST(PinnedResource, isPinned) +{ + rmm::mr::pinned_memory_resource mr; + cuda::mr::resource_ref ref{mr}; + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = ref.allocate(100)); + EXPECT_TRUE(is_pinned_memory(ptr)); + EXPECT_NO_THROW(ref.deallocate(ptr, 100)); +} +} // namespace rmm::test diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp index 1cd59f5a6..ff976ff96 100644 --- a/tests/mr/host/mr_tests.cpp +++ b/tests/mr/host/mr_tests.cpp @@ -23,6 +23,8 @@ #include +#include + #include #include @@ -80,6 +82,9 @@ struct MRTest : public ::testing::Test { }; using resources = ::testing::Types; +static_assert(cuda::mr::resource_with, ""); +static_assert(cuda::mr::resource_with, + ""); TYPED_TEST_CASE(MRTest, resources); From 0e78d9e336a4103ea62fbd9e398b67c2c5d3fee6 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 14:14:48 +0200 Subject: [PATCH 03/30] Make `pinned_memory_resource` usable for `pool_memory_resource` --- .../rmm/mr/device/pool_memory_resource.hpp | 69 ++++++++++++- .../rmm/mr/host/pinned_memory_resource.hpp | 74 ++++++++++++++ tests/CMakeLists.txt | 3 + tests/mr/device/pool_mr_tests.cpp | 51 ++++++++++ tests/mr/host/pinned_pool_mr_tests.cpp | 96 +++++++++++++++++++ 5 files changed, 290 insertions(+), 3 deletions(-) create mode 100644 tests/mr/host/pinned_pool_mr_tests.cpp diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index f6d3710e9..7cca60613 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -51,6 +51,35 @@ namespace rmm::mr { * @{ * @file */ +namespace detail { +/** + * @brief A helper class to remove the device_accessible property + * + * We want to be able to use the pool_memory_resource with an upstream that may not + * be device accessible. To avoid rewriting the world, we allow to conditionally remove + * the cuda::mr::device_accessible property. + * + * @tparam PoolResource the pool_memory_resource class + * @tparam Upstream memory_resource to use for allocating the pool. + */ +template +struct maybe_remove_property {}; + +/** + * @brief Specialization of maybe_remove_property to not propagate non existing properties + */ +template +struct maybe_remove_property< + PoolResource, + Upstream, + cuda::std::enable_if_t>> { + /** + * @brief Explicit removal of the friend function so we do not pretent to provide device + * accessible memory + */ + friend void get_property(const PoolResource&, cuda::mr::device_accessible) = delete; +}; +} // namespace detail /** * @brief A coalescing best-fit suballocator which uses a pool of memory allocated from @@ -64,8 +93,10 @@ namespace rmm::mr { */ template class pool_memory_resource final - : public detail::stream_ordered_memory_resource, - detail::coalescing_free_list> { + : public detail::maybe_remove_property, Upstream>, + public detail::stream_ordered_memory_resource, + detail::coalescing_free_list>, + public cuda::forward_property, Upstream> { public: friend class detail::stream_ordered_memory_resource, detail::coalescing_free_list>; @@ -104,6 +135,31 @@ class pool_memory_resource final initialize_pool(initial_pool_size, maximum_pool_size); } + /** + * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using + * `upstream_mr`. + * + * @throws rmm::logic_error if `upstream_mr == nullptr` + * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * + * @param upstream_mr The memory_resource from which to allocate blocks for the pool. + * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to half of the + * available memory on the current device. + * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all + * of the available memory on the current device. + */ + template , int> = 0> + explicit pool_memory_resource(Upstream2& upstream_mr, + thrust::optional initial_pool_size = thrust::nullopt, + thrust::optional maximum_pool_size = thrust::nullopt) + : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size) + { + } + /** * @brief Destroy the `pool_memory_resource` and deallocate all memory it allocated using * the upstream resource. @@ -131,6 +187,13 @@ class pool_memory_resource final */ [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; } + /** + * @brief Get the upstream memory_resource object. + * + * @return const reference to the upstream memory resource. + */ + [[nodiscard]] const Upstream& upstream_resource() const noexcept { return *upstream_mr_; } + /** * @brief Get the upstream memory_resource object. * @@ -296,7 +359,7 @@ class pool_memory_resource final if (size == 0) { return {}; } try { - void* ptr = get_upstream()->allocate(size, stream); + void* ptr = get_upstream()->allocate_async(size, stream); return thrust::optional{ *upstream_blocks_.emplace(static_cast(ptr), size, true).first}; } catch (std::exception const& e) { diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp index 514cc1664..5da2d8b7b 100644 --- a/include/rmm/mr/host/pinned_memory_resource.hpp +++ b/include/rmm/mr/host/pinned_memory_resource.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -46,6 +47,78 @@ class pinned_memory_resource final : public host_memory_resource { pinned_memory_resource& operator=(pinned_memory_resource&&) = default; ///< @default_move_assignment{pinned_memory_resource} + /** + * @brief Query whether the pinned_memory_resource supports use of non-null CUDA streams for + * allocation/deallocation. + * + * @returns bool false. + */ + [[nodiscard]] bool supports_streams() const noexcept { return false; } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return bool false. + */ + [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; } + + /** + * @brief Queries the amount of free and total memory for the resource. + * + * @param stream the stream whose memory manager we want to retrieve + * + * @returns a pair containing the free memory in bytes in .first and total amount of memory in + * .second + */ + [[nodiscard]] std::pair get_mem_info(cuda_stream_view stream) const + { + return std::make_pair(0, 0); + } + + /** + * @brief Pretent to support the allocate_async interface, falling back to stream 0 + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @return void* Pointer to the newly allocated memory + */ + [[nodiscard]] void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view) + { + return do_allocate(bytes, alignment); + } + + /** + * @brief Pretent to support the allocate_async interface, falling back to stream 0 + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @return void* Pointer to the newly allocated memory + */ + [[nodiscard]] void* allocate_async(std::size_t bytes, cuda_stream_view) + { + return do_allocate(bytes); + } + + /** + * @brief Pretent to support the deallocate_async interface, falling back to stream 0 + * + * @throws Nothing. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + */ + void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, cuda_stream_view) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment)); + } + private: /** * @brief Allocates pinned memory on the host of size at least `bytes` bytes. @@ -101,5 +174,6 @@ class pinned_memory_resource final : public host_memory_resource { ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); }); } }; +static_assert(cuda::mr::async_resource); /** @} */ // end of group } // namespace rmm::mr diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 80e2b846b..4e9866905 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -169,6 +169,9 @@ ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp) # host mr_ref tests ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp) +# pinned pool mr tests +ConfigureTest(PINNED_POOL_MR_TEST mr/host/pinned_pool_mr_tests.cpp) + # cuda stream tests ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp) diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index 4a234d2f9..2f32889d0 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -55,6 +55,18 @@ TEST(PoolTest, ThrowMaxLessThanInitial) EXPECT_THROW(max_less_than_initial(), rmm::logic_error); } +TEST(PoolTest, ReferenceThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + TEST(PoolTest, AllocateNinetyPercent) { auto allocate_ninety = []() { @@ -190,4 +202,43 @@ TEST(PoolTest, MultidevicePool) } } // namespace + +namespace test_properties { +class fake_async_resource { + public: + // To model `async_resource` + void* allocate(std::size_t, std::size_t) { return nullptr; } + void deallocate(void* ptr, std::size_t, std::size_t) {} + void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; } + void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {} + + bool operator==(const fake_async_resource& other) const { return true; } + bool operator!=(const fake_async_resource& other) const { return false; } + + // To model stream_resource + [[nodiscard]] bool supports_streams() const noexcept { return false; } + [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; } + + private: + void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; } + void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {} + [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; } +}; +static_assert(!cuda::has_property); +static_assert(!cuda::has_property, + cuda::mr::device_accessible>); + +// Ensure that we forward the property if it is there +class fake_async_resource_device_accessible : public fake_async_resource { + friend void get_property(const fake_async_resource_device_accessible&, + cuda::mr::device_accessible) + { + } +}; +static_assert( + cuda::has_property); +static_assert( + cuda::has_property, + cuda::mr::device_accessible>); +} // namespace test_properties } // namespace rmm::test diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp new file mode 100644 index 000000000..8e202ada3 --- /dev/null +++ b/tests/mr/host/pinned_pool_mr_tests.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +// explicit instantiation for test coverage purposes +template class rmm::mr::pool_memory_resource; + +namespace rmm::test { +namespace { +using pool_mr = rmm::mr::pool_memory_resource; + +TEST(PinnedPoolTest, ThrowOnNullUpstream) +{ + auto construct_nullptr = []() { pool_mr mr{nullptr}; }; + EXPECT_THROW(construct_nullptr(), rmm::logic_error); +} + +TEST(PinnedPoolTest, ThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{&pinned_mr, initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + +TEST(PinnedPoolTest, ReferenceThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{pinned_mr, initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + +// Issue #527 +TEST(PinnedPoolTest, InitialAndMaxPoolSizeEqual) +{ + EXPECT_NO_THROW([]() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000192, 1000192); + mr.allocate(1000); + }()); +} + +TEST(PinnedPoolTest, NonAlignedPoolSize) +{ + EXPECT_THROW( + []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000031, 1000192); + mr.allocate(1000); + }(), + rmm::logic_error); + + EXPECT_THROW( + []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000192, 1000200); + mr.allocate(1000); + }(), + rmm::logic_error); +} + +} // namespace +} // namespace rmm::test From 4d25841ad29c94a9b8d6f0602b3bdfc85c04528b Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 14:15:09 +0200 Subject: [PATCH 04/30] Port `device_buffer` and `device_uvector` to the new interface --- include/rmm/device_buffer.hpp | 34 +++++++++----- include/rmm/device_uvector.hpp | 27 +++++++---- .../rmm/mr/device/device_memory_resource.hpp | 25 +++++++++-- include/rmm/mr/device/per_device_resource.hpp | 29 ++++++++++++ tests/device_buffer_tests.cu | 45 ++++++++----------- tests/device_uvector_tests.cpp | 5 ++- 6 files changed, 112 insertions(+), 53 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index a49f9caa9..811c8d9cb 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -27,6 +27,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -38,7 +40,7 @@ namespace rmm { * * This class allocates untyped and *uninitialized* device memory using a * `device_memory_resource`. If not explicitly specified, the memory resource - * returned from `get_current_device_resource()` is used. + * returned from `get_current_device_resource_ref()` is used. * * @note Unlike `std::vector` or `thrust::device_vector`, the device memory * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined @@ -79,6 +81,8 @@ namespace rmm { *``` */ class device_buffer { + using async_resource_ref = cuda::mr::async_resource_ref; + public: // The copy constructor and copy assignment operator without a stream are deleted because they // provide no way to specify an explicit stream @@ -92,7 +96,7 @@ class device_buffer { // `__host__ __device__` specifiers to the defaulted constructor when it is called within the // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host- // device function. This causes warnings/errors because this ctor invokes host-only functions. - device_buffer() : _mr{rmm::mr::get_current_device_resource()} {} + device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {} /** * @brief Constructs a new device buffer of `size` uninitialized bytes @@ -106,7 +110,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -134,7 +138,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -164,7 +168,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource_ref()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -238,7 +242,6 @@ class device_buffer { ~device_buffer() noexcept { deallocate_async(); - _mr = nullptr; _stream = cuda_stream_view{}; } @@ -399,16 +402,23 @@ class device_buffer { /** * @briefreturn{Pointer to the memory resource used to allocate and deallocate} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; } + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } + + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_buffer` provides device accessible memory + */ + friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {} private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation std::size_t _capacity{}; ///< The actual size of the device memory allocation cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation - mr::device_memory_resource* _mr{ - mr::get_current_device_resource()}; ///< The memory resource used to - ///< allocate/deallocate device memory + async_resource_ref _mr{ + mr::get_current_device_resource_ref()}; ///< The memory resource used to + ///< allocate/deallocate device memory /** * @brief Allocates the specified amount of memory and updates the size/capacity accordingly. @@ -423,7 +433,7 @@ class device_buffer { { _size = bytes; _capacity = bytes; - _data = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr; + _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; } /** @@ -437,7 +447,7 @@ class device_buffer { */ void deallocate_async() noexcept { - if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); } + if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); } _size = 0; _capacity = 0; _data = nullptr; diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 930cda157..cfa6eebd4 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -26,6 +26,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -72,6 +74,7 @@ namespace rmm { */ template class device_uvector { + using async_resource_ref = cuda::mr::async_resource_ref; static_assert(std::is_trivially_copyable::value, "device_uvector only supports types that are trivially copyable."); @@ -121,10 +124,9 @@ class device_uvector { * @param stream The stream on which to perform the allocation * @param mr The resource used to allocate the device storage */ - explicit device_uvector( - std::size_t size, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(std::size_t size, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -138,10 +140,9 @@ class device_uvector { * @param stream The stream on which to perform the copy * @param mr The resource used to allocate device memory for the new vector */ - explicit device_uvector( - device_uvector const& other, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(device_uvector const& other, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) : _storage{other._storage, stream, mr} { } @@ -526,11 +527,19 @@ class device_uvector { /** * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept + [[nodiscard]] cuda::mr::async_resource_ref memory_resource() + const noexcept { return _storage.memory_resource(); } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_uvector` provides device accessible memory + */ + friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {} + /** * @briefreturn{Stream most recently specified for allocation/deallocation} */ diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index cc1ae2c3f..92baf3f0a 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -193,7 +193,7 @@ class device_memory_resource { * * @throws Nothing. * - * @param p Pointer to be deallocated + * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. * @param alignment The alignment that was passed to the `allocate` call that returned `p` @@ -257,7 +257,7 @@ class device_memory_resource { * * @throws Nothing. * - * @param p Pointer to be deallocated + * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. * @param alignment The alignment that was passed to the `allocate` call that returned `p` @@ -284,7 +284,7 @@ class device_memory_resource { * * @throws Nothing. * - * @param p Pointer to be deallocated + * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. * @param stream Stream on which to perform allocation @@ -294,11 +294,25 @@ class device_memory_resource { do_deallocate(ptr, bytes, stream); } + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equivalent + */ [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept { return do_is_equal(other); } + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return false If the two resources are equivalent + * @return true If the two resources are not equivalent + */ [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept { return !do_is_equal(other); @@ -332,6 +346,11 @@ class device_memory_resource { return do_get_mem_info(stream); } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_memory_resource` provides device accessible memory + */ friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {} private: diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index aa7217758..1c4108a25 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -24,6 +24,8 @@ #include #include +#include + /** * @file per_device_resource.hpp * @brief Management of per-device `device_memory_resource`s @@ -233,4 +235,31 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc { return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); } + +/** + * @brief Get the memory resource for the current device as a `cuda::mr::async_resource_ref`. + * + * Returns a `cuda::mr::async_resource_ref` from the memory_resource set for the current device. + * The initial resource is a `cuda_memory_resource`. + * + * The "current device" is the device returned by `cudaGetDevice`. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, + * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`. + * Concurrent calls to any of these functions will result in a valid state, but the order of + * execution is undefined. + * + * @note The returned `cuda::mr::async_resource_ref` should only be used with the current CUDA + * device. Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned + * resource can result in undefined behavior. The behavior of a device_memory_resource is undefined + * if used while the active CUDA device is a different device from the one that was active when the + * device_memory_resource was created. + * + * @return cuda::mr::async_resource_ref of the resource for the current device + */ +inline cuda::mr::async_resource_ref get_current_device_resource_ref() +{ + return cuda::mr::async_resource_ref{get_current_device_resource()}; +} + } // namespace rmm::mr diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index d4c34385e..56992a06c 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -57,6 +57,7 @@ struct DeviceBufferTest : public ::testing::Test { }; using resources = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(DeviceBufferTest, resources); @@ -73,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -84,30 +85,28 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource) { - rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr); + rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream) { - rmm::device_buffer buff(this->size, this->stream, &this->mr); + rmm::device_buffer buff(this->size, this->stream, this->mr); this->stream.synchronize(); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } @@ -119,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // TODO check for equality between the contents of the two allocations @@ -135,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); // TODO check for equality between the contents of the two allocations @@ -148,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -174,8 +173,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_NE(buff.data(), buff_copy.data()); EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref()); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -187,7 +185,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) // now use buff's stream and MR rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource()); EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); - EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource())); + EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); EXPECT_EQ(buff_copy2.stream(), buff.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -217,8 +215,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref()); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -243,7 +240,6 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr) EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -275,7 +271,6 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr) EXPECT_EQ(new_size, buff_copy.capacity()); EXPECT_NE(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -291,7 +286,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -309,7 +304,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_default, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveConstructorStream) @@ -319,7 +313,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -338,7 +332,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) @@ -347,7 +340,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest; @@ -366,7 +359,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, MoveAssignment) @@ -375,7 +367,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr); @@ -394,7 +386,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) @@ -403,7 +394,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); buff = std::move(buff); // self-move-assignment shouldn't modify the buffer diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 69d89e305..46023d519 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -30,14 +30,15 @@ struct TypedUVectorTest : ::testing::Test { [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; } }; -using TestTypes = ::testing::Types; +using TestTypes = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(TypedUVectorTest, TestTypes); TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); - EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource()); + EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource_ref()); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) From de71dcc6a58a2d3a6000c060305b79d78eee3ff0 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 12:22:49 +0200 Subject: [PATCH 05/30] Add cccl repo to build dependencies --- CMakeLists.txt | 2 ++ cmake/thirdparty/get_libcudacxx.cmake | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 cmake/thirdparty/get_libcudacxx.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c883bc4c..79119e4fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ rapids_cpm_init() include(cmake/thirdparty/get_fmt.cmake) include(cmake/thirdparty/get_spdlog.cmake) +include(cmake/thirdparty/get_libcudacxx.cmake) include(cmake/thirdparty/get_thrust.cmake) # ################################################################################################## @@ -89,6 +90,7 @@ else() target_link_libraries(rmm INTERFACE CUDA::cudart) endif() +target_link_libraries(rmm INTERFACE libcudacxx::libcudacxx) target_link_libraries(rmm INTERFACE rmm::Thrust) target_link_libraries(rmm INTERFACE fmt::fmt-header-only) target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only) diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake new file mode 100644 index 000000000..b1f990d39 --- /dev/null +++ b/cmake/thirdparty/get_libcudacxx.cmake @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Use CPM to find or clone libcudacxx +function(find_and_configure_libcudacxx) + + include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) + rapids_cpm_libcudacxx(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports) + +endfunction() + +find_and_configure_libcudacxx() From 13a990fdaa45b5de18958835c7daee6fbaf520f1 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 12:29:12 +0200 Subject: [PATCH 06/30] PoC for the new design of `cuda::mr::{async_}resource_ref` --- CMakeLists.txt | 1 + include/rmm/cuda_stream_view.hpp | 16 + .../mr/device/callback_memory_resource.hpp | 4 +- .../rmm/mr/device/device_memory_resource.hpp | 150 +++++++ include/rmm/mr/host/host_memory_resource.hpp | 35 ++ tests/CMakeLists.txt | 7 + tests/mr/device/adaptor_tests.cpp | 26 ++ tests/mr/device/cuda_async_mr_tests.cpp | 2 + tests/mr/device/cuda_async_view_mr_tests.cpp | 4 + .../mr/device/mr_ref_multithreaded_tests.cpp | 221 ++++++++++ tests/mr/device/mr_ref_test.hpp | 412 ++++++++++++++++++ tests/mr/device/mr_ref_tests.cpp | 109 +++++ tests/mr/host/mr_ref_tests.cpp | 263 +++++++++++ tests/mr/host/mr_tests.cpp | 5 + 14 files changed, 1253 insertions(+), 2 deletions(-) create mode 100644 tests/mr/device/mr_ref_multithreaded_tests.cpp create mode 100644 tests/mr/device/mr_ref_test.hpp create mode 100644 tests/mr/device/mr_ref_tests.cpp create mode 100644 tests/mr/host/mr_ref_tests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 79119e4fe..93fcdabe9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,7 @@ target_link_libraries(rmm INTERFACE fmt::fmt-header-only) target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only) target_link_libraries(rmm INTERFACE dl) target_compile_features(rmm INTERFACE cxx_std_17 $) +target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) # ################################################################################################## # * tests and benchmarks --------------------------------------------------------------------------- diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp index fe07fa1b9..ad2768535 100644 --- a/include/rmm/cuda_stream_view.hpp +++ b/include/rmm/cuda_stream_view.hpp @@ -20,6 +20,8 @@ #include +#include + #include #include #include @@ -58,6 +60,13 @@ class cuda_stream_view { */ constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {} + /** + * @brief Implicit conversion from cuda::stream_ref. + * + * @param stream The underlying stream for this view + */ + constexpr cuda_stream_view(cuda::stream_ref stream) noexcept : stream_{stream.get()} {} + /** * @brief Get the wrapped stream. * @@ -72,6 +81,13 @@ class cuda_stream_view { */ constexpr operator cudaStream_t() const noexcept { return value(); } + /** + * @brief Implicit conversion to stream_ref. + * + * @return cuda::stream_ref The underlying stream referenced by this cuda_stream_view + */ + constexpr operator cuda::stream_ref() const noexcept { return value(); } + /** * @briefreturn{true if the wrapped stream is the CUDA per-thread default stream} */ diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp index c6519ed5c..36802c83a 100644 --- a/include/rmm/mr/device/callback_memory_resource.hpp +++ b/include/rmm/mr/device/callback_memory_resource.hpp @@ -143,8 +143,8 @@ class callback_memory_resource final : public device_memory_resource { throw std::runtime_error("cannot get free / total memory"); } - [[nodiscard]] virtual bool supports_streams() const noexcept { return false; } - [[nodiscard]] virtual bool supports_get_mem_info() const noexcept { return false; } + [[nodiscard]] bool supports_streams() const noexcept override { return false; } + [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; } allocate_callback_t allocate_callback_; deallocate_callback_t deallocate_callback_; diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 8ad84644b..cc1ae2c3f 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -18,6 +18,9 @@ #include #include +#include +#include + #include #include @@ -157,6 +160,150 @@ class device_memory_resource { return do_is_equal(other); } + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate(std::size_t bytes, std::size_t alignment) + { + return do_allocate(rmm::detail::align_up(bytes, alignment), cuda_stream_view{}); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + */ + void deallocate(void* ptr, std::size_t bytes, std::size_t alignment) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), cuda_stream_view{}); + } + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream) + { + return do_allocate(rmm::detail::align_up(bytes, alignment), stream); + } + + /** + * @brief Allocates memory of size at least \p bytes. + * + * The returned pointer will have at minimum 256 byte alignment. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param stream Stream on which to perform allocation + * @return void* Pointer to the newly allocated memory + */ + void* allocate_async(std::size_t bytes, cuda_stream_view stream) + { + return do_allocate(bytes, stream); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + * @param stream Stream on which to perform allocation + */ + void deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + cuda_stream_view stream) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), stream); + } + + /** + * @brief Deallocate memory pointed to by \p p. + * + * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * a `device_memory_resource` that compares equal to `*this`, and the storage + * it points to must not yet have been deallocated, otherwise behavior is + * undefined. + * + * If supported, this operation may optionally be executed on a stream. + * Otherwise, the stream is ignored and the null stream is used. + * + * @throws Nothing. + * + * @param p Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param stream Stream on which to perform allocation + */ + void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream) + { + do_deallocate(ptr, bytes, stream); + } + + [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept + { + return do_is_equal(other); + } + + [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept + { + return !do_is_equal(other); + } + /** * @brief Query whether the resource supports use of non-null CUDA streams for * allocation/deallocation. @@ -185,6 +332,8 @@ class device_memory_resource { return do_get_mem_info(stream); } + friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {} + private: /** * @brief Allocates memory of size at least \p bytes. @@ -243,5 +392,6 @@ class device_memory_resource { [[nodiscard]] virtual std::pair do_get_mem_info( cuda_stream_view stream) const = 0; }; +static_assert(cuda::mr::resource_with, ""); /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index 8a5739b2a..aa40d57cd 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -15,6 +15,8 @@ */ #pragma once +#include + #include #include @@ -114,6 +116,37 @@ class host_memory_resource { return do_is_equal(other); } + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equivalent + */ + [[nodiscard]] bool operator==(host_memory_resource const& other) const noexcept + { + return do_is_equal(other); + } + + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return false If the two resources are equivalent + * @return true If the two resources are not equivalent + */ + [[nodiscard]] bool operator!=(host_memory_resource const& other) const noexcept + { + return !do_is_equal(other); + } + + /** + * @brief Enables the `cuda::mr::host_accessible` property + * + * This property declares that a `host_memory_resource` provides host accessible memory + */ + friend void get_property(host_memory_resource const&, cuda::mr::host_accessible) noexcept {} + private: /** * @brief Allocates memory on the host of size at least `bytes` bytes. @@ -166,5 +199,7 @@ class host_memory_resource { return this == &other; } }; +static_assert(cuda::mr::resource_with, ""); /** @} */ // end of group + } // namespace rmm::mr diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 36c3aa043..80e2b846b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -126,6 +126,10 @@ endfunction() ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp GPUS 1 PERCENT 90) +# device mr tests +ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp + mr/device/mr_ref_multithreaded_tests.cpp) + # general adaptor tests ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) @@ -162,6 +166,9 @@ ConfigureTest(LIMITING_TEST mr/device/limiting_mr_tests.cpp) # host mr tests ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp) +# host mr_ref tests +ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp) + # cuda stream tests ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp) diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp index 44c14240b..3e00dd208 100644 --- a/tests/mr/device/adaptor_tests.cpp +++ b/tests/mr/device/adaptor_tests.cpp @@ -29,6 +29,8 @@ #include #include +#include + #include #include @@ -64,6 +66,30 @@ using adaptors = ::testing::Types, thread_safe_resource_adaptor, tracking_resource_adaptor>; +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, ""); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>, + ""); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>, + ""); +static_assert( + cuda::mr::resource_with, cuda::mr::device_accessible>, + ""); + template struct AdaptorTest : public ::testing::Test { using adaptor_type = MemoryResourceType; diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp index 37ed5c306..90c7b0ff9 100644 --- a/tests/mr/device/cuda_async_mr_tests.cpp +++ b/tests/mr/device/cuda_async_mr_tests.cpp @@ -24,6 +24,8 @@ namespace rmm::test { namespace { using cuda_async_mr = rmm::mr::cuda_async_memory_resource; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); class AsyncMRTest : public ::testing::Test { protected: diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp index 209429b4b..fe82431a9 100644 --- a/tests/mr/device/cuda_async_view_mr_tests.cpp +++ b/tests/mr/device/cuda_async_view_mr_tests.cpp @@ -18,12 +18,16 @@ #include #include +#include + #include namespace rmm::test { namespace { using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource; +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT) diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp new file mode 100644 index 000000000..dd39d1e1d --- /dev/null +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mr_ref_test.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace rmm::test { +namespace { + +struct mr_ref_test_mt : public mr_ref_test {}; + +INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests, + mr_ref_test_mt, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}), + [](auto const& info) { return info.param.name; }); + +template +void spawn_n(std::size_t num_threads, Task task, Arguments&&... args) +{ + std::vector threads; + threads.reserve(num_threads); + for (std::size_t i = 0; i < num_threads; ++i) { + threads.emplace_back(std::thread(task, std::forward(args)...)); + } + + for (auto& thread : threads) { + thread.join(); + } +} + +template +void spawn(Task task, Arguments&&... args) +{ + spawn_n(4, task, std::forward(args)...); +} + +TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); } + +TEST_P(mr_ref_test_mt, AllocateDefaultStream) +{ + spawn(test_various_async_allocations, this->ref, rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, AllocateOnStream) +{ + spawn(test_various_async_allocations, this->ref, this->stream.view()); +} + +TEST_P(mr_ref_test_mt, RandomAllocations) +{ + spawn(test_random_allocations, this->ref, default_num_allocations, default_max_size); +} + +TEST_P(mr_ref_test_mt, RandomAllocationsDefaultStream) +{ + spawn(test_random_async_allocations, + this->ref, + default_num_allocations, + default_max_size, + rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, RandomAllocationsStream) +{ + spawn(test_random_async_allocations, + this->ref, + default_num_allocations, + default_max_size, + this->stream.view()); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFree) +{ + spawn(test_mixed_random_allocation_free, this->ref, default_max_size); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeDefaultStream) +{ + spawn( + test_mixed_random_async_allocation_free, this->ref, default_max_size, rmm::cuda_stream_view{}); +} + +TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeStream) +{ + spawn(test_mixed_random_async_allocation_free, this->ref, default_max_size, this->stream.view()); +} + +void allocate_async_loop(async_resource_ref ref, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + cudaEvent_t& event, + rmm::cuda_stream_view stream) +{ + constexpr std::size_t max_size{1_MiB}; + + std::default_random_engine generator; + std::uniform_int_distribution size_distribution(1, max_size); + + for (std::size_t i = 0; i < num_allocations; ++i) { + std::size_t size = size_distribution(generator); + void* ptr = ref.allocate_async(size, stream); + { + std::lock_guard lock(mtx); + RMM_CUDA_TRY(cudaEventRecord(event, stream.value())); + allocations.emplace_back(ptr, size); + } + } +} + +void deallocate_async_loop(async_resource_ref ref, + std::size_t num_allocations, + std::list& allocations, + std::mutex& mtx, + cudaEvent_t& event, + rmm::cuda_stream_view stream) +{ + for (std::size_t i = 0; i < num_allocations;) { + std::lock_guard lock(mtx); + if (allocations.empty()) { continue; } + i++; + RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event)); + allocation alloc = allocations.front(); + allocations.pop_front(); + ref.deallocate_async(alloc.ptr, alloc.size, stream); + } +} + +void test_allocate_async_free_different_threads(async_resource_ref ref, + rmm::cuda_stream_view streamA, + rmm::cuda_stream_view streamB) +{ + constexpr std::size_t num_allocations{100}; + + std::mutex mtx; + std::list allocations; + cudaEvent_t event; + + RMM_CUDA_TRY(cudaEventCreate(&event)); + + std::thread producer(allocate_async_loop, + ref, + num_allocations, + std::ref(allocations), + std::ref(mtx), + std::ref(event), + streamA); + + std::thread consumer(deallocate_async_loop, + ref, + num_allocations, + std::ref(allocations), + std::ref(mtx), + std::ref(event), + streamB); + + producer.join(); + consumer.join(); + + RMM_CUDA_TRY(cudaEventDestroy(event)); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDefaultStream) +{ + test_allocate_async_free_different_threads( + this->ref, rmm::cuda_stream_default, rmm::cuda_stream_default); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsPerThreadDefaultStream) +{ + test_allocate_async_free_different_threads( + this->ref, rmm::cuda_stream_per_thread, rmm::cuda_stream_per_thread); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsSameStream) +{ + test_allocate_async_free_different_threads(this->ref, this->stream, this->stream); +} + +TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDifferentStream) +{ + rmm::cuda_stream streamB; + test_allocate_async_free_different_threads(this->ref, this->stream, streamB); + streamB.synchronize(); +} + +} // namespace +} // namespace rmm::test diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp new file mode 100644 index 000000000..ecb13c2a7 --- /dev/null +++ b/tests/mr/device/mr_ref_test.hpp @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +using resource_ref = cuda::mr::resource_ref; +using async_resource_ref = cuda::mr::async_resource_ref; + +namespace rmm::test { + +/** + * @brief Returns if a pointer points to a device memory or managed memory + * allocation. + */ +inline bool is_device_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } +#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 + return attributes.memoryType == cudaMemoryTypeDevice; +#else + return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); +#endif +} + +enum size_in_bytes : size_t {}; + +constexpr auto default_num_allocations{100}; +constexpr size_in_bytes default_max_size{5_MiB}; + +struct allocation { + void* ptr{nullptr}; + std::size_t size{0}; + allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {} + allocation() = default; +}; + +// Various test functions, shared between single-threaded and multithreaded tests. +inline void test_allocate(resource_ref ref, std::size_t bytes) +{ + try { + void* ptr = ref.allocate(bytes); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr)); + EXPECT_TRUE(is_device_memory(ptr)); + ref.deallocate(ptr, bytes); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } +} + +inline void test_allocate_async(async_resource_ref ref, + std::size_t bytes, + cuda_stream_view stream = {}) +{ + try { + void* ptr = ref.allocate_async(bytes, stream); + if (not stream.is_default()) { stream.synchronize(); } + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr)); + EXPECT_TRUE(is_device_memory(ptr)); + ref.deallocate_async(ptr, bytes, stream); + if (not stream.is_default()) { stream.synchronize(); } + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } +} + +// Simple reproducer for https://github.com/rapidsai/rmm/issues/861 +inline void concurrent_allocations_are_different(resource_ref ref) +{ + const auto size{8_B}; + void* ptr1 = ref.allocate(size); + void* ptr2 = ref.allocate(size); + + EXPECT_NE(ptr1, ptr2); + + ref.deallocate(ptr1, size); + ref.deallocate(ptr2, size); +} + +inline void concurrent_async_allocations_are_different(async_resource_ref ref, + cuda_stream_view stream) +{ + const auto size{8_B}; + void* ptr1 = ref.allocate_async(size, stream); + void* ptr2 = ref.allocate_async(size, stream); + + EXPECT_NE(ptr1, ptr2); + + ref.deallocate_async(ptr1, size, stream); + ref.deallocate_async(ptr2, size, stream); +} + +inline void test_various_allocations(resource_ref ref) +{ + // test allocating zero bytes on non-default stream + { + void* ptr = ref.allocate(0); + EXPECT_NO_THROW(ref.deallocate(ptr, 0)); + } + + test_allocate(ref, 4_B); + test_allocate(ref, 1_KiB); + test_allocate(ref, 1_MiB); + test_allocate(ref, 1_GiB); + + // should fail to allocate too much + { + void* ptr{nullptr}; + EXPECT_THROW(ptr = ref.allocate(1_PiB), rmm::out_of_memory); + EXPECT_EQ(nullptr, ptr); + + // test e.what(); + try { + ptr = ref.allocate(1_PiB); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } + } +} + +inline void test_various_async_allocations(async_resource_ref ref, cuda_stream_view stream) +{ + // test allocating zero bytes on non-default stream + { + void* ptr = ref.allocate_async(0, stream); + stream.synchronize(); + EXPECT_NO_THROW(ref.deallocate_async(ptr, 0, stream)); + stream.synchronize(); + } + + test_allocate_async(ref, 4_B, stream); + test_allocate_async(ref, 1_KiB, stream); + test_allocate_async(ref, 1_MiB, stream); + test_allocate_async(ref, 1_GiB, stream); + + // should fail to allocate too much + { + void* ptr{nullptr}; + EXPECT_THROW(ptr = ref.allocate_async(1_PiB, stream), rmm::out_of_memory); + EXPECT_EQ(nullptr, ptr); + + // test e.what(); + try { + ptr = ref.allocate_async(1_PiB, stream); + } catch (rmm::out_of_memory const& e) { + EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos); + } + } +} + +inline void test_random_allocations(resource_ref ref, + std::size_t num_allocations = default_num_allocations, + size_in_bytes max_size = default_max_size) +{ + std::vector allocations(num_allocations); + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, max_size); + + // num_allocations allocations from [0,max_size) + std::for_each( + allocations.begin(), allocations.end(), [&generator, &distribution, &ref](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size)); + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [&ref](allocation& alloc) { + EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size)); + }); +} + +inline void test_random_async_allocations(async_resource_ref ref, + std::size_t num_allocations = default_num_allocations, + size_in_bytes max_size = default_max_size, + cuda_stream_view stream = {}) +{ + std::vector allocations(num_allocations); + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, max_size); + + // num_allocations allocations from [0,max_size) + std::for_each(allocations.begin(), + allocations.end(), + [&generator, &distribution, &ref, stream](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size)); + if (not stream.is_default()) { stream.synchronize(); } + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [stream, &ref](allocation& alloc) { + EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size)); + if (not stream.is_default()) { stream.synchronize(); } + }); +} + +inline void test_mixed_random_allocation_free(resource_ref ref, + size_in_bytes max_size = default_max_size) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + + std::uniform_int_distribution size_distribution(1, max_size); + + constexpr int allocation_probability{53}; // percent + constexpr int max_probability{99}; + std::uniform_int_distribution op_distribution(0, max_probability); + std::uniform_int_distribution index_distribution(0, num_allocations - 1); + + std::size_t active_allocations{0}; + std::size_t allocation_count{0}; + + std::vector allocations; + + for (std::size_t i = 0; i < num_allocations * 2; ++i) { + bool do_alloc = true; + if (active_allocations > 0) { + int chance = op_distribution(generator); + do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); + } + + if (do_alloc) { + std::size_t size = size_distribution(generator); + active_allocations++; + allocation_count++; + EXPECT_NO_THROW(allocations.emplace_back(ref.allocate(size), size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr)); + } else { + auto const index = static_cast(index_distribution(generator) % active_allocations); + active_allocations--; + allocation to_free = allocations[index]; + allocations.erase(std::next(allocations.begin(), index)); + EXPECT_NO_THROW(ref.deallocate(to_free.ptr, to_free.size)); + } + } + + EXPECT_EQ(active_allocations, 0); + EXPECT_EQ(allocations.size(), active_allocations); +} + +inline void test_mixed_random_async_allocation_free(async_resource_ref ref, + size_in_bytes max_size = default_max_size, + cuda_stream_view stream = {}) +{ + std::default_random_engine generator; + constexpr std::size_t num_allocations{100}; + + std::uniform_int_distribution size_distribution(1, max_size); + + constexpr int allocation_probability{53}; // percent + constexpr int max_probability{99}; + std::uniform_int_distribution op_distribution(0, max_probability); + std::uniform_int_distribution index_distribution(0, num_allocations - 1); + + std::size_t active_allocations{0}; + std::size_t allocation_count{0}; + + std::vector allocations; + + for (std::size_t i = 0; i < num_allocations * 2; ++i) { + bool do_alloc = true; + if (active_allocations > 0) { + int chance = op_distribution(generator); + do_alloc = (chance < allocation_probability) && (allocation_count < num_allocations); + } + + if (do_alloc) { + std::size_t size = size_distribution(generator); + active_allocations++; + allocation_count++; + EXPECT_NO_THROW(allocations.emplace_back(ref.allocate_async(size, stream), size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr)); + } else { + auto const index = static_cast(index_distribution(generator) % active_allocations); + active_allocations--; + allocation to_free = allocations[index]; + allocations.erase(std::next(allocations.begin(), index)); + EXPECT_NO_THROW(ref.deallocate_async(to_free.ptr, to_free.size, stream)); + } + } + + EXPECT_EQ(active_allocations, 0); + EXPECT_EQ(allocations.size(), active_allocations); +} + +using MRFactoryFunc = std::function()>; + +/// Encapsulates a `device_memory_resource` factory function and associated name +struct mr_factory { + mr_factory(std::string name, MRFactoryFunc factory) + : name{std::move(name)}, factory{std::move(factory)} + { + } + + std::string name; ///< Name to associate with tests that use this factory + MRFactoryFunc factory; ///< Factory function that returns shared_ptr to `device_memory_resource` + ///< instance to use in test +}; + +/// Test fixture class value-parameterized on different `mr_factory`s +struct mr_ref_test : public ::testing::TestWithParam { + void SetUp() override + { + auto factory = GetParam().factory; + mr = factory(); + if (mr == nullptr) { + GTEST_SKIP() << "Skipping tests since the memory resource is not supported with this CUDA " + << "driver/runtime version"; + } + ref = async_resource_ref{*mr}; + } + + std::shared_ptr mr; ///< Pointer to resource to use in tests + async_resource_ref ref{*mr}; + rmm::cuda_stream stream{}; +}; + +struct mr_ref_allocation_test : public mr_ref_test {}; + +/// MR factory functions +inline auto make_cuda() { return std::make_shared(); } + +inline auto make_cuda_async() +{ + if (rmm::detail::async_alloc::is_supported()) { + return std::make_shared(); + } + return std::shared_ptr{nullptr}; +} + +inline auto make_managed() { return std::make_shared(); } + +inline auto make_pool() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_arena() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_fixed_size() +{ + return rmm::mr::make_owning_wrapper(make_cuda()); +} + +inline auto make_binning() +{ + auto pool = make_pool(); + // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB + // Larger allocations will use the pool resource + auto const bin_range_start{18}; + auto const bin_range_end{22}; + + auto mr = rmm::mr::make_owning_wrapper( + pool, bin_range_start, bin_range_end); + return mr; +} + +} // namespace rmm::test diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp new file mode 100644 index 000000000..2507d468a --- /dev/null +++ b/tests/mr/device/mr_ref_tests.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mr_ref_test.hpp" + +#include + +#include + +#include + +namespace rmm::test { +namespace { + +INSTANTIATE_TEST_SUITE_P(ResourceTests, + mr_ref_test, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}, + mr_factory{"Fixed_Size", &make_fixed_size}), + [](auto const& info) { return info.param.name; }); + +// Leave out fixed-size MR here because it can't handle the dynamic allocation sizes +INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests, + mr_ref_allocation_test, + ::testing::Values(mr_factory{"CUDA", &make_cuda}, +#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT + mr_factory{"CUDA_Async", &make_cuda_async}, +#endif + mr_factory{"Managed", &make_managed}, + mr_factory{"Pool", &make_pool}, + mr_factory{"Arena", &make_arena}, + mr_factory{"Binning", &make_binning}), + [](auto const& info) { return info.param.name; }); +TEST_P(mr_ref_test, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } + +// Simple reproducer for https://github.com/rapidsai/rmm/issues/861 +TEST_P(mr_ref_test, AllocationsAreDifferent) { concurrent_allocations_are_different(this->ref); } + +TEST_P(mr_ref_test, AsyncAllocationsAreDifferentDefaultStream) +{ + concurrent_async_allocations_are_different(this->ref, cuda_stream_view{}); +} + +TEST_P(mr_ref_test, AsyncAllocationsAreDifferent) +{ + concurrent_async_allocations_are_different(this->ref, this->stream); +} + +TEST_P(mr_ref_allocation_test, AllocateDefault) { test_various_allocations(this->ref); } + +TEST_P(mr_ref_allocation_test, AllocateDefaultStream) +{ + test_various_async_allocations(this->ref, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, AllocateOnStream) +{ + test_various_async_allocations(this->ref, this->stream); +} + +TEST_P(mr_ref_allocation_test, RandomAllocations) { test_random_allocations(this->ref); } + +TEST_P(mr_ref_allocation_test, RandomAllocationsDefaultStream) +{ + test_random_async_allocations( + this->ref, default_num_allocations, default_max_size, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, RandomAllocationsStream) +{ + test_random_async_allocations(this->ref, default_num_allocations, default_max_size, this->stream); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFree) +{ + test_mixed_random_allocation_free(this->ref, default_max_size); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeDefaultStream) +{ + test_mixed_random_async_allocation_free(this->ref, default_max_size, cuda_stream_view{}); +} + +TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeStream) +{ + test_mixed_random_async_allocation_free(this->ref, default_max_size, this->stream); +} + +} // namespace +} // namespace rmm::test diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp new file mode 100644 index 000000000..568e18576 --- /dev/null +++ b/tests/mr/host/mr_ref_tests.cpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../byte_literals.hpp" + +#include +#include +#include +#include + +#include + +#include + +#include + +#include +#include +#include + +namespace rmm::test { +namespace { +inline bool is_aligned(void* ptr, std::size_t alignment = alignof(std::max_align_t)) +{ + return rmm::detail::is_pointer_aligned(ptr, alignment); +} + +// Returns true if a pointer points to a device memory or managed memory allocation. +inline bool is_device_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } +#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 + return attributes.memoryType == cudaMemoryTypeDevice; +#else + return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); +#endif +} + +/** + * @brief Returns if a pointer `p` points to pinned host memory. + */ +inline bool is_pinned_memory(void* ptr) +{ + cudaPointerAttributes attributes{}; + if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } + return attributes.type == cudaMemoryTypeHost; +} + +constexpr std::size_t size_word{4_B}; +constexpr std::size_t size_kb{1_KiB}; +constexpr std::size_t size_mb{1_MiB}; +constexpr std::size_t size_gb{1_GiB}; +constexpr std::size_t size_pb{1_PiB}; + +struct allocation { + void* ptr{nullptr}; + std::size_t size{0}; + allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {} + allocation() = default; +}; +} // namespace + +template +struct MRRefTest : public ::testing::Test { + MemoryResourceType mr; + cuda::mr::resource_ref ref; + + MRRefTest() : mr{}, ref{mr} {} +}; + +using resources = ::testing::Types; +static_assert(cuda::mr::resource_with, ""); +static_assert(cuda::mr::resource_with, + ""); + +TYPED_TEST_CASE(MRRefTest, resources); + +TYPED_TEST(MRRefTest, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); } + +TYPED_TEST(MRRefTest, AllocateZeroBytes) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(0)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, 0)); +} + +TYPED_TEST(MRRefTest, AllocateWord) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_word)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_word)); +} + +TYPED_TEST(MRRefTest, AllocateKB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_kb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_kb)); +} + +TYPED_TEST(MRRefTest, AllocateMB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_mb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_mb)); +} + +TYPED_TEST(MRRefTest, AllocateGB) +{ + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(size_gb)); + EXPECT_NE(nullptr, ptr); + EXPECT_TRUE(is_aligned(ptr)); + EXPECT_FALSE(is_device_memory(ptr)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, size_gb)); +} + +TYPED_TEST(MRRefTest, AllocateTooMuch) +{ + void* ptr{nullptr}; + EXPECT_THROW(ptr = this->ref.allocate(size_pb), std::bad_alloc); + EXPECT_EQ(nullptr, ptr); +} + +TYPED_TEST(MRRefTest, RandomAllocations) +{ + constexpr std::size_t num_allocations{100}; + std::vector allocations(num_allocations); + + constexpr std::size_t MAX_ALLOCATION_SIZE{5 * size_mb}; + + std::default_random_engine generator; + std::uniform_int_distribution distribution(1, MAX_ALLOCATION_SIZE); + + // 100 allocations from [0,5MB) + std::for_each( + allocations.begin(), allocations.end(), [&generator, &distribution, this](allocation& alloc) { + alloc.size = distribution(generator); + EXPECT_NO_THROW(alloc.ptr = this->ref.allocate(alloc.size)); + EXPECT_NE(nullptr, alloc.ptr); + EXPECT_TRUE(is_aligned(alloc.ptr)); + }); + + std::for_each(allocations.begin(), allocations.end(), [this](allocation& alloc) { + EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size)); + }); +} + +TYPED_TEST(MRRefTest, MixedRandomAllocationFree) +{ + std::default_random_engine generator; + + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + // How often a free will occur. For example, if `1`, then every allocation + // will immediately be free'd. Or, if 4, on average, a free will occur after + // every 4th allocation + constexpr std::size_t FREE_FREQUENCY{4}; + std::uniform_int_distribution free_distribution(1, FREE_FREQUENCY); + + std::deque allocations; + + constexpr std::size_t num_allocations{100}; + for (std::size_t i = 0; i < num_allocations; ++i) { + std::size_t allocation_size = size_distribution(generator); + EXPECT_NO_THROW(allocations.emplace_back(this->ref.allocate(allocation_size), allocation_size)); + auto new_allocation = allocations.back(); + EXPECT_NE(nullptr, new_allocation.ptr); + EXPECT_TRUE(is_aligned(new_allocation.ptr)); + + bool const free_front{free_distribution(generator) == free_distribution.max()}; + + if (free_front) { + auto front = allocations.front(); + EXPECT_NO_THROW(this->ref.deallocate(front.ptr, front.size)); + allocations.pop_front(); + } + } + // free any remaining allocations + for (auto alloc : allocations) { + EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size)); + allocations.pop_front(); + } +} + +static constexpr std::size_t MinTestedAlignment{16}; +static constexpr std::size_t MaxTestedAlignment{4096}; +static constexpr std::size_t TestedAlignmentMultiplier{2}; +static constexpr std::size_t NUM_TRIALS{100}; + +TYPED_TEST(MRRefTest, AlignmentTest) +{ + std::default_random_engine generator(0); + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { + for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; + alignment *= TestedAlignmentMultiplier) { + auto allocation_size = size_distribution(generator); + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, alignment)); + EXPECT_TRUE(is_aligned(ptr, alignment)); + EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, alignment)); + } + } +} + +TYPED_TEST(MRRefTest, UnsupportedAlignmentTest) +{ + std::default_random_engine generator(0); + constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb}; + std::uniform_int_distribution size_distribution(1, MAX_ALLOCATION_SIZE); + + for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { + for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; + alignment *= TestedAlignmentMultiplier) { + auto allocation_size = size_distribution(generator); + void* ptr{nullptr}; + // An unsupported alignment (like an odd number) should result in an + // alignment of `alignof(std::max_align_t)` + auto const bad_alignment = alignment + 1; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment)); + EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t))); + EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment)); + } + } +} + +TEST(PinnedResource, isPinned) +{ + rmm::mr::pinned_memory_resource mr; + cuda::mr::resource_ref ref{mr}; + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = ref.allocate(100)); + EXPECT_TRUE(is_pinned_memory(ptr)); + EXPECT_NO_THROW(ref.deallocate(ptr, 100)); +} +} // namespace rmm::test diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp index 1cd59f5a6..ff976ff96 100644 --- a/tests/mr/host/mr_tests.cpp +++ b/tests/mr/host/mr_tests.cpp @@ -23,6 +23,8 @@ #include +#include + #include #include @@ -80,6 +82,9 @@ struct MRTest : public ::testing::Test { }; using resources = ::testing::Types; +static_assert(cuda::mr::resource_with, ""); +static_assert(cuda::mr::resource_with, + ""); TYPED_TEST_CASE(MRTest, resources); From 9ceab835b23a1b30f91a0adf4dcf709cc7312a44 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 14:14:48 +0200 Subject: [PATCH 07/30] Make `pinned_memory_resource` usable for `pool_memory_resource` --- .../rmm/mr/device/pool_memory_resource.hpp | 69 ++++++++++++- .../rmm/mr/host/pinned_memory_resource.hpp | 74 ++++++++++++++ tests/CMakeLists.txt | 3 + tests/mr/device/pool_mr_tests.cpp | 51 ++++++++++ tests/mr/host/pinned_pool_mr_tests.cpp | 96 +++++++++++++++++++ 5 files changed, 290 insertions(+), 3 deletions(-) create mode 100644 tests/mr/host/pinned_pool_mr_tests.cpp diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index f6d3710e9..7cca60613 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -51,6 +51,35 @@ namespace rmm::mr { * @{ * @file */ +namespace detail { +/** + * @brief A helper class to remove the device_accessible property + * + * We want to be able to use the pool_memory_resource with an upstream that may not + * be device accessible. To avoid rewriting the world, we allow to conditionally remove + * the cuda::mr::device_accessible property. + * + * @tparam PoolResource the pool_memory_resource class + * @tparam Upstream memory_resource to use for allocating the pool. + */ +template +struct maybe_remove_property {}; + +/** + * @brief Specialization of maybe_remove_property to not propagate non existing properties + */ +template +struct maybe_remove_property< + PoolResource, + Upstream, + cuda::std::enable_if_t>> { + /** + * @brief Explicit removal of the friend function so we do not pretent to provide device + * accessible memory + */ + friend void get_property(const PoolResource&, cuda::mr::device_accessible) = delete; +}; +} // namespace detail /** * @brief A coalescing best-fit suballocator which uses a pool of memory allocated from @@ -64,8 +93,10 @@ namespace rmm::mr { */ template class pool_memory_resource final - : public detail::stream_ordered_memory_resource, - detail::coalescing_free_list> { + : public detail::maybe_remove_property, Upstream>, + public detail::stream_ordered_memory_resource, + detail::coalescing_free_list>, + public cuda::forward_property, Upstream> { public: friend class detail::stream_ordered_memory_resource, detail::coalescing_free_list>; @@ -104,6 +135,31 @@ class pool_memory_resource final initialize_pool(initial_pool_size, maximum_pool_size); } + /** + * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using + * `upstream_mr`. + * + * @throws rmm::logic_error if `upstream_mr == nullptr` + * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * + * @param upstream_mr The memory_resource from which to allocate blocks for the pool. + * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to half of the + * available memory on the current device. + * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all + * of the available memory on the current device. + */ + template , int> = 0> + explicit pool_memory_resource(Upstream2& upstream_mr, + thrust::optional initial_pool_size = thrust::nullopt, + thrust::optional maximum_pool_size = thrust::nullopt) + : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size) + { + } + /** * @brief Destroy the `pool_memory_resource` and deallocate all memory it allocated using * the upstream resource. @@ -131,6 +187,13 @@ class pool_memory_resource final */ [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; } + /** + * @brief Get the upstream memory_resource object. + * + * @return const reference to the upstream memory resource. + */ + [[nodiscard]] const Upstream& upstream_resource() const noexcept { return *upstream_mr_; } + /** * @brief Get the upstream memory_resource object. * @@ -296,7 +359,7 @@ class pool_memory_resource final if (size == 0) { return {}; } try { - void* ptr = get_upstream()->allocate(size, stream); + void* ptr = get_upstream()->allocate_async(size, stream); return thrust::optional{ *upstream_blocks_.emplace(static_cast(ptr), size, true).first}; } catch (std::exception const& e) { diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp index 514cc1664..5da2d8b7b 100644 --- a/include/rmm/mr/host/pinned_memory_resource.hpp +++ b/include/rmm/mr/host/pinned_memory_resource.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -46,6 +47,78 @@ class pinned_memory_resource final : public host_memory_resource { pinned_memory_resource& operator=(pinned_memory_resource&&) = default; ///< @default_move_assignment{pinned_memory_resource} + /** + * @brief Query whether the pinned_memory_resource supports use of non-null CUDA streams for + * allocation/deallocation. + * + * @returns bool false. + */ + [[nodiscard]] bool supports_streams() const noexcept { return false; } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return bool false. + */ + [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; } + + /** + * @brief Queries the amount of free and total memory for the resource. + * + * @param stream the stream whose memory manager we want to retrieve + * + * @returns a pair containing the free memory in bytes in .first and total amount of memory in + * .second + */ + [[nodiscard]] std::pair get_mem_info(cuda_stream_view stream) const + { + return std::make_pair(0, 0); + } + + /** + * @brief Pretent to support the allocate_async interface, falling back to stream 0 + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @param alignment The expected alignment of the allocation + * @return void* Pointer to the newly allocated memory + */ + [[nodiscard]] void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view) + { + return do_allocate(bytes, alignment); + } + + /** + * @brief Pretent to support the allocate_async interface, falling back to stream 0 + * + * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * the specified `stream`. + * + * @param bytes The size of the allocation + * @return void* Pointer to the newly allocated memory + */ + [[nodiscard]] void* allocate_async(std::size_t bytes, cuda_stream_view) + { + return do_allocate(bytes); + } + + /** + * @brief Pretent to support the deallocate_async interface, falling back to stream 0 + * + * @throws Nothing. + * + * @param ptr Pointer to be deallocated + * @param bytes The size in bytes of the allocation. This must be equal to the + * value of `bytes` that was passed to the `allocate` call that returned `p`. + * @param alignment The alignment that was passed to the `allocate` call that returned `p` + */ + void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, cuda_stream_view) + { + do_deallocate(ptr, rmm::detail::align_up(bytes, alignment)); + } + private: /** * @brief Allocates pinned memory on the host of size at least `bytes` bytes. @@ -101,5 +174,6 @@ class pinned_memory_resource final : public host_memory_resource { ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); }); } }; +static_assert(cuda::mr::async_resource); /** @} */ // end of group } // namespace rmm::mr diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 80e2b846b..4e9866905 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -169,6 +169,9 @@ ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp) # host mr_ref tests ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp) +# pinned pool mr tests +ConfigureTest(PINNED_POOL_MR_TEST mr/host/pinned_pool_mr_tests.cpp) + # cuda stream tests ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp) diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index 4a234d2f9..2f32889d0 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -55,6 +55,18 @@ TEST(PoolTest, ThrowMaxLessThanInitial) EXPECT_THROW(max_less_than_initial(), rmm::logic_error); } +TEST(PoolTest, ReferenceThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + TEST(PoolTest, AllocateNinetyPercent) { auto allocate_ninety = []() { @@ -190,4 +202,43 @@ TEST(PoolTest, MultidevicePool) } } // namespace + +namespace test_properties { +class fake_async_resource { + public: + // To model `async_resource` + void* allocate(std::size_t, std::size_t) { return nullptr; } + void deallocate(void* ptr, std::size_t, std::size_t) {} + void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; } + void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {} + + bool operator==(const fake_async_resource& other) const { return true; } + bool operator!=(const fake_async_resource& other) const { return false; } + + // To model stream_resource + [[nodiscard]] bool supports_streams() const noexcept { return false; } + [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; } + + private: + void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; } + void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {} + [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; } +}; +static_assert(!cuda::has_property); +static_assert(!cuda::has_property, + cuda::mr::device_accessible>); + +// Ensure that we forward the property if it is there +class fake_async_resource_device_accessible : public fake_async_resource { + friend void get_property(const fake_async_resource_device_accessible&, + cuda::mr::device_accessible) + { + } +}; +static_assert( + cuda::has_property); +static_assert( + cuda::has_property, + cuda::mr::device_accessible>); +} // namespace test_properties } // namespace rmm::test diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp new file mode 100644 index 000000000..8e202ada3 --- /dev/null +++ b/tests/mr/host/pinned_pool_mr_tests.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +// explicit instantiation for test coverage purposes +template class rmm::mr::pool_memory_resource; + +namespace rmm::test { +namespace { +using pool_mr = rmm::mr::pool_memory_resource; + +TEST(PinnedPoolTest, ThrowOnNullUpstream) +{ + auto construct_nullptr = []() { pool_mr mr{nullptr}; }; + EXPECT_THROW(construct_nullptr(), rmm::logic_error); +} + +TEST(PinnedPoolTest, ThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{&pinned_mr, initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + +TEST(PinnedPoolTest, ReferenceThrowMaxLessThanInitial) +{ + // Make sure first argument is enough larger than the second that alignment rounding doesn't + // make them equal + auto max_less_than_initial = []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + const auto initial{1024}; + const auto maximum{256}; + pool_mr mr{pinned_mr, initial, maximum}; + }; + EXPECT_THROW(max_less_than_initial(), rmm::logic_error); +} + +// Issue #527 +TEST(PinnedPoolTest, InitialAndMaxPoolSizeEqual) +{ + EXPECT_NO_THROW([]() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000192, 1000192); + mr.allocate(1000); + }()); +} + +TEST(PinnedPoolTest, NonAlignedPoolSize) +{ + EXPECT_THROW( + []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000031, 1000192); + mr.allocate(1000); + }(), + rmm::logic_error); + + EXPECT_THROW( + []() { + rmm::mr::pinned_memory_resource pinned_mr{}; + pool_mr mr(pinned_mr, 1000192, 1000200); + mr.allocate(1000); + }(), + rmm::logic_error); +} + +} // namespace +} // namespace rmm::test From c0df290ba4394cbab81b44da3bb6e2f00d6be17f Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 8 Sep 2023 14:15:09 +0200 Subject: [PATCH 08/30] Port `device_buffer` and `device_uvector` to the new interface --- include/rmm/device_buffer.hpp | 34 +++++++++----- include/rmm/device_uvector.hpp | 27 +++++++---- .../rmm/mr/device/device_memory_resource.hpp | 25 +++++++++-- include/rmm/mr/device/per_device_resource.hpp | 29 ++++++++++++ tests/device_buffer_tests.cu | 45 ++++++++----------- tests/device_uvector_tests.cpp | 5 ++- 6 files changed, 112 insertions(+), 53 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index a49f9caa9..811c8d9cb 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -27,6 +27,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -38,7 +40,7 @@ namespace rmm { * * This class allocates untyped and *uninitialized* device memory using a * `device_memory_resource`. If not explicitly specified, the memory resource - * returned from `get_current_device_resource()` is used. + * returned from `get_current_device_resource_ref()` is used. * * @note Unlike `std::vector` or `thrust::device_vector`, the device memory * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined @@ -79,6 +81,8 @@ namespace rmm { *``` */ class device_buffer { + using async_resource_ref = cuda::mr::async_resource_ref; + public: // The copy constructor and copy assignment operator without a stream are deleted because they // provide no way to specify an explicit stream @@ -92,7 +96,7 @@ class device_buffer { // `__host__ __device__` specifiers to the defaulted constructor when it is called within the // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host- // device function. This causes warnings/errors because this ctor invokes host-only functions. - device_buffer() : _mr{rmm::mr::get_current_device_resource()} {} + device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {} /** * @brief Constructs a new device buffer of `size` uninitialized bytes @@ -106,7 +110,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -134,7 +138,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -164,7 +168,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource_ref()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -238,7 +242,6 @@ class device_buffer { ~device_buffer() noexcept { deallocate_async(); - _mr = nullptr; _stream = cuda_stream_view{}; } @@ -399,16 +402,23 @@ class device_buffer { /** * @briefreturn{Pointer to the memory resource used to allocate and deallocate} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; } + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } + + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_buffer` provides device accessible memory + */ + friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {} private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation std::size_t _capacity{}; ///< The actual size of the device memory allocation cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation - mr::device_memory_resource* _mr{ - mr::get_current_device_resource()}; ///< The memory resource used to - ///< allocate/deallocate device memory + async_resource_ref _mr{ + mr::get_current_device_resource_ref()}; ///< The memory resource used to + ///< allocate/deallocate device memory /** * @brief Allocates the specified amount of memory and updates the size/capacity accordingly. @@ -423,7 +433,7 @@ class device_buffer { { _size = bytes; _capacity = bytes; - _data = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr; + _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; } /** @@ -437,7 +447,7 @@ class device_buffer { */ void deallocate_async() noexcept { - if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); } + if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); } _size = 0; _capacity = 0; _data = nullptr; diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 930cda157..cfa6eebd4 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -26,6 +26,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -72,6 +74,7 @@ namespace rmm { */ template class device_uvector { + using async_resource_ref = cuda::mr::async_resource_ref; static_assert(std::is_trivially_copyable::value, "device_uvector only supports types that are trivially copyable."); @@ -121,10 +124,9 @@ class device_uvector { * @param stream The stream on which to perform the allocation * @param mr The resource used to allocate the device storage */ - explicit device_uvector( - std::size_t size, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(std::size_t size, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -138,10 +140,9 @@ class device_uvector { * @param stream The stream on which to perform the copy * @param mr The resource used to allocate device memory for the new vector */ - explicit device_uvector( - device_uvector const& other, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(device_uvector const& other, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) : _storage{other._storage, stream, mr} { } @@ -526,11 +527,19 @@ class device_uvector { /** * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept + [[nodiscard]] cuda::mr::async_resource_ref memory_resource() + const noexcept { return _storage.memory_resource(); } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_uvector` provides device accessible memory + */ + friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {} + /** * @briefreturn{Stream most recently specified for allocation/deallocation} */ diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index cc1ae2c3f..92baf3f0a 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -193,7 +193,7 @@ class device_memory_resource { * * @throws Nothing. * - * @param p Pointer to be deallocated + * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. * @param alignment The alignment that was passed to the `allocate` call that returned `p` @@ -257,7 +257,7 @@ class device_memory_resource { * * @throws Nothing. * - * @param p Pointer to be deallocated + * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. * @param alignment The alignment that was passed to the `allocate` call that returned `p` @@ -284,7 +284,7 @@ class device_memory_resource { * * @throws Nothing. * - * @param p Pointer to be deallocated + * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. * @param stream Stream on which to perform allocation @@ -294,11 +294,25 @@ class device_memory_resource { do_deallocate(ptr, bytes, stream); } + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return true If the two resources are equivalent + * @return false If the two resources are not equivalent + */ [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept { return do_is_equal(other); } + /** + * @brief Comparison operator with another device_memory_resource + * + * @param other The other resource to compare to + * @return false If the two resources are equivalent + * @return true If the two resources are not equivalent + */ [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept { return !do_is_equal(other); @@ -332,6 +346,11 @@ class device_memory_resource { return do_get_mem_info(stream); } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_memory_resource` provides device accessible memory + */ friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {} private: diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index aa7217758..1c4108a25 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -24,6 +24,8 @@ #include #include +#include + /** * @file per_device_resource.hpp * @brief Management of per-device `device_memory_resource`s @@ -233,4 +235,31 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc { return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); } + +/** + * @brief Get the memory resource for the current device as a `cuda::mr::async_resource_ref`. + * + * Returns a `cuda::mr::async_resource_ref` from the memory_resource set for the current device. + * The initial resource is a `cuda_memory_resource`. + * + * The "current device" is the device returned by `cudaGetDevice`. + * + * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, + * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`. + * Concurrent calls to any of these functions will result in a valid state, but the order of + * execution is undefined. + * + * @note The returned `cuda::mr::async_resource_ref` should only be used with the current CUDA + * device. Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned + * resource can result in undefined behavior. The behavior of a device_memory_resource is undefined + * if used while the active CUDA device is a different device from the one that was active when the + * device_memory_resource was created. + * + * @return cuda::mr::async_resource_ref of the resource for the current device + */ +inline cuda::mr::async_resource_ref get_current_device_resource_ref() +{ + return cuda::mr::async_resource_ref{get_current_device_resource()}; +} + } // namespace rmm::mr diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index d4c34385e..56992a06c 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -57,6 +57,7 @@ struct DeviceBufferTest : public ::testing::Test { }; using resources = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(DeviceBufferTest, resources); @@ -73,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -84,30 +85,28 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource) { - rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr); + rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream) { - rmm::device_buffer buff(this->size, this->stream, &this->mr); + rmm::device_buffer buff(this->size, this->stream, this->mr); this->stream.synchronize(); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } @@ -119,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // TODO check for equality between the contents of the two allocations @@ -135,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); // TODO check for equality between the contents of the two allocations @@ -148,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -174,8 +173,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_NE(buff.data(), buff_copy.data()); EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref()); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -187,7 +185,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) // now use buff's stream and MR rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource()); EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); - EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource())); + EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); EXPECT_EQ(buff_copy2.stream(), buff.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -217,8 +215,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref()); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -243,7 +240,6 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr) EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -275,7 +271,6 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr) EXPECT_EQ(new_size, buff_copy.capacity()); EXPECT_NE(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -291,7 +286,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -309,7 +304,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_default, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveConstructorStream) @@ -319,7 +313,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -338,7 +332,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) @@ -347,7 +340,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest; @@ -366,7 +359,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, MoveAssignment) @@ -375,7 +367,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr); @@ -394,7 +386,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) @@ -403,7 +394,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); buff = std::move(buff); // self-move-assignment shouldn't modify the buffer diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 69d89e305..46023d519 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -30,14 +30,15 @@ struct TypedUVectorTest : ::testing::Test { [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; } }; -using TestTypes = ::testing::Types; +using TestTypes = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(TypedUVectorTest, TestTypes); TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); - EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource()); + EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource_ref()); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) From 8b1bc04979a42799b236cdf10f007dff320fd75a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 17 Oct 2023 10:29:29 -0700 Subject: [PATCH 09/30] Use upstream patch for memory resource. --- fetch_rapids.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake index 899db0467..3aa4f8be0 100644 --- a/fetch_rapids.cmake +++ b/fetch_rapids.cmake @@ -11,6 +11,8 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= +set(rapids-cmake-repo bdice/rapids-cmake) +set(rapids-cmake-branch libcudacxx-memory-resource-patch) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) From bccabd1b2d6896d290c4226deb611cdeab92312b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 17 Oct 2023 12:52:33 -0700 Subject: [PATCH 10/30] Add target_link_libraries for libcudacxx. --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9455c4267..93fcdabe9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,6 +90,7 @@ else() target_link_libraries(rmm INTERFACE CUDA::cudart) endif() +target_link_libraries(rmm INTERFACE libcudacxx::libcudacxx) target_link_libraries(rmm INTERFACE rmm::Thrust) target_link_libraries(rmm INTERFACE fmt::fmt-header-only) target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only) From 5b27e105e6392741635e2a17cecd750596eee19a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 17 Oct 2023 13:24:49 -0700 Subject: [PATCH 11/30] Revert "Use upstream patch for memory resource." This reverts commit 8b1bc04979a42799b236cdf10f007dff320fd75a. --- fetch_rapids.cmake | 2 -- 1 file changed, 2 deletions(-) diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake index 3aa4f8be0..899db0467 100644 --- a/fetch_rapids.cmake +++ b/fetch_rapids.cmake @@ -11,8 +11,6 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -set(rapids-cmake-repo bdice/rapids-cmake) -set(rapids-cmake-branch libcudacxx-memory-resource-patch) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake) From 6e610a33cf3e524fe5f3b889cf80a78c8b60b135 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 18 Oct 2023 06:11:23 +0000 Subject: [PATCH 12/30] Add restrictions on memory usage for tests --- tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4e9866905..3071b89b0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -128,7 +128,7 @@ ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_t # device mr tests ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp - mr/device/mr_ref_multithreaded_tests.cpp) + mr/device/mr_ref_multithreaded_tests.cpp GPUS 1 PERCENT 50) # general adaptor tests ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) From edb8f78f2be34bb1fcdda06664882079aaaf17a5 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 18 Oct 2023 06:34:26 +0000 Subject: [PATCH 13/30] Remove `get_current_device_resource_ref` It is not really usefull as we are constrained to only using `device_accessible` and we can easily construct any resource_ref from it anyway --- include/rmm/device_buffer.hpp | 16 ++++++------ include/rmm/device_uvector.hpp | 6 ++--- include/rmm/mr/device/per_device_resource.hpp | 26 ------------------- tests/device_buffer_tests.cu | 16 +++++++----- tests/device_uvector_tests.cpp | 2 +- 5 files changed, 21 insertions(+), 45 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 811c8d9cb..3b715ff5c 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -40,7 +40,7 @@ namespace rmm { * * This class allocates untyped and *uninitialized* device memory using a * `device_memory_resource`. If not explicitly specified, the memory resource - * returned from `get_current_device_resource_ref()` is used. + * returned from `get_current_device_resource()` is used. * * @note Unlike `std::vector` or `thrust::device_vector`, the device memory * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined @@ -96,7 +96,7 @@ class device_buffer { // `__host__ __device__` specifiers to the defaulted constructor when it is called within the // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host- // device function. This causes warnings/errors because this ctor invokes host-only functions. - device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {} + device_buffer() : _mr{rmm::mr::get_current_device_resource()} {} /** * @brief Constructs a new device buffer of `size` uninitialized bytes @@ -110,7 +110,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - async_resource_ref mr = mr::get_current_device_resource_ref()) + async_resource_ref mr = mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -138,7 +138,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - async_resource_ref mr = mr::get_current_device_resource_ref()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -168,7 +168,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - async_resource_ref mr = mr::get_current_device_resource_ref()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -400,7 +400,7 @@ class device_buffer { void set_stream(cuda_stream_view stream) noexcept { _stream = stream; } /** - * @briefreturn{Pointer to the memory resource used to allocate and deallocate} + * @briefreturn{The async_resource_ref used to allocate and deallocate} */ [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } @@ -417,8 +417,8 @@ class device_buffer { std::size_t _capacity{}; ///< The actual size of the device memory allocation cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation async_resource_ref _mr{ - mr::get_current_device_resource_ref()}; ///< The memory resource used to - ///< allocate/deallocate device memory + rmm::mr::get_current_device_resource()}; ///< The memory resource used to + ///< allocate/deallocate device memory /** * @brief Allocates the specified amount of memory and updates the size/capacity accordingly. diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index cfa6eebd4..43370588c 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -126,7 +126,7 @@ class device_uvector { */ explicit device_uvector(std::size_t size, cuda_stream_view stream, - async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -142,7 +142,7 @@ class device_uvector { */ explicit device_uvector(device_uvector const& other, cuda_stream_view stream, - async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _storage{other._storage, stream, mr} { } @@ -525,7 +525,7 @@ class device_uvector { [[nodiscard]] bool is_empty() const noexcept { return size() == 0; } /** - * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage} + * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage} */ [[nodiscard]] cuda::mr::async_resource_ref memory_resource() const noexcept diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 1c4108a25..c0a26d2ec 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -236,30 +236,4 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); } -/** - * @brief Get the memory resource for the current device as a `cuda::mr::async_resource_ref`. - * - * Returns a `cuda::mr::async_resource_ref` from the memory_resource set for the current device. - * The initial resource is a `cuda_memory_resource`. - * - * The "current device" is the device returned by `cudaGetDevice`. - * - * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`, - * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`. - * Concurrent calls to any of these functions will result in a valid state, but the order of - * execution is undefined. - * - * @note The returned `cuda::mr::async_resource_ref` should only be used with the current CUDA - * device. Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned - * resource can result in undefined behavior. The behavior of a device_memory_resource is undefined - * if used while the active CUDA device is a different device from the one that was active when the - * device_memory_resource was created. - * - * @return cuda::mr::async_resource_ref of the resource for the current device - */ -inline cuda::mr::async_resource_ref get_current_device_resource_ref() -{ - return cuda::mr::async_resource_ref{get_current_device_resource()}; -} - } // namespace rmm::mr diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index 56992a06c..c414c86c4 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -74,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -85,7 +85,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } @@ -118,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // TODO check for equality between the contents of the two allocations @@ -134,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); // TODO check for equality between the contents of the two allocations @@ -147,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -173,7 +173,8 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_NE(buff.data(), buff_copy.data()); EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref()); + EXPECT_EQ(buff_copy.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -215,7 +216,8 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref()); + EXPECT_EQ(buff_copy.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 46023d519..3c042a437 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -38,7 +38,7 @@ TYPED_TEST_CASE(TypedUVectorTest, TestTypes); TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); - EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource_ref()); + EXPECT_EQ(vec.memory_resource(), async_resource_ref{rmm::mr::get_current_device_resource()}); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) From 4ea703ecc043f9594b1aa52ef84c67989e1c4c81 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 18 Oct 2023 11:47:13 +0000 Subject: [PATCH 14/30] Make CI happy --- python/rmm/tests/test_rmm.py | 6 +++--- tests/CMakeLists.txt | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index fd537749b..b2ce8fe6d 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -630,11 +630,11 @@ def test_statistics_resource_adaptor(stats_mr): del buffers[i] assert stats_mr.allocation_counts == { - "current_bytes": 5000, + "current_bytes": 5040, "current_count": 5, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 10000, + "total_bytes": 10080, "total_count": 10, } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3071b89b0..bf4fac1f9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -126,9 +126,9 @@ endfunction() ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp GPUS 1 PERCENT 90) -# device mr tests +# device mr_ref tests ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp - mr/device/mr_ref_multithreaded_tests.cpp GPUS 1 PERCENT 50) + mr/device/mr_ref_multithreaded_tests.cpp GPUS 1 PERCENT 100) # general adaptor tests ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) From 8e7c91b39befc93f0e6e809d945161cac251a2c5 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 1 Nov 2023 09:01:22 +0000 Subject: [PATCH 15/30] Address review comments --- .../rmm/mr/device/device_memory_resource.hpp | 3 +-- include/rmm/mr/host/host_memory_resource.hpp | 2 +- tests/mr/device/adaptor_tests.cpp | 27 +++++++------------ tests/mr/host/mr_ref_tests.cpp | 5 ++-- tests/mr/host/mr_tests.cpp | 5 ++-- 5 files changed, 16 insertions(+), 26 deletions(-) diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 636c98c44..6681e598c 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -409,6 +408,6 @@ class device_memory_resource { [[nodiscard]] virtual std::pair do_get_mem_info( cuda_stream_view stream) const = 0; }; -static_assert(cuda::mr::resource_with, ""); +static_assert(cuda::mr::resource_with); /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index f5f1ddf58..ce870287c 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -195,7 +195,7 @@ class host_memory_resource { return this == &other; } }; -static_assert(cuda::mr::resource_with, ""); +static_assert(cuda::mr::resource_with); /** @} */ // end of group } // namespace rmm::mr diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp index 3e00dd208..98fc3a429 100644 --- a/tests/mr/device/adaptor_tests.cpp +++ b/tests/mr/device/adaptor_tests.cpp @@ -67,28 +67,21 @@ using adaptors = ::testing::Types, tracking_resource_adaptor>; static_assert( - cuda::mr::resource_with, cuda::mr::device_accessible>, - ""); + cuda::mr::resource_with, cuda::mr::device_accessible>); static_assert(cuda::mr::resource_with, - cuda::mr::device_accessible>, - ""); + cuda::mr::device_accessible>); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>); static_assert( - cuda::mr::resource_with, cuda::mr::device_accessible>, - ""); + cuda::mr::resource_with, cuda::mr::device_accessible>); static_assert( - cuda::mr::resource_with, cuda::mr::device_accessible>, - ""); -static_assert( - cuda::mr::resource_with, cuda::mr::device_accessible>, ""); + cuda::mr::resource_with, cuda::mr::device_accessible>); static_assert(cuda::mr::resource_with, - cuda::mr::device_accessible>, - ""); + cuda::mr::device_accessible>); static_assert(cuda::mr::resource_with, - cuda::mr::device_accessible>, - ""); -static_assert( - cuda::mr::resource_with, cuda::mr::device_accessible>, - ""); + cuda::mr::device_accessible>); +static_assert(cuda::mr::resource_with, + cuda::mr::device_accessible>); template struct AdaptorTest : public ::testing::Test { diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp index 568e18576..1439ebabe 100644 --- a/tests/mr/host/mr_ref_tests.cpp +++ b/tests/mr/host/mr_ref_tests.cpp @@ -83,9 +83,8 @@ struct MRRefTest : public ::testing::Test { }; using resources = ::testing::Types; -static_assert(cuda::mr::resource_with, ""); -static_assert(cuda::mr::resource_with, - ""); +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::resource_with); TYPED_TEST_CASE(MRRefTest, resources); diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp index ff976ff96..1b95bea5d 100644 --- a/tests/mr/host/mr_tests.cpp +++ b/tests/mr/host/mr_tests.cpp @@ -82,9 +82,8 @@ struct MRTest : public ::testing::Test { }; using resources = ::testing::Types; -static_assert(cuda::mr::resource_with, ""); -static_assert(cuda::mr::resource_with, - ""); +static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::resource_with); TYPED_TEST_CASE(MRTest, resources); From 849c88045ac203504413cc498b9e3881e355342c Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Wed, 1 Nov 2023 09:06:09 +0000 Subject: [PATCH 16/30] Address alignment differences with `cuda::mr::resource_ref` Our implementation of `cuda::mr::{async_}resource_ref` currently aligns to `alignof(max_align_t)` This is a minor difference to the current device_memory_resource. We intent to expand the interface of `cuda::mr::{async_}resource_ref` to take desired alignment into account. However, this is out of scope of this PR and will be done in the cccl repository --- python/rmm/tests/test_rmm.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index b2ce8fe6d..b5dc81c1f 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -646,19 +646,19 @@ def test_statistics_resource_adaptor(stats_mr): buffers.append(rmm.DeviceBuffer(size=1000)) assert mr2.allocation_counts == { - "current_bytes": 2000, + "current_bytes": 2016, "current_count": 2, - "peak_bytes": 2000, + "peak_bytes": 2016, "peak_count": 2, - "total_bytes": 2000, + "total_bytes": 2016, "total_count": 2, } assert stats_mr.allocation_counts == { - "current_bytes": 7000, + "current_bytes": 7056, "current_count": 7, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 12000, + "total_bytes": 12096, "total_count": 12, } @@ -668,17 +668,17 @@ def test_statistics_resource_adaptor(stats_mr): assert mr2.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 2000, + "peak_bytes": 2016, "peak_count": 2, - "total_bytes": 2000, + "total_bytes": 2016, "total_count": 2, } assert stats_mr.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 12000, + "total_bytes": 12096, "total_count": 12, } gc.collect() @@ -696,7 +696,7 @@ def test_tracking_resource_adaptor(): for i in range(9, 0, -2): del buffers[i] - assert mr.get_allocated_bytes() == 5000 + assert mr.get_allocated_bytes() == 5040 # Push a new Tracking adaptor mr2 = rmm.mr.TrackingResourceAdaptor(mr, capture_stacks=True) @@ -705,8 +705,8 @@ def test_tracking_resource_adaptor(): for _ in range(2): buffers.append(rmm.DeviceBuffer(size=1000)) - assert mr2.get_allocated_bytes() == 2000 - assert mr.get_allocated_bytes() == 7000 + assert mr2.get_allocated_bytes() == 2016 + assert mr.get_allocated_bytes() == 7056 # Ensure we get back a non-empty string for the allocations assert len(mr.get_outstanding_allocations_str()) > 0 From 2cf03a782a809a070e6fa5c0c2131b92eff9b42e Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 2 Nov 2023 08:12:17 +0000 Subject: [PATCH 17/30] Roll back the changes to `device_u{buffer, vector}` Those are potentially breaking and we want to do that in a followup --- include/rmm/device_buffer.hpp | 30 ++++++++-------------- include/rmm/device_uvector.hpp | 27 +++++++------------ python/rmm/tests/test_rmm.py | 32 +++++++++++------------ tests/device_buffer_tests.cu | 47 +++++++++++++++++++--------------- tests/device_uvector_tests.cpp | 5 ++-- 5 files changed, 64 insertions(+), 77 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 688f218b9..5762d2bbe 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -27,8 +27,6 @@ #include #include -#include - namespace rmm { /** * @addtogroup data_containers @@ -81,8 +79,6 @@ namespace rmm { *``` */ class device_buffer { - using async_resource_ref = cuda::mr::async_resource_ref; - public: // The copy constructor and copy assignment operator without a stream are deleted because they // provide no way to specify an explicit stream @@ -110,7 +106,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - async_resource_ref mr = mr::get_current_device_resource()) + mr::device_memory_resource* mr = mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -138,7 +134,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - async_resource_ref mr = rmm::mr::get_current_device_resource()) + mr::device_memory_resource* mr = mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -168,7 +164,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - async_resource_ref mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -240,6 +236,7 @@ class device_buffer { ~device_buffer() noexcept { deallocate_async(); + _mr = nullptr; _stream = cuda_stream_view{}; } @@ -400,23 +397,16 @@ class device_buffer { /** * @briefreturn{The async_resource_ref used to allocate and deallocate} */ - [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } - - /** - * @brief Enables the `cuda::mr::device_accessible` property - * - * This property declares that a `device_buffer` provides device accessible memory - */ - friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {} + [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; } private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation std::size_t _capacity{}; ///< The actual size of the device memory allocation cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation - async_resource_ref _mr{ - rmm::mr::get_current_device_resource()}; ///< The memory resource used to - ///< allocate/deallocate device memory + mr::device_memory_resource* _mr{ + mr::get_current_device_resource()}; ///< The memory resource used to + ///< allocate/deallocate device memory /** * @brief Allocates the specified amount of memory and updates the size/capacity accordingly. @@ -431,7 +421,7 @@ class device_buffer { { _size = bytes; _capacity = bytes; - _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; + _data = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr; } /** @@ -445,7 +435,7 @@ class device_buffer { */ void deallocate_async() noexcept { - if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); } + if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); } _size = 0; _capacity = 0; _data = nullptr; diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 43370588c..840da8ca8 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -26,8 +26,6 @@ #include #include -#include - namespace rmm { /** * @addtogroup data_containers @@ -74,7 +72,6 @@ namespace rmm { */ template class device_uvector { - using async_resource_ref = cuda::mr::async_resource_ref; static_assert(std::is_trivially_copyable::value, "device_uvector only supports types that are trivially copyable."); @@ -124,9 +121,10 @@ class device_uvector { * @param stream The stream on which to perform the allocation * @param mr The resource used to allocate the device storage */ - explicit device_uvector(std::size_t size, - cuda_stream_view stream, - async_resource_ref mr = rmm::mr::get_current_device_resource()) + explicit device_uvector( + std::size_t size, + cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -140,9 +138,10 @@ class device_uvector { * @param stream The stream on which to perform the copy * @param mr The resource used to allocate device memory for the new vector */ - explicit device_uvector(device_uvector const& other, - cuda_stream_view stream, - async_resource_ref mr = rmm::mr::get_current_device_resource()) + explicit device_uvector( + device_uvector const& other, + cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) : _storage{other._storage, stream, mr} { } @@ -527,19 +526,11 @@ class device_uvector { /** * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage} */ - [[nodiscard]] cuda::mr::async_resource_ref memory_resource() - const noexcept + [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _storage.memory_resource(); } - /** - * @brief Enables the `cuda::mr::device_accessible` property - * - * This property declares that a `device_uvector` provides device accessible memory - */ - friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {} - /** * @briefreturn{Stream most recently specified for allocation/deallocation} */ diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index b5dc81c1f..fd537749b 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -630,11 +630,11 @@ def test_statistics_resource_adaptor(stats_mr): del buffers[i] assert stats_mr.allocation_counts == { - "current_bytes": 5040, + "current_bytes": 5000, "current_count": 5, - "peak_bytes": 10080, + "peak_bytes": 10000, "peak_count": 10, - "total_bytes": 10080, + "total_bytes": 10000, "total_count": 10, } @@ -646,19 +646,19 @@ def test_statistics_resource_adaptor(stats_mr): buffers.append(rmm.DeviceBuffer(size=1000)) assert mr2.allocation_counts == { - "current_bytes": 2016, + "current_bytes": 2000, "current_count": 2, - "peak_bytes": 2016, + "peak_bytes": 2000, "peak_count": 2, - "total_bytes": 2016, + "total_bytes": 2000, "total_count": 2, } assert stats_mr.allocation_counts == { - "current_bytes": 7056, + "current_bytes": 7000, "current_count": 7, - "peak_bytes": 10080, + "peak_bytes": 10000, "peak_count": 10, - "total_bytes": 12096, + "total_bytes": 12000, "total_count": 12, } @@ -668,17 +668,17 @@ def test_statistics_resource_adaptor(stats_mr): assert mr2.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 2016, + "peak_bytes": 2000, "peak_count": 2, - "total_bytes": 2016, + "total_bytes": 2000, "total_count": 2, } assert stats_mr.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 10080, + "peak_bytes": 10000, "peak_count": 10, - "total_bytes": 12096, + "total_bytes": 12000, "total_count": 12, } gc.collect() @@ -696,7 +696,7 @@ def test_tracking_resource_adaptor(): for i in range(9, 0, -2): del buffers[i] - assert mr.get_allocated_bytes() == 5040 + assert mr.get_allocated_bytes() == 5000 # Push a new Tracking adaptor mr2 = rmm.mr.TrackingResourceAdaptor(mr, capture_stacks=True) @@ -705,8 +705,8 @@ def test_tracking_resource_adaptor(): for _ in range(2): buffers.append(rmm.DeviceBuffer(size=1000)) - assert mr2.get_allocated_bytes() == 2016 - assert mr.get_allocated_bytes() == 7056 + assert mr2.get_allocated_bytes() == 2000 + assert mr.get_allocated_bytes() == 7000 # Ensure we get back a non-empty string for the allocations assert len(mr.get_outstanding_allocations_str()) > 0 diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index c414c86c4..d4c34385e 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -57,7 +57,6 @@ struct DeviceBufferTest : public ::testing::Test { }; using resources = ::testing::Types; -using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(DeviceBufferTest, resources); @@ -74,7 +73,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -85,28 +84,30 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource) { - rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr); + rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); + EXPECT_EQ(&this->mr, buff.memory_resource()); + EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream) { - rmm::device_buffer buff(this->size, this->stream, this->mr); + rmm::device_buffer buff(this->size, this->stream, &this->mr); this->stream.synchronize(); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); + EXPECT_EQ(&this->mr, buff.memory_resource()); + EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); EXPECT_EQ(this->stream, buff.stream()); } @@ -118,7 +119,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // TODO check for equality between the contents of the two allocations @@ -134,7 +135,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); // TODO check for equality between the contents of the two allocations @@ -147,7 +148,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); + EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -173,8 +174,8 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_NE(buff.data(), buff_copy.data()); EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), - async_resource_ref{rmm::mr::get_current_device_resource()}); + EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); + EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -186,7 +187,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) // now use buff's stream and MR rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource()); EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); - EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); + EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource())); EXPECT_EQ(buff_copy2.stream(), buff.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -216,8 +217,8 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), - async_resource_ref{rmm::mr::get_current_device_resource()}); + EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); + EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -242,6 +243,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr) EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); + EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -273,6 +275,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr) EXPECT_EQ(new_size, buff_copy.capacity()); EXPECT_NE(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); + EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -288,7 +291,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto mr = buff.memory_resource(); + auto* mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -306,6 +309,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_default, buff.stream()); // NOLINT(bugprone-use-after-move) + EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveConstructorStream) @@ -315,7 +319,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto mr = buff.memory_resource(); + auto* mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -334,6 +338,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // NOLINT(bugprone-use-after-move) + EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) @@ -342,7 +347,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto mr = src.memory_resource(); + auto* mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest; @@ -361,6 +366,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); + EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, MoveAssignment) @@ -369,7 +375,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto mr = src.memory_resource(); + auto* mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr); @@ -388,6 +394,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); + EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) @@ -396,7 +403,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto mr = buff.memory_resource(); + auto* mr = buff.memory_resource(); auto stream = buff.stream(); buff = std::move(buff); // self-move-assignment shouldn't modify the buffer diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 3c042a437..69d89e305 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -30,15 +30,14 @@ struct TypedUVectorTest : ::testing::Test { [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; } }; -using TestTypes = ::testing::Types; -using async_resource_ref = cuda::mr::async_resource_ref; +using TestTypes = ::testing::Types; TYPED_TEST_CASE(TypedUVectorTest, TestTypes); TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); - EXPECT_EQ(vec.memory_resource(), async_resource_ref{rmm::mr::get_current_device_resource()}); + EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource()); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) From 76d55fa29b74667de174c71b48be2f374b4e1cd2 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 2 Nov 2023 08:44:19 +0000 Subject: [PATCH 18/30] Address review comments --- .../rmm/mr/device/device_memory_resource.hpp | 18 ----------------- include/rmm/mr/device/per_device_resource.hpp | 2 -- .../rmm/mr/device/pool_memory_resource.hpp | 20 ++++++++++--------- .../rmm/mr/host/pinned_memory_resource.hpp | 11 +++++++++- 4 files changed, 21 insertions(+), 30 deletions(-) diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 6681e598c..2f81a117f 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -162,9 +162,6 @@ class device_memory_resource { * * The returned pointer will have at minimum 256 byte alignment. * - * If supported, this operation may optionally be executed on a stream. - * Otherwise, the stream is ignored and the null stream is used. - * * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on * the specified `stream`. * @@ -185,9 +182,6 @@ class device_memory_resource { * it points to must not yet have been deallocated, otherwise behavior is * undefined. * - * If supported, this operation may optionally be executed on a stream. - * Otherwise, the stream is ignored and the null stream is used. - * * @throws Nothing. * * @param ptr Pointer to be deallocated @@ -205,9 +199,6 @@ class device_memory_resource { * * The returned pointer will have at minimum 256 byte alignment. * - * If supported, this operation may optionally be executed on a stream. - * Otherwise, the stream is ignored and the null stream is used. - * * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on * the specified `stream`. * @@ -226,9 +217,6 @@ class device_memory_resource { * * The returned pointer will have at minimum 256 byte alignment. * - * If supported, this operation may optionally be executed on a stream. - * Otherwise, the stream is ignored and the null stream is used. - * * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on * the specified `stream`. * @@ -249,9 +237,6 @@ class device_memory_resource { * it points to must not yet have been deallocated, otherwise behavior is * undefined. * - * If supported, this operation may optionally be executed on a stream. - * Otherwise, the stream is ignored and the null stream is used. - * * @throws Nothing. * * @param ptr Pointer to be deallocated @@ -276,9 +261,6 @@ class device_memory_resource { * it points to must not yet have been deallocated, otherwise behavior is * undefined. * - * If supported, this operation may optionally be executed on a stream. - * Otherwise, the stream is ignored and the null stream is used. - * * @throws Nothing. * * @param ptr Pointer to be deallocated diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index c0a26d2ec..cbdb71532 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -24,8 +24,6 @@ #include #include -#include - /** * @file per_device_resource.hpp * @brief Management of per-device `device_memory_resource`s diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index 7cca60613..face43993 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -61,23 +61,24 @@ namespace detail { * * @tparam PoolResource the pool_memory_resource class * @tparam Upstream memory_resource to use for allocating the pool. + * @tparam Property The property we want to potentially remove. */ -template +template struct maybe_remove_property {}; /** * @brief Specialization of maybe_remove_property to not propagate non existing properties */ -template -struct maybe_remove_property< - PoolResource, - Upstream, - cuda::std::enable_if_t>> { +template +struct maybe_remove_property>> { /** - * @brief Explicit removal of the friend function so we do not pretent to provide device + * @brief Explicit removal of the friend function so we do not pretend to provide device * accessible memory */ - friend void get_property(const PoolResource&, cuda::mr::device_accessible) = delete; + friend void get_property(const PoolResource&, Property) = delete; }; } // namespace detail @@ -93,7 +94,8 @@ struct maybe_remove_property< */ template class pool_memory_resource final - : public detail::maybe_remove_property, Upstream>, + : public detail:: + maybe_remove_property, Upstream, cuda::mr::device_accessible>, public detail::stream_ordered_memory_resource, detail::coalescing_free_list>, public cuda::forward_property, Upstream> { diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp index b4551af60..8ea259263 100644 --- a/include/rmm/mr/host/pinned_memory_resource.hpp +++ b/include/rmm/mr/host/pinned_memory_resource.hpp @@ -119,6 +119,13 @@ class pinned_memory_resource final : public host_memory_resource { do_deallocate(ptr, rmm::detail::align_up(bytes, alignment)); } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `pinned_memory_resource` provides device accessible memory + */ + friend void get_property(pinned_memory_resource const&, cuda::mr::device_accessible) noexcept {} + private: /** * @brief Allocates pinned memory on the host of size at least `bytes` bytes. @@ -172,6 +179,8 @@ class pinned_memory_resource final : public host_memory_resource { ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); }); } }; -static_assert(cuda::mr::async_resource); +static_assert(cuda::mr::async_resource_with); /** @} */ // end of group } // namespace rmm::mr From 3390fb841b113acd3ea7679626b2cdccbc029a0b Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Fri, 3 Nov 2023 08:23:25 +0000 Subject: [PATCH 19/30] Revert "Roll back the changes to `device_u{buffer, vector}`" This reverts commit 2cf03a782a809a070e6fa5c0c2131b92eff9b42e. --- include/rmm/device_buffer.hpp | 30 ++++++++++++++-------- include/rmm/device_uvector.hpp | 27 ++++++++++++------- python/rmm/tests/test_rmm.py | 32 +++++++++++------------ tests/device_buffer_tests.cu | 47 +++++++++++++++------------------- tests/device_uvector_tests.cpp | 5 ++-- 5 files changed, 77 insertions(+), 64 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 5762d2bbe..688f218b9 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -27,6 +27,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -79,6 +81,8 @@ namespace rmm { *``` */ class device_buffer { + using async_resource_ref = cuda::mr::async_resource_ref; + public: // The copy constructor and copy assignment operator without a stream are deleted because they // provide no way to specify an explicit stream @@ -106,7 +110,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -134,7 +138,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - mr::device_memory_resource* mr = mr::get_current_device_resource()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _stream{stream}, _mr{mr} { allocate_async(size); @@ -164,7 +168,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + async_resource_ref mr = rmm::mr::get_current_device_resource()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -236,7 +240,6 @@ class device_buffer { ~device_buffer() noexcept { deallocate_async(); - _mr = nullptr; _stream = cuda_stream_view{}; } @@ -397,16 +400,23 @@ class device_buffer { /** * @briefreturn{The async_resource_ref used to allocate and deallocate} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; } + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } + + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_buffer` provides device accessible memory + */ + friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {} private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation std::size_t _capacity{}; ///< The actual size of the device memory allocation cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation - mr::device_memory_resource* _mr{ - mr::get_current_device_resource()}; ///< The memory resource used to - ///< allocate/deallocate device memory + async_resource_ref _mr{ + rmm::mr::get_current_device_resource()}; ///< The memory resource used to + ///< allocate/deallocate device memory /** * @brief Allocates the specified amount of memory and updates the size/capacity accordingly. @@ -421,7 +431,7 @@ class device_buffer { { _size = bytes; _capacity = bytes; - _data = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr; + _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; } /** @@ -435,7 +445,7 @@ class device_buffer { */ void deallocate_async() noexcept { - if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); } + if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); } _size = 0; _capacity = 0; _data = nullptr; diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 840da8ca8..43370588c 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -26,6 +26,8 @@ #include #include +#include + namespace rmm { /** * @addtogroup data_containers @@ -72,6 +74,7 @@ namespace rmm { */ template class device_uvector { + using async_resource_ref = cuda::mr::async_resource_ref; static_assert(std::is_trivially_copyable::value, "device_uvector only supports types that are trivially copyable."); @@ -121,10 +124,9 @@ class device_uvector { * @param stream The stream on which to perform the allocation * @param mr The resource used to allocate the device storage */ - explicit device_uvector( - std::size_t size, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(std::size_t size, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -138,10 +140,9 @@ class device_uvector { * @param stream The stream on which to perform the copy * @param mr The resource used to allocate device memory for the new vector */ - explicit device_uvector( - device_uvector const& other, - cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + explicit device_uvector(device_uvector const& other, + cuda_stream_view stream, + async_resource_ref mr = rmm::mr::get_current_device_resource()) : _storage{other._storage, stream, mr} { } @@ -526,11 +527,19 @@ class device_uvector { /** * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage} */ - [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept + [[nodiscard]] cuda::mr::async_resource_ref memory_resource() + const noexcept { return _storage.memory_resource(); } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `device_uvector` provides device accessible memory + */ + friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {} + /** * @briefreturn{Stream most recently specified for allocation/deallocation} */ diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index fd537749b..b5dc81c1f 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -630,11 +630,11 @@ def test_statistics_resource_adaptor(stats_mr): del buffers[i] assert stats_mr.allocation_counts == { - "current_bytes": 5000, + "current_bytes": 5040, "current_count": 5, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 10000, + "total_bytes": 10080, "total_count": 10, } @@ -646,19 +646,19 @@ def test_statistics_resource_adaptor(stats_mr): buffers.append(rmm.DeviceBuffer(size=1000)) assert mr2.allocation_counts == { - "current_bytes": 2000, + "current_bytes": 2016, "current_count": 2, - "peak_bytes": 2000, + "peak_bytes": 2016, "peak_count": 2, - "total_bytes": 2000, + "total_bytes": 2016, "total_count": 2, } assert stats_mr.allocation_counts == { - "current_bytes": 7000, + "current_bytes": 7056, "current_count": 7, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 12000, + "total_bytes": 12096, "total_count": 12, } @@ -668,17 +668,17 @@ def test_statistics_resource_adaptor(stats_mr): assert mr2.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 2000, + "peak_bytes": 2016, "peak_count": 2, - "total_bytes": 2000, + "total_bytes": 2016, "total_count": 2, } assert stats_mr.allocation_counts == { "current_bytes": 0, "current_count": 0, - "peak_bytes": 10000, + "peak_bytes": 10080, "peak_count": 10, - "total_bytes": 12000, + "total_bytes": 12096, "total_count": 12, } gc.collect() @@ -696,7 +696,7 @@ def test_tracking_resource_adaptor(): for i in range(9, 0, -2): del buffers[i] - assert mr.get_allocated_bytes() == 5000 + assert mr.get_allocated_bytes() == 5040 # Push a new Tracking adaptor mr2 = rmm.mr.TrackingResourceAdaptor(mr, capture_stacks=True) @@ -705,8 +705,8 @@ def test_tracking_resource_adaptor(): for _ in range(2): buffers.append(rmm.DeviceBuffer(size=1000)) - assert mr2.get_allocated_bytes() == 2000 - assert mr.get_allocated_bytes() == 7000 + assert mr2.get_allocated_bytes() == 2016 + assert mr.get_allocated_bytes() == 7056 # Ensure we get back a non-empty string for the allocations assert len(mr.get_outstanding_allocations_str()) > 0 diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index d4c34385e..c414c86c4 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -57,6 +57,7 @@ struct DeviceBufferTest : public ::testing::Test { }; using resources = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(DeviceBufferTest, resources); @@ -73,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -84,30 +85,28 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource) { - rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr); + rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream) { - rmm::device_buffer buff(this->size, this->stream, &this->mr); + rmm::device_buffer buff(this->size, this->stream, this->mr); this->stream.synchronize(); EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(&this->mr, buff.memory_resource()); - EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource())); + EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } @@ -119,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // TODO check for equality between the contents of the two allocations @@ -135,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); // TODO check for equality between the contents of the two allocations @@ -148,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource()); + EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -174,8 +173,8 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_NE(buff.data(), buff_copy.data()); EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -187,7 +186,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) // now use buff's stream and MR rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource()); EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); - EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource())); + EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource()); EXPECT_EQ(buff_copy2.stream(), buff.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -217,8 +216,8 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); - EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource()); - EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource())); + EXPECT_EQ(buff_copy.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -243,7 +242,6 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr) EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -275,7 +273,6 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr) EXPECT_EQ(new_size, buff_copy.capacity()); EXPECT_NE(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource()); - EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource())); EXPECT_NE(buff.stream(), buff_copy.stream()); EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()), @@ -291,7 +288,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -309,7 +306,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_default, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveConstructorStream) @@ -319,7 +315,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); // New buffer should have the same contents as the original @@ -338,7 +334,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream) EXPECT_EQ(0, buff.size()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(0, buff.capacity()); // NOLINT(bugprone-use-after-move) EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); // NOLINT(bugprone-use-after-move) - EXPECT_NE(nullptr, buff.memory_resource()); // NOLINT(bugprone-use-after-move) } TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) @@ -347,7 +342,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest; @@ -366,7 +361,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, MoveAssignment) @@ -375,7 +369,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) auto* ptr = src.data(); auto size = src.size(); auto capacity = src.capacity(); - auto* mr = src.memory_resource(); + auto mr = src.memory_resource(); auto stream = src.stream(); rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr); @@ -394,7 +388,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment) EXPECT_EQ(0, src.size()); EXPECT_EQ(0, src.capacity()); EXPECT_EQ(rmm::cuda_stream_default, src.stream()); - EXPECT_NE(nullptr, src.memory_resource()); } TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) @@ -403,7 +396,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment) auto* ptr = buff.data(); auto size = buff.size(); auto capacity = buff.capacity(); - auto* mr = buff.memory_resource(); + auto mr = buff.memory_resource(); auto stream = buff.stream(); buff = std::move(buff); // self-move-assignment shouldn't modify the buffer diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 69d89e305..3c042a437 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -30,14 +30,15 @@ struct TypedUVectorTest : ::testing::Test { [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; } }; -using TestTypes = ::testing::Types; +using TestTypes = ::testing::Types; +using async_resource_ref = cuda::mr::async_resource_ref; TYPED_TEST_CASE(TypedUVectorTest, TestTypes); TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); - EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource()); + EXPECT_EQ(vec.memory_resource(), async_resource_ref{rmm::mr::get_current_device_resource()}); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) From 559a6744b3e9638705dd5bfa9a44c96823f4f553 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 7 Nov 2023 08:00:05 +0000 Subject: [PATCH 20/30] Add a check that we do not store unnecessary data inside a `async_resource_ref ` While we are at it also properly test that a `device_memory_resource` is a `async_resource` --- include/rmm/mr/device/device_memory_resource.hpp | 5 ++++- include/rmm/mr/host/host_memory_resource.hpp | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index ab03a24cd..710db8eac 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -390,6 +390,9 @@ class device_memory_resource { [[nodiscard]] virtual std::pair do_get_mem_info( cuda_stream_view stream) const = 0; }; -static_assert(cuda::mr::resource_with); +static_assert(cuda::mr::async_resource_with); +static_assert( + sizeof(cuda::mr::async_resource_ref) == + 2 * sizeof(void*)); /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index ce870287c..e6403ad2b 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -196,6 +196,8 @@ class host_memory_resource { } }; static_assert(cuda::mr::resource_with); +static_assert(sizeof(cuda::mr::resource_ref) == + 2 * sizeof(void*)); /** @} */ // end of group } // namespace rmm::mr From c1f36d8fa015ba71e2e2934fe242ab0dc8db2fc6 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 7 Nov 2023 08:03:58 +0000 Subject: [PATCH 21/30] Properly test that a `device_memory_resource` is an `async_resource` --- include/rmm/mr/device/device_memory_resource.hpp | 3 --- include/rmm/mr/host/host_memory_resource.hpp | 2 -- 2 files changed, 5 deletions(-) diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 710db8eac..b5c1fbbb9 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -391,8 +391,5 @@ class device_memory_resource { cuda_stream_view stream) const = 0; }; static_assert(cuda::mr::async_resource_with); -static_assert( - sizeof(cuda::mr::async_resource_ref) == - 2 * sizeof(void*)); /** @} */ // end of group } // namespace rmm::mr diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp index e6403ad2b..ce870287c 100644 --- a/include/rmm/mr/host/host_memory_resource.hpp +++ b/include/rmm/mr/host/host_memory_resource.hpp @@ -196,8 +196,6 @@ class host_memory_resource { } }; static_assert(cuda::mr::resource_with); -static_assert(sizeof(cuda::mr::resource_ref) == - 2 * sizeof(void*)); /** @} */ // end of group } // namespace rmm::mr From 1d422b5a902c4f1e644ca286058df7207848199a Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 7 Nov 2023 08:14:45 +0000 Subject: [PATCH 22/30] Also port `rmm::mr::thrust_allocator` to `async_resource_ref`` --- .../mr/device/thrust_allocator_adaptor.hpp | 26 +++++++++++++------ tests/mr/device/thrust_allocator_tests.cu | 4 ++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp index a1386a842..562a0d79e 100644 --- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp +++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp @@ -24,6 +24,8 @@ #include #include +#include + namespace rmm::mr { /** * @addtogroup device_resource_adaptors @@ -42,6 +44,8 @@ namespace rmm::mr { */ template class thrust_allocator : public thrust::device_malloc_allocator { + using async_resource_ref = cuda::mr::async_resource_ref; + public: using Base = thrust::device_malloc_allocator; ///< The base type of this allocator using pointer = typename Base::pointer; ///< The pointer type @@ -79,9 +83,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { * @param mr The resource to be used for device memory allocation * @param stream The stream to be used for device memory (de)allocation */ - thrust_allocator(cuda_stream_view stream, device_memory_resource* mr) : _stream{stream}, _mr(mr) - { - } + thrust_allocator(cuda_stream_view stream, async_resource_ref mr) : _stream{stream}, _mr(mr) {} /** * @brief Copy constructor. Copies the resource pointer and stream. @@ -102,7 +104,8 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ pointer allocate(size_type num) { - return thrust::device_pointer_cast(static_cast(_mr->allocate(num * sizeof(T), _stream))); + return thrust::device_pointer_cast( + static_cast(_mr.allocate_async(num * sizeof(T), _stream))); } /** @@ -114,22 +117,29 @@ class thrust_allocator : public thrust::device_malloc_allocator { */ void deallocate(pointer ptr, size_type num) { - return _mr->deallocate(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream); + return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream); } /** - * @briefreturn{The device memory resource used by this} + * @briefreturn{The async_resource_ref used to allocate and deallocate} */ - [[nodiscard]] device_memory_resource* resource() const noexcept { return _mr; } + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } /** * @briefreturn{The stream used by this allocator} */ [[nodiscard]] cuda_stream_view stream() const noexcept { return _stream; } + /** + * @brief Enables the `cuda::mr::device_accessible` property + * + * This property declares that a `thrust_allocator` provides device accessible memory + */ + friend void get_property(thrust_allocator const&, cuda::mr::device_accessible) noexcept {} + private: cuda_stream_view _stream{}; - device_memory_resource* _mr{rmm::mr::get_current_device_resource()}; + async_resource_ref _mr{rmm::mr::get_current_device_resource()}; }; /** @} */ // end of group } // namespace rmm::mr diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu index 41fb15973..ed8875cbe 100644 --- a/tests/mr/device/thrust_allocator_tests.cu +++ b/tests/mr/device/thrust_allocator_tests.cu @@ -32,6 +32,7 @@ namespace rmm::test { namespace { struct allocator_test : public mr_test {}; +using async_resource_ref = cuda::mr::async_resource_ref; TEST_P(allocator_test, first) { @@ -44,7 +45,8 @@ TEST_P(allocator_test, defaults) { rmm::mr::thrust_allocator allocator(rmm::cuda_stream_default); EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default); - EXPECT_EQ(allocator.resource(), rmm::mr::get_current_device_resource()); + EXPECT_EQ(allocator.memory_resource(), + async_resource_ref{rmm::mr::get_current_device_resource()}); } INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests, From e867442eacf7b281ded7fcf120dbf59c335c0128 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 7 Nov 2023 10:30:32 +0000 Subject: [PATCH 23/30] Do not put `rmm::bad_alloc` in quotes Also `Nothing` seems to be an invalid type --- include/rmm/mr/device/device_memory_resource.hpp | 12 +++--------- include/rmm/mr/host/pinned_memory_resource.hpp | 6 ++---- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index b5c1fbbb9..9c29a2ad7 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -162,7 +162,7 @@ class device_memory_resource { * * The returned pointer will have at minimum 256 byte alignment. * - * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on * the specified `stream`. * * @param bytes The size of the allocation @@ -182,8 +182,6 @@ class device_memory_resource { * it points to must not yet have been deallocated, otherwise behavior is * undefined. * - * @throws Nothing. - * * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. @@ -199,7 +197,7 @@ class device_memory_resource { * * The returned pointer will have at minimum 256 byte alignment. * - * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on * the specified `stream`. * * @param bytes The size of the allocation @@ -217,7 +215,7 @@ class device_memory_resource { * * The returned pointer will have at minimum 256 byte alignment. * - * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on * the specified `stream`. * * @param bytes The size of the allocation @@ -237,8 +235,6 @@ class device_memory_resource { * it points to must not yet have been deallocated, otherwise behavior is * undefined. * - * @throws Nothing. - * * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. @@ -261,8 +257,6 @@ class device_memory_resource { * it points to must not yet have been deallocated, otherwise behavior is * undefined. * - * @throws Nothing. - * * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp index 8ea259263..ad650800c 100644 --- a/include/rmm/mr/host/pinned_memory_resource.hpp +++ b/include/rmm/mr/host/pinned_memory_resource.hpp @@ -78,7 +78,7 @@ class pinned_memory_resource final : public host_memory_resource { /** * @brief Pretent to support the allocate_async interface, falling back to stream 0 * - * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on * the specified `stream`. * * @param bytes The size of the allocation @@ -93,7 +93,7 @@ class pinned_memory_resource final : public host_memory_resource { /** * @brief Pretent to support the allocate_async interface, falling back to stream 0 * - * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on + * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on * the specified `stream`. * * @param bytes The size of the allocation @@ -107,8 +107,6 @@ class pinned_memory_resource final : public host_memory_resource { /** * @brief Pretent to support the deallocate_async interface, falling back to stream 0 * - * @throws Nothing. - * * @param ptr Pointer to be deallocated * @param bytes The size in bytes of the allocation. This must be equal to the * value of `bytes` that was passed to the `allocate` call that returned `p`. From 73efc2e5ef1c76199ddb6f5f641615a26f44ee04 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 7 Nov 2023 11:10:11 +0000 Subject: [PATCH 24/30] Try to avoid namespaces in comments --- include/rmm/cuda_stream_view.hpp | 4 ++-- include/rmm/device_uvector.hpp | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp index 082275ea9..f8564b16b 100644 --- a/include/rmm/cuda_stream_view.hpp +++ b/include/rmm/cuda_stream_view.hpp @@ -61,7 +61,7 @@ class cuda_stream_view { constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {} /** - * @brief Implicit conversion from cuda::stream_ref. + * @brief Implicit conversion from stream_ref. * * @param stream The underlying stream for this view */ @@ -84,7 +84,7 @@ class cuda_stream_view { /** * @brief Implicit conversion to stream_ref. * - * @return cuda::stream_ref The underlying stream referenced by this cuda_stream_view + * @return stream_ref The underlying stream referenced by this cuda_stream_view */ constexpr operator cuda::stream_ref() const noexcept { return value(); } diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 022d9dc02..3bd6a1c25 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -527,8 +527,7 @@ class device_uvector { /** * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage} */ - [[nodiscard]] cuda::mr::async_resource_ref memory_resource() - const noexcept + [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _storage.memory_resource(); } From c298fbc937fd8c2656b36ebce4315e082ea12ec7 Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 7 Nov 2023 18:11:51 +0000 Subject: [PATCH 25/30] Tell sphinx to skipp documenting libcu++ names --- python/docs/conf.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/docs/conf.py b/python/docs/conf.py index a063b52eb..ec6ddc70a 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -216,6 +216,18 @@ def on_missing_reference(app, env, node, contnode): "cudaStreamPerThread", "thrust", "spdlog", + "stream_ref", + # libcu++ names + "cuda", + "cuda::mr", + "resource", + "resource_ref", + "async_resource", + "async_resource_ref", + "device_accessible", + "host_accessible", + "forward_property", + "enable_if_t", # Unknown types "int64_t", "int8_t", From 80201bd11bf7e1a681a8296ba22dddf4d30a8282 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 15 Nov 2023 17:59:14 -0600 Subject: [PATCH 26/30] Update copyright years. --- cmake/thirdparty/get_libcudacxx.cmake | 2 +- tests/mr/device/mr_ref_multithreaded_tests.cpp | 2 +- tests/mr/device/mr_ref_test.hpp | 2 +- tests/mr/device/mr_ref_tests.cpp | 2 +- tests/mr/host/mr_ref_tests.cpp | 2 +- tests/mr/host/pinned_pool_mr_tests.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake index b1f990d39..14b0d492f 100644 --- a/cmake/thirdparty/get_libcudacxx.cmake +++ b/cmake/thirdparty/get_libcudacxx.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp index dd39d1e1d..0dea4f503 100644 --- a/tests/mr/device/mr_ref_multithreaded_tests.cpp +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp index ecb13c2a7..126a4fde8 100644 --- a/tests/mr/device/mr_ref_test.hpp +++ b/tests/mr/device/mr_ref_test.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp index 2507d468a..a9a94696a 100644 --- a/tests/mr/device/mr_ref_tests.cpp +++ b/tests/mr/device/mr_ref_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp index 1439ebabe..9f12159de 100644 --- a/tests/mr/host/mr_ref_tests.cpp +++ b/tests/mr/host/mr_ref_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp index 8e202ada3..dcdae37fa 100644 --- a/tests/mr/host/pinned_pool_mr_tests.cpp +++ b/tests/mr/host/pinned_pool_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 5937e3ea172b680418c924c146c2474a05d8079d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 15 Nov 2023 17:59:30 -0600 Subject: [PATCH 27/30] Fix typos and grammar. --- include/rmm/mr/device/device_memory_resource.hpp | 8 ++++---- include/rmm/mr/device/pool_memory_resource.hpp | 4 ++-- include/rmm/mr/host/pinned_memory_resource.hpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp index 9c29a2ad7..63e5f39a4 100644 --- a/include/rmm/mr/device/device_memory_resource.hpp +++ b/include/rmm/mr/device/device_memory_resource.hpp @@ -121,7 +121,7 @@ class device_memory_resource { /** * @brief Deallocate memory pointed to by \p p. * - * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on * a `device_memory_resource` that compares equal to `*this`, and the storage * it points to must not yet have been deallocated, otherwise behavior is * undefined. @@ -177,7 +177,7 @@ class device_memory_resource { /** * @brief Deallocate memory pointed to by \p p. * - * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on * a `device_memory_resource` that compares equal to `*this`, and the storage * it points to must not yet have been deallocated, otherwise behavior is * undefined. @@ -230,7 +230,7 @@ class device_memory_resource { /** * @brief Deallocate memory pointed to by \p p. * - * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on * a `device_memory_resource` that compares equal to `*this`, and the storage * it points to must not yet have been deallocated, otherwise behavior is * undefined. @@ -252,7 +252,7 @@ class device_memory_resource { /** * @brief Deallocate memory pointed to by \p p. * - * `p` must have been returned by a prior call to `allocate(bytes,stream)` on + * `p` must have been returned by a prior call to `allocate(bytes, stream)` on * a `device_memory_resource` that compares equal to `*this`, and the storage * it points to must not yet have been deallocated, otherwise behavior is * undefined. diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index face43993..c85408359 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -56,7 +56,7 @@ namespace detail { * @brief A helper class to remove the device_accessible property * * We want to be able to use the pool_memory_resource with an upstream that may not - * be device accessible. To avoid rewriting the world, we allow to conditionally remove + * be device accessible. To avoid rewriting the world, we allow conditionally removing * the cuda::mr::device_accessible property. * * @tparam PoolResource the pool_memory_resource class @@ -67,7 +67,7 @@ template struct maybe_remove_property {}; /** - * @brief Specialization of maybe_remove_property to not propagate non existing properties + * @brief Specialization of maybe_remove_property to not propagate nonexistent properties */ template struct maybe_remove_property Date: Wed, 15 Nov 2023 17:59:49 -0600 Subject: [PATCH 28/30] Drop support for CUDA older than CUDA 11 in tests. --- tests/mr/device/mr_ref_test.hpp | 4 ---- tests/mr/device/mr_test.hpp | 4 ---- tests/mr/host/mr_ref_tests.cpp | 4 ---- tests/mr/host/mr_tests.cpp | 4 ---- 4 files changed, 16 deletions(-) diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp index 126a4fde8..804c710a5 100644 --- a/tests/mr/device/mr_ref_test.hpp +++ b/tests/mr/device/mr_ref_test.hpp @@ -57,11 +57,7 @@ inline bool is_device_memory(void* ptr) { cudaPointerAttributes attributes{}; if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } -#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 - return attributes.memoryType == cudaMemoryTypeDevice; -#else return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); -#endif } enum size_in_bytes : size_t {}; diff --git a/tests/mr/device/mr_test.hpp b/tests/mr/device/mr_test.hpp index 8c69df215..03f880e72 100644 --- a/tests/mr/device/mr_test.hpp +++ b/tests/mr/device/mr_test.hpp @@ -52,11 +52,7 @@ inline bool is_device_memory(void* ptr) { cudaPointerAttributes attributes{}; if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } -#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 - return attributes.memoryType == cudaMemoryTypeDevice; -#else return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); -#endif } enum size_in_bytes : size_t {}; diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp index 9f12159de..6563eb635 100644 --- a/tests/mr/host/mr_ref_tests.cpp +++ b/tests/mr/host/mr_ref_tests.cpp @@ -43,11 +43,7 @@ inline bool is_device_memory(void* ptr) { cudaPointerAttributes attributes{}; if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } -#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 - return attributes.memoryType == cudaMemoryTypeDevice; -#else return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); -#endif } /** diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp index 1b95bea5d..678d6aeb8 100644 --- a/tests/mr/host/mr_tests.cpp +++ b/tests/mr/host/mr_tests.cpp @@ -43,11 +43,7 @@ inline bool is_device_memory(void* ptr) { cudaPointerAttributes attributes{}; if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; } -#if CUDART_VERSION < 10000 // memoryType is deprecated in CUDA 10 - return attributes.memoryType == cudaMemoryTypeDevice; -#else return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged); -#endif } /** From af127339220e7e2999900366c29efa05a52e3b6a Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 16 Nov 2023 07:48:18 +0000 Subject: [PATCH 29/30] Do not propagate the `device_accessible` property for containers We currently have no design on how we want to propagate the properties of the memory_resource, so drop this for now until have a proper design ready --- include/rmm/device_buffer.hpp | 7 ------- include/rmm/device_uvector.hpp | 7 ------- 2 files changed, 14 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index c94736d2d..4a780018e 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -414,13 +414,6 @@ class device_buffer { */ [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; } - /** - * @brief Enables the `cuda::mr::device_accessible` property - * - * This property declares that a `device_buffer` provides device accessible memory - */ - friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {} - private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index 3bd6a1c25..3f77f59f7 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -532,13 +532,6 @@ class device_uvector { return _storage.memory_resource(); } - /** - * @brief Enables the `cuda::mr::device_accessible` property - * - * This property declares that a `device_uvector` provides device accessible memory - */ - friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {} - /** * @briefreturn{Stream most recently specified for allocation/deallocation} */ From 37c1e3f70a032b6d9af1c32bd16226de8c205ec1 Mon Sep 17 00:00:00 2001 From: Mark Harris <783069+harrism@users.noreply.github.com> Date: Thu, 16 Nov 2023 21:59:41 +0000 Subject: [PATCH 30/30] Fix deadlock in new tests using fix from #1097 --- .../mr/device/mr_ref_multithreaded_tests.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp index dd39d1e1d..5e5bf0c6b 100644 --- a/tests/mr/device/mr_ref_multithreaded_tests.cpp +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -122,6 +122,7 @@ void allocate_async_loop(async_resource_ref ref, std::size_t num_allocations, std::list& allocations, std::mutex& mtx, + std::condition_variable& allocations_ready, cudaEvent_t& event, rmm::cuda_stream_view stream) { @@ -138,25 +139,32 @@ void allocate_async_loop(async_resource_ref ref, RMM_CUDA_TRY(cudaEventRecord(event, stream.value())); allocations.emplace_back(ptr, size); } + allocations_ready.notify_one(); } + + // Work around for threads going away before cudaEvent has finished async processing + cudaEventSynchronize(event); } void deallocate_async_loop(async_resource_ref ref, std::size_t num_allocations, std::list& allocations, std::mutex& mtx, + std::condition_variable& allocations_ready, cudaEvent_t& event, rmm::cuda_stream_view stream) { - for (std::size_t i = 0; i < num_allocations;) { - std::lock_guard lock(mtx); - if (allocations.empty()) { continue; } - i++; + for (std::size_t i = 0; i < num_allocations; i++) { + std::unique_lock lock(mtx); + allocations_ready.wait(lock, [&allocations] { return !allocations.empty(); }); RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event)); allocation alloc = allocations.front(); allocations.pop_front(); ref.deallocate_async(alloc.ptr, alloc.size, stream); } + + // Work around for threads going away before cudaEvent has finished async processing + cudaEventSynchronize(event); } void test_allocate_async_free_different_threads(async_resource_ref ref, @@ -166,6 +174,7 @@ void test_allocate_async_free_different_threads(async_resource_ref ref, constexpr std::size_t num_allocations{100}; std::mutex mtx; + std::condition_variable allocations_ready; std::list allocations; cudaEvent_t event; @@ -176,6 +185,7 @@ void test_allocate_async_free_different_threads(async_resource_ref ref, num_allocations, std::ref(allocations), std::ref(mtx), + std::ref(allocations_ready), std::ref(event), streamA); @@ -184,6 +194,7 @@ void test_allocate_async_free_different_threads(async_resource_ref ref, num_allocations, std::ref(allocations), std::ref(mtx), + std::ref(allocations_ready), std::ref(event), streamB);