diff --git a/README.md b/README.md index e90398c2b..1250d094b 100644 --- a/README.md +++ b/README.md @@ -771,8 +771,8 @@ of 1 GiB and a maximum size of 4 GiB. The pool uses >>> import rmm >>> pool = rmm.mr.PoolMemoryResource( ... rmm.mr.CudaMemoryResource(), -... initial_pool_size=2**30, -... maximum_pool_size=2**32 +... initial_pool_size="1GiB", # equivalent to initial_pool_size=2**30 +... maximum_pool_size="4GiB" ... ) >>> rmm.mr.set_current_device_resource(pool) ``` diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu index 36c9183f9..0eddb1d92 100644 --- a/benchmarks/device_uvector/device_uvector_bench.cu +++ b/benchmarks/device_uvector/device_uvector_bench.cu @@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state) rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{ &cuda_mr, rmm::percent_of_free_device_memory(50)}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) rmm::device_uvector vec(state.range(0), rmm::cuda_stream_view{}); @@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state) state.SetItemsProcessed(static_cast(state.iterations())); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } BENCHMARK(BM_UvectorSizeConstruction) @@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state) rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{ &cuda_mr, rmm::percent_of_free_device_memory(50)}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) rmm::device_vector vec(state.range(0)); @@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state) state.SetItemsProcessed(static_cast(state.iterations())); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } BENCHMARK(BM_ThrustVectorSizeConstruction) @@ -140,7 +140,7 @@ template void BM_VectorWorkflow(benchmark::State& state) { rmm::mr::cuda_async_memory_resource cuda_async_mr{}; - rmm::mr::set_current_device_resource(&cuda_async_mr); + rmm::mr::set_current_device_resource_ref(cuda_async_mr); rmm::cuda_stream input_stream; std::vector streams(4); @@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state) auto const bytes = num_elements * sizeof(std::int32_t) * num_accesses; state.SetBytesProcessed(static_cast(state.iterations() * bytes)); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT @@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT ->Unit(benchmark::kMicrosecond) ->UseManualTime(); -// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()` -// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use -// `cuda_async_memory_resource`, which is faster. +// The only difference here is that `rmm::device_vector` uses +// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses +// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster. BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector) // NOLINT ->RangeMultiplier(10) // NOLINT ->Range(100'000, 100'000'000) // NOLINT diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu index b73ef54f8..86e761c80 100644 --- a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu +++ b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu @@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con { auto mr = factory(); - rmm::mr::set_current_device_resource(mr.get()); + rmm::mr::set_current_device_resource_ref(mr.get()); auto num_streams = state.range(0); auto num_kernels = state.range(1); @@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con state.SetItemsProcessed(static_cast(state.iterations() * num_kernels)); - rmm::mr::set_current_device_resource(nullptr); + rmm::mr::reset_current_device_resource_ref(); } inline auto make_cuda() { return std::make_shared(); } diff --git a/doxygen/Doxyfile b/doxygen/Doxyfile index 7e13fa95b..dbd4194ab 100644 --- a/doxygen/Doxyfile +++ b/doxygen/Doxyfile @@ -2176,7 +2176,11 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = RMM_NAMESPACE=rmm RMM_EXPORT +# These need to be kept in sync with set in rmm/detail/export.hpp +# Since we are excluding detail files in EXCLUDE_PATTERNS there +# appears to be no way of getting doxygen to still parse that file and +# make the definitions available via the preprocessor :( +PREDEFINED = RMM_EXPORT RMM_HIDDEN RMM_NAMESPACE=rmm # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2185,7 +2189,7 @@ PREDEFINED = RMM_NAMESPACE=rmm RMM_EXPORT # definition found in the source code. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_AS_DEFINED = RMM_NAMESPACE +EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will # remove all references to function-like macros that are alone on a line, have diff --git a/include/rmm/detail/export.hpp b/include/rmm/detail/export.hpp index 90fa8d173..9587a6009 100644 --- a/include/rmm/detail/export.hpp +++ b/include/rmm/detail/export.hpp @@ -24,4 +24,5 @@ #else #define RMM_EXPORT #define RMM_HIDDEN +#define RMM_NAMESPACE rmm #endif diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index e6664e0f6..3ddd37415 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -41,7 +41,7 @@ namespace RMM_NAMESPACE { * * This class allocates untyped and *uninitialized* device memory using a * `device_async_resource_ref`. If not explicitly specified, the memory resource - * returned from `get_current_device_resource()` is used. + * returned from `get_current_device_resource_ref()` is used. * * @note Unlike `std::vector` or `thrust::device_vector`, the device memory * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined @@ -95,7 +95,7 @@ class device_buffer { // `__host__ __device__` specifiers to the defaulted constructor when it is called within the // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host- // device function. This causes warnings/errors because this ctor invokes host-only functions. - device_buffer() : _mr{rmm::mr::get_current_device_resource()} {} + device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {} /** * @brief Constructs a new device buffer of `size` uninitialized bytes @@ -109,7 +109,7 @@ class device_buffer { */ explicit device_buffer(std::size_t size, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { cuda_set_device_raii dev{_device}; @@ -138,7 +138,7 @@ class device_buffer { device_buffer(void const* source_data, std::size_t size, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _stream{stream}, _mr{mr} { cuda_set_device_raii dev{_device}; @@ -169,7 +169,7 @@ class device_buffer { */ device_buffer(device_buffer const& other, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : device_buffer{other.data(), other.size(), stream, mr} { } @@ -419,8 +419,8 @@ class device_buffer { cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation rmm::device_async_resource_ref _mr{ - rmm::mr::get_current_device_resource()}; ///< The memory resource used to - ///< allocate/deallocate device memory + rmm::mr::get_current_device_resource_ref()}; ///< The memory resource used to + ///< allocate/deallocate device memory cuda_device_id _device{get_current_cuda_device()}; /** diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp index 95388eca9..62b004e2f 100644 --- a/include/rmm/device_scalar.hpp +++ b/include/rmm/device_scalar.hpp @@ -95,7 +95,7 @@ class device_scalar { * @param mr Optional, resource with which to allocate. */ explicit device_scalar(cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{1, stream, mr} { } @@ -118,7 +118,7 @@ class device_scalar { */ explicit device_scalar(value_type const& initial_value, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{1, stream, mr} { set_value_async(initial_value, stream); @@ -138,7 +138,7 @@ class device_scalar { */ device_scalar(device_scalar const& other, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{other._storage, stream, mr} { } diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp index e1610a73a..13f566150 100644 --- a/include/rmm/device_uvector.hpp +++ b/include/rmm/device_uvector.hpp @@ -48,7 +48,7 @@ namespace RMM_NAMESPACE { * * Example: * @code{.cpp} - * rmm::mr::device_memory_resource * mr = new my_custom_resource(); + * auto mr = new my_custom_resource(); * rmm::cuda_stream_view s{}; * * // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the @@ -126,7 +126,7 @@ class device_uvector { */ explicit device_uvector(std::size_t size, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{elements_to_bytes(size), stream, mr} { } @@ -142,7 +142,7 @@ class device_uvector { */ explicit device_uvector(device_uvector const& other, cuda_stream_view stream, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : _storage{other._storage, stream, mr} { } diff --git a/include/rmm/exec_policy.hpp b/include/rmm/exec_policy.hpp index 1c9a07abd..019a8245a 100644 --- a/include/rmm/exec_policy.hpp +++ b/include/rmm/exec_policy.hpp @@ -57,7 +57,7 @@ class exec_policy : public thrust_exec_policy_t { * @param mr The resource to use for allocating temporary memory */ explicit exec_policy(cuda_stream_view stream = cuda_stream_default, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : thrust_exec_policy_t( thrust::cuda::par(mr::thrust_allocator(stream, mr)).on(stream.value())) { @@ -81,7 +81,7 @@ using thrust_exec_policy_nosync_t = class exec_policy_nosync : public thrust_exec_policy_nosync_t { public: explicit exec_policy_nosync(cuda_stream_view stream = cuda_stream_default, - device_async_resource_ref mr = mr::get_current_device_resource()) + device_async_resource_ref mr = mr::get_current_device_resource_ref()) : thrust_exec_policy_nosync_t( thrust::cuda::par_nosync(mr::thrust_allocator(stream, mr)).on(stream.value())) { diff --git a/include/rmm/logger.hpp b/include/rmm/logger.hpp index e60501d2b..326385f16 100644 --- a/include/rmm/logger.hpp +++ b/include/rmm/logger.hpp @@ -107,7 +107,7 @@ struct bytes { * * @return spdlog::logger& The logger. */ -inline spdlog::logger& logger() +RMM_EXPORT inline spdlog::logger& logger() { static detail::logger_wrapper wrapped{}; return wrapped.logger_; diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp index 85eddb427..4df2c4d2d 100644 --- a/include/rmm/mr/device/aligned_resource_adaptor.hpp +++ b/include/rmm/mr/device/aligned_resource_adaptor.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -59,7 +60,6 @@ class aligned_resource_adaptor final : public device_memory_resource { /** * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests. * - * @throws rmm::logic_error if `upstream == nullptr` * @throws rmm::logic_error if `allocation_alignment` is not a power of 2 * * @param upstream The resource used for allocating/deallocating device memory. @@ -67,12 +67,33 @@ class aligned_resource_adaptor final : public device_memory_resource { * @param alignment_threshold Only allocations with a size larger than or equal to this threshold * are aligned. */ - explicit aligned_resource_adaptor(Upstream* upstream, + explicit aligned_resource_adaptor(device_async_resource_ref upstream, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT, std::size_t alignment_threshold = default_alignment_threshold) : upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + RMM_EXPECTS(rmm::is_supported_alignment(alignment), + "Allocation alignment is not a power of 2."); + } + + /** + * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests. + * + * @throws rmm::logic_error if `upstream == nullptr` + * @throws rmm::logic_error if `alignment` is not a power of 2 + * + * @param upstream The resource used for allocating/deallocating device memory. + * @param alignment The size used for allocation alignment. + * @param alignment_threshold Only allocations with a size larger than or equal to this threshold + * are aligned. + */ + explicit aligned_resource_adaptor(Upstream* upstream, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT, + std::size_t alignment_threshold = default_alignment_threshold) + : upstream_{to_device_async_resource_ref_checked(upstream)}, + alignment_{alignment}, + alignment_threshold_{alignment_threshold} + { RMM_EXPECTS(rmm::is_supported_alignment(alignment), "Allocation alignment is not a power of 2."); } @@ -92,11 +113,6 @@ class aligned_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief The default alignment used by the adaptor. */ @@ -106,8 +122,8 @@ class aligned_resource_adaptor final : public device_memory_resource { using lock_guard = std::lock_guard; /** - * @brief Allocates memory of size at least `bytes` using the upstream resource with the specified - * alignment. + * @brief Allocates memory of size at least `bytes` using the upstream resource with the + * specified alignment. * * @throws rmm::bad_alloc if the requested allocation could not be fulfilled * by the upstream resource. @@ -119,10 +135,10 @@ class aligned_resource_adaptor final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) { - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, 1, stream); } auto const size = upstream_allocation_size(bytes); - void* pointer = upstream_->allocate(size, stream); + void* pointer = get_upstream_resource().allocate_async(size, 1, stream); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) auto const address = reinterpret_cast(pointer); auto const aligned_address = rmm::align_up(address, alignment_); @@ -145,7 +161,7 @@ class aligned_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, 1, stream); } else { { lock_guard lock(mtx_); @@ -155,7 +171,7 @@ class aligned_resource_adaptor final : public device_memory_resource { pointers_.erase(iter); } } - upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream); + get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream); } } @@ -176,8 +192,8 @@ class aligned_resource_adaptor final : public device_memory_resource { } /** - * @brief Calculate the allocation size needed from upstream to account for alignments of both the - * size and the base pointer. + * @brief Calculate the allocation size needed from upstream to account for alignments of both + * the size and the base pointer. * * @param bytes The requested allocation size. * @return Allocation size needed from upstream to align both the size and the base pointer. @@ -188,7 +204,8 @@ class aligned_resource_adaptor final : public device_memory_resource { return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT; } - Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests + /// The upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; std::unordered_map pointers_; ///< Map of aligned pointers to upstream pointers. std::size_t alignment_; ///< The size used for allocation alignment std::size_t alignment_threshold_; ///< The size above which allocations should be aligned diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp index 388182e6a..417b7d2b4 100644 --- a/include/rmm/mr/device/arena_memory_resource.hpp +++ b/include/rmm/mr/device/arena_memory_resource.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -82,6 +83,26 @@ namespace mr { template class arena_memory_resource final : public device_memory_resource { public: + /** + * @brief Construct an `arena_memory_resource`. + * + * @param upstream_mr The memory resource from which to allocate blocks for the global arena. + * @param arena_size Size in bytes of the global arena. Defaults to half of the available + * memory on the current device. + * @param dump_log_on_failure If true, dump memory log when running out of memory. + */ + explicit arena_memory_resource(device_async_resource_ref upstream_mr, + std::optional arena_size = std::nullopt, + bool dump_log_on_failure = false) + : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure} + { + if (dump_log_on_failure_) { + logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log"); + // Set the level to `debug` for more detailed output. + logger_->set_level(spdlog::level::info); + } + } + /** * @brief Construct an `arena_memory_resource`. * @@ -95,7 +116,8 @@ class arena_memory_resource final : public device_memory_resource { explicit arena_memory_resource(Upstream* upstream_mr, std::optional arena_size = std::nullopt, bool dump_log_on_failure = false) - : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure} + : global_arena_{to_device_async_resource_ref_checked(upstream_mr), arena_size}, + dump_log_on_failure_{dump_log_on_failure} { if (dump_log_on_failure_) { logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log"); @@ -113,8 +135,8 @@ class arena_memory_resource final : public device_memory_resource { arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete; private: - using global_arena = rmm::mr::detail::arena::global_arena; - using arena = rmm::mr::detail::arena::arena; + using global_arena = rmm::mr::detail::arena::global_arena; + using arena = rmm::mr::detail::arena::arena; /** * @brief Allocates memory of size at least `bytes`. @@ -274,7 +296,7 @@ class arena_memory_resource final : public device_memory_resource { std::unique_lock lock(map_mtx_); auto thread_arena = std::make_shared(global_arena_); thread_arenas_.emplace(thread_id, thread_arena); - thread_local detail::arena::arena_cleaner cleaner{thread_arena}; + thread_local detail::arena::arena_cleaner cleaner{thread_arena}; return *thread_arena; } } diff --git a/include/rmm/mr/device/binning_memory_resource.hpp b/include/rmm/mr/device/binning_memory_resource.hpp index 773035231..a5ef64665 100644 --- a/include/rmm/mr/device/binning_memory_resource.hpp +++ b/include/rmm/mr/device/binning_memory_resource.hpp @@ -27,6 +27,7 @@ #include #include #include +#include #include namespace RMM_NAMESPACE { @@ -52,16 +53,48 @@ class binning_memory_resource final : public device_memory_resource { * Initially has no bins, so simply uses the upstream_resource until bin resources are added * with `add_bin`. * - * @throws rmm::logic_error if size_base is not a power of two. + * @param upstream_resource The upstream memory resource used to allocate bin pools. + */ + explicit binning_memory_resource(device_async_resource_ref upstream_resource) + : upstream_mr_{upstream_resource} + { + } + + /** + * @brief Construct a new binning memory resource object. + * + * Initially has no bins, so simply uses the upstream_resource until bin resources are added + * with `add_bin`. + * + * @throws rmm::logic_error if upstream_resource is nullptr * * @param upstream_resource The upstream memory resource used to allocate bin pools. */ explicit binning_memory_resource(Upstream* upstream_resource) - : upstream_mr_{[upstream_resource]() { - RMM_EXPECTS(nullptr != upstream_resource, "Unexpected null upstream pointer."); - return upstream_resource; - }()} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_resource)} + { + } + + /** + * @brief Construct a new binning memory resource object with a range of initial bins. + * + * Constructs a new binning memory resource and adds bins backed by `fixed_size_memory_resource` + * in the range [2^min_size_exponent, 2^max_size_exponent]. For example if `min_size_exponent==18` + * and `max_size_exponent==22`, creates bins of sizes 256KiB, 512KiB, 1024KiB, 2048KiB and + * 4096KiB. + * + * @param upstream_resource The upstream memory resource used to allocate bin pools. + * @param min_size_exponent The minimum base-2 exponent bin size. + * @param max_size_exponent The maximum base-2 exponent bin size. + */ + binning_memory_resource(device_async_resource_ref upstream_resource, + int8_t min_size_exponent, // NOLINT(bugprone-easily-swappable-parameters) + int8_t max_size_exponent) + : upstream_mr_{upstream_resource} { + for (auto i = min_size_exponent; i <= max_size_exponent; i++) { + add_bin(1 << i); + } } /** @@ -72,6 +105,8 @@ class binning_memory_resource final : public device_memory_resource { * and `max_size_exponent==22`, creates bins of sizes 256KiB, 512KiB, 1024KiB, 2048KiB and * 4096KiB. * + * @throws rmm::logic_error if upstream_resource is nullptr + * * @param upstream_resource The upstream memory resource used to allocate bin pools. * @param min_size_exponent The minimum base-2 exponent bin size. * @param max_size_exponent The maximum base-2 exponent bin size. @@ -79,10 +114,7 @@ class binning_memory_resource final : public device_memory_resource { binning_memory_resource(Upstream* upstream_resource, int8_t min_size_exponent, // NOLINT(bugprone-easily-swappable-parameters) int8_t max_size_exponent) - : upstream_mr_{[upstream_resource]() { - RMM_EXPECTS(nullptr != upstream_resource, "Unexpected null upstream pointer."); - return upstream_resource; - }()} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_resource)} { for (auto i = min_size_exponent; i <= max_size_exponent; i++) { add_bin(1 << i); @@ -102,23 +134,17 @@ class binning_memory_resource final : public device_memory_resource { binning_memory_resource& operator=(binning_memory_resource&&) = delete; /** - * @briefreturn{rmm::device_async_resource_ref to the upstream resource} + * @briefreturn{device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_mr_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; } - /** * @brief Add a bin allocator to this resource * - * Adds `bin_resource` if it is not null; otherwise constructs and adds a - * fixed_size_memory_resource. + * Adds `bin_resource` if provided; otherwise constructs and adds a fixed_size_memory_resource. * * This bin will be used for any allocation smaller than `allocation_size` that is larger than * the next smaller bin's allocation size. @@ -130,14 +156,14 @@ class binning_memory_resource final : public device_memory_resource { * @param allocation_size The maximum size that this bin allocates * @param bin_resource The memory resource for the bin */ - void add_bin(std::size_t allocation_size, device_memory_resource* bin_resource = nullptr) + void add_bin(std::size_t allocation_size, + std::optional bin_resource = std::nullopt) { - allocation_size = rmm::align_up(allocation_size, rmm::CUDA_ALLOCATION_ALIGNMENT); + allocation_size = align_up(allocation_size, CUDA_ALLOCATION_ALIGNMENT); - if (nullptr != bin_resource) { - resource_bins_.insert({allocation_size, bin_resource}); + if (bin_resource.has_value()) { + resource_bins_.insert({allocation_size, bin_resource.value()}); } else if (resource_bins_.count(allocation_size) == 0) { // do nothing if bin already exists - owned_bin_resources_.push_back( std::make_unique>(upstream_mr_, allocation_size)); resource_bins_.insert({allocation_size, owned_bin_resources_.back().get()}); @@ -153,11 +179,10 @@ class binning_memory_resource final : public device_memory_resource { * @param bytes Requested allocation size in bytes * @return Get the resource reference for the requested size. */ - rmm::device_async_resource_ref get_resource_ref(std::size_t bytes) + device_async_resource_ref get_resource_ref(std::size_t bytes) { auto iter = resource_bins_.lower_bound(bytes); - return (iter != resource_bins_.cend()) ? rmm::device_async_resource_ref{iter->second} - : get_upstream_resource(); + return (iter != resource_bins_.cend()) ? iter->second : get_upstream_resource(); } /** @@ -188,11 +213,12 @@ class binning_memory_resource final : public device_memory_resource { get_resource_ref(bytes).deallocate_async(ptr, bytes, stream); } - Upstream* upstream_mr_; // The upstream memory_resource from which to allocate blocks. + device_async_resource_ref + upstream_mr_; // The upstream memory_resource from which to allocate blocks. std::vector>> owned_bin_resources_; - std::map resource_bins_; + std::map resource_bins_; }; /** @} */ // end of group diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp index c569f7dd6..fa2d8056d 100644 --- a/include/rmm/mr/device/callback_memory_resource.hpp +++ b/include/rmm/mr/device/callback_memory_resource.hpp @@ -86,12 +86,13 @@ class callback_memory_resource final : public device_memory_resource { * It is the caller's responsibility to maintain the lifetime of the pointed-to data * for the duration of the lifetime of the `callback_memory_resource`. */ - callback_memory_resource(allocate_callback_t allocate_callback, - deallocate_callback_t deallocate_callback, - void* allocate_callback_arg = nullptr, - void* deallocate_callback_arg = nullptr) noexcept - : allocate_callback_(allocate_callback), - deallocate_callback_(deallocate_callback), + callback_memory_resource( + allocate_callback_t allocate_callback, + deallocate_callback_t deallocate_callback, + void* allocate_callback_arg = nullptr, // NOLINT(bugprone-easily-swappable-parameters) + void* deallocate_callback_arg = nullptr) noexcept + : allocate_callback_(std::move(allocate_callback)), + deallocate_callback_(std::move(deallocate_callback)), allocate_callback_arg_(allocate_callback_arg), deallocate_callback_arg_(deallocate_callback_arg) { diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp index 3d24cfebf..6f8303c83 100644 --- a/include/rmm/mr/device/detail/arena.hpp +++ b/include/rmm/mr/device/detail/arena.hpp @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -494,22 +495,18 @@ inline auto max_free_size(std::set const& superblocks) * @tparam Upstream Memory resource to use for allocating the arena. Implements * rmm::mr::device_memory_resource interface. */ -template class global_arena final { public: /** * @brief Construct a global arena. * - * @throws rmm::logic_error if `upstream_mr == nullptr`. - * * @param upstream_mr The memory resource from which to allocate blocks for the pool * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory * on the current device. */ - global_arena(Upstream* upstream_mr, std::optional arena_size) + global_arena(device_async_resource_ref upstream_mr, std::optional arena_size) : upstream_mr_{upstream_mr} { - RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer."); auto const size = rmm::align_down(arena_size.value_or(default_size()), rmm::CUDA_ALLOCATION_ALIGNMENT); RMM_EXPECTS(size >= superblock::minimum_size, @@ -530,7 +527,7 @@ class global_arena final { ~global_arena() { std::lock_guard lock(mtx_); - upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size()); + upstream_mr_.deallocate(upstream_block_.pointer(), upstream_block_.size()); } /** @@ -539,7 +536,7 @@ class global_arena final { * @param size The size in bytes of the allocation. * @return bool True if the allocation should be handled by the global arena. */ - bool handles(std::size_t size) const { return size > superblock::minimum_size; } + static bool handles(std::size_t size) { return size > superblock::minimum_size; } /** * @brief Acquire a superblock that can fit a block of the given size. @@ -610,7 +607,7 @@ class global_arena final { * @param stream Stream on which to perform deallocation. * @return bool true if the allocation is found, false otherwise. */ - bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) + bool deallocate_async(void* ptr, std::size_t size, cuda_stream_view stream) { RMM_LOGGING_ASSERT(handles(size)); stream.synchronize_no_throw(); @@ -692,7 +689,7 @@ class global_arena final { * @brief Default size of the global arena if unspecified. * @return the default global arena size. */ - constexpr std::size_t default_size() const + static std::size_t default_size() { auto const [free, total] = rmm::available_device_memory(); return free / 2; @@ -705,7 +702,7 @@ class global_arena final { */ void initialize(std::size_t size) { - upstream_block_ = {upstream_mr_->allocate(size), size}; + upstream_block_ = {upstream_mr_.allocate(size), size}; superblocks_.emplace(upstream_block_.pointer(), size); } @@ -777,7 +774,7 @@ class global_arena final { } /// The upstream resource to allocate memory from. - Upstream* upstream_mr_; + device_async_resource_ref upstream_mr_; /// Block allocated from upstream so that it can be quickly freed. block upstream_block_; /// Address-ordered set of superblocks. @@ -795,7 +792,6 @@ class global_arena final { * @tparam Upstream Memory resource to use for allocating the global arena. Implements * rmm::mr::device_memory_resource interface. */ -template class arena { public: /** @@ -803,7 +799,7 @@ class arena { * * @param global_arena The global arena from which to allocate superblocks. */ - explicit arena(global_arena& global_arena) : global_arena_{global_arena} {} + explicit arena(global_arena& global_arena) : global_arena_{global_arena} {} // Disable copy (and move) semantics. arena(arena const&) = delete; @@ -837,7 +833,9 @@ class arena { */ bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream) { - if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; } + if (global_arena::handles(size) && global_arena_.deallocate_async(ptr, size, stream)) { + return true; + } return deallocate(ptr, size); } @@ -961,7 +959,7 @@ class arena { } /// The global arena to allocate superblocks from. - global_arena& global_arena_; + global_arena& global_arena_; /// Acquired superblocks. std::set superblocks_; /// Mutex for exclusive lock. @@ -976,10 +974,9 @@ class arena { * @tparam Upstream Memory resource to use for allocating the global arena. Implements * rmm::mr::device_memory_resource interface. */ -template class arena_cleaner { public: - explicit arena_cleaner(std::shared_ptr> const& arena) : arena_(arena) {} + explicit arena_cleaner(std::shared_ptr const& arena) : arena_(arena) {} // Disable copy (and move) semantics. arena_cleaner(arena_cleaner const&) = delete; @@ -997,7 +994,7 @@ class arena_cleaner { private: /// A non-owning pointer to the arena that may need cleaning. - std::weak_ptr> arena_; + std::weak_ptr arena_; }; } // namespace mr::detail::arena diff --git a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp index eeebaac3b..fdb385748 100644 --- a/include/rmm/mr/device/failure_callback_resource_adaptor.hpp +++ b/include/rmm/mr/device/failure_callback_resource_adaptor.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -81,9 +82,9 @@ using failure_callback_t = std::function; * { * bool retried{false}; * failure_callback_adaptor mr{ - * rmm::mr::get_current_device_resource(), failure_handler, &retried + * rmm::mr::get_current_device_resource_ref(), failure_handler, &retried * }; - * rmm::mr::set_current_device_resource(&mr); + * rmm::mr::set_current_device_resource_ref(mr); * } * @endcode * @@ -95,6 +96,21 @@ class failure_callback_resource_adaptor final : public device_memory_resource { public: using exception_type = ExceptionType; ///< The type of exception this object catches/throws + /** + * @brief Construct a new `failure_callback_resource_adaptor` using `upstream` to satisfy + * allocation requests. + * + * @param upstream The resource used for allocating/deallocating device memory + * @param callback Callback function @see failure_callback_t + * @param callback_arg Extra argument passed to `callback` + */ + failure_callback_resource_adaptor(device_async_resource_ref upstream, + failure_callback_t callback, + void* callback_arg) + : upstream_{upstream}, callback_{std::move(callback)}, callback_arg_{callback_arg} + { + } + /** * @brief Construct a new `failure_callback_resource_adaptor` using `upstream` to satisfy * allocation requests. @@ -108,9 +124,10 @@ class failure_callback_resource_adaptor final : public device_memory_resource { failure_callback_resource_adaptor(Upstream* upstream, failure_callback_t callback, void* callback_arg) - : upstream_{upstream}, callback_{std::move(callback)}, callback_arg_{callback_arg} + : upstream_{to_device_async_resource_ref_checked(upstream)}, + callback_{std::move(callback)}, + callback_arg_{callback_arg} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } failure_callback_resource_adaptor() = delete; @@ -130,11 +147,6 @@ class failure_callback_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -153,7 +165,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { while (true) { try { - ret = upstream_->allocate(bytes, stream); + ret = get_upstream_resource().allocate_async(bytes, stream); break; } catch (exception_type const& e) { if (!callback_(bytes, callback_arg_)) { throw; } @@ -171,7 +183,7 @@ class failure_callback_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -185,11 +197,12 @@ class failure_callback_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; failure_callback_t callback_; void* callback_arg_; }; diff --git a/include/rmm/mr/device/fixed_size_memory_resource.hpp b/include/rmm/mr/device/fixed_size_memory_resource.hpp index 84cb3b0c6..249af77dd 100644 --- a/include/rmm/mr/device/fixed_size_memory_resource.hpp +++ b/include/rmm/mr/device/fixed_size_memory_resource.hpp @@ -65,7 +65,31 @@ class fixed_size_memory_resource /** * @brief Construct a new `fixed_size_memory_resource` that allocates memory from - * `upstream_resource`. + * `upstream_mr`. + * + * When the pool of blocks is all allocated, grows the pool by allocating + * `blocks_to_preallocate` more blocks from `upstream_mr`. + * + * @param upstream_mr The device_async_resource_ref from which to allocate blocks for the pool. + * @param block_size The size of blocks to allocate. + * @param blocks_to_preallocate The number of blocks to allocate to initialize the pool. + */ + explicit fixed_size_memory_resource( + device_async_resource_ref upstream_mr, + // NOLINTNEXTLINE bugprone-easily-swappable-parameters + std::size_t block_size = default_block_size, + std::size_t blocks_to_preallocate = default_blocks_to_preallocate) + : upstream_mr_{upstream_mr}, + block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)}, + upstream_chunk_size_{block_size_ * blocks_to_preallocate} + { + // allocate initial blocks and insert into free list + this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy); + } + + /** + * @brief Construct a new `fixed_size_memory_resource` that allocates memory from + * `upstream_mr`. * * When the pool of blocks is all allocated, grows the pool by allocating * `blocks_to_preallocate` more blocks from `upstream_mr`. @@ -76,11 +100,12 @@ class fixed_size_memory_resource */ explicit fixed_size_memory_resource( Upstream* upstream_mr, + // NOLINTNEXTLINE bugprone-easily-swappable-parameters std::size_t block_size = default_block_size, std::size_t blocks_to_preallocate = default_blocks_to_preallocate) - : upstream_mr_{upstream_mr}, - block_size_{rmm::align_up(block_size, rmm::CUDA_ALLOCATION_ALIGNMENT)}, - upstream_chunk_size_{block_size * blocks_to_preallocate} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)}, + block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)}, + upstream_chunk_size_{block_size_ * blocks_to_preallocate} { // allocate initial blocks and insert into free list this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy); @@ -99,18 +124,13 @@ class fixed_size_memory_resource fixed_size_memory_resource& operator=(fixed_size_memory_resource&&) = delete; /** - * @briefreturn{rmm::device_async_resource_ref to the upstream resource} + * @briefreturn{device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_mr_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; } - /** * @brief Get the size of blocks allocated by this memory resource. * @@ -200,7 +220,7 @@ class fixed_size_memory_resource { // Deallocating a fixed-size block just inserts it in the free list, which is // handled by the parent class - RMM_LOGGING_ASSERT(rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT) <= block_size_); + RMM_LOGGING_ASSERT(align_up(size, CUDA_ALLOCATION_ALIGNMENT) <= block_size_); return block_type{ptr}; } @@ -254,10 +274,10 @@ class fixed_size_memory_resource } private: - Upstream* upstream_mr_; // The resource from which to allocate new blocks + device_async_resource_ref upstream_mr_; // The resource from which to allocate new blocks - std::size_t const block_size_; // size of blocks this MR allocates - std::size_t const upstream_chunk_size_; // size of chunks allocated from heap MR + std::size_t block_size_; // size of blocks this MR allocates + std::size_t upstream_chunk_size_; // size of chunks allocated from heap MR // blocks allocated from heap: so they can be easily freed std::vector upstream_blocks_; diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp index c3ef72e09..d19fa3a0a 100644 --- a/include/rmm/mr/device/limiting_resource_adaptor.hpp +++ b/include/rmm/mr/device/limiting_resource_adaptor.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,24 @@ namespace mr { template class limiting_resource_adaptor final : public device_memory_resource { public: + /** + * @brief Construct a new limiting resource adaptor using `upstream` to satisfy + * allocation requests and limiting the total allocation amount possible. + * + * @param upstream The resource used for allocating/deallocating device memory + * @param allocation_limit Maximum memory allowed for this allocator + * @param alignment Alignment in bytes for the start of each allocated buffer + */ + limiting_resource_adaptor(device_async_resource_ref upstream, + std::size_t allocation_limit, + std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT) + : upstream_{upstream}, + allocation_limit_{allocation_limit}, + allocated_bytes_(0), + alignment_(alignment) + { + } + /** * @brief Construct a new limiting resource adaptor using `upstream` to satisfy * allocation requests and limiting the total allocation amount possible. @@ -57,13 +76,12 @@ class limiting_resource_adaptor final : public device_memory_resource { */ limiting_resource_adaptor(Upstream* upstream, std::size_t allocation_limit, - std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) - : allocation_limit_{allocation_limit}, + std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT) + : upstream_{to_device_async_resource_ref_checked(upstream)}, + allocation_limit_{allocation_limit}, allocated_bytes_(0), - alignment_(alignment), - upstream_{upstream} + alignment_(alignment) { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } limiting_resource_adaptor() = delete; @@ -76,18 +94,13 @@ class limiting_resource_adaptor final : public device_memory_resource { default; ///< @default_move_assignment{limiting_resource_adaptor} /** - * @briefreturn{rmm::device_async_resource_ref to the upstream resource} + * @briefreturn{device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Query the number of bytes that have been allocated. Note that * this can not be used to know how large of an allocation is possible due @@ -124,11 +137,11 @@ class limiting_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - auto const proposed_size = rmm::align_up(bytes, alignment_); + auto const proposed_size = align_up(bytes, alignment_); auto const old = allocated_bytes_.fetch_add(proposed_size); if (old + proposed_size <= allocation_limit_) { try { - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, stream); } catch (...) { allocated_bytes_ -= proposed_size; throw; @@ -148,8 +161,8 @@ class limiting_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - std::size_t allocated_size = rmm::align_up(bytes, alignment_); - upstream_->deallocate(ptr, bytes, stream); + std::size_t allocated_size = align_up(bytes, alignment_); + get_upstream_resource().deallocate_async(ptr, bytes, stream); allocated_bytes_ -= allocated_size; } @@ -164,10 +177,13 @@ class limiting_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto const* cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } + // The upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; + // maximum bytes this allocator is allowed to allocate. std::size_t allocation_limit_; @@ -176,9 +192,6 @@ class limiting_resource_adaptor final : public device_memory_resource { // todo: should be some way to ask the upstream... std::size_t alignment_; - - Upstream* upstream_; ///< The upstream resource used for satisfying - ///< allocation requests }; /** diff --git a/include/rmm/mr/device/logging_resource_adaptor.hpp b/include/rmm/mr/device/logging_resource_adaptor.hpp index 6f6cd816a..595ab2e4e 100644 --- a/include/rmm/mr/device/logging_resource_adaptor.hpp +++ b/include/rmm/mr/device/logging_resource_adaptor.hpp @@ -77,10 +77,8 @@ class logging_resource_adaptor final : public device_memory_resource { logging_resource_adaptor(Upstream* upstream, std::string const& filename = get_default_filename(), bool auto_flush = false) - : logger_{make_logger(filename)}, upstream_{upstream} + : logger_{make_logger(filename)}, upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - init_logger(auto_flush); } @@ -99,10 +97,8 @@ class logging_resource_adaptor final : public device_memory_resource { * performance. */ logging_resource_adaptor(Upstream* upstream, std::ostream& stream, bool auto_flush = false) - : logger_{make_logger(stream)}, upstream_{upstream} + : logger_{make_logger(stream)}, upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); - init_logger(auto_flush); } @@ -123,10 +119,76 @@ class logging_resource_adaptor final : public device_memory_resource { logging_resource_adaptor(Upstream* upstream, spdlog::sinks_init_list sinks, bool auto_flush = false) - : logger_{make_logger(sinks)}, upstream_{upstream} + : logger_{make_logger(sinks)}, upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + init_logger(auto_flush); + } + /** + * @brief Construct a new logging resource adaptor using `upstream` to satisfy + * allocation requests and logging information about each allocation/free to + * the file specified by `filename`. + * + * The logfile will be written using CSV formatting. + * + * Clears the contents of `filename` if it already exists. + * + * Creating multiple `logging_resource_adaptor`s with the same `filename` will + * result in undefined behavior. + * + * @throws spdlog::spdlog_ex if opening `filename` failed + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + * @param filename Name of file to write log info. If not specified, retrieves + * the file name from the environment variable "RMM_LOG_FILE". + * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade + * performance. + */ + logging_resource_adaptor(device_async_resource_ref upstream, + std::string const& filename = get_default_filename(), + bool auto_flush = false) + : logger_{make_logger(filename)}, upstream_{upstream} + { + init_logger(auto_flush); + } + + /** + * @brief Construct a new logging resource adaptor using `upstream` to satisfy + * allocation requests and logging information about each allocation/free to + * the ostream specified by `stream`. + * + * The logfile will be written using CSV formatting. + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + * @param stream The ostream to write log info. + * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade + * performance. + */ + logging_resource_adaptor(device_async_resource_ref upstream, + std::ostream& stream, + bool auto_flush = false) + : logger_{make_logger(stream)}, upstream_{upstream} + { + init_logger(auto_flush); + } + + /** + * @brief Construct a new logging resource adaptor using `upstream` to satisfy + * allocation requests and logging information about each allocation/free to + * the ostream specified by `stream`. + * + * The logfile will be written using CSV formatting. + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + * @param sinks A list of logging sinks to which log output will be written. + * @param auto_flush If true, flushes the log for every (de)allocation. Warning, this will degrade + * performance. + */ + logging_resource_adaptor(device_async_resource_ref upstream, + spdlog::sinks_init_list sinks, + bool auto_flush = false) + : logger_{make_logger(sinks)}, upstream_{upstream} + { init_logger(auto_flush); } @@ -147,11 +209,6 @@ class logging_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Flush logger contents. */ @@ -239,7 +296,7 @@ class logging_resource_adaptor final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { try { - auto const ptr = upstream_->allocate(bytes, stream); + auto const ptr = get_upstream_resource().allocate_async(bytes, stream); logger_->info("allocate,{},{},{}", ptr, bytes, fmt::ptr(stream.value())); return ptr; } catch (...) { @@ -265,7 +322,7 @@ class logging_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { logger_->info("free,{},{},{}", ptr, bytes, fmt::ptr(stream.value())); - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -279,24 +336,14 @@ class logging_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto const* cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } - // make_logging_adaptor needs access to private get_default_filename - template - // NOLINTNEXTLINE(readability-redundant-declaration) - [[deprecated( - "make_logging_adaptor is deprecated in RMM 24.10. Use the logging_resource_adaptor constructor " - "instead.")]] - friend logging_resource_adaptor make_logging_adaptor(T* upstream, - std::string const& filename, - bool auto_flush); - std::shared_ptr logger_; ///< spdlog logger object - Upstream* upstream_; ///< The upstream resource used for satisfying - ///< allocation requests + device_async_resource_ref upstream_; ///< The upstream resource used for satisfying + ///< allocation requests }; /** diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index b8291c237..855d4c0bf 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -100,6 +100,15 @@ namespace mr { namespace detail { +// These symbols must have default visibility so that when they are +// referenced in multiple different DSOs the linker correctly +// determines that there is only a single unique reference to the +// function symbols (and hence they return unique static references +// across different DSOs). See also +// https://github.com/rapidsai/rmm/issues/826 +// Although currently the entire RMM namespace is RMM_EXPORT, we +// explicitly mark these functions as exported in case the namespace +// export changes. /** * @brief Returns a pointer to the initial resource. * @@ -122,7 +131,6 @@ RMM_EXPORT inline std::mutex& map_lock() return map_lock; } -// This symbol must have default visibility, see: https://github.com/rapidsai/rmm/issues/826 /** * @briefreturn{Reference to the map from device id -> resource} */ @@ -185,6 +193,31 @@ inline device_memory_resource* get_per_device_resource(cuda_device_id device_id) : found->second; } +namespace detail { + +// The non-thread-safe implementation of `set_per_device_resource_ref`. This exists because +// we need to call this function from two places: the thread-safe version of +// `set_per_device_resource_ref` and the thread-safe version of `set_per_device_resource`, +// both of which take the lock, so we need an implementation that doesn't take the lock. +/// @private +inline device_async_resource_ref set_per_device_resource_ref_unsafe( + cuda_device_id device_id, device_async_resource_ref new_resource_ref) +{ + auto& map = detail::get_ref_map(); + auto const old_itr = map.find(device_id.value()); + // If a resource didn't previously exist for `device_id`, return pointer to initial_resource + // Note: because resource_ref is not default-constructible, we can't use std::map::operator[] + if (old_itr == map.end()) { + map.insert({device_id.value(), new_resource_ref}); + return device_async_resource_ref{detail::initial_resource()}; + } + + auto old_resource_ref = old_itr->second; + old_itr->second = new_resource_ref; // update map directly via iterator + return old_resource_ref; +} +} // namespace detail + /** * @brief Set the `device_memory_resource` for the specified device. * @@ -216,6 +249,14 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id, device_memory_resource* new_mr) { std::lock_guard lock{detail::map_lock()}; + + // Note: even though set_per_device_resource() and set_per_device_resource_ref() are not + // interchangeable, we call the latter from the former to maintain resource_ref + // state consistent with the resource pointer state. This is necessary because the + // Python API still uses the raw pointer API. Once the Python API is updated to use + // resource_ref, this call can be removed. + detail::set_per_device_resource_ref_unsafe(device_id, new_mr); + auto& map = detail::get_map(); auto const old_itr = map.find(device_id.value()); // If a resource didn't previously exist for `id`, return pointer to initial_resource @@ -342,18 +383,7 @@ inline device_async_resource_ref set_per_device_resource_ref( cuda_device_id device_id, device_async_resource_ref new_resource_ref) { std::lock_guard lock{detail::ref_map_lock()}; - auto& map = detail::get_ref_map(); - auto const old_itr = map.find(device_id.value()); - // If a resource didn't previously exist for `device_id`, return pointer to initial_resource - // Note: because resource_ref is not default-constructible, we can't use std::map::operator[] - if (old_itr == map.end()) { - map.insert({device_id.value(), new_resource_ref}); - return device_async_resource_ref{detail::initial_resource()}; - } - - auto old_resource_ref = old_itr->second; - old_itr->second = new_resource_ref; // update map directly via iterator - return old_resource_ref; + return detail::set_per_device_resource_ref_unsafe(device_id, new_resource_ref); } /** diff --git a/include/rmm/mr/device/polymorphic_allocator.hpp b/include/rmm/mr/device/polymorphic_allocator.hpp index 6fb068410..442632d4f 100644 --- a/include/rmm/mr/device/polymorphic_allocator.hpp +++ b/include/rmm/mr/device/polymorphic_allocator.hpp @@ -52,7 +52,7 @@ class polymorphic_allocator { using value_type = T; ///< T, the value type of objects allocated by this allocator /** * @brief Construct a `polymorphic_allocator` using the return value of - * `rmm::mr::get_current_device_resource()` as the underlying memory resource. + * `rmm::mr::get_current_device_resource_ref()` as the underlying memory resource. * */ polymorphic_allocator() = default; @@ -116,7 +116,7 @@ class polymorphic_allocator { private: rmm::device_async_resource_ref mr_{ - get_current_device_resource()}; ///< Underlying resource used for (de)allocation + get_current_device_resource_ref()}; ///< Underlying resource used for (de)allocation }; /** diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp index d22b53404..f63de21ff 100644 --- a/include/rmm/mr/device/pool_memory_resource.hpp +++ b/include/rmm/mr/device/pool_memory_resource.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -114,6 +115,33 @@ class pool_memory_resource final friend class detail::stream_ordered_memory_resource, detail::coalescing_free_list>; + /** + * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using + * `upstream_mr`. + * + * @throws rmm::logic_error if `initial_pool_size` is not aligned to a multiple of + * pool_memory_resource::allocation_alignment bytes. + * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a + * multiple of pool_memory_resource::allocation_alignment bytes. + * + * @param upstream_mr The memory_resource from which to allocate blocks for the pool. + * @param initial_pool_size Minimum size, in bytes, of the initial pool. + * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all + * of the available from the upstream resource. + */ + explicit pool_memory_resource(device_async_resource_ref upstream_mr, + std::size_t initial_pool_size, + std::optional maximum_pool_size = std::nullopt) + : upstream_mr_{upstream_mr} + { + RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT), + "Error, Initial pool size required to be a multiple of 256 bytes"); + RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT), + "Error, Maximum pool size required to be a multiple of 256 bytes"); + + initialize_pool(initial_pool_size, maximum_pool_size); + } + /** * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using * `upstream_mr`. @@ -132,10 +160,7 @@ class pool_memory_resource final explicit pool_memory_resource(Upstream* upstream_mr, std::size_t initial_pool_size, std::optional maximum_pool_size = std::nullopt) - : upstream_mr_{[upstream_mr]() { - RMM_EXPECTS(nullptr != upstream_mr, "Unexpected null upstream pointer."); - return upstream_mr; - }()} + : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)} { RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT), "Error, Initial pool size required to be a multiple of 256 bytes"); @@ -184,16 +209,11 @@ class pool_memory_resource final /** * @briefreturn{rmm::device_async_resource_ref to the upstream resource} */ - [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept + [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept { return upstream_mr_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_mr_; } - /** * @brief Computes the size of the current pool * @@ -466,7 +486,8 @@ class pool_memory_resource final } private: - Upstream* upstream_mr_; // The "heap" to allocate the pool from + // The "heap" to allocate the pool from + device_async_resource_ref upstream_mr_; std::size_t current_pool_size_{}; std::optional maximum_pool_size_{}; diff --git a/include/rmm/mr/device/prefetch_resource_adaptor.hpp b/include/rmm/mr/device/prefetch_resource_adaptor.hpp index 59ce8e036..d3a4c676a 100644 --- a/include/rmm/mr/device/prefetch_resource_adaptor.hpp +++ b/include/rmm/mr/device/prefetch_resource_adaptor.hpp @@ -41,6 +41,14 @@ namespace mr { template class prefetch_resource_adaptor final : public device_memory_resource { public: + /** + * @brief Construct a new prefetch resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @param upstream The resource_ref used for allocating/deallocating device memory + */ + prefetch_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} + /** * @brief Construct a new prefetch resource adaptor using `upstream` to satisfy * allocation requests. @@ -49,9 +57,9 @@ class prefetch_resource_adaptor final : public device_memory_resource { * * @param upstream The resource used for allocating/deallocating device memory */ - prefetch_resource_adaptor(Upstream* upstream) : upstream_{upstream} + prefetch_resource_adaptor(Upstream* upstream) + : upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } prefetch_resource_adaptor() = delete; @@ -71,11 +79,6 @@ class prefetch_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -92,7 +95,7 @@ class prefetch_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); + void* ptr = get_upstream_resource().allocate_async(bytes, stream); rmm::prefetch(ptr, bytes, rmm::get_current_cuda_device(), stream); return ptr; } @@ -106,7 +109,7 @@ class prefetch_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -120,11 +123,12 @@ class prefetch_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; }; /** @} */ // end of group diff --git a/include/rmm/mr/device/statistics_resource_adaptor.hpp b/include/rmm/mr/device/statistics_resource_adaptor.hpp index bf78c669a..025c51aa7 100644 --- a/include/rmm/mr/device/statistics_resource_adaptor.hpp +++ b/include/rmm/mr/device/statistics_resource_adaptor.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -113,17 +114,25 @@ class statistics_resource_adaptor final : public device_memory_resource { } }; + /** + * @brief Construct a new statistics resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @param upstream The resource_ref used for allocating/deallocating device memory. + */ + statistics_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} + /** * @brief Construct a new statistics resource adaptor using `upstream` to satisfy * allocation requests. * * @throws rmm::logic_error if `upstream == nullptr` * - * @param upstream The resource used for allocating/deallocating device memory + * @param upstream The resource used for allocating/deallocating device memory. */ - statistics_resource_adaptor(Upstream* upstream) : upstream_{upstream} + statistics_resource_adaptor(Upstream* upstream) + : upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } statistics_resource_adaptor() = delete; @@ -143,11 +152,6 @@ class statistics_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Returns a `counter` struct for this adaptor containing the current, * peak, and total number of allocated bytes for this @@ -226,7 +230,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); + void* ptr = get_upstream_resource().allocate_async(bytes, stream); // increment the stats { @@ -249,7 +253,7 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -271,7 +275,7 @@ class statistics_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -279,14 +283,14 @@ class statistics_resource_adaptor final : public device_memory_resource { // Invariant: the stack always contains at least one entry std::stack> counter_stack_{{std::make_pair(counter{}, counter{})}}; std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; }; /** * @brief Convenience factory to return a `statistics_resource_adaptor` around the * upstream resource `upstream`. * - * @tparam Upstream Type of the upstream `device_memory_resource`. * @param upstream Pointer to the upstream resource * @return The new statistics resource adaptor */ @@ -297,7 +301,7 @@ template "instead.")]] statistics_resource_adaptor make_statistics_adaptor(Upstream* upstream) { - return statistics_resource_adaptor{upstream}; + return statistics_resource_adaptor{upstream}; } /** @} */ // end of group diff --git a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp index 9979d1e08..6881aa19e 100644 --- a/include/rmm/mr/device/thread_safe_resource_adaptor.hpp +++ b/include/rmm/mr/device/thread_safe_resource_adaptor.hpp @@ -45,6 +45,16 @@ class thread_safe_resource_adaptor final : public device_memory_resource { public: using lock_t = std::lock_guard; ///< Type of lock used to synchronize access + /** + * @brief Construct a new thread safe resource adaptor using `upstream` to satisfy + * allocation requests. + * + * All allocations and frees are protected by a mutex lock + * + * @param upstream The resource used for allocating/deallocating device memory. + */ + thread_safe_resource_adaptor(device_async_resource_ref upstream) : upstream_{upstream} {} + /** * @brief Construct a new thread safe resource adaptor using `upstream` to satisfy * allocation requests. @@ -55,9 +65,9 @@ class thread_safe_resource_adaptor final : public device_memory_resource { * * @param upstream The resource used for allocating/deallocating device memory. */ - thread_safe_resource_adaptor(Upstream* upstream) : upstream_{upstream} + thread_safe_resource_adaptor(Upstream* upstream) + : upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } thread_safe_resource_adaptor() = delete; @@ -75,11 +85,6 @@ class thread_safe_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - private: /** * @brief Allocates memory of size at least `bytes` using the upstream @@ -95,7 +100,7 @@ class thread_safe_resource_adaptor final : public device_memory_resource { void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { lock_t lock(mtx); - return upstream_->allocate(bytes, stream); + return get_upstream_resource().allocate_async(bytes, stream); } /** @@ -108,7 +113,7 @@ class thread_safe_resource_adaptor final : public device_memory_resource { void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { lock_t lock(mtx); - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); } /** @@ -122,12 +127,13 @@ class thread_safe_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } std::mutex mutable mtx; // mutex for thread safe access to upstream - Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests + device_async_resource_ref + upstream_; ///< The upstream resource used for satisfying allocation requests }; /** @} */ // end of group diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp index 2055a0633..b7b990c3d 100644 --- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp +++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp @@ -150,7 +150,7 @@ class thrust_allocator : public thrust::device_malloc_allocator { private: cuda_stream_view _stream{}; - rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()}; + rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource_ref()}; cuda_device_id _device{get_current_cuda_device()}; }; /** @} */ // end of group diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp index 3d3188b23..6a5916e5c 100644 --- a/include/rmm/mr/device/tracking_resource_adaptor.hpp +++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -83,6 +84,18 @@ class tracking_resource_adaptor final : public device_memory_resource { allocation_size{size} {}; }; + /** + * @brief Construct a new tracking resource adaptor using `upstream` to satisfy + * allocation requests. + * + * @param upstream The resource used for allocating/deallocating device memory + * @param capture_stacks If true, capture stacks for allocation calls + */ + tracking_resource_adaptor(device_async_resource_ref upstream, bool capture_stacks = false) + : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{upstream} + { + } + /** * @brief Construct a new tracking resource adaptor using `upstream` to satisfy * allocation requests. @@ -93,9 +106,10 @@ class tracking_resource_adaptor final : public device_memory_resource { * @param capture_stacks If true, capture stacks for allocation calls */ tracking_resource_adaptor(Upstream* upstream, bool capture_stacks = false) - : capture_stacks_{capture_stacks}, allocated_bytes_{0}, upstream_{upstream} + : capture_stacks_{capture_stacks}, + allocated_bytes_{0}, + upstream_{to_device_async_resource_ref_checked(upstream)} { - RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); } tracking_resource_adaptor() = delete; @@ -115,11 +129,6 @@ class tracking_resource_adaptor final : public device_memory_resource { return upstream_; } - /** - * @briefreturn{Upstream* to the upstream memory resource} - */ - [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; } - /** * @brief Get the outstanding allocations map * @@ -199,8 +208,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void* do_allocate(std::size_t bytes, cuda_stream_view stream) override { - void* ptr = upstream_->allocate(bytes, stream); - + void* ptr = get_upstream_resource().allocate_async(bytes, stream); // track it. { write_lock_t lock(mtx_); @@ -220,7 +228,7 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override { - upstream_->deallocate(ptr, bytes, stream); + get_upstream_resource().deallocate_async(ptr, bytes, stream); { write_lock_t lock(mtx_); @@ -265,7 +273,7 @@ class tracking_resource_adaptor final : public device_memory_resource { { if (this == &other) { return true; } auto cast = dynamic_cast const*>(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } @@ -273,7 +281,8 @@ class tracking_resource_adaptor final : public device_memory_resource { std::map allocations_; // map of active allocations std::atomic allocated_bytes_; // number of bytes currently allocated std::shared_mutex mutable mtx_; // mutex for thread safe access to allocations_ - Upstream* upstream_; // the upstream resource used for satisfying allocation requests + device_async_resource_ref upstream_; // the upstream resource used for satisfying + // allocation requests }; /** diff --git a/include/rmm/resource_ref.hpp b/include/rmm/resource_ref.hpp index 08942a040..56049522f 100644 --- a/include/rmm/resource_ref.hpp +++ b/include/rmm/resource_ref.hpp @@ -65,5 +65,21 @@ using host_device_resource_ref = using host_device_async_resource_ref = cuda::mr::async_resource_ref; +/** + * @brief Convert pointer to memory resource into `device_async_resource_ref`, checking for + * `nullptr` + * + * @tparam Resource The type of the memory resource. + * @param res A pointer to the memory resource. + * @return A `device_async_resource_ref` to the memory resource. + * @throws std::logic_error if the memory resource pointer is null. + */ +template +device_async_resource_ref to_device_async_resource_ref_checked(Resource* res) +{ + RMM_EXPECTS(res, "Unexpected null resource pointer."); + return device_async_resource_ref{*res}; +} + /** @} */ // end of group } // namespace RMM_NAMESPACE diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md index 911073b5d..22c0dc023 100644 --- a/python/rmm/docs/guide.md +++ b/python/rmm/docs/guide.md @@ -139,8 +139,8 @@ of 1 GiB and a maximum size of 4 GiB. The pool uses >>> import rmm >>> pool = rmm.mr.PoolMemoryResource( ... rmm.mr.CudaMemoryResource(), -... initial_pool_size=2**30, -... maximum_pool_size=2**32 +... initial_pool_size="1GiB", # equivalent to initial_pool_size=2**30 +... maximum_pool_size="4GiB" ... ) >>> rmm.mr.set_current_device_resource(pool) ``` @@ -151,8 +151,8 @@ Similarly, to use a pool of managed memory: >>> import rmm >>> pool = rmm.mr.PoolMemoryResource( ... rmm.mr.ManagedMemoryResource(), -... initial_pool_size=2**30, -... maximum_pool_size=2**32 +... initial_pool_size="1GiB", +... maximum_pool_size="4GiB" ... ) >>> rmm.mr.set_current_device_resource(pool) ``` diff --git a/python/rmm/rmm/_lib/CMakeLists.txt b/python/rmm/rmm/_lib/CMakeLists.txt index 1e629a402..7cdfed971 100644 --- a/python/rmm/rmm/_lib/CMakeLists.txt +++ b/python/rmm/rmm/_lib/CMakeLists.txt @@ -12,7 +12,8 @@ # the License. # ============================================================================= -set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx) +set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx + helper.pyx) set(linked_libraries rmm::rmm) # Build all of the Cython targets diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/_lib/_torch_allocator.cpp index dc92e4639..bfe94c2d0 100644 --- a/python/rmm/rmm/_lib/_torch_allocator.cpp +++ b/python/rmm/rmm/_lib/_torch_allocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,8 +39,9 @@ extern "C" void* allocate(std::size_t size, int device, void* stream) { rmm::cuda_device_id const device_id{device}; rmm::cuda_set_device_raii with_device{device_id}; - auto mr = rmm::mr::get_per_device_resource(device_id); - return mr->allocate(size, rmm::cuda_stream_view{static_cast(stream)}); + auto mr = rmm::mr::get_per_device_resource_ref(device_id); + return mr.allocate_async( + size, rmm::CUDA_ALLOCATION_ALIGNMENT, rmm::cuda_stream_view{static_cast(stream)}); } /** @@ -55,6 +56,9 @@ extern "C" void deallocate(void* ptr, std::size_t size, int device, void* stream { rmm::cuda_device_id const device_id{device}; rmm::cuda_set_device_raii with_device{device_id}; - auto mr = rmm::mr::get_per_device_resource(device_id); - mr->deallocate(ptr, size, rmm::cuda_stream_view{static_cast(stream)}); + auto mr = rmm::mr::get_per_device_resource_ref(device_id); + mr.deallocate_async(ptr, + size, + rmm::CUDA_ALLOCATION_ALIGNMENT, + rmm::cuda_stream_view{static_cast(stream)}); } diff --git a/python/rmm/rmm/_lib/helper.pxd b/python/rmm/rmm/_lib/helper.pxd new file mode 100644 index 000000000..8ca151c00 --- /dev/null +++ b/python/rmm/rmm/_lib/helper.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +cdef object parse_bytes(object s) except * diff --git a/python/rmm/rmm/_lib/helper.pyx b/python/rmm/rmm/_lib/helper.pyx new file mode 100644 index 000000000..d442ee341 --- /dev/null +++ b/python/rmm/rmm/_lib/helper.pyx @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper functions for rmm""" + +import re + + +cdef dict BYTE_SIZES = { + 'b': 1, + '': 1, + 'kb': 1000, + 'mb': 1000**2, + 'gb': 1000**3, + 'tb': 1000**4, + 'pb': 1000**5, + 'kib': 1024, + 'mib': 1024**2, + 'gib': 1024**3, + 'tib': 1024**4, + 'pib': 1024**5, +} + + +pattern = re.compile(r"^([0-9]+(?:\.[0-9]*)?)[\t ]*((?i:(?:[kmgtp]i?)?b))?$") + +cdef object parse_bytes(object s): + """Parse a string or integer into a number of bytes. + + Parameters + ---------- + s : int | str + Size in bytes. If an integer is provided, it is returned as-is. + A string is parsed as a floating point number with an (optional, + case-insensitive) byte-specifier, both SI prefixes (kb, mb, ..., pb) + and binary prefixes (kib, mib, ..., pib) are supported. + + Returns + ------- + Requested size in bytes as an integer. + + Raises + ------ + ValueError + If it is not possible to parse the input as a byte specification. + """ + cdef str suffix + cdef double n + cdef int multiplier + + if isinstance(s, int): + return s + + match = pattern.match(s) + + if match is None: + raise ValueError(f"Could not parse {s} as a byte specification") + + n = float(match.group(1)) + + suffix = match.group(2) + if suffix is None: + suffix = "" + + multiplier = BYTE_SIZES[suffix.lower()] + + return int(n*multiplier) diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/_lib/memory_resource.pyx index 7cd8a05b7..843331215 100644 --- a/python/rmm/rmm/_lib/memory_resource.pyx +++ b/python/rmm/rmm/_lib/memory_resource.pyx @@ -38,6 +38,7 @@ from rmm._cuda.stream cimport Stream from rmm._cuda.stream import DEFAULT_STREAM from rmm._lib.cuda_stream_view cimport cuda_stream_view +from rmm._lib.helper cimport parse_bytes from rmm._lib.memory_resource cimport ( available_device_memory as c_available_device_memory, percent_of_free_device_memory as c_percent_of_free_device_memory, @@ -332,9 +333,9 @@ cdef class CudaAsyncMemoryResource(DeviceMemoryResource): Parameters ---------- - initial_pool_size : int, optional + initial_pool_size : int | str, optional Initial pool size in bytes. By default, half the available memory - on the device is used. + on the device is used. A string argument is parsed using `parse_bytes`. release_threshold: int, optional Release threshold in bytes. If the pool size grows beyond this value, unused memory held by the pool will be released at the @@ -352,7 +353,7 @@ cdef class CudaAsyncMemoryResource(DeviceMemoryResource): cdef optional[size_t] c_initial_pool_size = ( optional[size_t]() if initial_pool_size is None - else optional[size_t]( initial_pool_size) + else optional[size_t]( parse_bytes(initial_pool_size)) ) cdef optional[size_t] c_release_threshold = ( @@ -444,12 +445,12 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor): c_initial_pool_size = ( c_percent_of_free_device_memory(50) if initial_pool_size is None - else initial_pool_size + else parse_bytes(initial_pool_size) ) c_maximum_pool_size = ( optional[size_t]() if maximum_pool_size is None - else optional[size_t]( maximum_pool_size) + else optional[size_t]( parse_bytes(maximum_pool_size)) ) self.c_obj.reset( new pool_memory_resource[device_memory_resource]( @@ -474,10 +475,10 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor): upstream_mr : DeviceMemoryResource The DeviceMemoryResource from which to allocate blocks for the pool. - initial_pool_size : int, optional + initial_pool_size : int | str, optional Initial pool size in bytes. By default, half the available memory on the device is used. - maximum_pool_size : int, optional + maximum_pool_size : int | str, optional Maximum size in bytes, that the pool can grow to. """ pass @@ -1149,8 +1150,10 @@ cpdef void _initialize( typ = PoolMemoryResource args = (upstream(),) kwargs = dict( - initial_pool_size=initial_pool_size, - maximum_pool_size=maximum_pool_size + initial_pool_size=None if initial_pool_size is None + else parse_bytes(initial_pool_size), + maximum_pool_size=None if maximum_pool_size is None + else parse_bytes(maximum_pool_size) ) else: typ = upstream diff --git a/python/rmm/rmm/rmm.py b/python/rmm/rmm/rmm.py index e5290905c..bac04b477 100644 --- a/python/rmm/rmm/rmm.py +++ b/python/rmm/rmm/rmm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -45,14 +45,16 @@ def reinitialize( performance. managed_memory : bool, default False If True, use managed memory for device memory allocation - initial_pool_size : int, default None + initial_pool_size : int | str, default None When `pool_allocator` is True, this indicates the initial pool size in bytes. By default, 1/2 of the total GPU memory is used. When `pool_allocator` is False, this argument is ignored if provided. - maximum_pool_size : int, default None + A string argument is parsed using `parse_bytes`. + maximum_pool_size : int | str, default None When `pool_allocator` is True, this indicates the maximum pool size in bytes. By default, the total available memory on the GPU is used. When `pool_allocator` is False, this argument is ignored if provided. + A string argument is parsed using `parse_bytes`. devices : int or List[int], default 0 GPU device IDs to register. By default registers only GPU 0. logging : bool, default False diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index 9379c71e8..8c978e6f1 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -432,8 +432,8 @@ def test_rmm_pool_cupy_allocator_stream_lifetime(): def test_pool_memory_resource(dtype, nelem, alloc): mr = rmm.mr.PoolMemoryResource( rmm.mr.CudaMemoryResource(), - initial_pool_size=1 << 22, - maximum_pool_size=1 << 23, + initial_pool_size="4MiB", + maximum_pool_size="8MiB", ) rmm.mr.set_current_device_resource(mr) assert rmm.mr.get_current_device_resource_type() is type(mr) @@ -507,7 +507,7 @@ def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr): def test_reinitialize_max_pool_size(): rmm.reinitialize( - pool_allocator=True, initial_pool_size=0, maximum_pool_size=1 << 23 + pool_allocator=True, initial_pool_size=0, maximum_pool_size="8MiB" ) rmm.DeviceBuffer().resize((1 << 23) - 1) @@ -530,6 +530,24 @@ def test_reinitialize_initial_pool_size_gt_max(): assert "Initial pool size exceeds the maximum pool size" in str(e.value) +def test_reinitialize_with_valid_str_arg_pool_size(): + rmm.reinitialize( + pool_allocator=True, + initial_pool_size="2kib", + maximum_pool_size="8kib", + ) + + +def test_reinitialize_with_invalid_str_arg_pool_size(): + with pytest.raises(ValueError) as e: + rmm.reinitialize( + pool_allocator=True, + initial_pool_size="2k", # 2kb valid, not 2k + maximum_pool_size="8k", + ) + assert "Could not parse" in str(e.value) + + @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("nelem", _nelems) @pytest.mark.parametrize("alloc", _allocs) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d1195a070..551727fe8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,6 +40,7 @@ function(ConfigureTestInternal TEST_NAME) PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}") target_compile_options(${TEST_NAME} PUBLIC $<$:-Wall -Werror -Wno-error=deprecated-declarations>) + target_compile_options(${TEST_NAME} PUBLIC "$<$:-O0>") if(DISABLE_DEPRECATION_WARNING) target_compile_options( @@ -125,7 +126,7 @@ ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) # pool mr tests -ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp GPUS 1 PERCENT 60) +ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp GPUS 1 PERCENT 100) # cuda_async mr tests ConfigureTest(CUDA_ASYNC_MR_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60) @@ -140,10 +141,10 @@ ConfigureTest(POLYMORPHIC_ALLOCATOR_TEST mr/device/polymorphic_allocator_tests.c ConfigureTest(STREAM_ADAPTOR_TEST mr/device/stream_allocator_adaptor_tests.cpp) # statistics adaptor tests -ConfigureTest(STATISTICS_TEST mr/device/statistics_mr_tests.cpp GPUS 1 PERCENT 100) +ConfigureTest(STATISTICS_TEST mr/device/statistics_mr_tests.cpp) # tracking adaptor tests -ConfigureTest(TRACKING_TEST mr/device/tracking_mr_tests.cpp GPUS 1 PERCENT 100) +ConfigureTest(TRACKING_TEST mr/device/tracking_mr_tests.cpp) # failure callback adaptor tests ConfigureTest(FAILURE_CALLBACK_TEST mr/device/failure_callback_mr_tests.cpp) @@ -185,7 +186,7 @@ ConfigureTest(PREFETCH_TEST prefetch_tests.cpp) ConfigureTest(LOGGER_TEST logger_tests.cpp) # arena MR tests -ConfigureTest(ARENA_MR_TEST mr/device/arena_mr_tests.cpp GPUS 1 PERCENT 60) +ConfigureTest(ARENA_MR_TEST mr/device/arena_mr_tests.cpp GPUS 1 PERCENT 100) # binning MR tests ConfigureTest(BINNING_MR_TEST mr/device/binning_mr_tests.cpp) diff --git a/tests/container_multidevice_tests.cu b/tests/container_multidevice_tests.cu index e58ba53a2..55432feb0 100644 --- a/tests/container_multidevice_tests.cu +++ b/tests/container_multidevice_tests.cu @@ -42,9 +42,9 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); { if constexpr (std::is_same_v>) { @@ -57,7 +57,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } @@ -69,9 +69,9 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); { auto buf_1 = []() { @@ -97,7 +97,7 @@ TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } @@ -109,9 +109,9 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); if constexpr (not std::is_same_v>) { auto buf = TypeParam(128, rmm::cuda_stream_view{}); @@ -120,7 +120,7 @@ TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } @@ -132,9 +132,9 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice) // only run on multidevice systems if (num_devices >= 2) { rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}}; - auto* orig_mr = rmm::mr::get_current_device_resource(); + auto orig_mr = rmm::mr::get_current_device_resource_ref(); auto check_mr = device_check_resource_adaptor{orig_mr}; - rmm::mr::set_current_device_resource(&check_mr); + rmm::mr::set_current_device_resource_ref(check_mr); if constexpr (not std::is_same_v>) { auto buf = TypeParam(128, rmm::cuda_stream_view{}); @@ -144,6 +144,6 @@ TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice) } RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0)); - rmm::mr::set_current_device_resource(orig_mr); + rmm::mr::set_current_device_resource_ref(orig_mr); } } diff --git a/tests/cuda_stream_tests.cpp b/tests/cuda_stream_tests.cpp index 1cc068434..ec7e6c3e9 100644 --- a/tests/cuda_stream_tests.cpp +++ b/tests/cuda_stream_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,6 +96,6 @@ TEST_F(CudaStreamDeathTest, TestSyncNoThrow) // should assert here or in `~cuda_stream()` stream_a.synchronize_no_throw(); }; - EXPECT_DEATH(test(), "Assertion"); + EXPECT_DEATH(test(), ""); } #endif diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu index c095eecf8..5e48504d6 100644 --- a/tests/device_buffer_tests.cu +++ b/tests/device_buffer_tests.cu @@ -75,7 +75,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource) EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.ssize()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -87,7 +87,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(this->stream, buff.stream()); } @@ -121,7 +121,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); @@ -138,7 +138,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) EXPECT_NE(nullptr, buff.data()); EXPECT_EQ(this->size, buff.size()); EXPECT_EQ(this->size, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); buff.stream().synchronize(); @@ -152,7 +152,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr) EXPECT_EQ(nullptr, buff.data()); EXPECT_EQ(0, buff.size()); EXPECT_EQ(0, buff.capacity()); - EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}, + EXPECT_EQ(rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}, buff.memory_resource()); EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream()); } @@ -180,7 +180,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor) EXPECT_EQ(buff.size(), buff_copy.size()); EXPECT_EQ(buff.capacity(), buff_copy.capacity()); EXPECT_EQ(buff_copy.memory_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), @@ -223,7 +223,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize) // The capacity of the copy should be equal to the `size()` of the original EXPECT_EQ(new_size, buff_copy.capacity()); EXPECT_EQ(buff_copy.memory_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{}); EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default), diff --git a/tests/device_check_resource_adaptor.hpp b/tests/device_check_resource_adaptor.hpp index fcb578fdf..6780f56d7 100644 --- a/tests/device_check_resource_adaptor.hpp +++ b/tests/device_check_resource_adaptor.hpp @@ -17,13 +17,14 @@ #include #include #include +#include #include #include class device_check_resource_adaptor final : public rmm::mr::device_memory_resource { public: - device_check_resource_adaptor(rmm::mr::device_memory_resource* upstream) + device_check_resource_adaptor(rmm::device_async_resource_ref upstream) : device_id{rmm::get_current_cuda_device()}, upstream_(upstream) { } @@ -36,11 +37,6 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour return upstream_; } - /** - * @briefreturn{device_memory_resource* to the upstream memory resource} - */ - [[nodiscard]] device_memory_resource* get_upstream() const noexcept { return upstream_; } - private: [[nodiscard]] bool check_device_id() const { return device_id == rmm::get_current_cuda_device(); } @@ -48,7 +44,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { bool const is_correct_device = check_device_id(); EXPECT_TRUE(is_correct_device); - if (is_correct_device) { return upstream_->allocate(bytes, stream); } + if (is_correct_device) { return get_upstream_resource().allocate_async(bytes, stream); } return nullptr; } @@ -56,7 +52,7 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { bool const is_correct_device = check_device_id(); EXPECT_TRUE(is_correct_device); - if (is_correct_device) { upstream_->deallocate(ptr, bytes, stream); } + if (is_correct_device) { get_upstream_resource().deallocate_async(ptr, bytes, stream); } } [[nodiscard]] bool do_is_equal( @@ -64,10 +60,10 @@ class device_check_resource_adaptor final : public rmm::mr::device_memory_resour { if (this == &other) { return true; } auto const* cast = dynamic_cast(&other); - if (cast == nullptr) { return upstream_->is_equal(other); } + if (cast == nullptr) { return false; } return get_upstream_resource() == cast->get_upstream_resource(); } rmm::cuda_device_id device_id; - rmm::mr::device_memory_resource* upstream_{}; + rmm::device_async_resource_ref upstream_; }; diff --git a/tests/device_scalar_tests.cpp b/tests/device_scalar_tests.cpp index 6f80a5de1..323894a6a 100644 --- a/tests/device_scalar_tests.cpp +++ b/tests/device_scalar_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ struct DeviceScalarTest : public ::testing::Test { std::default_random_engine generator{}; T value{}; rmm::cuda_stream stream{}; - rmm::device_async_resource_ref mr{rmm::mr::get_current_device_resource()}; + rmm::device_async_resource_ref mr{rmm::mr::get_current_device_resource_ref()}; DeviceScalarTest() : value{random_value()} {} diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp index 1c93ef138..90955c24c 100644 --- a/tests/device_uvector_tests.cpp +++ b/tests/device_uvector_tests.cpp @@ -39,7 +39,7 @@ TYPED_TEST(TypedUVectorTest, MemoryResource) { rmm::device_uvector vec(128, this->stream()); EXPECT_EQ(vec.memory_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); } TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor) diff --git a/tests/mock_resource.hpp b/tests/mock_resource.hpp index e06148d3a..555cf0d74 100644 --- a/tests/mock_resource.hpp +++ b/tests/mock_resource.hpp @@ -25,7 +25,12 @@ class mock_resource : public rmm::mr::device_memory_resource { public: MOCK_METHOD(void*, do_allocate, (std::size_t, cuda_stream_view), (override)); MOCK_METHOD(void, do_deallocate, (void*, std::size_t, cuda_stream_view), (override)); + bool operator==(mock_resource const&) const noexcept { return true; } + bool operator!=(mock_resource const&) const { return false; } + friend void get_property(mock_resource const&, cuda::mr::device_accessible) noexcept {} using size_pair = std::pair; }; +static_assert(cuda::mr::async_resource_with); + } // namespace rmm::test diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp index a757a78b0..286daa841 100644 --- a/tests/mr/device/adaptor_tests.cpp +++ b/tests/mr/device/adaptor_tests.cpp @@ -129,8 +129,7 @@ TYPED_TEST(AdaptorTest, Equality) } { - rmm::mr::device_memory_resource* device_mr = &this->cuda; - auto other_mr = aligned_resource_adaptor{device_mr}; + auto other_mr = aligned_resource_adaptor{&this->cuda}; EXPECT_FALSE(this->mr->is_equal(other_mr)); } } diff --git a/tests/mr/device/aligned_mr_tests.cpp b/tests/mr/device/aligned_mr_tests.cpp index b9ecbc8ca..9b90bf751 100644 --- a/tests/mr/device/aligned_mr_tests.cpp +++ b/tests/mr/device/aligned_mr_tests.cpp @@ -59,13 +59,13 @@ TEST(AlignedTest, ThrowOnInvalidAllocationAlignment) TEST(AlignedTest, SupportsGetMemInfo) { mock_resource mock; - aligned_mock mr{&mock}; + aligned_mock mr{mock}; } TEST(AlignedTest, DefaultAllocationAlignmentPassthrough) { mock_resource mock; - aligned_mock mr{&mock}; + aligned_mock mr{mock}; cuda_stream_view stream; void* const pointer = int_to_address(123); @@ -204,7 +204,7 @@ TEST(AlignedTest, AlignRealPointer) { auto const alignment{4096}; auto const threshold{65536}; - aligned_real mr{rmm::mr::get_current_device_resource(), alignment, threshold}; + aligned_real mr{rmm::mr::get_current_device_resource_ref(), alignment, threshold}; void* alloc = mr.allocate(threshold); EXPECT_TRUE(rmm::is_pointer_aligned(alloc, alignment)); mr.deallocate(alloc, threshold); diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp index 1eb38888e..bdc0f2438 100644 --- a/tests/mr/device/arena_mr_tests.cpp +++ b/tests/mr/device/arena_mr_tests.cpp @@ -23,6 +23,9 @@ #include #include #include +#include + +#include #include #include @@ -37,15 +40,22 @@ namespace { class mock_memory_resource { public: - MOCK_METHOD(void*, allocate, (std::size_t)); - MOCK_METHOD(void, deallocate, (void*, std::size_t)); + MOCK_METHOD(void*, allocate, (std::size_t, std::size_t)); + MOCK_METHOD(void, deallocate, (void*, std::size_t, std::size_t)); + MOCK_METHOD(void*, allocate_async, (std::size_t, std::size_t, cuda::stream_ref)); + MOCK_METHOD(void, deallocate_async, (void*, std::size_t, std::size_t, cuda::stream_ref)); + bool operator==(mock_memory_resource const&) const noexcept { return true; } + bool operator!=(mock_memory_resource const&) const { return false; } + friend void get_property(mock_memory_resource const&, cuda::mr::device_accessible) noexcept {} }; +static_assert(cuda::mr::async_resource_with); + using rmm::mr::detail::arena::block; using rmm::mr::detail::arena::byte_span; using rmm::mr::detail::arena::superblock; -using global_arena = rmm::mr::detail::arena::global_arena; -using arena = rmm::mr::detail::arena::arena; +using global_arena = rmm::mr::detail::arena::global_arena; +using arena = rmm::mr::detail::arena::arena; using arena_mr = rmm::mr::arena_memory_resource; using ::testing::Return; @@ -59,9 +69,10 @@ auto const fake_address4 = reinterpret_cast(superblock::minimum_size * 2) struct ArenaTest : public ::testing::Test { void SetUp() override { - EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3)); - EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size)); - global = std::make_unique(&mock_mr, arena_size); + EXPECT_CALL(mock_mr, allocate(arena_size, ::testing::_)).WillOnce(Return(fake_address3)); + EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size, ::testing::_)); + + global = std::make_unique(mock_mr, arena_size); per_thread = std::make_unique(*global); } @@ -293,13 +304,6 @@ TEST_F(ArenaTest, SuperblockMaxFreeSizeWhenFull) // NOLINT /** * Test global_arena. */ - -TEST_F(ArenaTest, GlobalArenaNullUpstream) // NOLINT -{ - auto construct_nullptr = []() { global_arena global{nullptr, std::nullopt}; }; - EXPECT_THROW(construct_nullptr(), rmm::logic_error); // NOLINT(cppcoreguidelines-avoid-goto) -} - TEST_F(ArenaTest, GlobalArenaAcquire) // NOLINT { auto const sblk = global->acquire(256); @@ -378,7 +382,7 @@ TEST_F(ArenaTest, GlobalArenaDeallocate) // NOLINT { auto* ptr = global->allocate(superblock::minimum_size * 2); EXPECT_EQ(ptr, fake_address3); - global->deallocate(ptr, superblock::minimum_size * 2, {}); + global->deallocate_async(ptr, superblock::minimum_size * 2, {}); ptr = global->allocate(superblock::minimum_size * 2); EXPECT_EQ(ptr, fake_address3); } @@ -387,8 +391,8 @@ TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp) // NOLINT { auto* ptr = global->allocate(superblock::minimum_size + 256); auto* ptr2 = global->allocate(superblock::minimum_size + 512); - global->deallocate(ptr, superblock::minimum_size + 256, {}); - global->deallocate(ptr2, superblock::minimum_size + 512, {}); + global->deallocate_async(ptr, superblock::minimum_size + 256, {}); + global->deallocate_async(ptr2, superblock::minimum_size + 512, {}); EXPECT_EQ(global->allocate(arena_size), fake_address3); } @@ -479,7 +483,7 @@ TEST_F(ArenaTest, ThrowOnNullUpstream) // NOLINT TEST_F(ArenaTest, SizeSmallerThanSuperblockSize) // NOLINT { - auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource(), 256}; }; + auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource_ref(), 256}; }; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto) EXPECT_THROW(construct_small(), rmm::logic_error); } @@ -490,14 +494,14 @@ TEST_F(ArenaTest, AllocateNinetyPercent) // NOLINT auto const free = rmm::available_device_memory().first; auto const ninety_percent = rmm::align_up( static_cast(static_cast(free) * 0.9), rmm::CUDA_ALLOCATION_ALIGNMENT); - arena_mr mr(rmm::mr::get_current_device_resource(), ninety_percent); + arena_mr mr(rmm::mr::get_current_device_resource_ref(), ninety_percent); }()); } TEST_F(ArenaTest, SmallMediumLarge) // NOLINT { EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) - arena_mr mr(rmm::mr::get_current_device_resource()); + arena_mr mr(rmm::mr::get_current_device_resource_ref()); auto* small = mr.allocate(256); auto* medium = mr.allocate(64_MiB); auto const free = rmm::available_device_memory().first; @@ -512,7 +516,7 @@ TEST_F(ArenaTest, Defragment) // NOLINT { EXPECT_NO_THROW([]() { // NOLINT(cppcoreguidelines-avoid-goto) auto const arena_size = superblock::minimum_size * 4; - arena_mr mr(rmm::mr::get_current_device_resource(), arena_size); + arena_mr mr(rmm::mr::get_current_device_resource_ref(), arena_size); std::vector threads; std::size_t num_threads{4}; threads.reserve(num_threads); @@ -539,7 +543,7 @@ TEST_F(ArenaTest, PerThreadToStreamDealloc) // NOLINT // arena that then moved to global arena during a defragmentation // and then moved to a stream arena. auto const arena_size = superblock::minimum_size * 2; - arena_mr mr(rmm::mr::get_current_device_resource(), arena_size); + arena_mr mr(rmm::mr::get_current_device_resource_ref(), arena_size); // Create an allocation from a per thread arena void* thread_ptr = mr.allocate(256, rmm::cuda_stream_per_thread); // Create an allocation in a stream arena to force global arena @@ -565,7 +569,7 @@ TEST_F(ArenaTest, PerThreadToStreamDealloc) // NOLINT TEST_F(ArenaTest, DumpLogOnFailure) // NOLINT { - arena_mr mr{rmm::mr::get_current_device_resource(), 1_MiB, true}; + arena_mr mr{rmm::mr::get_current_device_resource_ref(), 1_MiB, true}; { // make the log interesting std::vector threads; diff --git a/tests/mr/device/callback_mr_tests.cpp b/tests/mr/device/callback_mr_tests.cpp index 34a2cc8cc..a56efa60c 100644 --- a/tests/mr/device/callback_mr_tests.cpp +++ b/tests/mr/device/callback_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,47 +36,50 @@ using ::testing::_; TEST(CallbackTest, TestCallbacksAreInvoked) { - auto base_mr = mock_resource(); + auto base_mr = mock_resource(); + auto base_ref = device_async_resource_ref{base_mr}; EXPECT_CALL(base_mr, do_allocate(10_MiB, cuda_stream_view{})).Times(1); EXPECT_CALL(base_mr, do_deallocate(_, 10_MiB, cuda_stream_view{})).Times(1); auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) { - auto base_mr = static_cast(arg); - return base_mr->allocate(size, stream); + auto base_mr = *static_cast(arg); + return base_mr.allocate_async(size, stream); }; auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) { - auto base_mr = static_cast(arg); - base_mr->deallocate(ptr, size, stream); + auto base_mr = *static_cast(arg); + base_mr.deallocate_async(ptr, size, stream); }; auto mr = - rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr); - auto ptr = mr.allocate(10_MiB); - mr.deallocate(ptr, 10_MiB); + rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_ref, &base_ref); + auto const size = std::size_t{10_MiB}; + auto* ptr = mr.allocate(size); + mr.deallocate(ptr, size); } TEST(CallbackTest, LoggingTest) { testing::internal::CaptureStdout(); - auto base_mr = rmm::mr::get_current_device_resource(); + auto base_mr = rmm::mr::get_current_device_resource_ref(); auto allocate_callback = [](std::size_t size, cuda_stream_view stream, void* arg) { std::cout << "Allocating " << size << " bytes" << std::endl; - auto base_mr = static_cast(arg); - return base_mr->allocate(size, stream); + auto base_mr = *static_cast(arg); + return base_mr.allocate_async(size, stream); }; auto deallocate_callback = [](void* ptr, std::size_t size, cuda_stream_view stream, void* arg) { std::cout << "Deallocating " << size << " bytes" << std::endl; - auto base_mr = static_cast(arg); - base_mr->deallocate(ptr, size, stream); + auto base_mr = *static_cast(arg); + base_mr.deallocate_async(ptr, size, stream); }; auto mr = - rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, base_mr, base_mr); - auto ptr = mr.allocate(10_MiB); - mr.deallocate(ptr, 10_MiB); + rmm::mr::callback_memory_resource(allocate_callback, deallocate_callback, &base_mr, &base_mr); + auto const size = std::size_t{10_MiB}; + auto* ptr = mr.allocate(size); + mr.deallocate(ptr, size); std::string output = testing::internal::GetCapturedStdout(); - std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", 10_MiB, 10_MiB); + std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", size, size); ASSERT_EQ(expect, output); } diff --git a/tests/mr/device/failure_callback_mr_tests.cpp b/tests/mr/device/failure_callback_mr_tests.cpp index 683aee86e..4b3d084d5 100644 --- a/tests/mr/device/failure_callback_mr_tests.cpp +++ b/tests/mr/device/failure_callback_mr_tests.cpp @@ -47,7 +47,8 @@ bool failure_handler(std::size_t /*bytes*/, void* arg) TEST(FailureCallbackTest, RetryAllocationOnce) { bool retried{false}; - failure_callback_adaptor<> mr{rmm::mr::get_current_device_resource(), failure_handler, &retried}; + failure_callback_adaptor<> mr{ + rmm::mr::get_current_device_resource_ref(), failure_handler, &retried}; EXPECT_EQ(retried, false); EXPECT_THROW(mr.allocate(512_GiB), std::bad_alloc); EXPECT_EQ(retried, true); diff --git a/tests/mr/device/limiting_mr_tests.cpp b/tests/mr/device/limiting_mr_tests.cpp index 777ce9428..e6cc97029 100644 --- a/tests/mr/device/limiting_mr_tests.cpp +++ b/tests/mr/device/limiting_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,19 +25,19 @@ namespace rmm::test { namespace { -using Limiting_adaptor = rmm::mr::limiting_resource_adaptor; +using limiting_adaptor = rmm::mr::limiting_resource_adaptor; TEST(LimitingTest, ThrowOnNullUpstream) { auto const max_size{5_MiB}; - auto construct_nullptr = []() { Limiting_adaptor mr{nullptr, max_size}; }; + auto construct_nullptr = []() { limiting_adaptor mr{nullptr, max_size}; }; EXPECT_THROW(construct_nullptr(), rmm::logic_error); } TEST(LimitingTest, TooBig) { auto const max_size{5_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size}; EXPECT_THROW(mr.allocate(max_size + 1), rmm::out_of_memory); } @@ -45,15 +45,15 @@ TEST(LimitingTest, UpstreamFailure) { auto const max_size_1{2_MiB}; auto const max_size_2{5_MiB}; - Limiting_adaptor mr1{rmm::mr::get_current_device_resource(), max_size_1}; - Limiting_adaptor mr2{&mr1, max_size_2}; + limiting_adaptor mr1{rmm::mr::get_current_device_resource_ref(), max_size_1}; + limiting_adaptor mr2{&mr1, max_size_2}; EXPECT_THROW(mr2.allocate(4_MiB), rmm::out_of_memory); } TEST(LimitingTest, UnderLimitDueToFrees) { auto const max_size{10_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; @@ -81,7 +81,7 @@ TEST(LimitingTest, UnderLimitDueToFrees) TEST(LimitingTest, OverLimit) { auto const max_size{10_MiB}; - Limiting_adaptor mr{rmm::mr::get_current_device_resource(), max_size}; + limiting_adaptor mr{rmm::mr::get_current_device_resource_ref(), max_size}; auto const size1{4_MiB}; auto* ptr1 = mr.allocate(size1); auto allocated_bytes = size1; diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index c63a61844..9db63eb1b 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -49,19 +49,7 @@ TEST(PoolTest, ThrowMaxLessThanInitial) auto max_less_than_initial = []() { const auto initial{1024}; const auto maximum{256}; - pool_mr mr{rmm::mr::get_current_device_resource(), initial, maximum}; - }; - EXPECT_THROW(max_less_than_initial(), rmm::logic_error); -} - -TEST(PoolTest, ReferenceThrowMaxLessThanInitial) -{ - // Make sure first argument is enough larger than the second that alignment rounding doesn't - // make them equal - auto max_less_than_initial = []() { - const auto initial{1024}; - const auto maximum{256}; - pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), initial, maximum}; }; EXPECT_THROW(max_less_than_initial(), rmm::logic_error); } @@ -72,7 +60,7 @@ TEST(PoolTest, AllocateNinetyPercent) auto const [free, total] = rmm::available_device_memory(); (void)total; auto const ninety_percent_pool = rmm::percent_of_free_device_memory(90); - pool_mr mr{rmm::mr::get_current_device_resource(), ninety_percent_pool}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), ninety_percent_pool}; }; EXPECT_NO_THROW(allocate_ninety()); } @@ -81,7 +69,7 @@ TEST(PoolTest, TwoLargeBuffers) { auto two_large = []() { [[maybe_unused]] auto const [free, total] = rmm::available_device_memory(); - pool_mr mr{rmm::mr::get_current_device_resource(), rmm::percent_of_free_device_memory(50)}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), rmm::percent_of_free_device_memory(50)}; auto* ptr1 = mr.allocate(free / 4); auto* ptr2 = mr.allocate(free / 4); mr.deallocate(ptr1, free / 4); @@ -116,7 +104,7 @@ TEST(PoolTest, ForceGrowth) TEST(PoolTest, DeletedStream) { - pool_mr mr{rmm::mr::get_current_device_resource(), 0}; + pool_mr mr{rmm::mr::get_current_device_resource_ref(), 0}; cudaStream_t stream{}; // we don't use rmm::cuda_stream here to make destruction more explicit const int size = 10000; EXPECT_EQ(cudaSuccess, cudaStreamCreate(&stream)); @@ -129,7 +117,7 @@ TEST(PoolTest, DeletedStream) TEST(PoolTest, InitialAndMaxPoolSizeEqual) { EXPECT_NO_THROW([]() { - pool_mr mr(rmm::mr::get_current_device_resource(), 1000192, 1000192); + pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000192, 1000192); mr.allocate(1000); }()); } @@ -138,14 +126,14 @@ TEST(PoolTest, NonAlignedPoolSize) { EXPECT_THROW( []() { - pool_mr mr(rmm::mr::get_current_device_resource(), 1000031, 1000192); + pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000031, 1000192); mr.allocate(1000); }(), rmm::logic_error); EXPECT_THROW( []() { - pool_mr mr(rmm::mr::get_current_device_resource(), 1000192, 1000200); + pool_mr mr(rmm::mr::get_current_device_resource_ref(), 1000192, 1000200); mr.allocate(1000); }(), rmm::logic_error); @@ -203,18 +191,18 @@ namespace test_properties { class fake_async_resource { public: // To model `async_resource` - void* allocate(std::size_t, std::size_t) { return nullptr; } - void deallocate(void* ptr, std::size_t, std::size_t) {} - void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; } - void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {} + static void* allocate(std::size_t, std::size_t) { return nullptr; } + static void deallocate(void* ptr, std::size_t, std::size_t) {} + static void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; } + static void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {} bool operator==(const fake_async_resource& other) const { return true; } bool operator!=(const fake_async_resource& other) const { return false; } private: - void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; } - void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {} - [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; } + static void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; } + static void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {} + [[nodiscard]] static bool do_is_equal(fake_async_resource const& other) noexcept { return true; } }; static_assert(!cuda::has_property); static_assert(!cuda::has_property, diff --git a/tests/mr/device/statistics_mr_tests.cpp b/tests/mr/device/statistics_mr_tests.cpp index 6c5700f0b..f796a4c00 100644 --- a/tests/mr/device/statistics_mr_tests.cpp +++ b/tests/mr/device/statistics_mr_tests.cpp @@ -40,7 +40,7 @@ TEST(StatisticsTest, ThrowOnNullUpstream) TEST(StatisticsTest, Empty) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; EXPECT_EQ(mr.get_bytes_counter().peak, 0); EXPECT_EQ(mr.get_bytes_counter().total, 0); @@ -53,7 +53,7 @@ TEST(StatisticsTest, Empty) TEST(StatisticsTest, AllFreed) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; allocations.reserve(num_allocations); @@ -71,7 +71,7 @@ TEST(StatisticsTest, AllFreed) TEST(StatisticsTest, PeakAllocations) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -127,9 +127,9 @@ TEST(StatisticsTest, PeakAllocations) TEST(StatisticsTest, MultiTracking) { - auto* orig_device_resource = rmm::mr::get_current_device_resource(); + auto orig_device_resource = rmm::mr::get_current_device_resource_ref(); statistics_adaptor mr{orig_device_resource}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(mr); std::vector> allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -139,8 +139,8 @@ TEST(StatisticsTest, MultiTracking) EXPECT_EQ(mr.get_allocations_counter().value, 10); - statistics_adaptor inner_mr{rmm::mr::get_current_device_resource()}; - rmm::mr::set_current_device_resource(&inner_mr); + statistics_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()}; + rmm::mr::set_current_device_resource_ref(inner_mr); for (std::size_t i = 0; i < num_more_allocations; ++i) { allocations.emplace_back( @@ -172,7 +172,7 @@ TEST(StatisticsTest, MultiTracking) EXPECT_EQ(inner_mr.get_allocations_counter().peak, 5); // Reset the current device resource - rmm::mr::set_current_device_resource(orig_device_resource); + rmm::mr::set_current_device_resource_ref(orig_device_resource); } TEST(StatisticsTest, NegativeInnerTracking) @@ -180,7 +180,7 @@ TEST(StatisticsTest, NegativeInnerTracking) // This tests the unlikely scenario where pointers are deallocated on an inner // wrapped memory resource. This can happen if the MR is not saved with the // memory pointer - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); @@ -236,7 +236,7 @@ TEST(StatisticsTest, NegativeInnerTracking) TEST(StatisticsTest, Nested) { - statistics_adaptor mr{rmm::mr::get_current_device_resource()}; + statistics_adaptor mr{rmm::mr::get_current_device_resource_ref()}; void* a0 = mr.allocate(ten_MiB); EXPECT_EQ(mr.get_bytes_counter().value, ten_MiB); EXPECT_EQ(mr.get_allocations_counter().value, 1); diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu index 91ae396ed..84f599957 100644 --- a/tests/mr/device/thrust_allocator_tests.cu +++ b/tests/mr/device/thrust_allocator_tests.cu @@ -37,9 +37,9 @@ namespace { struct allocator_test : public mr_ref_test {}; // Disable until we support resource_ref with set_current_device_resource -/*TEST_P(allocator_test, first) +TEST_P(allocator_test, first) { - rmm::mr::set_current_device_resource(this->mr.get()); + rmm::mr::set_current_device_resource_ref(this->ref); auto const num_ints{100}; rmm::device_vector ints(num_ints, 1); EXPECT_EQ(num_ints, thrust::reduce(ints.begin(), ints.end())); @@ -47,12 +47,12 @@ struct allocator_test : public mr_ref_test {}; TEST_P(allocator_test, defaults) { - rmm::mr::set_current_device_resource(this->mr.get()); + rmm::mr::set_current_device_resource_ref(this->ref); rmm::mr::thrust_allocator allocator(rmm::cuda_stream_default); EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default); EXPECT_EQ(allocator.get_upstream_resource(), - rmm::device_async_resource_ref{rmm::mr::get_current_device_resource()}); -}*/ + rmm::device_async_resource_ref{rmm::mr::get_current_device_resource_ref()}); +} TEST_P(allocator_test, multi_device) { diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp index 7c2532c60..acd540ae6 100644 --- a/tests/mr/device/tracking_mr_tests.cpp +++ b/tests/mr/device/tracking_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,14 +42,14 @@ TEST(TrackingTest, ThrowOnNullUpstream) TEST(TrackingTest, Empty) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; EXPECT_EQ(mr.get_outstanding_allocations().size(), 0); EXPECT_EQ(mr.get_allocated_bytes(), 0); } TEST(TrackingTest, AllFreed) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; allocations.reserve(num_allocations); for (int i = 0; i < num_allocations; ++i) { @@ -64,7 +64,7 @@ TEST(TrackingTest, AllFreed) TEST(TrackingTest, AllocationsLeftWithStacks) { - tracking_adaptor mr{rmm::mr::get_current_device_resource(), true}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref(), true}; std::vector allocations; allocations.reserve(num_allocations); for (int i = 0; i < num_allocations; ++i) { @@ -82,7 +82,7 @@ TEST(TrackingTest, AllocationsLeftWithStacks) TEST(TrackingTest, AllocationsLeftWithoutStacks) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; allocations.reserve(num_allocations); for (int i = 0; i < num_allocations; ++i) { @@ -101,9 +101,9 @@ TEST(TrackingTest, AllocationsLeftWithoutStacks) TEST(TrackingTest, MultiTracking) { - auto* orig_device_resource = rmm::mr::get_current_device_resource(); + auto orig_device_resource = rmm::mr::get_current_device_resource_ref(); tracking_adaptor mr{orig_device_resource, true}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(mr); std::vector> allocations; for (std::size_t i = 0; i < num_allocations; ++i) { @@ -113,8 +113,8 @@ TEST(TrackingTest, MultiTracking) EXPECT_EQ(mr.get_outstanding_allocations().size(), num_allocations); - tracking_adaptor inner_mr{rmm::mr::get_current_device_resource()}; - rmm::mr::set_current_device_resource(&inner_mr); + tracking_adaptor inner_mr{rmm::mr::get_current_device_resource_ref()}; + rmm::mr::set_current_device_resource_ref(inner_mr); for (std::size_t i = 0; i < num_more_allocations; ++i) { allocations.emplace_back( @@ -141,7 +141,7 @@ TEST(TrackingTest, MultiTracking) EXPECT_EQ(inner_mr.get_allocated_bytes(), 0); // Reset the current device resource - rmm::mr::set_current_device_resource(orig_device_resource); + rmm::mr::set_current_device_resource_ref(orig_device_resource); } TEST(TrackingTest, NegativeInnerTracking) @@ -149,7 +149,7 @@ TEST(TrackingTest, NegativeInnerTracking) // This tests the unlikely scenario where pointers are deallocated on an inner // wrapped memory resource. This can happen if the MR is not saved with the // memory pointer - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); @@ -181,7 +181,7 @@ TEST(TrackingTest, NegativeInnerTracking) TEST(TrackingTest, DeallocWrongBytes) { - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); @@ -207,7 +207,7 @@ TEST(TrackingTest, LogOutstandingAllocations) rmm::logger().sinks().push_back(oss_sink); auto old_level = rmm::logger().level(); - tracking_adaptor mr{rmm::mr::get_current_device_resource()}; + tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()}; std::vector allocations; for (std::size_t i = 0; i < num_allocations; ++i) { allocations.push_back(mr.allocate(ten_MiB)); diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp index 8445ab1f5..071739575 100644 --- a/tests/mr/host/mr_ref_tests.cpp +++ b/tests/mr/host/mr_ref_tests.cpp @@ -233,14 +233,17 @@ TYPED_TEST(MRRefTest, UnsupportedAlignmentTest) for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) { for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment; alignment *= TestedAlignmentMultiplier) { +#ifdef NDEBUG auto allocation_size = size_distribution(generator); void* ptr{nullptr}; // An unsupported alignment (like an odd number) should result in an // alignment of `alignof(std::max_align_t)` auto const bad_alignment = alignment + 1; + EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment)); EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t))); EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment)); +#endif } } }