Merge branch 'branch-24.10' of github.com:rapidsai/rmm into failure_a…

…lternate_resource_adaptor
rapidsai · Sep 10, 2024 · 38aaf5c · 38aaf5c
2 parents 865d010 + 6729def
commit 38aaf5c
Show file tree

Hide file tree

Showing 55 changed files with 800 additions and 427 deletions.
diff --git a/README.md b/README.md
@@ -771,8 +771,8 @@ of 1 GiB and a maximum size of 4 GiB. The pool uses
 >>> import rmm
 >>> pool = rmm.mr.PoolMemoryResource(
 ...     rmm.mr.CudaMemoryResource(),
-...     initial_pool_size=2**30,
-...     maximum_pool_size=2**32
+...     initial_pool_size="1GiB", # equivalent to initial_pool_size=2**30
+...     maximum_pool_size="4GiB"
 ... )
 >>> rmm.mr.set_current_device_resource(pool)
 ```

diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
@@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
 
   state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK(BM_UvectorSizeConstruction)
@@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(&mr);
+  rmm::mr::set_current_device_resource_ref(mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
     rmm::device_vector<std::int32_t> vec(state.range(0));
@@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 
   state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK(BM_ThrustVectorSizeConstruction)
@@ -140,7 +140,7 @@ template <typename Vector>
 void BM_VectorWorkflow(benchmark::State& state)
 {
   rmm::mr::cuda_async_memory_resource cuda_async_mr{};
-  rmm::mr::set_current_device_resource(&cuda_async_mr);
+  rmm::mr::set_current_device_resource_ref(cuda_async_mr);
 
   rmm::cuda_stream input_stream;
   std::vector<rmm::cuda_stream> streams(4);
@@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state)
   auto const bytes            = num_elements * sizeof(std::int32_t) * num_accesses;
   state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
@@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
   ->Unit(benchmark::kMicrosecond)
   ->UseManualTime();
 
-// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
-// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
-// `cuda_async_memory_resource`, which is faster.
+// The only difference here is that `rmm::device_vector` uses
+// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses
+// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster.
 BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector)  // NOLINT
   ->RangeMultiplier(10)                            // NOLINT
   ->Range(100'000, 100'000'000)                    // NOLINT

diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
 {
   auto mr = factory();
 
-  rmm::mr::set_current_device_resource(mr.get());
+  rmm::mr::set_current_device_resource_ref(mr.get());
 
   auto num_streams = state.range(0);
   auto num_kernels = state.range(1);
@@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
 
   state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_kernels));
 
-  rmm::mr::set_current_device_resource(nullptr);
+  rmm::mr::reset_current_device_resource_ref();
 }
 
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }

diff --git a/doxygen/Doxyfile b/doxygen/Doxyfile
@@ -2176,7 +2176,11 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = RMM_NAMESPACE=rmm RMM_EXPORT
+# These need to be kept in sync with set in rmm/detail/export.hpp
+# Since we are excluding detail files in EXCLUDE_PATTERNS there
+# appears to be no way of getting doxygen to still parse that file and
+# make the definitions available via the preprocessor :(
+PREDEFINED             = RMM_EXPORT RMM_HIDDEN RMM_NAMESPACE=rmm
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2185,7 +2189,7 @@ PREDEFINED             = RMM_NAMESPACE=rmm RMM_EXPORT
 # definition found in the source code.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_AS_DEFINED      = RMM_NAMESPACE
+EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
 # remove all references to function-like macros that are alone on a line, have

diff --git a/include/rmm/detail/export.hpp b/include/rmm/detail/export.hpp
@@ -24,4 +24,5 @@
 #else
 #define RMM_EXPORT
 #define RMM_HIDDEN
+#define RMM_NAMESPACE rmm
 #endif
diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
@@ -41,7 +41,7 @@ namespace RMM_NAMESPACE {
  *
  * This class allocates untyped and *uninitialized* device memory using a
  * `device_async_resource_ref`. If not explicitly specified, the memory resource
- * returned from `get_current_device_resource()` is used.
+ * returned from `get_current_device_resource_ref()` is used.
  *
  * @note Unlike `std::vector` or `thrust::device_vector`, the device memory
  * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -95,7 +95,7 @@ class device_buffer {
   // `__host__ __device__` specifiers to the defaulted constructor when it is called within the
   // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
   // device function. This causes warnings/errors because this ctor invokes host-only functions.
-  device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
+  device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
 
   /**
    * @brief Constructs a new device buffer of `size` uninitialized bytes
@@ -109,7 +109,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     cuda_set_device_raii dev{_device};
@@ -138,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     cuda_set_device_raii dev{_device};
@@ -169,7 +169,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -419,8 +419,8 @@ class device_buffer {
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
 
   rmm::device_async_resource_ref _mr{
-    rmm::mr::get_current_device_resource()};  ///< The memory resource used to
-                                              ///< allocate/deallocate device memory
+    rmm::mr::get_current_device_resource_ref()};  ///< The memory resource used to
+                                                  ///< allocate/deallocate device memory
   cuda_device_id _device{get_current_cuda_device()};
 
   /**

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
@@ -95,7 +95,7 @@ class device_scalar {
    * @param mr Optional, resource with which to allocate.
    */
   explicit device_scalar(cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, mr}
   {
   }
@@ -118,7 +118,7 @@ class device_scalar {
    */
   explicit device_scalar(value_type const& initial_value,
                          cuda_stream_view stream,
-                         device_async_resource_ref mr = mr::get_current_device_resource())
+                         device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{1, stream, mr}
   {
     set_value_async(initial_value, stream);
@@ -138,7 +138,7 @@ class device_scalar {
    */
   device_scalar(device_scalar const& other,
                 cuda_stream_view stream,
-                device_async_resource_ref mr = mr::get_current_device_resource())
+                device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }

diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
@@ -48,7 +48,7 @@ namespace RMM_NAMESPACE {
  *
  * Example:
  * @code{.cpp}
- * rmm::mr::device_memory_resource * mr = new my_custom_resource();
+ * auto mr = new my_custom_resource();
  * rmm::cuda_stream_view s{};
  *
  * // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the
@@ -126,7 +126,7 @@ class device_uvector {
    */
   explicit device_uvector(std::size_t size,
                           cuda_stream_view stream,
-                          device_async_resource_ref mr = mr::get_current_device_resource())
+                          device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -142,7 +142,7 @@ class device_uvector {
    */
   explicit device_uvector(device_uvector const& other,
                           cuda_stream_view stream,
-                          device_async_resource_ref mr = mr::get_current_device_resource())
+                          device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }

diff --git a/include/rmm/exec_policy.hpp b/include/rmm/exec_policy.hpp
@@ -57,7 +57,7 @@ class exec_policy : public thrust_exec_policy_t {
    * @param mr The resource to use for allocating temporary memory
    */
   explicit exec_policy(cuda_stream_view stream      = cuda_stream_default,
-                       device_async_resource_ref mr = mr::get_current_device_resource())
+                       device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : thrust_exec_policy_t(
         thrust::cuda::par(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
   {
@@ -81,7 +81,7 @@ using thrust_exec_policy_nosync_t =
 class exec_policy_nosync : public thrust_exec_policy_nosync_t {
  public:
   explicit exec_policy_nosync(cuda_stream_view stream      = cuda_stream_default,
-                              device_async_resource_ref mr = mr::get_current_device_resource())
+                              device_async_resource_ref mr = mr::get_current_device_resource_ref())
     : thrust_exec_policy_nosync_t(
         thrust::cuda::par_nosync(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
   {

diff --git a/include/rmm/logger.hpp b/include/rmm/logger.hpp
@@ -107,7 +107,7 @@ struct bytes {
  *
  * @return spdlog::logger& The logger.
  */
-inline spdlog::logger& logger()
+RMM_EXPORT inline spdlog::logger& logger()
 {
   static detail::logger_wrapper wrapped{};
   return wrapped.logger_;

diff --git a/include/rmm/mr/device/aligned_resource_adaptor.hpp b/include/rmm/mr/device/aligned_resource_adaptor.hpp
@@ -20,6 +20,7 @@
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cstddef>
@@ -59,20 +60,40 @@ class aligned_resource_adaptor final : public device_memory_resource {
   /**
    * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
    *
-   * @throws rmm::logic_error if `upstream == nullptr`
    * @throws rmm::logic_error if `allocation_alignment` is not a power of 2
    *
    * @param upstream The resource used for allocating/deallocating device memory.
    * @param alignment The size used for allocation alignment.
    * @param alignment_threshold Only allocations with a size larger than or equal to this threshold
    * are aligned.
    */
-  explicit aligned_resource_adaptor(Upstream* upstream,
+  explicit aligned_resource_adaptor(device_async_resource_ref upstream,
                                     std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
                                     std::size_t alignment_threshold = default_alignment_threshold)
     : upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold}
   {
-    RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+    RMM_EXPECTS(rmm::is_supported_alignment(alignment),
+                "Allocation alignment is not a power of 2.");
+  }
+
+  /**
+   * @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
+   *
+   * @throws rmm::logic_error if `upstream == nullptr`
+   * @throws rmm::logic_error if `alignment` is not a power of 2
+   *
+   * @param upstream The resource used for allocating/deallocating device memory.
+   * @param alignment The size used for allocation alignment.
+   * @param alignment_threshold Only allocations with a size larger than or equal to this threshold
+   * are aligned.
+   */
+  explicit aligned_resource_adaptor(Upstream* upstream,
+                                    std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
+                                    std::size_t alignment_threshold = default_alignment_threshold)
+    : upstream_{to_device_async_resource_ref_checked(upstream)},
+      alignment_{alignment},
+      alignment_threshold_{alignment_threshold}
+  {
     RMM_EXPECTS(rmm::is_supported_alignment(alignment),
                 "Allocation alignment is not a power of 2.");
   }
@@ -92,11 +113,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
     return upstream_;
   }
 
-  /**
-   * @briefreturn{Upstream* to the upstream memory resource}
-   */
-  [[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
-
   /**
    * @brief The default alignment used by the adaptor.
    */
@@ -106,8 +122,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
   using lock_guard = std::lock_guard<std::mutex>;
 
   /**
-   * @brief Allocates memory of size at least `bytes` using the upstream resource with the specified
-   * alignment.
+   * @brief Allocates memory of size at least `bytes` using the upstream resource with the
+   * specified alignment.
    *
    * @throws rmm::bad_alloc if the requested allocation could not be fulfilled
    * by the upstream resource.
@@ -119,10 +135,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      return upstream_->allocate(bytes, stream);
+      return get_upstream_resource().allocate_async(bytes, 1, stream);
     }
     auto const size = upstream_allocation_size(bytes);
-    void* pointer   = upstream_->allocate(size, stream);
+    void* pointer   = get_upstream_resource().allocate_async(size, 1, stream);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     auto const address         = reinterpret_cast<std::size_t>(pointer);
     auto const aligned_address = rmm::align_up(address, alignment_);
@@ -145,7 +161,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
-      upstream_->deallocate(ptr, bytes, stream);
+      get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
     } else {
       {
         lock_guard lock(mtx_);
@@ -155,7 +171,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
           pointers_.erase(iter);
         }
       }
-      upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream);
+      get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
     }
   }
 
@@ -176,8 +192,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
   }
 
   /**
-   * @brief Calculate the allocation size needed from upstream to account for alignments of both the
-   * size and the base pointer.
+   * @brief Calculate the allocation size needed from upstream to account for alignments of both
+   * the size and the base pointer.
    *
    * @param bytes The requested allocation size.
    * @return Allocation size needed from upstream to align both the size and the base pointer.
@@ -188,7 +204,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
     return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT;
   }
 
-  Upstream* upstream_;  ///< The upstream resource used for satisfying allocation requests
+  /// The upstream resource used for satisfying allocation requests
+  device_async_resource_ref upstream_;
   std::unordered_map<void*, void*> pointers_;  ///< Map of aligned pointers to upstream pointers.
   std::size_t alignment_;                      ///< The size used for allocation alignment
   std::size_t alignment_threshold_;  ///< The size above which allocations should be aligned