From 90adef2d6d55f324067b1f750742a75a0efcf2af Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 12:22:49 +0200
Subject: [PATCH 01/30] Add cccl repo to build dependencies

---
 CMakeLists.txt                        |  1 +
 cmake/thirdparty/get_libcudacxx.cmake | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 cmake/thirdparty/get_libcudacxx.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c883bc4c..808e52ded 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ rapids_cpm_init()
 
 include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
+include(cmake/thirdparty/get_libcudacxx.cmake)
 include(cmake/thirdparty/get_thrust.cmake)
 
 # ##################################################################################################
diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake
new file mode 100644
index 000000000..b1f990d39
--- /dev/null
+++ b/cmake/thirdparty/get_libcudacxx.cmake
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone libcudacxx
+function(find_and_configure_libcudacxx)
+
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports)
+
+endfunction()
+
+find_and_configure_libcudacxx()

From a49fe9c352b4609f011895ed59d936abdb898915 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 12:29:12 +0200
Subject: [PATCH 02/30] PoC for the new design of
 `cuda::mr::{async_}resource_ref`

---
 CMakeLists.txt                                |   1 +
 include/rmm/cuda_stream_view.hpp              |  16 +
 .../mr/device/callback_memory_resource.hpp    |   4 +-
 .../rmm/mr/device/device_memory_resource.hpp  | 150 +++++++
 include/rmm/mr/host/host_memory_resource.hpp  |  35 ++
 tests/CMakeLists.txt                          |   7 +
 tests/mr/device/adaptor_tests.cpp             |  26 ++
 tests/mr/device/cuda_async_mr_tests.cpp       |   2 +
 tests/mr/device/cuda_async_view_mr_tests.cpp  |   4 +
 .../mr/device/mr_ref_multithreaded_tests.cpp  | 221 ++++++++++
 tests/mr/device/mr_ref_test.hpp               | 412 ++++++++++++++++++
 tests/mr/device/mr_ref_tests.cpp              | 109 +++++
 tests/mr/host/mr_ref_tests.cpp                | 263 +++++++++++
 tests/mr/host/mr_tests.cpp                    |   5 +
 14 files changed, 1253 insertions(+), 2 deletions(-)
 create mode 100644 tests/mr/device/mr_ref_multithreaded_tests.cpp
 create mode 100644 tests/mr/device/mr_ref_test.hpp
 create mode 100644 tests/mr/device/mr_ref_tests.cpp
 create mode 100644 tests/mr/host/mr_ref_tests.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 808e52ded..9455c4267 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,6 +95,7 @@ target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
 target_link_libraries(rmm INTERFACE dl)
 target_compile_features(rmm INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 # ##################################################################################################
 # * tests and benchmarks ---------------------------------------------------------------------------
diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp
index fe07fa1b9..ad2768535 100644
--- a/include/rmm/cuda_stream_view.hpp
+++ b/include/rmm/cuda_stream_view.hpp
@@ -20,6 +20,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cuda/stream_ref>
+
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
@@ -58,6 +60,13 @@ class cuda_stream_view {
    */
   constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {}
 
+  /**
+   * @brief Implicit conversion from cuda::stream_ref.
+   *
+   * @param stream The underlying stream for this view
+   */
+  constexpr cuda_stream_view(cuda::stream_ref stream) noexcept : stream_{stream.get()} {}
+
   /**
    * @brief Get the wrapped stream.
    *
@@ -72,6 +81,13 @@ class cuda_stream_view {
    */
   constexpr operator cudaStream_t() const noexcept { return value(); }
 
+  /**
+   * @brief Implicit conversion to stream_ref.
+   *
+   * @return cuda::stream_ref The underlying stream referenced by this cuda_stream_view
+   */
+  constexpr operator cuda::stream_ref() const noexcept { return value(); }
+
   /**
    * @briefreturn{true if the wrapped stream is the CUDA per-thread default stream}
    */
diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp
index c6519ed5c..36802c83a 100644
--- a/include/rmm/mr/device/callback_memory_resource.hpp
+++ b/include/rmm/mr/device/callback_memory_resource.hpp
@@ -143,8 +143,8 @@ class callback_memory_resource final : public device_memory_resource {
     throw std::runtime_error("cannot get free / total memory");
   }
 
-  [[nodiscard]] virtual bool supports_streams() const noexcept { return false; }
-  [[nodiscard]] virtual bool supports_get_mem_info() const noexcept { return false; }
+  [[nodiscard]] bool supports_streams() const noexcept override { return false; }
+  [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; }
 
   allocate_callback_t allocate_callback_;
   deallocate_callback_t deallocate_callback_;
diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 8ad84644b..cc1ae2c3f 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -18,6 +18,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
 
+#include <cuda/memory_resource>
+#include <cuda/stream_ref>
+
 #include <cstddef>
 #include <utility>
 
@@ -157,6 +160,150 @@ class device_memory_resource {
     return do_is_equal(other);
   }
 
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* allocate(std::size_t bytes, std::size_t alignment)
+  {
+    return do_allocate(rmm::detail::align_up(bytes, alignment), cuda_stream_view{});
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   */
+  void deallocate(void* ptr, std::size_t bytes, std::size_t alignment)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), cuda_stream_view{});
+  }
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream)
+  {
+    return do_allocate(rmm::detail::align_up(bytes, alignment), stream);
+  }
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* allocate_async(std::size_t bytes, cuda_stream_view stream)
+  {
+    return do_allocate(bytes, stream);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   * @param stream Stream on which to perform allocation
+   */
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda_stream_view stream)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), stream);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform allocation
+   */
+  void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream)
+  {
+    do_deallocate(ptr, bytes, stream);
+  }
+
+  [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept
+  {
+    return do_is_equal(other);
+  }
+
+  [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept
+  {
+    return !do_is_equal(other);
+  }
+
   /**
    * @brief Query whether the resource supports use of non-null CUDA streams for
    * allocation/deallocation.
@@ -185,6 +332,8 @@ class device_memory_resource {
     return do_get_mem_info(stream);
   }
 
+  friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {}
+
  private:
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -243,5 +392,6 @@ class device_memory_resource {
   [[nodiscard]] virtual std::pair<std::size_t, std::size_t> do_get_mem_info(
     cuda_stream_view stream) const = 0;
 };
+static_assert(cuda::mr::resource_with<device_memory_resource, cuda::mr::device_accessible>, "");
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp
index 8a5739b2a..aa40d57cd 100644
--- a/include/rmm/mr/host/host_memory_resource.hpp
+++ b/include/rmm/mr/host/host_memory_resource.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cuda/memory_resource>
+
 #include <cstddef>
 #include <utility>
 
@@ -114,6 +116,37 @@ class host_memory_resource {
     return do_is_equal(other);
   }
 
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equivalent
+   */
+  [[nodiscard]] bool operator==(host_memory_resource const& other) const noexcept
+  {
+    return do_is_equal(other);
+  }
+
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return false If the two resources are equivalent
+   * @return true If the two resources are not equivalent
+   */
+  [[nodiscard]] bool operator!=(host_memory_resource const& other) const noexcept
+  {
+    return !do_is_equal(other);
+  }
+
+  /**
+   * @brief Enables the `cuda::mr::host_accessible` property
+   *
+   * This property declares that a `host_memory_resource` provides host accessible memory
+   */
+  friend void get_property(host_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+
  private:
   /**
    * @brief Allocates memory on the host of size at least `bytes` bytes.
@@ -166,5 +199,7 @@ class host_memory_resource {
     return this == &other;
   }
 };
+static_assert(cuda::mr::resource_with<host_memory_resource, cuda::mr::host_accessible>, "");
 /** @} */  // end of group
+
 }  // namespace rmm::mr
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 36c3aa043..80e2b846b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -126,6 +126,10 @@ endfunction()
 ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp GPUS 1
               PERCENT 90)
 
+# device mr tests
+ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp
+              mr/device/mr_ref_multithreaded_tests.cpp)
+
 # general adaptor tests
 ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)
 
@@ -162,6 +166,9 @@ ConfigureTest(LIMITING_TEST mr/device/limiting_mr_tests.cpp)
 # host mr tests
 ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp)
 
+# host mr_ref tests
+ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp)
+
 # cuda stream tests
 ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp)
 
diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp
index 44c14240b..3e00dd208 100644
--- a/tests/mr/device/adaptor_tests.cpp
+++ b/tests/mr/device/adaptor_tests.cpp
@@ -29,6 +29,8 @@
 #include <rmm/mr/device/thread_safe_resource_adaptor.hpp>
 #include <rmm/mr/device/tracking_resource_adaptor.hpp>
 
+#include <cuda/memory_resource>
+
 #include <gtest/gtest.h>
 
 #include <cstddef>
@@ -64,6 +66,30 @@ using adaptors = ::testing::Types<aligned_resource_adaptor<cuda_mr>,
                                   thread_safe_resource_adaptor<cuda_mr>,
                                   tracking_resource_adaptor<cuda_mr>>;
 
+static_assert(
+  cuda::mr::resource_with<rmm::mr::aligned_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+static_assert(cuda::mr::resource_with<rmm::mr::failure_callback_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>,
+              "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::limiting_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::logging_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::owning_wrapper<cuda_mr>, cuda::mr::device_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::statistics_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>,
+              "");
+static_assert(cuda::mr::resource_with<rmm::mr::thread_safe_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>,
+              "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::tracking_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+
 template <typename MemoryResourceType>
 struct AdaptorTest : public ::testing::Test {
   using adaptor_type = MemoryResourceType;
diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
index 37ed5c306..90c7b0ff9 100644
--- a/tests/mr/device/cuda_async_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -24,6 +24,8 @@ namespace rmm::test {
 namespace {
 
 using cuda_async_mr = rmm::mr::cuda_async_memory_resource;
+static_assert(cuda::mr::resource_with<cuda_async_mr, cuda::mr::device_accessible>);
+static_assert(cuda::mr::async_resource_with<cuda_async_mr, cuda::mr::device_accessible>);
 
 class AsyncMRTest : public ::testing::Test {
  protected:
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
index 209429b4b..fe82431a9 100644
--- a/tests/mr/device/cuda_async_view_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -18,12 +18,16 @@
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>
 
+#include <cuda/memory_resource>
+
 #include <gtest/gtest.h>
 
 namespace rmm::test {
 namespace {
 
 using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource;
+static_assert(cuda::mr::resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
+static_assert(cuda::mr::async_resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 
 #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
 
diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
new file mode 100644
index 000000000..dd39d1e1d
--- /dev/null
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mr_ref_test.hpp"
+
+#include <gtest/gtest.h>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cuda/memory_resource>
+
+#include <thread>
+#include <vector>
+
+namespace rmm::test {
+namespace {
+
+struct mr_ref_test_mt : public mr_ref_test {};
+
+INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests,
+                        mr_ref_test_mt,
+                        ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                          mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                          mr_factory{"Managed", &make_managed},
+                                          mr_factory{"Pool", &make_pool},
+                                          mr_factory{"Arena", &make_arena},
+                                          mr_factory{"Binning", &make_binning}),
+                        [](auto const& info) { return info.param.name; });
+
+template <typename Task, typename... Arguments>
+void spawn_n(std::size_t num_threads, Task task, Arguments&&... args)
+{
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+  for (std::size_t i = 0; i < num_threads; ++i) {
+    threads.emplace_back(std::thread(task, std::forward<Arguments>(args)...));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <typename Task, typename... Arguments>
+void spawn(Task task, Arguments&&... args)
+{
+  spawn_n(4, task, std::forward<Arguments>(args)...);
+}
+
+TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); }
+
+TEST_P(mr_ref_test_mt, AllocateDefaultStream)
+{
+  spawn(test_various_async_allocations, this->ref, rmm::cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test_mt, AllocateOnStream)
+{
+  spawn(test_various_async_allocations, this->ref, this->stream.view());
+}
+
+TEST_P(mr_ref_test_mt, RandomAllocations)
+{
+  spawn(test_random_allocations, this->ref, default_num_allocations, default_max_size);
+}
+
+TEST_P(mr_ref_test_mt, RandomAllocationsDefaultStream)
+{
+  spawn(test_random_async_allocations,
+        this->ref,
+        default_num_allocations,
+        default_max_size,
+        rmm::cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test_mt, RandomAllocationsStream)
+{
+  spawn(test_random_async_allocations,
+        this->ref,
+        default_num_allocations,
+        default_max_size,
+        this->stream.view());
+}
+
+TEST_P(mr_ref_test_mt, MixedRandomAllocationFree)
+{
+  spawn(test_mixed_random_allocation_free, this->ref, default_max_size);
+}
+
+TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeDefaultStream)
+{
+  spawn(
+    test_mixed_random_async_allocation_free, this->ref, default_max_size, rmm::cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeStream)
+{
+  spawn(test_mixed_random_async_allocation_free, this->ref, default_max_size, this->stream.view());
+}
+
+void allocate_async_loop(async_resource_ref ref,
+                         std::size_t num_allocations,
+                         std::list<allocation>& allocations,
+                         std::mutex& mtx,
+                         cudaEvent_t& event,
+                         rmm::cuda_stream_view stream)
+{
+  constexpr std::size_t max_size{1_MiB};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  for (std::size_t i = 0; i < num_allocations; ++i) {
+    std::size_t size = size_distribution(generator);
+    void* ptr        = ref.allocate_async(size, stream);
+    {
+      std::lock_guard<std::mutex> lock(mtx);
+      RMM_CUDA_TRY(cudaEventRecord(event, stream.value()));
+      allocations.emplace_back(ptr, size);
+    }
+  }
+}
+
+void deallocate_async_loop(async_resource_ref ref,
+                           std::size_t num_allocations,
+                           std::list<allocation>& allocations,
+                           std::mutex& mtx,
+                           cudaEvent_t& event,
+                           rmm::cuda_stream_view stream)
+{
+  for (std::size_t i = 0; i < num_allocations;) {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (allocations.empty()) { continue; }
+    i++;
+    RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event));
+    allocation alloc = allocations.front();
+    allocations.pop_front();
+    ref.deallocate_async(alloc.ptr, alloc.size, stream);
+  }
+}
+
+void test_allocate_async_free_different_threads(async_resource_ref ref,
+                                                rmm::cuda_stream_view streamA,
+                                                rmm::cuda_stream_view streamB)
+{
+  constexpr std::size_t num_allocations{100};
+
+  std::mutex mtx;
+  std::list<allocation> allocations;
+  cudaEvent_t event;
+
+  RMM_CUDA_TRY(cudaEventCreate(&event));
+
+  std::thread producer(allocate_async_loop,
+                       ref,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       std::ref(event),
+                       streamA);
+
+  std::thread consumer(deallocate_async_loop,
+                       ref,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       std::ref(event),
+                       streamB);
+
+  producer.join();
+  consumer.join();
+
+  RMM_CUDA_TRY(cudaEventDestroy(event));
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDefaultStream)
+{
+  test_allocate_async_free_different_threads(
+    this->ref, rmm::cuda_stream_default, rmm::cuda_stream_default);
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsPerThreadDefaultStream)
+{
+  test_allocate_async_free_different_threads(
+    this->ref, rmm::cuda_stream_per_thread, rmm::cuda_stream_per_thread);
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsSameStream)
+{
+  test_allocate_async_free_different_threads(this->ref, this->stream, this->stream);
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDifferentStream)
+{
+  rmm::cuda_stream streamB;
+  test_allocate_async_free_different_threads(this->ref, this->stream, streamB);
+  streamB.synchronize();
+}
+
+}  // namespace
+}  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
new file mode 100644
index 000000000..ecb13c2a7
--- /dev/null
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../../byte_literals.hpp"
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/aligned.hpp>
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/binning_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/fixed_size_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cuda_runtime_api.h>
+
+#include <cuda/memory_resource>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <utility>
+
+using resource_ref       = cuda::mr::resource_ref<cuda::mr::device_accessible>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+
+namespace rmm::test {
+
+/**
+ * @brief Returns if a pointer points to a device memory or managed memory
+ * allocation.
+ */
+inline bool is_device_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
+  return attributes.memoryType == cudaMemoryTypeDevice;
+#else
+  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
+#endif
+}
+
+enum size_in_bytes : size_t {};
+
+constexpr auto default_num_allocations{100};
+constexpr size_in_bytes default_max_size{5_MiB};
+
+struct allocation {
+  void* ptr{nullptr};
+  std::size_t size{0};
+  allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {}
+  allocation() = default;
+};
+
+// Various test functions, shared between single-threaded and multithreaded tests.
+inline void test_allocate(resource_ref ref, std::size_t bytes)
+{
+  try {
+    void* ptr = ref.allocate(bytes);
+    EXPECT_NE(nullptr, ptr);
+    EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr));
+    EXPECT_TRUE(is_device_memory(ptr));
+    ref.deallocate(ptr, bytes);
+  } catch (rmm::out_of_memory const& e) {
+    EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+  }
+}
+
+inline void test_allocate_async(async_resource_ref ref,
+                                std::size_t bytes,
+                                cuda_stream_view stream = {})
+{
+  try {
+    void* ptr = ref.allocate_async(bytes, stream);
+    if (not stream.is_default()) { stream.synchronize(); }
+    EXPECT_NE(nullptr, ptr);
+    EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr));
+    EXPECT_TRUE(is_device_memory(ptr));
+    ref.deallocate_async(ptr, bytes, stream);
+    if (not stream.is_default()) { stream.synchronize(); }
+  } catch (rmm::out_of_memory const& e) {
+    EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+  }
+}
+
+// Simple reproducer for https://github.com/rapidsai/rmm/issues/861
+inline void concurrent_allocations_are_different(resource_ref ref)
+{
+  const auto size{8_B};
+  void* ptr1 = ref.allocate(size);
+  void* ptr2 = ref.allocate(size);
+
+  EXPECT_NE(ptr1, ptr2);
+
+  ref.deallocate(ptr1, size);
+  ref.deallocate(ptr2, size);
+}
+
+inline void concurrent_async_allocations_are_different(async_resource_ref ref,
+                                                       cuda_stream_view stream)
+{
+  const auto size{8_B};
+  void* ptr1 = ref.allocate_async(size, stream);
+  void* ptr2 = ref.allocate_async(size, stream);
+
+  EXPECT_NE(ptr1, ptr2);
+
+  ref.deallocate_async(ptr1, size, stream);
+  ref.deallocate_async(ptr2, size, stream);
+}
+
+inline void test_various_allocations(resource_ref ref)
+{
+  // test allocating zero bytes on non-default stream
+  {
+    void* ptr = ref.allocate(0);
+    EXPECT_NO_THROW(ref.deallocate(ptr, 0));
+  }
+
+  test_allocate(ref, 4_B);
+  test_allocate(ref, 1_KiB);
+  test_allocate(ref, 1_MiB);
+  test_allocate(ref, 1_GiB);
+
+  // should fail to allocate too much
+  {
+    void* ptr{nullptr};
+    EXPECT_THROW(ptr = ref.allocate(1_PiB), rmm::out_of_memory);
+    EXPECT_EQ(nullptr, ptr);
+
+    // test e.what();
+    try {
+      ptr = ref.allocate(1_PiB);
+    } catch (rmm::out_of_memory const& e) {
+      EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+    }
+  }
+}
+
+inline void test_various_async_allocations(async_resource_ref ref, cuda_stream_view stream)
+{
+  // test allocating zero bytes on non-default stream
+  {
+    void* ptr = ref.allocate_async(0, stream);
+    stream.synchronize();
+    EXPECT_NO_THROW(ref.deallocate_async(ptr, 0, stream));
+    stream.synchronize();
+  }
+
+  test_allocate_async(ref, 4_B, stream);
+  test_allocate_async(ref, 1_KiB, stream);
+  test_allocate_async(ref, 1_MiB, stream);
+  test_allocate_async(ref, 1_GiB, stream);
+
+  // should fail to allocate too much
+  {
+    void* ptr{nullptr};
+    EXPECT_THROW(ptr = ref.allocate_async(1_PiB, stream), rmm::out_of_memory);
+    EXPECT_EQ(nullptr, ptr);
+
+    // test e.what();
+    try {
+      ptr = ref.allocate_async(1_PiB, stream);
+    } catch (rmm::out_of_memory const& e) {
+      EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+    }
+  }
+}
+
+inline void test_random_allocations(resource_ref ref,
+                                    std::size_t num_allocations = default_num_allocations,
+                                    size_in_bytes max_size      = default_max_size)
+{
+  std::vector<allocation> allocations(num_allocations);
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, max_size);
+
+  // num_allocations allocations from [0,max_size)
+  std::for_each(
+    allocations.begin(), allocations.end(), [&generator, &distribution, &ref](allocation& alloc) {
+      alloc.size = distribution(generator);
+      EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size));
+      EXPECT_NE(nullptr, alloc.ptr);
+      EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr));
+    });
+
+  std::for_each(allocations.begin(), allocations.end(), [&ref](allocation& alloc) {
+    EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size));
+  });
+}
+
+inline void test_random_async_allocations(async_resource_ref ref,
+                                          std::size_t num_allocations = default_num_allocations,
+                                          size_in_bytes max_size      = default_max_size,
+                                          cuda_stream_view stream     = {})
+{
+  std::vector<allocation> allocations(num_allocations);
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, max_size);
+
+  // num_allocations allocations from [0,max_size)
+  std::for_each(allocations.begin(),
+                allocations.end(),
+                [&generator, &distribution, &ref, stream](allocation& alloc) {
+                  alloc.size = distribution(generator);
+                  EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size));
+                  if (not stream.is_default()) { stream.synchronize(); }
+                  EXPECT_NE(nullptr, alloc.ptr);
+                  EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr));
+                });
+
+  std::for_each(allocations.begin(), allocations.end(), [stream, &ref](allocation& alloc) {
+    EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size));
+    if (not stream.is_default()) { stream.synchronize(); }
+  });
+}
+
+inline void test_mixed_random_allocation_free(resource_ref ref,
+                                              size_in_bytes max_size = default_max_size)
+{
+  std::default_random_engine generator;
+  constexpr std::size_t num_allocations{100};
+
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  constexpr int allocation_probability{53};  // percent
+  constexpr int max_probability{99};
+  std::uniform_int_distribution<int> op_distribution(0, max_probability);
+  std::uniform_int_distribution<int> index_distribution(0, num_allocations - 1);
+
+  std::size_t active_allocations{0};
+  std::size_t allocation_count{0};
+
+  std::vector<allocation> allocations;
+
+  for (std::size_t i = 0; i < num_allocations * 2; ++i) {
+    bool do_alloc = true;
+    if (active_allocations > 0) {
+      int chance = op_distribution(generator);
+      do_alloc   = (chance < allocation_probability) && (allocation_count < num_allocations);
+    }
+
+    if (do_alloc) {
+      std::size_t size = size_distribution(generator);
+      active_allocations++;
+      allocation_count++;
+      EXPECT_NO_THROW(allocations.emplace_back(ref.allocate(size), size));
+      auto new_allocation = allocations.back();
+      EXPECT_NE(nullptr, new_allocation.ptr);
+      EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr));
+    } else {
+      auto const index = static_cast<int>(index_distribution(generator) % active_allocations);
+      active_allocations--;
+      allocation to_free = allocations[index];
+      allocations.erase(std::next(allocations.begin(), index));
+      EXPECT_NO_THROW(ref.deallocate(to_free.ptr, to_free.size));
+    }
+  }
+
+  EXPECT_EQ(active_allocations, 0);
+  EXPECT_EQ(allocations.size(), active_allocations);
+}
+
+inline void test_mixed_random_async_allocation_free(async_resource_ref ref,
+                                                    size_in_bytes max_size  = default_max_size,
+                                                    cuda_stream_view stream = {})
+{
+  std::default_random_engine generator;
+  constexpr std::size_t num_allocations{100};
+
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  constexpr int allocation_probability{53};  // percent
+  constexpr int max_probability{99};
+  std::uniform_int_distribution<int> op_distribution(0, max_probability);
+  std::uniform_int_distribution<int> index_distribution(0, num_allocations - 1);
+
+  std::size_t active_allocations{0};
+  std::size_t allocation_count{0};
+
+  std::vector<allocation> allocations;
+
+  for (std::size_t i = 0; i < num_allocations * 2; ++i) {
+    bool do_alloc = true;
+    if (active_allocations > 0) {
+      int chance = op_distribution(generator);
+      do_alloc   = (chance < allocation_probability) && (allocation_count < num_allocations);
+    }
+
+    if (do_alloc) {
+      std::size_t size = size_distribution(generator);
+      active_allocations++;
+      allocation_count++;
+      EXPECT_NO_THROW(allocations.emplace_back(ref.allocate_async(size, stream), size));
+      auto new_allocation = allocations.back();
+      EXPECT_NE(nullptr, new_allocation.ptr);
+      EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr));
+    } else {
+      auto const index = static_cast<int>(index_distribution(generator) % active_allocations);
+      active_allocations--;
+      allocation to_free = allocations[index];
+      allocations.erase(std::next(allocations.begin(), index));
+      EXPECT_NO_THROW(ref.deallocate_async(to_free.ptr, to_free.size, stream));
+    }
+  }
+
+  EXPECT_EQ(active_allocations, 0);
+  EXPECT_EQ(allocations.size(), active_allocations);
+}
+
+using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
+
+/// Encapsulates a `device_memory_resource` factory function and associated name
+struct mr_factory {
+  mr_factory(std::string name, MRFactoryFunc factory)
+    : name{std::move(name)}, factory{std::move(factory)}
+  {
+  }
+
+  std::string name;       ///< Name to associate with tests that use this factory
+  MRFactoryFunc factory;  ///< Factory function that returns shared_ptr to `device_memory_resource`
+                          ///< instance to use in test
+};
+
+/// Test fixture class value-parameterized on different `mr_factory`s
+struct mr_ref_test : public ::testing::TestWithParam<mr_factory> {
+  void SetUp() override
+  {
+    auto factory = GetParam().factory;
+    mr           = factory();
+    if (mr == nullptr) {
+      GTEST_SKIP() << "Skipping tests since the memory resource is not supported with this CUDA "
+                   << "driver/runtime version";
+    }
+    ref = async_resource_ref{*mr};
+  }
+
+  std::shared_ptr<rmm::mr::device_memory_resource> mr;  ///< Pointer to resource to use in tests
+  async_resource_ref ref{*mr};
+  rmm::cuda_stream stream{};
+};
+
+struct mr_ref_allocation_test : public mr_ref_test {};
+
+/// MR factory functions
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+inline auto make_cuda_async()
+{
+  if (rmm::detail::async_alloc::is_supported()) {
+    return std::make_shared<rmm::mr::cuda_async_memory_resource>();
+  }
+  return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
+}
+
+inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+}
+
+inline auto make_arena()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+}
+
+inline auto make_fixed_size()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::fixed_size_memory_resource>(make_cuda());
+}
+
+inline auto make_binning()
+{
+  auto pool = make_pool();
+  // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
+  // Larger allocations will use the pool resource
+  auto const bin_range_start{18};
+  auto const bin_range_end{22};
+
+  auto mr = rmm::mr::make_owning_wrapper<rmm::mr::binning_memory_resource>(
+    pool, bin_range_start, bin_range_end);
+  return mr;
+}
+
+}  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp
new file mode 100644
index 000000000..2507d468a
--- /dev/null
+++ b/tests/mr/device/mr_ref_tests.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mr_ref_test.hpp"
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cuda/memory_resource>
+
+#include <gtest/gtest.h>
+
+namespace rmm::test {
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(ResourceTests,
+                         mr_ref_test,
+                         ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                           mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                           mr_factory{"Managed", &make_managed},
+                                           mr_factory{"Pool", &make_pool},
+                                           mr_factory{"Arena", &make_arena},
+                                           mr_factory{"Binning", &make_binning},
+                                           mr_factory{"Fixed_Size", &make_fixed_size}),
+                         [](auto const& info) { return info.param.name; });
+
+// Leave out fixed-size MR here because it can't handle the dynamic allocation sizes
+INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests,
+                         mr_ref_allocation_test,
+                         ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                           mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                           mr_factory{"Managed", &make_managed},
+                                           mr_factory{"Pool", &make_pool},
+                                           mr_factory{"Arena", &make_arena},
+                                           mr_factory{"Binning", &make_binning}),
+                         [](auto const& info) { return info.param.name; });
+TEST_P(mr_ref_test, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); }
+
+// Simple reproducer for https://github.com/rapidsai/rmm/issues/861
+TEST_P(mr_ref_test, AllocationsAreDifferent) { concurrent_allocations_are_different(this->ref); }
+
+TEST_P(mr_ref_test, AsyncAllocationsAreDifferentDefaultStream)
+{
+  concurrent_async_allocations_are_different(this->ref, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test, AsyncAllocationsAreDifferent)
+{
+  concurrent_async_allocations_are_different(this->ref, this->stream);
+}
+
+TEST_P(mr_ref_allocation_test, AllocateDefault) { test_various_allocations(this->ref); }
+
+TEST_P(mr_ref_allocation_test, AllocateDefaultStream)
+{
+  test_various_async_allocations(this->ref, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_allocation_test, AllocateOnStream)
+{
+  test_various_async_allocations(this->ref, this->stream);
+}
+
+TEST_P(mr_ref_allocation_test, RandomAllocations) { test_random_allocations(this->ref); }
+
+TEST_P(mr_ref_allocation_test, RandomAllocationsDefaultStream)
+{
+  test_random_async_allocations(
+    this->ref, default_num_allocations, default_max_size, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_allocation_test, RandomAllocationsStream)
+{
+  test_random_async_allocations(this->ref, default_num_allocations, default_max_size, this->stream);
+}
+
+TEST_P(mr_ref_allocation_test, MixedRandomAllocationFree)
+{
+  test_mixed_random_allocation_free(this->ref, default_max_size);
+}
+
+TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeDefaultStream)
+{
+  test_mixed_random_async_allocation_free(this->ref, default_max_size, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeStream)
+{
+  test_mixed_random_async_allocation_free(this->ref, default_max_size, this->stream);
+}
+
+}  // namespace
+}  // namespace rmm::test
diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp
new file mode 100644
index 000000000..568e18576
--- /dev/null
+++ b/tests/mr/host/mr_ref_tests.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../byte_literals.hpp"
+
+#include <rmm/detail/aligned.hpp>
+#include <rmm/mr/host/host_memory_resource.hpp>
+#include <rmm/mr/host/new_delete_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cuda/memory_resource>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <deque>
+#include <random>
+
+namespace rmm::test {
+namespace {
+inline bool is_aligned(void* ptr, std::size_t alignment = alignof(std::max_align_t))
+{
+  return rmm::detail::is_pointer_aligned(ptr, alignment);
+}
+
+// Returns true if a pointer points to a device memory or managed memory allocation.
+inline bool is_device_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
+  return attributes.memoryType == cudaMemoryTypeDevice;
+#else
+  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
+#endif
+}
+
+/**
+ * @brief Returns if a pointer `p` points to pinned host memory.
+ */
+inline bool is_pinned_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+  return attributes.type == cudaMemoryTypeHost;
+}
+
+constexpr std::size_t size_word{4_B};
+constexpr std::size_t size_kb{1_KiB};
+constexpr std::size_t size_mb{1_MiB};
+constexpr std::size_t size_gb{1_GiB};
+constexpr std::size_t size_pb{1_PiB};
+
+struct allocation {
+  void* ptr{nullptr};
+  std::size_t size{0};
+  allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {}
+  allocation() = default;
+};
+}  // namespace
+
+template <typename MemoryResourceType>
+struct MRRefTest : public ::testing::Test {
+  MemoryResourceType mr;
+  cuda::mr::resource_ref<cuda::mr::host_accessible> ref;
+
+  MRRefTest() : mr{}, ref{mr} {}
+};
+
+using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>,
+              "");
+
+TYPED_TEST_CASE(MRRefTest, resources);
+
+TYPED_TEST(MRRefTest, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); }
+
+TYPED_TEST(MRRefTest, AllocateZeroBytes)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(0));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, 0));
+}
+
+TYPED_TEST(MRRefTest, AllocateWord)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_word));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_word));
+}
+
+TYPED_TEST(MRRefTest, AllocateKB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_kb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_kb));
+}
+
+TYPED_TEST(MRRefTest, AllocateMB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_mb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_mb));
+}
+
+TYPED_TEST(MRRefTest, AllocateGB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_gb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_gb));
+}
+
+TYPED_TEST(MRRefTest, AllocateTooMuch)
+{
+  void* ptr{nullptr};
+  EXPECT_THROW(ptr = this->ref.allocate(size_pb), std::bad_alloc);
+  EXPECT_EQ(nullptr, ptr);
+}
+
+TYPED_TEST(MRRefTest, RandomAllocations)
+{
+  constexpr std::size_t num_allocations{100};
+  std::vector<allocation> allocations(num_allocations);
+
+  constexpr std::size_t MAX_ALLOCATION_SIZE{5 * size_mb};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, MAX_ALLOCATION_SIZE);
+
+  // 100 allocations from [0,5MB)
+  std::for_each(
+    allocations.begin(), allocations.end(), [&generator, &distribution, this](allocation& alloc) {
+      alloc.size = distribution(generator);
+      EXPECT_NO_THROW(alloc.ptr = this->ref.allocate(alloc.size));
+      EXPECT_NE(nullptr, alloc.ptr);
+      EXPECT_TRUE(is_aligned(alloc.ptr));
+    });
+
+  std::for_each(allocations.begin(), allocations.end(), [this](allocation& alloc) {
+    EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size));
+  });
+}
+
+TYPED_TEST(MRRefTest, MixedRandomAllocationFree)
+{
+  std::default_random_engine generator;
+
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  // How often a free will occur. For example, if `1`, then every allocation
+  // will immediately be free'd. Or, if 4, on average, a free will occur after
+  // every 4th allocation
+  constexpr std::size_t FREE_FREQUENCY{4};
+  std::uniform_int_distribution<int> free_distribution(1, FREE_FREQUENCY);
+
+  std::deque<allocation> allocations;
+
+  constexpr std::size_t num_allocations{100};
+  for (std::size_t i = 0; i < num_allocations; ++i) {
+    std::size_t allocation_size = size_distribution(generator);
+    EXPECT_NO_THROW(allocations.emplace_back(this->ref.allocate(allocation_size), allocation_size));
+    auto new_allocation = allocations.back();
+    EXPECT_NE(nullptr, new_allocation.ptr);
+    EXPECT_TRUE(is_aligned(new_allocation.ptr));
+
+    bool const free_front{free_distribution(generator) == free_distribution.max()};
+
+    if (free_front) {
+      auto front = allocations.front();
+      EXPECT_NO_THROW(this->ref.deallocate(front.ptr, front.size));
+      allocations.pop_front();
+    }
+  }
+  // free any remaining allocations
+  for (auto alloc : allocations) {
+    EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size));
+    allocations.pop_front();
+  }
+}
+
+static constexpr std::size_t MinTestedAlignment{16};
+static constexpr std::size_t MaxTestedAlignment{4096};
+static constexpr std::size_t TestedAlignmentMultiplier{2};
+static constexpr std::size_t NUM_TRIALS{100};
+
+TYPED_TEST(MRRefTest, AlignmentTest)
+{
+  std::default_random_engine generator(0);
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) {
+    for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+         alignment *= TestedAlignmentMultiplier) {
+      auto allocation_size = size_distribution(generator);
+      void* ptr{nullptr};
+      EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, alignment));
+      EXPECT_TRUE(is_aligned(ptr, alignment));
+      EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, alignment));
+    }
+  }
+}
+
+TYPED_TEST(MRRefTest, UnsupportedAlignmentTest)
+{
+  std::default_random_engine generator(0);
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) {
+    for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+         alignment *= TestedAlignmentMultiplier) {
+      auto allocation_size = size_distribution(generator);
+      void* ptr{nullptr};
+      // An unsupported alignment (like an odd number) should result in an
+      // alignment of `alignof(std::max_align_t)`
+      auto const bad_alignment = alignment + 1;
+      EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment));
+      EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t)));
+      EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment));
+    }
+  }
+}
+
+TEST(PinnedResource, isPinned)
+{
+  rmm::mr::pinned_memory_resource mr;
+  cuda::mr::resource_ref<cuda::mr::host_accessible> ref{mr};
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = ref.allocate(100));
+  EXPECT_TRUE(is_pinned_memory(ptr));
+  EXPECT_NO_THROW(ref.deallocate(ptr, 100));
+}
+}  // namespace rmm::test
diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp
index 1cd59f5a6..ff976ff96 100644
--- a/tests/mr/host/mr_tests.cpp
+++ b/tests/mr/host/mr_tests.cpp
@@ -23,6 +23,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cuda/memory_resource>
+
 #include <gtest/gtest.h>
 
 #include <cstddef>
@@ -80,6 +82,9 @@ struct MRTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>,
+              "");
 
 TYPED_TEST_CASE(MRTest, resources);
 

From 0e78d9e336a4103ea62fbd9e398b67c2c5d3fee6 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 14:14:48 +0200
Subject: [PATCH 03/30] Make `pinned_memory_resource` usable for
 `pool_memory_resource`

---
 .../rmm/mr/device/pool_memory_resource.hpp    | 69 ++++++++++++-
 .../rmm/mr/host/pinned_memory_resource.hpp    | 74 ++++++++++++++
 tests/CMakeLists.txt                          |  3 +
 tests/mr/device/pool_mr_tests.cpp             | 51 ++++++++++
 tests/mr/host/pinned_pool_mr_tests.cpp        | 96 +++++++++++++++++++
 5 files changed, 290 insertions(+), 3 deletions(-)
 create mode 100644 tests/mr/host/pinned_pool_mr_tests.cpp

diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index f6d3710e9..7cca60613 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -51,6 +51,35 @@ namespace rmm::mr {
  * @{
  * @file
  */
+namespace detail {
+/**
+ * @brief A helper class to remove the device_accessible property
+ *
+ * We want to be able to use the pool_memory_resource with an upstream that may not
+ * be device accessible. To avoid rewriting the world, we allow to conditionally remove
+ * the cuda::mr::device_accessible property.
+ *
+ * @tparam PoolResource the pool_memory_resource class
+ * @tparam Upstream memory_resource to use for allocating the pool.
+ */
+template <class PoolResource, class Upstream, class = void>
+struct maybe_remove_property {};
+
+/**
+ * @brief Specialization of maybe_remove_property to not propagate non existing properties
+ */
+template <class PoolResource, class Upstream>
+struct maybe_remove_property<
+  PoolResource,
+  Upstream,
+  cuda::std::enable_if_t<!cuda::has_property<Upstream, cuda::mr::device_accessible>>> {
+  /**
+   * @brief Explicit removal of the friend function so we do not pretent to provide device
+   * accessible memory
+   */
+  friend void get_property(const PoolResource&, cuda::mr::device_accessible) = delete;
+};
+}  // namespace detail
 
 /**
  * @brief A coalescing best-fit suballocator which uses a pool of memory allocated from
@@ -64,8 +93,10 @@ namespace rmm::mr {
  */
 template <typename Upstream>
 class pool_memory_resource final
-  : public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
-                                                  detail::coalescing_free_list> {
+  : public detail::maybe_remove_property<pool_memory_resource<Upstream>, Upstream>,
+    public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
+                                                  detail::coalescing_free_list>,
+    public cuda::forward_property<pool_memory_resource<Upstream>, Upstream> {
  public:
   friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
                                                       detail::coalescing_free_list>;
@@ -104,6 +135,31 @@ class pool_memory_resource final
     initialize_pool(initial_pool_size, maximum_pool_size);
   }
 
+  /**
+   * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
+   * `upstream_mr`.
+   *
+   * @throws rmm::logic_error if `upstream_mr == nullptr`
+   * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a
+   * multiple of pool_memory_resource::allocation_alignment bytes.
+   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
+   * multiple of pool_memory_resource::allocation_alignment bytes.
+   *
+   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
+   * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to half of the
+   * available memory on the current device.
+   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
+   * of the available memory on the current device.
+   */
+  template <typename Upstream2                                               = Upstream,
+            cuda::std::enable_if_t<cuda::mr::async_resource<Upstream2>, int> = 0>
+  explicit pool_memory_resource(Upstream2& upstream_mr,
+                                thrust::optional<std::size_t> initial_pool_size = thrust::nullopt,
+                                thrust::optional<std::size_t> maximum_pool_size = thrust::nullopt)
+    : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size)
+  {
+  }
+
   /**
    * @brief Destroy the `pool_memory_resource` and deallocate all memory it allocated using
    * the upstream resource.
@@ -131,6 +187,13 @@ class pool_memory_resource final
    */
   [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; }
 
+  /**
+   * @brief Get the upstream memory_resource object.
+   *
+   * @return const reference to the upstream memory resource.
+   */
+  [[nodiscard]] const Upstream& upstream_resource() const noexcept { return *upstream_mr_; }
+
   /**
    * @brief Get the upstream memory_resource object.
    *
@@ -296,7 +359,7 @@ class pool_memory_resource final
     if (size == 0) { return {}; }
 
     try {
-      void* ptr = get_upstream()->allocate(size, stream);
+      void* ptr = get_upstream()->allocate_async(size, stream);
       return thrust::optional<block_type>{
         *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first};
     } catch (std::exception const& e) {
diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp
index 514cc1664..5da2d8b7b 100644
--- a/include/rmm/mr/host/pinned_memory_resource.hpp
+++ b/include/rmm/mr/host/pinned_memory_resource.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/host/host_memory_resource.hpp>
@@ -46,6 +47,78 @@ class pinned_memory_resource final : public host_memory_resource {
   pinned_memory_resource& operator=(pinned_memory_resource&&) =
     default;  ///< @default_move_assignment{pinned_memory_resource}
 
+  /**
+   * @brief Query whether the pinned_memory_resource supports use of non-null CUDA streams for
+   * allocation/deallocation.
+   *
+   * @returns bool false.
+   */
+  [[nodiscard]] bool supports_streams() const noexcept { return false; }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return bool false.
+   */
+  [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; }
+
+  /**
+   * @brief Queries the amount of free and total memory for the resource.
+   *
+   * @param stream the stream whose memory manager we want to retrieve
+   *
+   * @returns a pair containing the free memory in bytes in .first and total amount of memory in
+   * .second
+   */
+  [[nodiscard]] std::pair<std::size_t, std::size_t> get_mem_info(cuda_stream_view stream) const
+  {
+    return std::make_pair(0, 0);
+  }
+
+  /**
+   * @brief Pretent to support the allocate_async interface, falling back to stream 0
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  [[nodiscard]] void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view)
+  {
+    return do_allocate(bytes, alignment);
+  }
+
+  /**
+   * @brief Pretent to support the allocate_async interface, falling back to stream 0
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  [[nodiscard]] void* allocate_async(std::size_t bytes, cuda_stream_view)
+  {
+    return do_allocate(bytes);
+  }
+
+  /**
+   * @brief Pretent to support the deallocate_async interface, falling back to stream 0
+   *
+   * @throws Nothing.
+   *
+   * @param ptr Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   */
+  void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, cuda_stream_view)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment));
+  }
+
  private:
   /**
    * @brief Allocates pinned memory on the host of size at least `bytes` bytes.
@@ -101,5 +174,6 @@ class pinned_memory_resource final : public host_memory_resource {
       ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); });
   }
 };
+static_assert(cuda::mr::async_resource<pinned_memory_resource>);
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 80e2b846b..4e9866905 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -169,6 +169,9 @@ ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp)
 # host mr_ref tests
 ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp)
 
+# pinned pool mr tests
+ConfigureTest(PINNED_POOL_MR_TEST mr/host/pinned_pool_mr_tests.cpp)
+
 # cuda stream tests
 ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp)
 
diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
index 4a234d2f9..2f32889d0 100644
--- a/tests/mr/device/pool_mr_tests.cpp
+++ b/tests/mr/device/pool_mr_tests.cpp
@@ -55,6 +55,18 @@ TEST(PoolTest, ThrowMaxLessThanInitial)
   EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
 }
 
+TEST(PoolTest, ReferenceThrowMaxLessThanInitial)
+{
+  // Make sure first argument is enough larger than the second that alignment rounding doesn't
+  // make them equal
+  auto max_less_than_initial = []() {
+    const auto initial{1024};
+    const auto maximum{256};
+    pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum};
+  };
+  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
+}
+
 TEST(PoolTest, AllocateNinetyPercent)
 {
   auto allocate_ninety = []() {
@@ -190,4 +202,43 @@ TEST(PoolTest, MultidevicePool)
 }
 
 }  // namespace
+
+namespace test_properties {
+class fake_async_resource {
+ public:
+  // To model `async_resource`
+  void* allocate(std::size_t, std::size_t) { return nullptr; }
+  void deallocate(void* ptr, std::size_t, std::size_t) {}
+  void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; }
+  void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {}
+
+  bool operator==(const fake_async_resource& other) const { return true; }
+  bool operator!=(const fake_async_resource& other) const { return false; }
+
+  // To model stream_resource
+  [[nodiscard]] bool supports_streams() const noexcept { return false; }
+  [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; }
+
+ private:
+  void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; }
+  void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {}
+  [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; }
+};
+static_assert(!cuda::has_property<fake_async_resource, cuda::mr::device_accessible>);
+static_assert(!cuda::has_property<rmm::mr::pool_memory_resource<fake_async_resource>,
+                                  cuda::mr::device_accessible>);
+
+// Ensure that we forward the property if it is there
+class fake_async_resource_device_accessible : public fake_async_resource {
+  friend void get_property(const fake_async_resource_device_accessible&,
+                           cuda::mr::device_accessible)
+  {
+  }
+};
+static_assert(
+  cuda::has_property<fake_async_resource_device_accessible, cuda::mr::device_accessible>);
+static_assert(
+  cuda::has_property<rmm::mr::pool_memory_resource<fake_async_resource_device_accessible>,
+                     cuda::mr::device_accessible>);
+}  // namespace test_properties
 }  // namespace rmm::test
diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp
new file mode 100644
index 000000000..8e202ada3
--- /dev/null
+++ b/tests/mr/host/pinned_pool_mr_tests.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/aligned.hpp>
+#include <rmm/detail/cuda_util.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+// explicit instantiation for test coverage purposes
+template class rmm::mr::pool_memory_resource<rmm::mr::pinned_memory_resource>;
+
+namespace rmm::test {
+namespace {
+using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_memory_resource>;
+
+TEST(PinnedPoolTest, ThrowOnNullUpstream)
+{
+  auto construct_nullptr = []() { pool_mr mr{nullptr}; };
+  EXPECT_THROW(construct_nullptr(), rmm::logic_error);
+}
+
+TEST(PinnedPoolTest, ThrowMaxLessThanInitial)
+{
+  // Make sure first argument is enough larger than the second that alignment rounding doesn't
+  // make them equal
+  auto max_less_than_initial = []() {
+    rmm::mr::pinned_memory_resource pinned_mr{};
+    const auto initial{1024};
+    const auto maximum{256};
+    pool_mr mr{&pinned_mr, initial, maximum};
+  };
+  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
+}
+
+TEST(PinnedPoolTest, ReferenceThrowMaxLessThanInitial)
+{
+  // Make sure first argument is enough larger than the second that alignment rounding doesn't
+  // make them equal
+  auto max_less_than_initial = []() {
+    rmm::mr::pinned_memory_resource pinned_mr{};
+    const auto initial{1024};
+    const auto maximum{256};
+    pool_mr mr{pinned_mr, initial, maximum};
+  };
+  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
+}
+
+// Issue #527
+TEST(PinnedPoolTest, InitialAndMaxPoolSizeEqual)
+{
+  EXPECT_NO_THROW([]() {
+    rmm::mr::pinned_memory_resource pinned_mr{};
+    pool_mr mr(pinned_mr, 1000192, 1000192);
+    mr.allocate(1000);
+  }());
+}
+
+TEST(PinnedPoolTest, NonAlignedPoolSize)
+{
+  EXPECT_THROW(
+    []() {
+      rmm::mr::pinned_memory_resource pinned_mr{};
+      pool_mr mr(pinned_mr, 1000031, 1000192);
+      mr.allocate(1000);
+    }(),
+    rmm::logic_error);
+
+  EXPECT_THROW(
+    []() {
+      rmm::mr::pinned_memory_resource pinned_mr{};
+      pool_mr mr(pinned_mr, 1000192, 1000200);
+      mr.allocate(1000);
+    }(),
+    rmm::logic_error);
+}
+
+}  // namespace
+}  // namespace rmm::test

From 4d25841ad29c94a9b8d6f0602b3bdfc85c04528b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 14:15:09 +0200
Subject: [PATCH 04/30] Port `device_buffer` and `device_uvector` to the new
 interface

---
 include/rmm/device_buffer.hpp                 | 34 +++++++++-----
 include/rmm/device_uvector.hpp                | 27 +++++++----
 .../rmm/mr/device/device_memory_resource.hpp  | 25 +++++++++--
 include/rmm/mr/device/per_device_resource.hpp | 29 ++++++++++++
 tests/device_buffer_tests.cu                  | 45 ++++++++-----------
 tests/device_uvector_tests.cpp                |  5 ++-
 6 files changed, 112 insertions(+), 53 deletions(-)

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index a49f9caa9..811c8d9cb 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -27,6 +27,8 @@
 #include <stdexcept>
 #include <utility>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -38,7 +40,7 @@ namespace rmm {
  *
  * This class allocates untyped and *uninitialized* device memory using a
  * `device_memory_resource`. If not explicitly specified, the memory resource
- * returned from `get_current_device_resource()` is used.
+ * returned from `get_current_device_resource_ref()` is used.
  *
  * @note Unlike `std::vector` or `thrust::device_vector`, the device memory
  * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -79,6 +81,8 @@ namespace rmm {
  *```
  */
 class device_buffer {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+
  public:
   // The copy constructor and copy assignment operator without a stream are deleted because they
   // provide no way to specify an explicit stream
@@ -92,7 +96,7 @@ class device_buffer {
   // `__host__ __device__` specifiers to the defaulted constructor when it is called within the
   // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
   // device function. This causes warnings/errors because this ctor invokes host-only functions.
-  device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
+  device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
 
   /**
    * @brief Constructs a new device buffer of `size` uninitialized bytes
@@ -106,7 +110,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         mr::device_memory_resource* mr = mr::get_current_device_resource())
+                         async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -134,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                mr::device_memory_resource* mr = mr::get_current_device_resource())
+                async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -164,7 +168,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                async_resource_ref mr = mr::get_current_device_resource_ref())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -238,7 +242,6 @@ class device_buffer {
   ~device_buffer() noexcept
   {
     deallocate_async();
-    _mr     = nullptr;
     _stream = cuda_stream_view{};
   }
 
@@ -399,16 +402,23 @@ class device_buffer {
   /**
    * @briefreturn{Pointer to the memory resource used to allocate and deallocate}
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; }
+  [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
+
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_buffer` provides device accessible memory
+   */
+  friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {}
 
  private:
   void* _data{nullptr};        ///< Pointer to device memory allocation
   std::size_t _size{};         ///< Requested size of the device memory allocation
   std::size_t _capacity{};     ///< The actual size of the device memory allocation
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
-  mr::device_memory_resource* _mr{
-    mr::get_current_device_resource()};  ///< The memory resource used to
-                                         ///< allocate/deallocate device memory
+  async_resource_ref _mr{
+    mr::get_current_device_resource_ref()};  ///< The memory resource used to
+                                             ///< allocate/deallocate device memory
 
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
@@ -423,7 +433,7 @@ class device_buffer {
   {
     _size     = bytes;
     _capacity = bytes;
-    _data     = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr;
+    _data     = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
   }
 
   /**
@@ -437,7 +447,7 @@ class device_buffer {
    */
   void deallocate_async() noexcept
   {
-    if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); }
+    if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }
     _size     = 0;
     _capacity = 0;
     _data     = nullptr;
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index 930cda157..cfa6eebd4 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <vector>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -72,6 +74,7 @@ namespace rmm {
  */
 template <typename T>
 class device_uvector {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
   static_assert(std::is_trivially_copyable<T>::value,
                 "device_uvector only supports types that are trivially copyable.");
 
@@ -121,10 +124,9 @@ class device_uvector {
    * @param stream The stream on which to perform the allocation
    * @param mr The resource used to allocate the device storage
    */
-  explicit device_uvector(
-    std::size_t size,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(std::size_t size,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource_ref())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -138,10 +140,9 @@ class device_uvector {
    * @param stream The stream on which to perform the copy
    * @param mr The resource used to allocate device memory for the new vector
    */
-  explicit device_uvector(
-    device_uvector const& other,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(device_uvector const& other,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }
@@ -526,11 +527,19 @@ class device_uvector {
   /**
    * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage}
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept
+  [[nodiscard]] cuda::mr::async_resource_ref<cuda::mr::device_accessible> memory_resource()
+    const noexcept
   {
     return _storage.memory_resource();
   }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_uvector` provides device accessible memory
+   */
+  friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {}
+
   /**
    * @briefreturn{Stream most recently specified for allocation/deallocation}
    */
diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index cc1ae2c3f..92baf3f0a 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -193,7 +193,7 @@ class device_memory_resource {
    *
    * @throws Nothing.
    *
-   * @param p Pointer to be deallocated
+   * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
    * @param alignment The alignment that was passed to the `allocate` call that returned `p`
@@ -257,7 +257,7 @@ class device_memory_resource {
    *
    * @throws Nothing.
    *
-   * @param p Pointer to be deallocated
+   * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
    * @param alignment The alignment that was passed to the `allocate` call that returned `p`
@@ -284,7 +284,7 @@ class device_memory_resource {
    *
    * @throws Nothing.
    *
-   * @param p Pointer to be deallocated
+   * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
    * @param stream Stream on which to perform allocation
@@ -294,11 +294,25 @@ class device_memory_resource {
     do_deallocate(ptr, bytes, stream);
   }
 
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equivalent
+   */
   [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept
   {
     return do_is_equal(other);
   }
 
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return false If the two resources are equivalent
+   * @return true If the two resources are not equivalent
+   */
   [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept
   {
     return !do_is_equal(other);
@@ -332,6 +346,11 @@ class device_memory_resource {
     return do_get_mem_info(stream);
   }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_memory_resource` provides device accessible memory
+   */
   friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {}
 
  private:
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
index aa7217758..1c4108a25 100644
--- a/include/rmm/mr/device/per_device_resource.hpp
+++ b/include/rmm/mr/device/per_device_resource.hpp
@@ -24,6 +24,8 @@
 #include <map>
 #include <mutex>
 
+#include <cuda/memory_resource>
+
 /**
  * @file per_device_resource.hpp
  * @brief Management of per-device `device_memory_resource`s
@@ -233,4 +235,31 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc
 {
   return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
 }
+
+/**
+ * @brief Get the memory resource for the current device as a `cuda::mr::async_resource_ref`.
+ *
+ * Returns a `cuda::mr::async_resource_ref` from the memory_resource set for the current device.
+ * The initial resource is a `cuda_memory_resource`.
+ *
+ * The "current device" is the device returned by `cudaGetDevice`.
+ *
+ * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`,
+ * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`.
+ * Concurrent calls to any of these functions will result in a valid state, but the order of
+ * execution is undefined.
+ *
+ * @note The returned `cuda::mr::async_resource_ref` should only be used with the current CUDA
+ * device. Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned
+ * resource can result in undefined behavior. The behavior of a device_memory_resource is undefined
+ * if used while the active CUDA device is a different device from the one that was active when the
+ * device_memory_resource was created.
+ *
+ * @return cuda::mr::async_resource_ref of the resource for the current device
+ */
+inline cuda::mr::async_resource_ref<cuda::mr::device_accessible> get_current_device_resource_ref()
+{
+  return cuda::mr::async_resource_ref<cuda::mr::device_accessible>{get_current_device_resource()};
+}
+
 }  // namespace rmm::mr
diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu
index d4c34385e..56992a06c 100644
--- a/tests/device_buffer_tests.cu
+++ b/tests/device_buffer_tests.cu
@@ -57,6 +57,7 @@ struct DeviceBufferTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TYPED_TEST_CASE(DeviceBufferTest, resources);
 
@@ -73,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource)
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.ssize());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -84,30 +85,28 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource)
 {
-  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr);
+  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr);
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(&this->mr, buff.memory_resource());
-  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
+  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream)
 {
-  rmm::device_buffer buff(this->size, this->stream, &this->mr);
+  rmm::device_buffer buff(this->size, this->stream, this->mr);
   this->stream.synchronize();
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(&this->mr, buff.memory_resource());
-  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
+  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
@@ -119,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 
   // TODO check for equality between the contents of the two allocations
@@ -135,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
   buff.stream().synchronize();
   // TODO check for equality between the contents of the two allocations
@@ -148,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr)
   EXPECT_EQ(nullptr, buff.data());
   EXPECT_EQ(0, buff.size());
   EXPECT_EQ(0, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -174,8 +173,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   EXPECT_NE(buff.data(), buff_copy.data());
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
-  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
+  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref());
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -187,7 +185,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   // now use buff's stream and MR
   rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource());
   EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
-  EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource()));
+  EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
   EXPECT_EQ(buff_copy2.stream(), buff.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -217,8 +215,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize)
 
   // The capacity of the copy should be equal to the `size()` of the original
   EXPECT_EQ(new_size, buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
-  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
+  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref());
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -243,7 +240,6 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr)
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
-  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -275,7 +271,6 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr)
   EXPECT_EQ(new_size, buff_copy.capacity());
   EXPECT_NE(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
-  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -291,7 +286,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -309,7 +304,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_default, buff.stream());  // NOLINT(bugprone-use-after-move)
-  EXPECT_NE(nullptr, buff.memory_resource());          // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
@@ -319,7 +313,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -338,7 +332,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());  // NOLINT(bugprone-use-after-move)
-  EXPECT_NE(nullptr, buff.memory_resource());         // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
@@ -347,7 +340,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto* mr      = src.memory_resource();
+  auto mr       = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest;
@@ -366,7 +359,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
-  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignment)
@@ -375,7 +367,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto* mr      = src.memory_resource();
+  auto mr       = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr);
@@ -394,7 +386,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
-  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
@@ -403,7 +394,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   buff = std::move(buff);           // self-move-assignment shouldn't modify the buffer
diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
index 69d89e305..46023d519 100644
--- a/tests/device_uvector_tests.cpp
+++ b/tests/device_uvector_tests.cpp
@@ -30,14 +30,15 @@ struct TypedUVectorTest : ::testing::Test {
   [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; }
 };
 
-using TestTypes = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
+using TestTypes          = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TYPED_TEST_CASE(TypedUVectorTest, TestTypes);
 
 TYPED_TEST(TypedUVectorTest, MemoryResource)
 {
   rmm::device_uvector<TypeParam> vec(128, this->stream());
-  EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource());
+  EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource_ref());
 }
 
 TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor)

From de71dcc6a58a2d3a6000c060305b79d78eee3ff0 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 12:22:49 +0200
Subject: [PATCH 05/30] Add cccl repo to build dependencies

---
 CMakeLists.txt                        |  2 ++
 cmake/thirdparty/get_libcudacxx.cmake | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 cmake/thirdparty/get_libcudacxx.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c883bc4c..79119e4fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ rapids_cpm_init()
 
 include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
+include(cmake/thirdparty/get_libcudacxx.cmake)
 include(cmake/thirdparty/get_thrust.cmake)
 
 # ##################################################################################################
@@ -89,6 +90,7 @@ else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()
 
+target_link_libraries(rmm INTERFACE libcudacxx::libcudacxx)
 target_link_libraries(rmm INTERFACE rmm::Thrust)
 target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake
new file mode 100644
index 000000000..b1f990d39
--- /dev/null
+++ b/cmake/thirdparty/get_libcudacxx.cmake
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone libcudacxx
+function(find_and_configure_libcudacxx)
+
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports)
+
+endfunction()
+
+find_and_configure_libcudacxx()

From 13a990fdaa45b5de18958835c7daee6fbaf520f1 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 12:29:12 +0200
Subject: [PATCH 06/30] PoC for the new design of
 `cuda::mr::{async_}resource_ref`

---
 CMakeLists.txt                                |   1 +
 include/rmm/cuda_stream_view.hpp              |  16 +
 .../mr/device/callback_memory_resource.hpp    |   4 +-
 .../rmm/mr/device/device_memory_resource.hpp  | 150 +++++++
 include/rmm/mr/host/host_memory_resource.hpp  |  35 ++
 tests/CMakeLists.txt                          |   7 +
 tests/mr/device/adaptor_tests.cpp             |  26 ++
 tests/mr/device/cuda_async_mr_tests.cpp       |   2 +
 tests/mr/device/cuda_async_view_mr_tests.cpp  |   4 +
 .../mr/device/mr_ref_multithreaded_tests.cpp  | 221 ++++++++++
 tests/mr/device/mr_ref_test.hpp               | 412 ++++++++++++++++++
 tests/mr/device/mr_ref_tests.cpp              | 109 +++++
 tests/mr/host/mr_ref_tests.cpp                | 263 +++++++++++
 tests/mr/host/mr_tests.cpp                    |   5 +
 14 files changed, 1253 insertions(+), 2 deletions(-)
 create mode 100644 tests/mr/device/mr_ref_multithreaded_tests.cpp
 create mode 100644 tests/mr/device/mr_ref_test.hpp
 create mode 100644 tests/mr/device/mr_ref_tests.cpp
 create mode 100644 tests/mr/host/mr_ref_tests.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79119e4fe..93fcdabe9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,7 @@ target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
 target_link_libraries(rmm INTERFACE dl)
 target_compile_features(rmm INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 # ##################################################################################################
 # * tests and benchmarks ---------------------------------------------------------------------------
diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp
index fe07fa1b9..ad2768535 100644
--- a/include/rmm/cuda_stream_view.hpp
+++ b/include/rmm/cuda_stream_view.hpp
@@ -20,6 +20,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cuda/stream_ref>
+
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
@@ -58,6 +60,13 @@ class cuda_stream_view {
    */
   constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {}
 
+  /**
+   * @brief Implicit conversion from cuda::stream_ref.
+   *
+   * @param stream The underlying stream for this view
+   */
+  constexpr cuda_stream_view(cuda::stream_ref stream) noexcept : stream_{stream.get()} {}
+
   /**
    * @brief Get the wrapped stream.
    *
@@ -72,6 +81,13 @@ class cuda_stream_view {
    */
   constexpr operator cudaStream_t() const noexcept { return value(); }
 
+  /**
+   * @brief Implicit conversion to stream_ref.
+   *
+   * @return cuda::stream_ref The underlying stream referenced by this cuda_stream_view
+   */
+  constexpr operator cuda::stream_ref() const noexcept { return value(); }
+
   /**
    * @briefreturn{true if the wrapped stream is the CUDA per-thread default stream}
    */
diff --git a/include/rmm/mr/device/callback_memory_resource.hpp b/include/rmm/mr/device/callback_memory_resource.hpp
index c6519ed5c..36802c83a 100644
--- a/include/rmm/mr/device/callback_memory_resource.hpp
+++ b/include/rmm/mr/device/callback_memory_resource.hpp
@@ -143,8 +143,8 @@ class callback_memory_resource final : public device_memory_resource {
     throw std::runtime_error("cannot get free / total memory");
   }
 
-  [[nodiscard]] virtual bool supports_streams() const noexcept { return false; }
-  [[nodiscard]] virtual bool supports_get_mem_info() const noexcept { return false; }
+  [[nodiscard]] bool supports_streams() const noexcept override { return false; }
+  [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; }
 
   allocate_callback_t allocate_callback_;
   deallocate_callback_t deallocate_callback_;
diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 8ad84644b..cc1ae2c3f 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -18,6 +18,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
 
+#include <cuda/memory_resource>
+#include <cuda/stream_ref>
+
 #include <cstddef>
 #include <utility>
 
@@ -157,6 +160,150 @@ class device_memory_resource {
     return do_is_equal(other);
   }
 
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* allocate(std::size_t bytes, std::size_t alignment)
+  {
+    return do_allocate(rmm::detail::align_up(bytes, alignment), cuda_stream_view{});
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   */
+  void deallocate(void* ptr, std::size_t bytes, std::size_t alignment)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), cuda_stream_view{});
+  }
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view stream)
+  {
+    return do_allocate(rmm::detail::align_up(bytes, alignment), stream);
+  }
+
+  /**
+   * @brief Allocates memory of size at least \p bytes.
+   *
+   * The returned pointer will have at minimum 256 byte alignment.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param stream Stream on which to perform allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  void* allocate_async(std::size_t bytes, cuda_stream_view stream)
+  {
+    return do_allocate(bytes, stream);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   * @param stream Stream on which to perform allocation
+   */
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda_stream_view stream)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment), stream);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by \p p.
+   *
+   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * a `device_memory_resource` that compares equal to `*this`, and the storage
+   * it points to must not yet have been deallocated, otherwise behavior is
+   * undefined.
+   *
+   * If supported, this operation may optionally be executed on a stream.
+   * Otherwise, the stream is ignored and the null stream is used.
+   *
+   * @throws Nothing.
+   *
+   * @param p Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform allocation
+   */
+  void deallocate_async(void* ptr, std::size_t bytes, cuda_stream_view stream)
+  {
+    do_deallocate(ptr, bytes, stream);
+  }
+
+  [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept
+  {
+    return do_is_equal(other);
+  }
+
+  [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept
+  {
+    return !do_is_equal(other);
+  }
+
   /**
    * @brief Query whether the resource supports use of non-null CUDA streams for
    * allocation/deallocation.
@@ -185,6 +332,8 @@ class device_memory_resource {
     return do_get_mem_info(stream);
   }
 
+  friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {}
+
  private:
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -243,5 +392,6 @@ class device_memory_resource {
   [[nodiscard]] virtual std::pair<std::size_t, std::size_t> do_get_mem_info(
     cuda_stream_view stream) const = 0;
 };
+static_assert(cuda::mr::resource_with<device_memory_resource, cuda::mr::device_accessible>, "");
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp
index 8a5739b2a..aa40d57cd 100644
--- a/include/rmm/mr/host/host_memory_resource.hpp
+++ b/include/rmm/mr/host/host_memory_resource.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cuda/memory_resource>
+
 #include <cstddef>
 #include <utility>
 
@@ -114,6 +116,37 @@ class host_memory_resource {
     return do_is_equal(other);
   }
 
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equivalent
+   */
+  [[nodiscard]] bool operator==(host_memory_resource const& other) const noexcept
+  {
+    return do_is_equal(other);
+  }
+
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return false If the two resources are equivalent
+   * @return true If the two resources are not equivalent
+   */
+  [[nodiscard]] bool operator!=(host_memory_resource const& other) const noexcept
+  {
+    return !do_is_equal(other);
+  }
+
+  /**
+   * @brief Enables the `cuda::mr::host_accessible` property
+   *
+   * This property declares that a `host_memory_resource` provides host accessible memory
+   */
+  friend void get_property(host_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+
  private:
   /**
    * @brief Allocates memory on the host of size at least `bytes` bytes.
@@ -166,5 +199,7 @@ class host_memory_resource {
     return this == &other;
   }
 };
+static_assert(cuda::mr::resource_with<host_memory_resource, cuda::mr::host_accessible>, "");
 /** @} */  // end of group
+
 }  // namespace rmm::mr
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 36c3aa043..80e2b846b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -126,6 +126,10 @@ endfunction()
 ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp GPUS 1
               PERCENT 90)
 
+# device mr tests
+ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp
+              mr/device/mr_ref_multithreaded_tests.cpp)
+
 # general adaptor tests
 ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)
 
@@ -162,6 +166,9 @@ ConfigureTest(LIMITING_TEST mr/device/limiting_mr_tests.cpp)
 # host mr tests
 ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp)
 
+# host mr_ref tests
+ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp)
+
 # cuda stream tests
 ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp)
 
diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp
index 44c14240b..3e00dd208 100644
--- a/tests/mr/device/adaptor_tests.cpp
+++ b/tests/mr/device/adaptor_tests.cpp
@@ -29,6 +29,8 @@
 #include <rmm/mr/device/thread_safe_resource_adaptor.hpp>
 #include <rmm/mr/device/tracking_resource_adaptor.hpp>
 
+#include <cuda/memory_resource>
+
 #include <gtest/gtest.h>
 
 #include <cstddef>
@@ -64,6 +66,30 @@ using adaptors = ::testing::Types<aligned_resource_adaptor<cuda_mr>,
                                   thread_safe_resource_adaptor<cuda_mr>,
                                   tracking_resource_adaptor<cuda_mr>>;
 
+static_assert(
+  cuda::mr::resource_with<rmm::mr::aligned_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+static_assert(cuda::mr::resource_with<rmm::mr::failure_callback_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>,
+              "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::limiting_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::logging_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::owning_wrapper<cuda_mr>, cuda::mr::device_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::statistics_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>,
+              "");
+static_assert(cuda::mr::resource_with<rmm::mr::thread_safe_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>,
+              "");
+static_assert(
+  cuda::mr::resource_with<rmm::mr::tracking_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
+  "");
+
 template <typename MemoryResourceType>
 struct AdaptorTest : public ::testing::Test {
   using adaptor_type = MemoryResourceType;
diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
index 37ed5c306..90c7b0ff9 100644
--- a/tests/mr/device/cuda_async_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -24,6 +24,8 @@ namespace rmm::test {
 namespace {
 
 using cuda_async_mr = rmm::mr::cuda_async_memory_resource;
+static_assert(cuda::mr::resource_with<cuda_async_mr, cuda::mr::device_accessible>);
+static_assert(cuda::mr::async_resource_with<cuda_async_mr, cuda::mr::device_accessible>);
 
 class AsyncMRTest : public ::testing::Test {
  protected:
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
index 209429b4b..fe82431a9 100644
--- a/tests/mr/device/cuda_async_view_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -18,12 +18,16 @@
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>
 
+#include <cuda/memory_resource>
+
 #include <gtest/gtest.h>
 
 namespace rmm::test {
 namespace {
 
 using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource;
+static_assert(cuda::mr::resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
+static_assert(cuda::mr::async_resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 
 #if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
 
diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
new file mode 100644
index 000000000..dd39d1e1d
--- /dev/null
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mr_ref_test.hpp"
+
+#include <gtest/gtest.h>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cuda/memory_resource>
+
+#include <thread>
+#include <vector>
+
+namespace rmm::test {
+namespace {
+
+struct mr_ref_test_mt : public mr_ref_test {};
+
+INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests,
+                        mr_ref_test_mt,
+                        ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                          mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                          mr_factory{"Managed", &make_managed},
+                                          mr_factory{"Pool", &make_pool},
+                                          mr_factory{"Arena", &make_arena},
+                                          mr_factory{"Binning", &make_binning}),
+                        [](auto const& info) { return info.param.name; });
+
+template <typename Task, typename... Arguments>
+void spawn_n(std::size_t num_threads, Task task, Arguments&&... args)
+{
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+  for (std::size_t i = 0; i < num_threads; ++i) {
+    threads.emplace_back(std::thread(task, std::forward<Arguments>(args)...));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <typename Task, typename... Arguments>
+void spawn(Task task, Arguments&&... args)
+{
+  spawn_n(4, task, std::forward<Arguments>(args)...);
+}
+
+TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); }
+
+TEST_P(mr_ref_test_mt, AllocateDefaultStream)
+{
+  spawn(test_various_async_allocations, this->ref, rmm::cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test_mt, AllocateOnStream)
+{
+  spawn(test_various_async_allocations, this->ref, this->stream.view());
+}
+
+TEST_P(mr_ref_test_mt, RandomAllocations)
+{
+  spawn(test_random_allocations, this->ref, default_num_allocations, default_max_size);
+}
+
+TEST_P(mr_ref_test_mt, RandomAllocationsDefaultStream)
+{
+  spawn(test_random_async_allocations,
+        this->ref,
+        default_num_allocations,
+        default_max_size,
+        rmm::cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test_mt, RandomAllocationsStream)
+{
+  spawn(test_random_async_allocations,
+        this->ref,
+        default_num_allocations,
+        default_max_size,
+        this->stream.view());
+}
+
+TEST_P(mr_ref_test_mt, MixedRandomAllocationFree)
+{
+  spawn(test_mixed_random_allocation_free, this->ref, default_max_size);
+}
+
+TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeDefaultStream)
+{
+  spawn(
+    test_mixed_random_async_allocation_free, this->ref, default_max_size, rmm::cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test_mt, MixedRandomAllocationFreeStream)
+{
+  spawn(test_mixed_random_async_allocation_free, this->ref, default_max_size, this->stream.view());
+}
+
+void allocate_async_loop(async_resource_ref ref,
+                         std::size_t num_allocations,
+                         std::list<allocation>& allocations,
+                         std::mutex& mtx,
+                         cudaEvent_t& event,
+                         rmm::cuda_stream_view stream)
+{
+  constexpr std::size_t max_size{1_MiB};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  for (std::size_t i = 0; i < num_allocations; ++i) {
+    std::size_t size = size_distribution(generator);
+    void* ptr        = ref.allocate_async(size, stream);
+    {
+      std::lock_guard<std::mutex> lock(mtx);
+      RMM_CUDA_TRY(cudaEventRecord(event, stream.value()));
+      allocations.emplace_back(ptr, size);
+    }
+  }
+}
+
+void deallocate_async_loop(async_resource_ref ref,
+                           std::size_t num_allocations,
+                           std::list<allocation>& allocations,
+                           std::mutex& mtx,
+                           cudaEvent_t& event,
+                           rmm::cuda_stream_view stream)
+{
+  for (std::size_t i = 0; i < num_allocations;) {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (allocations.empty()) { continue; }
+    i++;
+    RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event));
+    allocation alloc = allocations.front();
+    allocations.pop_front();
+    ref.deallocate_async(alloc.ptr, alloc.size, stream);
+  }
+}
+
+void test_allocate_async_free_different_threads(async_resource_ref ref,
+                                                rmm::cuda_stream_view streamA,
+                                                rmm::cuda_stream_view streamB)
+{
+  constexpr std::size_t num_allocations{100};
+
+  std::mutex mtx;
+  std::list<allocation> allocations;
+  cudaEvent_t event;
+
+  RMM_CUDA_TRY(cudaEventCreate(&event));
+
+  std::thread producer(allocate_async_loop,
+                       ref,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       std::ref(event),
+                       streamA);
+
+  std::thread consumer(deallocate_async_loop,
+                       ref,
+                       num_allocations,
+                       std::ref(allocations),
+                       std::ref(mtx),
+                       std::ref(event),
+                       streamB);
+
+  producer.join();
+  consumer.join();
+
+  RMM_CUDA_TRY(cudaEventDestroy(event));
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDefaultStream)
+{
+  test_allocate_async_free_different_threads(
+    this->ref, rmm::cuda_stream_default, rmm::cuda_stream_default);
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsPerThreadDefaultStream)
+{
+  test_allocate_async_free_different_threads(
+    this->ref, rmm::cuda_stream_per_thread, rmm::cuda_stream_per_thread);
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsSameStream)
+{
+  test_allocate_async_free_different_threads(this->ref, this->stream, this->stream);
+}
+
+TEST_P(mr_ref_test_mt, AllocFreeDifferentThreadsDifferentStream)
+{
+  rmm::cuda_stream streamB;
+  test_allocate_async_free_different_threads(this->ref, this->stream, streamB);
+  streamB.synchronize();
+}
+
+}  // namespace
+}  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
new file mode 100644
index 000000000..ecb13c2a7
--- /dev/null
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../../byte_literals.hpp"
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/aligned.hpp>
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/binning_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/fixed_size_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cuda_runtime_api.h>
+
+#include <cuda/memory_resource>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <utility>
+
+using resource_ref       = cuda::mr::resource_ref<cuda::mr::device_accessible>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+
+namespace rmm::test {
+
+/**
+ * @brief Returns if a pointer points to a device memory or managed memory
+ * allocation.
+ */
+inline bool is_device_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
+  return attributes.memoryType == cudaMemoryTypeDevice;
+#else
+  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
+#endif
+}
+
+enum size_in_bytes : size_t {};
+
+constexpr auto default_num_allocations{100};
+constexpr size_in_bytes default_max_size{5_MiB};
+
+struct allocation {
+  void* ptr{nullptr};
+  std::size_t size{0};
+  allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {}
+  allocation() = default;
+};
+
+// Various test functions, shared between single-threaded and multithreaded tests.
+inline void test_allocate(resource_ref ref, std::size_t bytes)
+{
+  try {
+    void* ptr = ref.allocate(bytes);
+    EXPECT_NE(nullptr, ptr);
+    EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr));
+    EXPECT_TRUE(is_device_memory(ptr));
+    ref.deallocate(ptr, bytes);
+  } catch (rmm::out_of_memory const& e) {
+    EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+  }
+}
+
+inline void test_allocate_async(async_resource_ref ref,
+                                std::size_t bytes,
+                                cuda_stream_view stream = {})
+{
+  try {
+    void* ptr = ref.allocate_async(bytes, stream);
+    if (not stream.is_default()) { stream.synchronize(); }
+    EXPECT_NE(nullptr, ptr);
+    EXPECT_TRUE(rmm::detail::is_pointer_aligned(ptr));
+    EXPECT_TRUE(is_device_memory(ptr));
+    ref.deallocate_async(ptr, bytes, stream);
+    if (not stream.is_default()) { stream.synchronize(); }
+  } catch (rmm::out_of_memory const& e) {
+    EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+  }
+}
+
+// Simple reproducer for https://github.com/rapidsai/rmm/issues/861
+inline void concurrent_allocations_are_different(resource_ref ref)
+{
+  const auto size{8_B};
+  void* ptr1 = ref.allocate(size);
+  void* ptr2 = ref.allocate(size);
+
+  EXPECT_NE(ptr1, ptr2);
+
+  ref.deallocate(ptr1, size);
+  ref.deallocate(ptr2, size);
+}
+
+inline void concurrent_async_allocations_are_different(async_resource_ref ref,
+                                                       cuda_stream_view stream)
+{
+  const auto size{8_B};
+  void* ptr1 = ref.allocate_async(size, stream);
+  void* ptr2 = ref.allocate_async(size, stream);
+
+  EXPECT_NE(ptr1, ptr2);
+
+  ref.deallocate_async(ptr1, size, stream);
+  ref.deallocate_async(ptr2, size, stream);
+}
+
+inline void test_various_allocations(resource_ref ref)
+{
+  // test allocating zero bytes on non-default stream
+  {
+    void* ptr = ref.allocate(0);
+    EXPECT_NO_THROW(ref.deallocate(ptr, 0));
+  }
+
+  test_allocate(ref, 4_B);
+  test_allocate(ref, 1_KiB);
+  test_allocate(ref, 1_MiB);
+  test_allocate(ref, 1_GiB);
+
+  // should fail to allocate too much
+  {
+    void* ptr{nullptr};
+    EXPECT_THROW(ptr = ref.allocate(1_PiB), rmm::out_of_memory);
+    EXPECT_EQ(nullptr, ptr);
+
+    // test e.what();
+    try {
+      ptr = ref.allocate(1_PiB);
+    } catch (rmm::out_of_memory const& e) {
+      EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+    }
+  }
+}
+
+inline void test_various_async_allocations(async_resource_ref ref, cuda_stream_view stream)
+{
+  // test allocating zero bytes on non-default stream
+  {
+    void* ptr = ref.allocate_async(0, stream);
+    stream.synchronize();
+    EXPECT_NO_THROW(ref.deallocate_async(ptr, 0, stream));
+    stream.synchronize();
+  }
+
+  test_allocate_async(ref, 4_B, stream);
+  test_allocate_async(ref, 1_KiB, stream);
+  test_allocate_async(ref, 1_MiB, stream);
+  test_allocate_async(ref, 1_GiB, stream);
+
+  // should fail to allocate too much
+  {
+    void* ptr{nullptr};
+    EXPECT_THROW(ptr = ref.allocate_async(1_PiB, stream), rmm::out_of_memory);
+    EXPECT_EQ(nullptr, ptr);
+
+    // test e.what();
+    try {
+      ptr = ref.allocate_async(1_PiB, stream);
+    } catch (rmm::out_of_memory const& e) {
+      EXPECT_NE(std::string{e.what()}.find("out_of_memory"), std::string::npos);
+    }
+  }
+}
+
+inline void test_random_allocations(resource_ref ref,
+                                    std::size_t num_allocations = default_num_allocations,
+                                    size_in_bytes max_size      = default_max_size)
+{
+  std::vector<allocation> allocations(num_allocations);
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, max_size);
+
+  // num_allocations allocations from [0,max_size)
+  std::for_each(
+    allocations.begin(), allocations.end(), [&generator, &distribution, &ref](allocation& alloc) {
+      alloc.size = distribution(generator);
+      EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size));
+      EXPECT_NE(nullptr, alloc.ptr);
+      EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr));
+    });
+
+  std::for_each(allocations.begin(), allocations.end(), [&ref](allocation& alloc) {
+    EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size));
+  });
+}
+
+inline void test_random_async_allocations(async_resource_ref ref,
+                                          std::size_t num_allocations = default_num_allocations,
+                                          size_in_bytes max_size      = default_max_size,
+                                          cuda_stream_view stream     = {})
+{
+  std::vector<allocation> allocations(num_allocations);
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, max_size);
+
+  // num_allocations allocations from [0,max_size)
+  std::for_each(allocations.begin(),
+                allocations.end(),
+                [&generator, &distribution, &ref, stream](allocation& alloc) {
+                  alloc.size = distribution(generator);
+                  EXPECT_NO_THROW(alloc.ptr = ref.allocate(alloc.size));
+                  if (not stream.is_default()) { stream.synchronize(); }
+                  EXPECT_NE(nullptr, alloc.ptr);
+                  EXPECT_TRUE(rmm::detail::is_pointer_aligned(alloc.ptr));
+                });
+
+  std::for_each(allocations.begin(), allocations.end(), [stream, &ref](allocation& alloc) {
+    EXPECT_NO_THROW(ref.deallocate(alloc.ptr, alloc.size));
+    if (not stream.is_default()) { stream.synchronize(); }
+  });
+}
+
+inline void test_mixed_random_allocation_free(resource_ref ref,
+                                              size_in_bytes max_size = default_max_size)
+{
+  std::default_random_engine generator;
+  constexpr std::size_t num_allocations{100};
+
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  constexpr int allocation_probability{53};  // percent
+  constexpr int max_probability{99};
+  std::uniform_int_distribution<int> op_distribution(0, max_probability);
+  std::uniform_int_distribution<int> index_distribution(0, num_allocations - 1);
+
+  std::size_t active_allocations{0};
+  std::size_t allocation_count{0};
+
+  std::vector<allocation> allocations;
+
+  for (std::size_t i = 0; i < num_allocations * 2; ++i) {
+    bool do_alloc = true;
+    if (active_allocations > 0) {
+      int chance = op_distribution(generator);
+      do_alloc   = (chance < allocation_probability) && (allocation_count < num_allocations);
+    }
+
+    if (do_alloc) {
+      std::size_t size = size_distribution(generator);
+      active_allocations++;
+      allocation_count++;
+      EXPECT_NO_THROW(allocations.emplace_back(ref.allocate(size), size));
+      auto new_allocation = allocations.back();
+      EXPECT_NE(nullptr, new_allocation.ptr);
+      EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr));
+    } else {
+      auto const index = static_cast<int>(index_distribution(generator) % active_allocations);
+      active_allocations--;
+      allocation to_free = allocations[index];
+      allocations.erase(std::next(allocations.begin(), index));
+      EXPECT_NO_THROW(ref.deallocate(to_free.ptr, to_free.size));
+    }
+  }
+
+  EXPECT_EQ(active_allocations, 0);
+  EXPECT_EQ(allocations.size(), active_allocations);
+}
+
+inline void test_mixed_random_async_allocation_free(async_resource_ref ref,
+                                                    size_in_bytes max_size  = default_max_size,
+                                                    cuda_stream_view stream = {})
+{
+  std::default_random_engine generator;
+  constexpr std::size_t num_allocations{100};
+
+  std::uniform_int_distribution<std::size_t> size_distribution(1, max_size);
+
+  constexpr int allocation_probability{53};  // percent
+  constexpr int max_probability{99};
+  std::uniform_int_distribution<int> op_distribution(0, max_probability);
+  std::uniform_int_distribution<int> index_distribution(0, num_allocations - 1);
+
+  std::size_t active_allocations{0};
+  std::size_t allocation_count{0};
+
+  std::vector<allocation> allocations;
+
+  for (std::size_t i = 0; i < num_allocations * 2; ++i) {
+    bool do_alloc = true;
+    if (active_allocations > 0) {
+      int chance = op_distribution(generator);
+      do_alloc   = (chance < allocation_probability) && (allocation_count < num_allocations);
+    }
+
+    if (do_alloc) {
+      std::size_t size = size_distribution(generator);
+      active_allocations++;
+      allocation_count++;
+      EXPECT_NO_THROW(allocations.emplace_back(ref.allocate_async(size, stream), size));
+      auto new_allocation = allocations.back();
+      EXPECT_NE(nullptr, new_allocation.ptr);
+      EXPECT_TRUE(rmm::detail::is_pointer_aligned(new_allocation.ptr));
+    } else {
+      auto const index = static_cast<int>(index_distribution(generator) % active_allocations);
+      active_allocations--;
+      allocation to_free = allocations[index];
+      allocations.erase(std::next(allocations.begin(), index));
+      EXPECT_NO_THROW(ref.deallocate_async(to_free.ptr, to_free.size, stream));
+    }
+  }
+
+  EXPECT_EQ(active_allocations, 0);
+  EXPECT_EQ(allocations.size(), active_allocations);
+}
+
+using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
+
+/// Encapsulates a `device_memory_resource` factory function and associated name
+struct mr_factory {
+  mr_factory(std::string name, MRFactoryFunc factory)
+    : name{std::move(name)}, factory{std::move(factory)}
+  {
+  }
+
+  std::string name;       ///< Name to associate with tests that use this factory
+  MRFactoryFunc factory;  ///< Factory function that returns shared_ptr to `device_memory_resource`
+                          ///< instance to use in test
+};
+
+/// Test fixture class value-parameterized on different `mr_factory`s
+struct mr_ref_test : public ::testing::TestWithParam<mr_factory> {
+  void SetUp() override
+  {
+    auto factory = GetParam().factory;
+    mr           = factory();
+    if (mr == nullptr) {
+      GTEST_SKIP() << "Skipping tests since the memory resource is not supported with this CUDA "
+                   << "driver/runtime version";
+    }
+    ref = async_resource_ref{*mr};
+  }
+
+  std::shared_ptr<rmm::mr::device_memory_resource> mr;  ///< Pointer to resource to use in tests
+  async_resource_ref ref{*mr};
+  rmm::cuda_stream stream{};
+};
+
+struct mr_ref_allocation_test : public mr_ref_test {};
+
+/// MR factory functions
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+inline auto make_cuda_async()
+{
+  if (rmm::detail::async_alloc::is_supported()) {
+    return std::make_shared<rmm::mr::cuda_async_memory_resource>();
+  }
+  return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
+}
+
+inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+}
+
+inline auto make_arena()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+}
+
+inline auto make_fixed_size()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::fixed_size_memory_resource>(make_cuda());
+}
+
+inline auto make_binning()
+{
+  auto pool = make_pool();
+  // Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
+  // Larger allocations will use the pool resource
+  auto const bin_range_start{18};
+  auto const bin_range_end{22};
+
+  auto mr = rmm::mr::make_owning_wrapper<rmm::mr::binning_memory_resource>(
+    pool, bin_range_start, bin_range_end);
+  return mr;
+}
+
+}  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp
new file mode 100644
index 000000000..2507d468a
--- /dev/null
+++ b/tests/mr/device/mr_ref_tests.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mr_ref_test.hpp"
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cuda/memory_resource>
+
+#include <gtest/gtest.h>
+
+namespace rmm::test {
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(ResourceTests,
+                         mr_ref_test,
+                         ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                           mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                           mr_factory{"Managed", &make_managed},
+                                           mr_factory{"Pool", &make_pool},
+                                           mr_factory{"Arena", &make_arena},
+                                           mr_factory{"Binning", &make_binning},
+                                           mr_factory{"Fixed_Size", &make_fixed_size}),
+                         [](auto const& info) { return info.param.name; });
+
+// Leave out fixed-size MR here because it can't handle the dynamic allocation sizes
+INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests,
+                         mr_ref_allocation_test,
+                         ::testing::Values(mr_factory{"CUDA", &make_cuda},
+#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
+                                           mr_factory{"CUDA_Async", &make_cuda_async},
+#endif
+                                           mr_factory{"Managed", &make_managed},
+                                           mr_factory{"Pool", &make_pool},
+                                           mr_factory{"Arena", &make_arena},
+                                           mr_factory{"Binning", &make_binning}),
+                         [](auto const& info) { return info.param.name; });
+TEST_P(mr_ref_test, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); }
+
+// Simple reproducer for https://github.com/rapidsai/rmm/issues/861
+TEST_P(mr_ref_test, AllocationsAreDifferent) { concurrent_allocations_are_different(this->ref); }
+
+TEST_P(mr_ref_test, AsyncAllocationsAreDifferentDefaultStream)
+{
+  concurrent_async_allocations_are_different(this->ref, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_test, AsyncAllocationsAreDifferent)
+{
+  concurrent_async_allocations_are_different(this->ref, this->stream);
+}
+
+TEST_P(mr_ref_allocation_test, AllocateDefault) { test_various_allocations(this->ref); }
+
+TEST_P(mr_ref_allocation_test, AllocateDefaultStream)
+{
+  test_various_async_allocations(this->ref, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_allocation_test, AllocateOnStream)
+{
+  test_various_async_allocations(this->ref, this->stream);
+}
+
+TEST_P(mr_ref_allocation_test, RandomAllocations) { test_random_allocations(this->ref); }
+
+TEST_P(mr_ref_allocation_test, RandomAllocationsDefaultStream)
+{
+  test_random_async_allocations(
+    this->ref, default_num_allocations, default_max_size, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_allocation_test, RandomAllocationsStream)
+{
+  test_random_async_allocations(this->ref, default_num_allocations, default_max_size, this->stream);
+}
+
+TEST_P(mr_ref_allocation_test, MixedRandomAllocationFree)
+{
+  test_mixed_random_allocation_free(this->ref, default_max_size);
+}
+
+TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeDefaultStream)
+{
+  test_mixed_random_async_allocation_free(this->ref, default_max_size, cuda_stream_view{});
+}
+
+TEST_P(mr_ref_allocation_test, MixedRandomAllocationFreeStream)
+{
+  test_mixed_random_async_allocation_free(this->ref, default_max_size, this->stream);
+}
+
+}  // namespace
+}  // namespace rmm::test
diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp
new file mode 100644
index 000000000..568e18576
--- /dev/null
+++ b/tests/mr/host/mr_ref_tests.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../byte_literals.hpp"
+
+#include <rmm/detail/aligned.hpp>
+#include <rmm/mr/host/host_memory_resource.hpp>
+#include <rmm/mr/host/new_delete_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cuda/memory_resource>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <deque>
+#include <random>
+
+namespace rmm::test {
+namespace {
+inline bool is_aligned(void* ptr, std::size_t alignment = alignof(std::max_align_t))
+{
+  return rmm::detail::is_pointer_aligned(ptr, alignment);
+}
+
+// Returns true if a pointer points to a device memory or managed memory allocation.
+inline bool is_device_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
+  return attributes.memoryType == cudaMemoryTypeDevice;
+#else
+  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
+#endif
+}
+
+/**
+ * @brief Returns if a pointer `p` points to pinned host memory.
+ */
+inline bool is_pinned_memory(void* ptr)
+{
+  cudaPointerAttributes attributes{};
+  if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
+  return attributes.type == cudaMemoryTypeHost;
+}
+
+constexpr std::size_t size_word{4_B};
+constexpr std::size_t size_kb{1_KiB};
+constexpr std::size_t size_mb{1_MiB};
+constexpr std::size_t size_gb{1_GiB};
+constexpr std::size_t size_pb{1_PiB};
+
+struct allocation {
+  void* ptr{nullptr};
+  std::size_t size{0};
+  allocation(void* ptr, std::size_t size) : ptr{ptr}, size{size} {}
+  allocation() = default;
+};
+}  // namespace
+
+template <typename MemoryResourceType>
+struct MRRefTest : public ::testing::Test {
+  MemoryResourceType mr;
+  cuda::mr::resource_ref<cuda::mr::host_accessible> ref;
+
+  MRRefTest() : mr{}, ref{mr} {}
+};
+
+using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>,
+              "");
+
+TYPED_TEST_CASE(MRRefTest, resources);
+
+TYPED_TEST(MRRefTest, SelfEquality) { EXPECT_TRUE(this->ref == this->ref); }
+
+TYPED_TEST(MRRefTest, AllocateZeroBytes)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(0));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, 0));
+}
+
+TYPED_TEST(MRRefTest, AllocateWord)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_word));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_word));
+}
+
+TYPED_TEST(MRRefTest, AllocateKB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_kb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_kb));
+}
+
+TYPED_TEST(MRRefTest, AllocateMB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_mb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_mb));
+}
+
+TYPED_TEST(MRRefTest, AllocateGB)
+{
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = this->ref.allocate(size_gb));
+  EXPECT_NE(nullptr, ptr);
+  EXPECT_TRUE(is_aligned(ptr));
+  EXPECT_FALSE(is_device_memory(ptr));
+  EXPECT_NO_THROW(this->ref.deallocate(ptr, size_gb));
+}
+
+TYPED_TEST(MRRefTest, AllocateTooMuch)
+{
+  void* ptr{nullptr};
+  EXPECT_THROW(ptr = this->ref.allocate(size_pb), std::bad_alloc);
+  EXPECT_EQ(nullptr, ptr);
+}
+
+TYPED_TEST(MRRefTest, RandomAllocations)
+{
+  constexpr std::size_t num_allocations{100};
+  std::vector<allocation> allocations(num_allocations);
+
+  constexpr std::size_t MAX_ALLOCATION_SIZE{5 * size_mb};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<std::size_t> distribution(1, MAX_ALLOCATION_SIZE);
+
+  // 100 allocations from [0,5MB)
+  std::for_each(
+    allocations.begin(), allocations.end(), [&generator, &distribution, this](allocation& alloc) {
+      alloc.size = distribution(generator);
+      EXPECT_NO_THROW(alloc.ptr = this->ref.allocate(alloc.size));
+      EXPECT_NE(nullptr, alloc.ptr);
+      EXPECT_TRUE(is_aligned(alloc.ptr));
+    });
+
+  std::for_each(allocations.begin(), allocations.end(), [this](allocation& alloc) {
+    EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size));
+  });
+}
+
+TYPED_TEST(MRRefTest, MixedRandomAllocationFree)
+{
+  std::default_random_engine generator;
+
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  // How often a free will occur. For example, if `1`, then every allocation
+  // will immediately be free'd. Or, if 4, on average, a free will occur after
+  // every 4th allocation
+  constexpr std::size_t FREE_FREQUENCY{4};
+  std::uniform_int_distribution<int> free_distribution(1, FREE_FREQUENCY);
+
+  std::deque<allocation> allocations;
+
+  constexpr std::size_t num_allocations{100};
+  for (std::size_t i = 0; i < num_allocations; ++i) {
+    std::size_t allocation_size = size_distribution(generator);
+    EXPECT_NO_THROW(allocations.emplace_back(this->ref.allocate(allocation_size), allocation_size));
+    auto new_allocation = allocations.back();
+    EXPECT_NE(nullptr, new_allocation.ptr);
+    EXPECT_TRUE(is_aligned(new_allocation.ptr));
+
+    bool const free_front{free_distribution(generator) == free_distribution.max()};
+
+    if (free_front) {
+      auto front = allocations.front();
+      EXPECT_NO_THROW(this->ref.deallocate(front.ptr, front.size));
+      allocations.pop_front();
+    }
+  }
+  // free any remaining allocations
+  for (auto alloc : allocations) {
+    EXPECT_NO_THROW(this->ref.deallocate(alloc.ptr, alloc.size));
+    allocations.pop_front();
+  }
+}
+
+static constexpr std::size_t MinTestedAlignment{16};
+static constexpr std::size_t MaxTestedAlignment{4096};
+static constexpr std::size_t TestedAlignmentMultiplier{2};
+static constexpr std::size_t NUM_TRIALS{100};
+
+TYPED_TEST(MRRefTest, AlignmentTest)
+{
+  std::default_random_engine generator(0);
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) {
+    for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+         alignment *= TestedAlignmentMultiplier) {
+      auto allocation_size = size_distribution(generator);
+      void* ptr{nullptr};
+      EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, alignment));
+      EXPECT_TRUE(is_aligned(ptr, alignment));
+      EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, alignment));
+    }
+  }
+}
+
+TYPED_TEST(MRRefTest, UnsupportedAlignmentTest)
+{
+  std::default_random_engine generator(0);
+  constexpr std::size_t MAX_ALLOCATION_SIZE{10 * size_mb};
+  std::uniform_int_distribution<std::size_t> size_distribution(1, MAX_ALLOCATION_SIZE);
+
+  for (std::size_t num_trials = 0; num_trials < NUM_TRIALS; ++num_trials) {
+    for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+         alignment *= TestedAlignmentMultiplier) {
+      auto allocation_size = size_distribution(generator);
+      void* ptr{nullptr};
+      // An unsupported alignment (like an odd number) should result in an
+      // alignment of `alignof(std::max_align_t)`
+      auto const bad_alignment = alignment + 1;
+      EXPECT_NO_THROW(ptr = this->ref.allocate(allocation_size, bad_alignment));
+      EXPECT_TRUE(is_aligned(ptr, alignof(std::max_align_t)));
+      EXPECT_NO_THROW(this->ref.deallocate(ptr, allocation_size, bad_alignment));
+    }
+  }
+}
+
+TEST(PinnedResource, isPinned)
+{
+  rmm::mr::pinned_memory_resource mr;
+  cuda::mr::resource_ref<cuda::mr::host_accessible> ref{mr};
+  void* ptr{nullptr};
+  EXPECT_NO_THROW(ptr = ref.allocate(100));
+  EXPECT_TRUE(is_pinned_memory(ptr));
+  EXPECT_NO_THROW(ref.deallocate(ptr, 100));
+}
+}  // namespace rmm::test
diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp
index 1cd59f5a6..ff976ff96 100644
--- a/tests/mr/host/mr_tests.cpp
+++ b/tests/mr/host/mr_tests.cpp
@@ -23,6 +23,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cuda/memory_resource>
+
 #include <gtest/gtest.h>
 
 #include <cstddef>
@@ -80,6 +82,9 @@ struct MRTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>,
+              "");
 
 TYPED_TEST_CASE(MRTest, resources);
 

From 9ceab835b23a1b30f91a0adf4dcf709cc7312a44 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 14:14:48 +0200
Subject: [PATCH 07/30] Make `pinned_memory_resource` usable for
 `pool_memory_resource`

---
 .../rmm/mr/device/pool_memory_resource.hpp    | 69 ++++++++++++-
 .../rmm/mr/host/pinned_memory_resource.hpp    | 74 ++++++++++++++
 tests/CMakeLists.txt                          |  3 +
 tests/mr/device/pool_mr_tests.cpp             | 51 ++++++++++
 tests/mr/host/pinned_pool_mr_tests.cpp        | 96 +++++++++++++++++++
 5 files changed, 290 insertions(+), 3 deletions(-)
 create mode 100644 tests/mr/host/pinned_pool_mr_tests.cpp

diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index f6d3710e9..7cca60613 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -51,6 +51,35 @@ namespace rmm::mr {
  * @{
  * @file
  */
+namespace detail {
+/**
+ * @brief A helper class to remove the device_accessible property
+ *
+ * We want to be able to use the pool_memory_resource with an upstream that may not
+ * be device accessible. To avoid rewriting the world, we allow to conditionally remove
+ * the cuda::mr::device_accessible property.
+ *
+ * @tparam PoolResource the pool_memory_resource class
+ * @tparam Upstream memory_resource to use for allocating the pool.
+ */
+template <class PoolResource, class Upstream, class = void>
+struct maybe_remove_property {};
+
+/**
+ * @brief Specialization of maybe_remove_property to not propagate non existing properties
+ */
+template <class PoolResource, class Upstream>
+struct maybe_remove_property<
+  PoolResource,
+  Upstream,
+  cuda::std::enable_if_t<!cuda::has_property<Upstream, cuda::mr::device_accessible>>> {
+  /**
+   * @brief Explicit removal of the friend function so we do not pretent to provide device
+   * accessible memory
+   */
+  friend void get_property(const PoolResource&, cuda::mr::device_accessible) = delete;
+};
+}  // namespace detail
 
 /**
  * @brief A coalescing best-fit suballocator which uses a pool of memory allocated from
@@ -64,8 +93,10 @@ namespace rmm::mr {
  */
 template <typename Upstream>
 class pool_memory_resource final
-  : public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
-                                                  detail::coalescing_free_list> {
+  : public detail::maybe_remove_property<pool_memory_resource<Upstream>, Upstream>,
+    public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
+                                                  detail::coalescing_free_list>,
+    public cuda::forward_property<pool_memory_resource<Upstream>, Upstream> {
  public:
   friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
                                                       detail::coalescing_free_list>;
@@ -104,6 +135,31 @@ class pool_memory_resource final
     initialize_pool(initial_pool_size, maximum_pool_size);
   }
 
+  /**
+   * @brief Construct a `pool_memory_resource` and allocate the initial device memory pool using
+   * `upstream_mr`.
+   *
+   * @throws rmm::logic_error if `upstream_mr == nullptr`
+   * @throws rmm::logic_error if `initial_pool_size` is neither the default nor aligned to a
+   * multiple of pool_memory_resource::allocation_alignment bytes.
+   * @throws rmm::logic_error if `maximum_pool_size` is neither the default nor aligned to a
+   * multiple of pool_memory_resource::allocation_alignment bytes.
+   *
+   * @param upstream_mr The memory_resource from which to allocate blocks for the pool.
+   * @param initial_pool_size Minimum size, in bytes, of the initial pool. Defaults to half of the
+   * available memory on the current device.
+   * @param maximum_pool_size Maximum size, in bytes, that the pool can grow to. Defaults to all
+   * of the available memory on the current device.
+   */
+  template <typename Upstream2                                               = Upstream,
+            cuda::std::enable_if_t<cuda::mr::async_resource<Upstream2>, int> = 0>
+  explicit pool_memory_resource(Upstream2& upstream_mr,
+                                thrust::optional<std::size_t> initial_pool_size = thrust::nullopt,
+                                thrust::optional<std::size_t> maximum_pool_size = thrust::nullopt)
+    : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size)
+  {
+  }
+
   /**
    * @brief Destroy the `pool_memory_resource` and deallocate all memory it allocated using
    * the upstream resource.
@@ -131,6 +187,13 @@ class pool_memory_resource final
    */
   [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; }
 
+  /**
+   * @brief Get the upstream memory_resource object.
+   *
+   * @return const reference to the upstream memory resource.
+   */
+  [[nodiscard]] const Upstream& upstream_resource() const noexcept { return *upstream_mr_; }
+
   /**
    * @brief Get the upstream memory_resource object.
    *
@@ -296,7 +359,7 @@ class pool_memory_resource final
     if (size == 0) { return {}; }
 
     try {
-      void* ptr = get_upstream()->allocate(size, stream);
+      void* ptr = get_upstream()->allocate_async(size, stream);
       return thrust::optional<block_type>{
         *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first};
     } catch (std::exception const& e) {
diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp
index 514cc1664..5da2d8b7b 100644
--- a/include/rmm/mr/host/pinned_memory_resource.hpp
+++ b/include/rmm/mr/host/pinned_memory_resource.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/host/host_memory_resource.hpp>
@@ -46,6 +47,78 @@ class pinned_memory_resource final : public host_memory_resource {
   pinned_memory_resource& operator=(pinned_memory_resource&&) =
     default;  ///< @default_move_assignment{pinned_memory_resource}
 
+  /**
+   * @brief Query whether the pinned_memory_resource supports use of non-null CUDA streams for
+   * allocation/deallocation.
+   *
+   * @returns bool false.
+   */
+  [[nodiscard]] bool supports_streams() const noexcept { return false; }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return bool false.
+   */
+  [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; }
+
+  /**
+   * @brief Queries the amount of free and total memory for the resource.
+   *
+   * @param stream the stream whose memory manager we want to retrieve
+   *
+   * @returns a pair containing the free memory in bytes in .first and total amount of memory in
+   * .second
+   */
+  [[nodiscard]] std::pair<std::size_t, std::size_t> get_mem_info(cuda_stream_view stream) const
+  {
+    return std::make_pair(0, 0);
+  }
+
+  /**
+   * @brief Pretent to support the allocate_async interface, falling back to stream 0
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @param alignment The expected alignment of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  [[nodiscard]] void* allocate_async(std::size_t bytes, std::size_t alignment, cuda_stream_view)
+  {
+    return do_allocate(bytes, alignment);
+  }
+
+  /**
+   * @brief Pretent to support the allocate_async interface, falling back to stream 0
+   *
+   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * the specified `stream`.
+   *
+   * @param bytes The size of the allocation
+   * @return void* Pointer to the newly allocated memory
+   */
+  [[nodiscard]] void* allocate_async(std::size_t bytes, cuda_stream_view)
+  {
+    return do_allocate(bytes);
+  }
+
+  /**
+   * @brief Pretent to support the deallocate_async interface, falling back to stream 0
+   *
+   * @throws Nothing.
+   *
+   * @param ptr Pointer to be deallocated
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * @param alignment The alignment that was passed to the `allocate` call that returned `p`
+   */
+  void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, cuda_stream_view)
+  {
+    do_deallocate(ptr, rmm::detail::align_up(bytes, alignment));
+  }
+
  private:
   /**
    * @brief Allocates pinned memory on the host of size at least `bytes` bytes.
@@ -101,5 +174,6 @@ class pinned_memory_resource final : public host_memory_resource {
       ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); });
   }
 };
+static_assert(cuda::mr::async_resource<pinned_memory_resource>);
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 80e2b846b..4e9866905 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -169,6 +169,9 @@ ConfigureTest(HOST_MR_TEST mr/host/mr_tests.cpp)
 # host mr_ref tests
 ConfigureTest(HOST_MR_REF_TEST mr/host/mr_ref_tests.cpp)
 
+# pinned pool mr tests
+ConfigureTest(PINNED_POOL_MR_TEST mr/host/pinned_pool_mr_tests.cpp)
+
 # cuda stream tests
 ConfigureTest(CUDA_STREAM_TEST cuda_stream_tests.cpp cuda_stream_pool_tests.cpp)
 
diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp
index 4a234d2f9..2f32889d0 100644
--- a/tests/mr/device/pool_mr_tests.cpp
+++ b/tests/mr/device/pool_mr_tests.cpp
@@ -55,6 +55,18 @@ TEST(PoolTest, ThrowMaxLessThanInitial)
   EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
 }
 
+TEST(PoolTest, ReferenceThrowMaxLessThanInitial)
+{
+  // Make sure first argument is enough larger than the second that alignment rounding doesn't
+  // make them equal
+  auto max_less_than_initial = []() {
+    const auto initial{1024};
+    const auto maximum{256};
+    pool_mr mr{*rmm::mr::get_current_device_resource(), initial, maximum};
+  };
+  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
+}
+
 TEST(PoolTest, AllocateNinetyPercent)
 {
   auto allocate_ninety = []() {
@@ -190,4 +202,43 @@ TEST(PoolTest, MultidevicePool)
 }
 
 }  // namespace
+
+namespace test_properties {
+class fake_async_resource {
+ public:
+  // To model `async_resource`
+  void* allocate(std::size_t, std::size_t) { return nullptr; }
+  void deallocate(void* ptr, std::size_t, std::size_t) {}
+  void* allocate_async(std::size_t, std::size_t, cuda::stream_ref) { return nullptr; }
+  void deallocate_async(void* ptr, std::size_t, std::size_t, cuda::stream_ref) {}
+
+  bool operator==(const fake_async_resource& other) const { return true; }
+  bool operator!=(const fake_async_resource& other) const { return false; }
+
+  // To model stream_resource
+  [[nodiscard]] bool supports_streams() const noexcept { return false; }
+  [[nodiscard]] bool supports_get_mem_info() const noexcept { return false; }
+
+ private:
+  void* do_allocate(std::size_t bytes, cuda_stream_view) { return nullptr; }
+  void do_deallocate(void* ptr, std::size_t, cuda_stream_view) {}
+  [[nodiscard]] bool do_is_equal(fake_async_resource const& other) const noexcept { return true; }
+};
+static_assert(!cuda::has_property<fake_async_resource, cuda::mr::device_accessible>);
+static_assert(!cuda::has_property<rmm::mr::pool_memory_resource<fake_async_resource>,
+                                  cuda::mr::device_accessible>);
+
+// Ensure that we forward the property if it is there
+class fake_async_resource_device_accessible : public fake_async_resource {
+  friend void get_property(const fake_async_resource_device_accessible&,
+                           cuda::mr::device_accessible)
+  {
+  }
+};
+static_assert(
+  cuda::has_property<fake_async_resource_device_accessible, cuda::mr::device_accessible>);
+static_assert(
+  cuda::has_property<rmm::mr::pool_memory_resource<fake_async_resource_device_accessible>,
+                     cuda::mr::device_accessible>);
+}  // namespace test_properties
 }  // namespace rmm::test
diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp
new file mode 100644
index 000000000..8e202ada3
--- /dev/null
+++ b/tests/mr/host/pinned_pool_mr_tests.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/detail/aligned.hpp>
+#include <rmm/detail/cuda_util.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+// explicit instantiation for test coverage purposes
+template class rmm::mr::pool_memory_resource<rmm::mr::pinned_memory_resource>;
+
+namespace rmm::test {
+namespace {
+using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_memory_resource>;
+
+TEST(PinnedPoolTest, ThrowOnNullUpstream)
+{
+  auto construct_nullptr = []() { pool_mr mr{nullptr}; };
+  EXPECT_THROW(construct_nullptr(), rmm::logic_error);
+}
+
+TEST(PinnedPoolTest, ThrowMaxLessThanInitial)
+{
+  // Make sure first argument is enough larger than the second that alignment rounding doesn't
+  // make them equal
+  auto max_less_than_initial = []() {
+    rmm::mr::pinned_memory_resource pinned_mr{};
+    const auto initial{1024};
+    const auto maximum{256};
+    pool_mr mr{&pinned_mr, initial, maximum};
+  };
+  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
+}
+
+TEST(PinnedPoolTest, ReferenceThrowMaxLessThanInitial)
+{
+  // Make sure first argument is enough larger than the second that alignment rounding doesn't
+  // make them equal
+  auto max_less_than_initial = []() {
+    rmm::mr::pinned_memory_resource pinned_mr{};
+    const auto initial{1024};
+    const auto maximum{256};
+    pool_mr mr{pinned_mr, initial, maximum};
+  };
+  EXPECT_THROW(max_less_than_initial(), rmm::logic_error);
+}
+
+// Issue #527
+TEST(PinnedPoolTest, InitialAndMaxPoolSizeEqual)
+{
+  EXPECT_NO_THROW([]() {
+    rmm::mr::pinned_memory_resource pinned_mr{};
+    pool_mr mr(pinned_mr, 1000192, 1000192);
+    mr.allocate(1000);
+  }());
+}
+
+TEST(PinnedPoolTest, NonAlignedPoolSize)
+{
+  EXPECT_THROW(
+    []() {
+      rmm::mr::pinned_memory_resource pinned_mr{};
+      pool_mr mr(pinned_mr, 1000031, 1000192);
+      mr.allocate(1000);
+    }(),
+    rmm::logic_error);
+
+  EXPECT_THROW(
+    []() {
+      rmm::mr::pinned_memory_resource pinned_mr{};
+      pool_mr mr(pinned_mr, 1000192, 1000200);
+      mr.allocate(1000);
+    }(),
+    rmm::logic_error);
+}
+
+}  // namespace
+}  // namespace rmm::test

From c0df290ba4394cbab81b44da3bb6e2f00d6be17f Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 8 Sep 2023 14:15:09 +0200
Subject: [PATCH 08/30] Port `device_buffer` and `device_uvector` to the new
 interface

---
 include/rmm/device_buffer.hpp                 | 34 +++++++++-----
 include/rmm/device_uvector.hpp                | 27 +++++++----
 .../rmm/mr/device/device_memory_resource.hpp  | 25 +++++++++--
 include/rmm/mr/device/per_device_resource.hpp | 29 ++++++++++++
 tests/device_buffer_tests.cu                  | 45 ++++++++-----------
 tests/device_uvector_tests.cpp                |  5 ++-
 6 files changed, 112 insertions(+), 53 deletions(-)

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index a49f9caa9..811c8d9cb 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -27,6 +27,8 @@
 #include <stdexcept>
 #include <utility>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -38,7 +40,7 @@ namespace rmm {
  *
  * This class allocates untyped and *uninitialized* device memory using a
  * `device_memory_resource`. If not explicitly specified, the memory resource
- * returned from `get_current_device_resource()` is used.
+ * returned from `get_current_device_resource_ref()` is used.
  *
  * @note Unlike `std::vector` or `thrust::device_vector`, the device memory
  * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -79,6 +81,8 @@ namespace rmm {
  *```
  */
 class device_buffer {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+
  public:
   // The copy constructor and copy assignment operator without a stream are deleted because they
   // provide no way to specify an explicit stream
@@ -92,7 +96,7 @@ class device_buffer {
   // `__host__ __device__` specifiers to the defaulted constructor when it is called within the
   // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
   // device function. This causes warnings/errors because this ctor invokes host-only functions.
-  device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
+  device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
 
   /**
    * @brief Constructs a new device buffer of `size` uninitialized bytes
@@ -106,7 +110,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         mr::device_memory_resource* mr = mr::get_current_device_resource())
+                         async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -134,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                mr::device_memory_resource* mr = mr::get_current_device_resource())
+                async_resource_ref mr = mr::get_current_device_resource_ref())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -164,7 +168,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                async_resource_ref mr = mr::get_current_device_resource_ref())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -238,7 +242,6 @@ class device_buffer {
   ~device_buffer() noexcept
   {
     deallocate_async();
-    _mr     = nullptr;
     _stream = cuda_stream_view{};
   }
 
@@ -399,16 +402,23 @@ class device_buffer {
   /**
    * @briefreturn{Pointer to the memory resource used to allocate and deallocate}
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; }
+  [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
+
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_buffer` provides device accessible memory
+   */
+  friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {}
 
  private:
   void* _data{nullptr};        ///< Pointer to device memory allocation
   std::size_t _size{};         ///< Requested size of the device memory allocation
   std::size_t _capacity{};     ///< The actual size of the device memory allocation
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
-  mr::device_memory_resource* _mr{
-    mr::get_current_device_resource()};  ///< The memory resource used to
-                                         ///< allocate/deallocate device memory
+  async_resource_ref _mr{
+    mr::get_current_device_resource_ref()};  ///< The memory resource used to
+                                             ///< allocate/deallocate device memory
 
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
@@ -423,7 +433,7 @@ class device_buffer {
   {
     _size     = bytes;
     _capacity = bytes;
-    _data     = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr;
+    _data     = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
   }
 
   /**
@@ -437,7 +447,7 @@ class device_buffer {
    */
   void deallocate_async() noexcept
   {
-    if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); }
+    if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }
     _size     = 0;
     _capacity = 0;
     _data     = nullptr;
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index 930cda157..cfa6eebd4 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <vector>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -72,6 +74,7 @@ namespace rmm {
  */
 template <typename T>
 class device_uvector {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
   static_assert(std::is_trivially_copyable<T>::value,
                 "device_uvector only supports types that are trivially copyable.");
 
@@ -121,10 +124,9 @@ class device_uvector {
    * @param stream The stream on which to perform the allocation
    * @param mr The resource used to allocate the device storage
    */
-  explicit device_uvector(
-    std::size_t size,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(std::size_t size,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource_ref())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -138,10 +140,9 @@ class device_uvector {
    * @param stream The stream on which to perform the copy
    * @param mr The resource used to allocate device memory for the new vector
    */
-  explicit device_uvector(
-    device_uvector const& other,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(device_uvector const& other,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource_ref())
     : _storage{other._storage, stream, mr}
   {
   }
@@ -526,11 +527,19 @@ class device_uvector {
   /**
    * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage}
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept
+  [[nodiscard]] cuda::mr::async_resource_ref<cuda::mr::device_accessible> memory_resource()
+    const noexcept
   {
     return _storage.memory_resource();
   }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_uvector` provides device accessible memory
+   */
+  friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {}
+
   /**
    * @briefreturn{Stream most recently specified for allocation/deallocation}
    */
diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index cc1ae2c3f..92baf3f0a 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -193,7 +193,7 @@ class device_memory_resource {
    *
    * @throws Nothing.
    *
-   * @param p Pointer to be deallocated
+   * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
    * @param alignment The alignment that was passed to the `allocate` call that returned `p`
@@ -257,7 +257,7 @@ class device_memory_resource {
    *
    * @throws Nothing.
    *
-   * @param p Pointer to be deallocated
+   * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
    * @param alignment The alignment that was passed to the `allocate` call that returned `p`
@@ -284,7 +284,7 @@ class device_memory_resource {
    *
    * @throws Nothing.
    *
-   * @param p Pointer to be deallocated
+   * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
    * @param stream Stream on which to perform allocation
@@ -294,11 +294,25 @@ class device_memory_resource {
     do_deallocate(ptr, bytes, stream);
   }
 
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return true If the two resources are equivalent
+   * @return false If the two resources are not equivalent
+   */
   [[nodiscard]] bool operator==(device_memory_resource const& other) const noexcept
   {
     return do_is_equal(other);
   }
 
+  /**
+   * @brief Comparison operator with another device_memory_resource
+   *
+   * @param other The other resource to compare to
+   * @return false If the two resources are equivalent
+   * @return true If the two resources are not equivalent
+   */
   [[nodiscard]] bool operator!=(device_memory_resource const& other) const noexcept
   {
     return !do_is_equal(other);
@@ -332,6 +346,11 @@ class device_memory_resource {
     return do_get_mem_info(stream);
   }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_memory_resource` provides device accessible memory
+   */
   friend void get_property(device_memory_resource const&, cuda::mr::device_accessible) noexcept {}
 
  private:
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
index aa7217758..1c4108a25 100644
--- a/include/rmm/mr/device/per_device_resource.hpp
+++ b/include/rmm/mr/device/per_device_resource.hpp
@@ -24,6 +24,8 @@
 #include <map>
 #include <mutex>
 
+#include <cuda/memory_resource>
+
 /**
  * @file per_device_resource.hpp
  * @brief Management of per-device `device_memory_resource`s
@@ -233,4 +235,31 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc
 {
   return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
 }
+
+/**
+ * @brief Get the memory resource for the current device as a `cuda::mr::async_resource_ref`.
+ *
+ * Returns a `cuda::mr::async_resource_ref` from the memory_resource set for the current device.
+ * The initial resource is a `cuda_memory_resource`.
+ *
+ * The "current device" is the device returned by `cudaGetDevice`.
+ *
+ * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`,
+ * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`.
+ * Concurrent calls to any of these functions will result in a valid state, but the order of
+ * execution is undefined.
+ *
+ * @note The returned `cuda::mr::async_resource_ref` should only be used with the current CUDA
+ * device. Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned
+ * resource can result in undefined behavior. The behavior of a device_memory_resource is undefined
+ * if used while the active CUDA device is a different device from the one that was active when the
+ * device_memory_resource was created.
+ *
+ * @return cuda::mr::async_resource_ref of the resource for the current device
+ */
+inline cuda::mr::async_resource_ref<cuda::mr::device_accessible> get_current_device_resource_ref()
+{
+  return cuda::mr::async_resource_ref<cuda::mr::device_accessible>{get_current_device_resource()};
+}
+
 }  // namespace rmm::mr
diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu
index d4c34385e..56992a06c 100644
--- a/tests/device_buffer_tests.cu
+++ b/tests/device_buffer_tests.cu
@@ -57,6 +57,7 @@ struct DeviceBufferTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TYPED_TEST_CASE(DeviceBufferTest, resources);
 
@@ -73,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource)
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.ssize());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -84,30 +85,28 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource)
 {
-  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr);
+  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr);
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(&this->mr, buff.memory_resource());
-  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
+  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream)
 {
-  rmm::device_buffer buff(this->size, this->stream, &this->mr);
+  rmm::device_buffer buff(this->size, this->stream, this->mr);
   this->stream.synchronize();
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(&this->mr, buff.memory_resource());
-  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
+  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
@@ -119,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 
   // TODO check for equality between the contents of the two allocations
@@ -135,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
   buff.stream().synchronize();
   // TODO check for equality between the contents of the two allocations
@@ -148,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr)
   EXPECT_EQ(nullptr, buff.data());
   EXPECT_EQ(0, buff.size());
   EXPECT_EQ(0, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -174,8 +173,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   EXPECT_NE(buff.data(), buff_copy.data());
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
-  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
+  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref());
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -187,7 +185,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   // now use buff's stream and MR
   rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource());
   EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
-  EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource()));
+  EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
   EXPECT_EQ(buff_copy2.stream(), buff.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -217,8 +215,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize)
 
   // The capacity of the copy should be equal to the `size()` of the original
   EXPECT_EQ(new_size, buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
-  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
+  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref());
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -243,7 +240,6 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr)
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
-  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -275,7 +271,6 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr)
   EXPECT_EQ(new_size, buff_copy.capacity());
   EXPECT_NE(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
-  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -291,7 +286,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -309,7 +304,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_default, buff.stream());  // NOLINT(bugprone-use-after-move)
-  EXPECT_NE(nullptr, buff.memory_resource());          // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
@@ -319,7 +313,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -338,7 +332,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());  // NOLINT(bugprone-use-after-move)
-  EXPECT_NE(nullptr, buff.memory_resource());         // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
@@ -347,7 +340,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto* mr      = src.memory_resource();
+  auto mr       = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest;
@@ -366,7 +359,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
-  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignment)
@@ -375,7 +367,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto* mr      = src.memory_resource();
+  auto mr       = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr);
@@ -394,7 +386,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
-  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
@@ -403,7 +394,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   buff = std::move(buff);           // self-move-assignment shouldn't modify the buffer
diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
index 69d89e305..46023d519 100644
--- a/tests/device_uvector_tests.cpp
+++ b/tests/device_uvector_tests.cpp
@@ -30,14 +30,15 @@ struct TypedUVectorTest : ::testing::Test {
   [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; }
 };
 
-using TestTypes = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
+using TestTypes          = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TYPED_TEST_CASE(TypedUVectorTest, TestTypes);
 
 TYPED_TEST(TypedUVectorTest, MemoryResource)
 {
   rmm::device_uvector<TypeParam> vec(128, this->stream());
-  EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource());
+  EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource_ref());
 }
 
 TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor)

From 8b1bc04979a42799b236cdf10f007dff320fd75a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Oct 2023 10:29:29 -0700
Subject: [PATCH 09/30] Use upstream patch for memory resource.

---
 fetch_rapids.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 899db0467..3aa4f8be0 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -11,6 +11,8 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
+set(rapids-cmake-repo bdice/rapids-cmake)
+set(rapids-cmake-branch libcudacxx-memory-resource-patch)
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake)
   file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake)

From bccabd1b2d6896d290c4226deb611cdeab92312b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Oct 2023 12:52:33 -0700
Subject: [PATCH 10/30] Add target_link_libraries for libcudacxx.

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9455c4267..93fcdabe9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,7 @@ else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()
 
+target_link_libraries(rmm INTERFACE libcudacxx::libcudacxx)
 target_link_libraries(rmm INTERFACE rmm::Thrust)
 target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)

From 5b27e105e6392741635e2a17cecd750596eee19a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Oct 2023 13:24:49 -0700
Subject: [PATCH 11/30] Revert "Use upstream patch for memory resource."

This reverts commit 8b1bc04979a42799b236cdf10f007dff320fd75a.
---
 fetch_rapids.cmake | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 3aa4f8be0..899db0467 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -11,8 +11,6 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(rapids-cmake-repo bdice/rapids-cmake)
-set(rapids-cmake-branch libcudacxx-memory-resource-patch)
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake)
   file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/RMM_RAPIDS.cmake)

From 6e610a33cf3e524fe5f3b889cf80a78c8b60b135 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 18 Oct 2023 06:11:23 +0000
Subject: [PATCH 12/30] Add restrictions on memory usage for tests

---
 tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4e9866905..3071b89b0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -128,7 +128,7 @@ ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_t
 
 # device mr tests
 ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp
-              mr/device/mr_ref_multithreaded_tests.cpp)
+              mr/device/mr_ref_multithreaded_tests.cpp GPUS 1 PERCENT 50)
 
 # general adaptor tests
 ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)

From edb8f78f2be34bb1fcdda06664882079aaaf17a5 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 18 Oct 2023 06:34:26 +0000
Subject: [PATCH 13/30] Remove `get_current_device_resource_ref`

It is not really usefull as we are constrained to only using `device_accessible` and we can easily construct any resource_ref from it anyway
---
 include/rmm/device_buffer.hpp                 | 16 ++++++------
 include/rmm/device_uvector.hpp                |  6 ++---
 include/rmm/mr/device/per_device_resource.hpp | 26 -------------------
 tests/device_buffer_tests.cu                  | 16 +++++++-----
 tests/device_uvector_tests.cpp                |  2 +-
 5 files changed, 21 insertions(+), 45 deletions(-)

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index 811c8d9cb..3b715ff5c 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -40,7 +40,7 @@ namespace rmm {
  *
  * This class allocates untyped and *uninitialized* device memory using a
  * `device_memory_resource`. If not explicitly specified, the memory resource
- * returned from `get_current_device_resource_ref()` is used.
+ * returned from `get_current_device_resource()` is used.
  *
  * @note Unlike `std::vector` or `thrust::device_vector`, the device memory
  * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -96,7 +96,7 @@ class device_buffer {
   // `__host__ __device__` specifiers to the defaulted constructor when it is called within the
   // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
   // device function. This causes warnings/errors because this ctor invokes host-only functions.
-  device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
+  device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
 
   /**
    * @brief Constructs a new device buffer of `size` uninitialized bytes
@@ -110,7 +110,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         async_resource_ref mr = mr::get_current_device_resource_ref())
+                         async_resource_ref mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -138,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                async_resource_ref mr = mr::get_current_device_resource_ref())
+                async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -168,7 +168,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                async_resource_ref mr = mr::get_current_device_resource_ref())
+                async_resource_ref mr = rmm::mr::get_current_device_resource())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -400,7 +400,7 @@ class device_buffer {
   void set_stream(cuda_stream_view stream) noexcept { _stream = stream; }
 
   /**
-   * @briefreturn{Pointer to the memory resource used to allocate and deallocate}
+   * @briefreturn{The async_resource_ref used to allocate and deallocate}
    */
   [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
 
@@ -417,8 +417,8 @@ class device_buffer {
   std::size_t _capacity{};     ///< The actual size of the device memory allocation
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
   async_resource_ref _mr{
-    mr::get_current_device_resource_ref()};  ///< The memory resource used to
-                                             ///< allocate/deallocate device memory
+    rmm::mr::get_current_device_resource()};  ///< The memory resource used to
+                                              ///< allocate/deallocate device memory
 
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index cfa6eebd4..43370588c 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -126,7 +126,7 @@ class device_uvector {
    */
   explicit device_uvector(std::size_t size,
                           cuda_stream_view stream,
-                          async_resource_ref mr = rmm::mr::get_current_device_resource_ref())
+                          async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -142,7 +142,7 @@ class device_uvector {
    */
   explicit device_uvector(device_uvector const& other,
                           cuda_stream_view stream,
-                          async_resource_ref mr = rmm::mr::get_current_device_resource_ref())
+                          async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _storage{other._storage, stream, mr}
   {
   }
@@ -525,7 +525,7 @@ class device_uvector {
   [[nodiscard]] bool is_empty() const noexcept { return size() == 0; }
 
   /**
-   * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage}
+   * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage}
    */
   [[nodiscard]] cuda::mr::async_resource_ref<cuda::mr::device_accessible> memory_resource()
     const noexcept
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
index 1c4108a25..c0a26d2ec 100644
--- a/include/rmm/mr/device/per_device_resource.hpp
+++ b/include/rmm/mr/device/per_device_resource.hpp
@@ -236,30 +236,4 @@ inline device_memory_resource* set_current_device_resource(device_memory_resourc
   return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
 }
 
-/**
- * @brief Get the memory resource for the current device as a `cuda::mr::async_resource_ref`.
- *
- * Returns a `cuda::mr::async_resource_ref` from the memory_resource set for the current device.
- * The initial resource is a `cuda_memory_resource`.
- *
- * The "current device" is the device returned by `cudaGetDevice`.
- *
- * This function is thread-safe with respect to concurrent calls to `set_per_device_resource`,
- * `get_per_device_resource`, `get_current_device_resource`, and `set_current_device_resource`.
- * Concurrent calls to any of these functions will result in a valid state, but the order of
- * execution is undefined.
- *
- * @note The returned `cuda::mr::async_resource_ref` should only be used with the current CUDA
- * device. Changing the current device (e.g. using `cudaSetDevice()`) and then using the returned
- * resource can result in undefined behavior. The behavior of a device_memory_resource is undefined
- * if used while the active CUDA device is a different device from the one that was active when the
- * device_memory_resource was created.
- *
- * @return cuda::mr::async_resource_ref of the resource for the current device
- */
-inline cuda::mr::async_resource_ref<cuda::mr::device_accessible> get_current_device_resource_ref()
-{
-  return cuda::mr::async_resource_ref<cuda::mr::device_accessible>{get_current_device_resource()};
-}
-
 }  // namespace rmm::mr
diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu
index 56992a06c..c414c86c4 100644
--- a/tests/device_buffer_tests.cu
+++ b/tests/device_buffer_tests.cu
@@ -74,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource)
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.ssize());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -85,7 +85,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
@@ -118,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 
   // TODO check for equality between the contents of the two allocations
@@ -134,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
   buff.stream().synchronize();
   // TODO check for equality between the contents of the two allocations
@@ -147,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr)
   EXPECT_EQ(nullptr, buff.data());
   EXPECT_EQ(0, buff.size());
   EXPECT_EQ(0, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -173,7 +173,8 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   EXPECT_NE(buff.data(), buff_copy.data());
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref());
+  EXPECT_EQ(buff_copy.memory_resource(),
+            async_resource_ref{rmm::mr::get_current_device_resource()});
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -215,7 +216,8 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize)
 
   // The capacity of the copy should be equal to the `size()` of the original
   EXPECT_EQ(new_size, buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource_ref());
+  EXPECT_EQ(buff_copy.memory_resource(),
+            async_resource_ref{rmm::mr::get_current_device_resource()});
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
index 46023d519..3c042a437 100644
--- a/tests/device_uvector_tests.cpp
+++ b/tests/device_uvector_tests.cpp
@@ -38,7 +38,7 @@ TYPED_TEST_CASE(TypedUVectorTest, TestTypes);
 TYPED_TEST(TypedUVectorTest, MemoryResource)
 {
   rmm::device_uvector<TypeParam> vec(128, this->stream());
-  EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource_ref());
+  EXPECT_EQ(vec.memory_resource(), async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
 TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor)

From 4ea703ecc043f9594b1aa52ef84c67989e1c4c81 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 18 Oct 2023 11:47:13 +0000
Subject: [PATCH 14/30] Make CI happy

---
 python/rmm/tests/test_rmm.py | 6 +++---
 tests/CMakeLists.txt         | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index fd537749b..b2ce8fe6d 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -630,11 +630,11 @@ def test_statistics_resource_adaptor(stats_mr):
         del buffers[i]
 
     assert stats_mr.allocation_counts == {
-        "current_bytes": 5000,
+        "current_bytes": 5040,
         "current_count": 5,
-        "peak_bytes": 10000,
+        "peak_bytes": 10080,
         "peak_count": 10,
-        "total_bytes": 10000,
+        "total_bytes": 10080,
         "total_count": 10,
     }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3071b89b0..bf4fac1f9 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -126,9 +126,9 @@ endfunction()
 ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_tests.cpp GPUS 1
               PERCENT 90)
 
-# device mr tests
+# device mr_ref tests
 ConfigureTest(DEVICE_MR_REF_TEST mr/device/mr_ref_tests.cpp
-              mr/device/mr_ref_multithreaded_tests.cpp GPUS 1 PERCENT 50)
+              mr/device/mr_ref_multithreaded_tests.cpp GPUS 1 PERCENT 100)
 
 # general adaptor tests
 ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)

From 8e7c91b39befc93f0e6e809d945161cac251a2c5 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 1 Nov 2023 09:01:22 +0000
Subject: [PATCH 15/30] Address review comments

---
 .../rmm/mr/device/device_memory_resource.hpp  |  3 +--
 include/rmm/mr/host/host_memory_resource.hpp  |  2 +-
 tests/mr/device/adaptor_tests.cpp             | 27 +++++++------------
 tests/mr/host/mr_ref_tests.cpp                |  5 ++--
 tests/mr/host/mr_tests.cpp                    |  5 ++--
 5 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 636c98c44..6681e598c 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -19,7 +19,6 @@
 #include <rmm/detail/aligned.hpp>
 
 #include <cuda/memory_resource>
-#include <cuda/stream_ref>
 
 #include <cstddef>
 #include <utility>
@@ -409,6 +408,6 @@ class device_memory_resource {
   [[nodiscard]] virtual std::pair<std::size_t, std::size_t> do_get_mem_info(
     cuda_stream_view stream) const = 0;
 };
-static_assert(cuda::mr::resource_with<device_memory_resource, cuda::mr::device_accessible>, "");
+static_assert(cuda::mr::resource_with<device_memory_resource, cuda::mr::device_accessible>);
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp
index f5f1ddf58..ce870287c 100644
--- a/include/rmm/mr/host/host_memory_resource.hpp
+++ b/include/rmm/mr/host/host_memory_resource.hpp
@@ -195,7 +195,7 @@ class host_memory_resource {
     return this == &other;
   }
 };
-static_assert(cuda::mr::resource_with<host_memory_resource, cuda::mr::host_accessible>, "");
+static_assert(cuda::mr::resource_with<host_memory_resource, cuda::mr::host_accessible>);
 /** @} */  // end of group
 
 }  // namespace rmm::mr
diff --git a/tests/mr/device/adaptor_tests.cpp b/tests/mr/device/adaptor_tests.cpp
index 3e00dd208..98fc3a429 100644
--- a/tests/mr/device/adaptor_tests.cpp
+++ b/tests/mr/device/adaptor_tests.cpp
@@ -67,28 +67,21 @@ using adaptors = ::testing::Types<aligned_resource_adaptor<cuda_mr>,
                                   tracking_resource_adaptor<cuda_mr>>;
 
 static_assert(
-  cuda::mr::resource_with<rmm::mr::aligned_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
-  "");
+  cuda::mr::resource_with<rmm::mr::aligned_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>);
 static_assert(cuda::mr::resource_with<rmm::mr::failure_callback_resource_adaptor<cuda_mr>,
-                                      cuda::mr::device_accessible>,
-              "");
+                                      cuda::mr::device_accessible>);
+static_assert(cuda::mr::resource_with<rmm::mr::limiting_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>);
 static_assert(
-  cuda::mr::resource_with<rmm::mr::limiting_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
-  "");
+  cuda::mr::resource_with<rmm::mr::logging_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>);
 static_assert(
-  cuda::mr::resource_with<rmm::mr::logging_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
-  "");
-static_assert(
-  cuda::mr::resource_with<rmm::mr::owning_wrapper<cuda_mr>, cuda::mr::device_accessible>, "");
+  cuda::mr::resource_with<rmm::mr::owning_wrapper<cuda_mr>, cuda::mr::device_accessible>);
 static_assert(cuda::mr::resource_with<rmm::mr::statistics_resource_adaptor<cuda_mr>,
-                                      cuda::mr::device_accessible>,
-              "");
+                                      cuda::mr::device_accessible>);
 static_assert(cuda::mr::resource_with<rmm::mr::thread_safe_resource_adaptor<cuda_mr>,
-                                      cuda::mr::device_accessible>,
-              "");
-static_assert(
-  cuda::mr::resource_with<rmm::mr::tracking_resource_adaptor<cuda_mr>, cuda::mr::device_accessible>,
-  "");
+                                      cuda::mr::device_accessible>);
+static_assert(cuda::mr::resource_with<rmm::mr::tracking_resource_adaptor<cuda_mr>,
+                                      cuda::mr::device_accessible>);
 
 template <typename MemoryResourceType>
 struct AdaptorTest : public ::testing::Test {
diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp
index 568e18576..1439ebabe 100644
--- a/tests/mr/host/mr_ref_tests.cpp
+++ b/tests/mr/host/mr_ref_tests.cpp
@@ -83,9 +83,8 @@ struct MRRefTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
-static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
-static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>,
-              "");
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>);
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>);
 
 TYPED_TEST_CASE(MRRefTest, resources);
 
diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp
index ff976ff96..1b95bea5d 100644
--- a/tests/mr/host/mr_tests.cpp
+++ b/tests/mr/host/mr_tests.cpp
@@ -82,9 +82,8 @@ struct MRTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::new_delete_resource, rmm::mr::pinned_memory_resource>;
-static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>, "");
-static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>,
-              "");
+static_assert(cuda::mr::resource_with<rmm::mr::new_delete_resource, cuda::mr::host_accessible>);
+static_assert(cuda::mr::resource_with<rmm::mr::pinned_memory_resource, cuda::mr::host_accessible>);
 
 TYPED_TEST_CASE(MRTest, resources);
 

From 849c88045ac203504413cc498b9e3881e355342c Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Wed, 1 Nov 2023 09:06:09 +0000
Subject: [PATCH 16/30] Address alignment differences with
 `cuda::mr::resource_ref`

Our implementation of `cuda::mr::{async_}resource_ref` currently aligns to `alignof(max_align_t)`

This is a minor difference to the current device_memory_resource. We intent to expand the interface of `cuda::mr::{async_}resource_ref` to take desired alignment into account.

However, this is out of scope of this PR and will be done in the cccl repository
---
 python/rmm/tests/test_rmm.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index b2ce8fe6d..b5dc81c1f 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -646,19 +646,19 @@ def test_statistics_resource_adaptor(stats_mr):
         buffers.append(rmm.DeviceBuffer(size=1000))
 
     assert mr2.allocation_counts == {
-        "current_bytes": 2000,
+        "current_bytes": 2016,
         "current_count": 2,
-        "peak_bytes": 2000,
+        "peak_bytes": 2016,
         "peak_count": 2,
-        "total_bytes": 2000,
+        "total_bytes": 2016,
         "total_count": 2,
     }
     assert stats_mr.allocation_counts == {
-        "current_bytes": 7000,
+        "current_bytes": 7056,
         "current_count": 7,
-        "peak_bytes": 10000,
+        "peak_bytes": 10080,
         "peak_count": 10,
-        "total_bytes": 12000,
+        "total_bytes": 12096,
         "total_count": 12,
     }
 
@@ -668,17 +668,17 @@ def test_statistics_resource_adaptor(stats_mr):
     assert mr2.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
-        "peak_bytes": 2000,
+        "peak_bytes": 2016,
         "peak_count": 2,
-        "total_bytes": 2000,
+        "total_bytes": 2016,
         "total_count": 2,
     }
     assert stats_mr.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
-        "peak_bytes": 10000,
+        "peak_bytes": 10080,
         "peak_count": 10,
-        "total_bytes": 12000,
+        "total_bytes": 12096,
         "total_count": 12,
     }
     gc.collect()
@@ -696,7 +696,7 @@ def test_tracking_resource_adaptor():
     for i in range(9, 0, -2):
         del buffers[i]
 
-    assert mr.get_allocated_bytes() == 5000
+    assert mr.get_allocated_bytes() == 5040
 
     # Push a new Tracking adaptor
     mr2 = rmm.mr.TrackingResourceAdaptor(mr, capture_stacks=True)
@@ -705,8 +705,8 @@ def test_tracking_resource_adaptor():
     for _ in range(2):
         buffers.append(rmm.DeviceBuffer(size=1000))
 
-    assert mr2.get_allocated_bytes() == 2000
-    assert mr.get_allocated_bytes() == 7000
+    assert mr2.get_allocated_bytes() == 2016
+    assert mr.get_allocated_bytes() == 7056
 
     # Ensure we get back a non-empty string for the allocations
     assert len(mr.get_outstanding_allocations_str()) > 0

From 2cf03a782a809a070e6fa5c0c2131b92eff9b42e Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 2 Nov 2023 08:12:17 +0000
Subject: [PATCH 17/30] Roll back the changes to `device_u{buffer, vector}`

Those are potentially breaking and we want to do that in a followup
---
 include/rmm/device_buffer.hpp  | 30 ++++++++--------------
 include/rmm/device_uvector.hpp | 27 +++++++------------
 python/rmm/tests/test_rmm.py   | 32 +++++++++++------------
 tests/device_buffer_tests.cu   | 47 +++++++++++++++++++---------------
 tests/device_uvector_tests.cpp |  5 ++--
 5 files changed, 64 insertions(+), 77 deletions(-)

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index 688f218b9..5762d2bbe 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -27,8 +27,6 @@
 #include <stdexcept>
 #include <utility>
 
-#include <cuda/memory_resource>
-
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -81,8 +79,6 @@ namespace rmm {
  *```
  */
 class device_buffer {
-  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
-
  public:
   // The copy constructor and copy assignment operator without a stream are deleted because they
   // provide no way to specify an explicit stream
@@ -110,7 +106,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         async_resource_ref mr = mr::get_current_device_resource())
+                         mr::device_memory_resource* mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -138,7 +134,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                async_resource_ref mr = rmm::mr::get_current_device_resource())
+                mr::device_memory_resource* mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -168,7 +164,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                async_resource_ref mr = rmm::mr::get_current_device_resource())
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -240,6 +236,7 @@ class device_buffer {
   ~device_buffer() noexcept
   {
     deallocate_async();
+    _mr     = nullptr;
     _stream = cuda_stream_view{};
   }
 
@@ -400,23 +397,16 @@ class device_buffer {
   /**
    * @briefreturn{The async_resource_ref used to allocate and deallocate}
    */
-  [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
-
-  /**
-   * @brief Enables the `cuda::mr::device_accessible` property
-   *
-   * This property declares that a `device_buffer` provides device accessible memory
-   */
-  friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {}
+  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; }
 
  private:
   void* _data{nullptr};        ///< Pointer to device memory allocation
   std::size_t _size{};         ///< Requested size of the device memory allocation
   std::size_t _capacity{};     ///< The actual size of the device memory allocation
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
-  async_resource_ref _mr{
-    rmm::mr::get_current_device_resource()};  ///< The memory resource used to
-                                              ///< allocate/deallocate device memory
+  mr::device_memory_resource* _mr{
+    mr::get_current_device_resource()};  ///< The memory resource used to
+                                         ///< allocate/deallocate device memory
 
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
@@ -431,7 +421,7 @@ class device_buffer {
   {
     _size     = bytes;
     _capacity = bytes;
-    _data     = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
+    _data     = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr;
   }
 
   /**
@@ -445,7 +435,7 @@ class device_buffer {
    */
   void deallocate_async() noexcept
   {
-    if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }
+    if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); }
     _size     = 0;
     _capacity = 0;
     _data     = nullptr;
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index 43370588c..840da8ca8 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -26,8 +26,6 @@
 #include <cstddef>
 #include <vector>
 
-#include <cuda/memory_resource>
-
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -74,7 +72,6 @@ namespace rmm {
  */
 template <typename T>
 class device_uvector {
-  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
   static_assert(std::is_trivially_copyable<T>::value,
                 "device_uvector only supports types that are trivially copyable.");
 
@@ -124,9 +121,10 @@ class device_uvector {
    * @param stream The stream on which to perform the allocation
    * @param mr The resource used to allocate the device storage
    */
-  explicit device_uvector(std::size_t size,
-                          cuda_stream_view stream,
-                          async_resource_ref mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(
+    std::size_t size,
+    cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -140,9 +138,10 @@ class device_uvector {
    * @param stream The stream on which to perform the copy
    * @param mr The resource used to allocate device memory for the new vector
    */
-  explicit device_uvector(device_uvector const& other,
-                          cuda_stream_view stream,
-                          async_resource_ref mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(
+    device_uvector const& other,
+    cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : _storage{other._storage, stream, mr}
   {
   }
@@ -527,19 +526,11 @@ class device_uvector {
   /**
    * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage}
    */
-  [[nodiscard]] cuda::mr::async_resource_ref<cuda::mr::device_accessible> memory_resource()
-    const noexcept
+  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept
   {
     return _storage.memory_resource();
   }
 
-  /**
-   * @brief Enables the `cuda::mr::device_accessible` property
-   *
-   * This property declares that a `device_uvector` provides device accessible memory
-   */
-  friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {}
-
   /**
    * @briefreturn{Stream most recently specified for allocation/deallocation}
    */
diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index b5dc81c1f..fd537749b 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -630,11 +630,11 @@ def test_statistics_resource_adaptor(stats_mr):
         del buffers[i]
 
     assert stats_mr.allocation_counts == {
-        "current_bytes": 5040,
+        "current_bytes": 5000,
         "current_count": 5,
-        "peak_bytes": 10080,
+        "peak_bytes": 10000,
         "peak_count": 10,
-        "total_bytes": 10080,
+        "total_bytes": 10000,
         "total_count": 10,
     }
 
@@ -646,19 +646,19 @@ def test_statistics_resource_adaptor(stats_mr):
         buffers.append(rmm.DeviceBuffer(size=1000))
 
     assert mr2.allocation_counts == {
-        "current_bytes": 2016,
+        "current_bytes": 2000,
         "current_count": 2,
-        "peak_bytes": 2016,
+        "peak_bytes": 2000,
         "peak_count": 2,
-        "total_bytes": 2016,
+        "total_bytes": 2000,
         "total_count": 2,
     }
     assert stats_mr.allocation_counts == {
-        "current_bytes": 7056,
+        "current_bytes": 7000,
         "current_count": 7,
-        "peak_bytes": 10080,
+        "peak_bytes": 10000,
         "peak_count": 10,
-        "total_bytes": 12096,
+        "total_bytes": 12000,
         "total_count": 12,
     }
 
@@ -668,17 +668,17 @@ def test_statistics_resource_adaptor(stats_mr):
     assert mr2.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
-        "peak_bytes": 2016,
+        "peak_bytes": 2000,
         "peak_count": 2,
-        "total_bytes": 2016,
+        "total_bytes": 2000,
         "total_count": 2,
     }
     assert stats_mr.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
-        "peak_bytes": 10080,
+        "peak_bytes": 10000,
         "peak_count": 10,
-        "total_bytes": 12096,
+        "total_bytes": 12000,
         "total_count": 12,
     }
     gc.collect()
@@ -696,7 +696,7 @@ def test_tracking_resource_adaptor():
     for i in range(9, 0, -2):
         del buffers[i]
 
-    assert mr.get_allocated_bytes() == 5040
+    assert mr.get_allocated_bytes() == 5000
 
     # Push a new Tracking adaptor
     mr2 = rmm.mr.TrackingResourceAdaptor(mr, capture_stacks=True)
@@ -705,8 +705,8 @@ def test_tracking_resource_adaptor():
     for _ in range(2):
         buffers.append(rmm.DeviceBuffer(size=1000))
 
-    assert mr2.get_allocated_bytes() == 2016
-    assert mr.get_allocated_bytes() == 7056
+    assert mr2.get_allocated_bytes() == 2000
+    assert mr.get_allocated_bytes() == 7000
 
     # Ensure we get back a non-empty string for the allocations
     assert len(mr.get_outstanding_allocations_str()) > 0
diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu
index c414c86c4..d4c34385e 100644
--- a/tests/device_buffer_tests.cu
+++ b/tests/device_buffer_tests.cu
@@ -57,7 +57,6 @@ struct DeviceBufferTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>;
-using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TYPED_TEST_CASE(DeviceBufferTest, resources);
 
@@ -74,7 +73,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource)
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.ssize());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -85,28 +84,30 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource)
 {
-  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr);
+  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr);
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
+  EXPECT_EQ(&this->mr, buff.memory_resource());
+  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream)
 {
-  rmm::device_buffer buff(this->size, this->stream, this->mr);
+  rmm::device_buffer buff(this->size, this->stream, &this->mr);
   this->stream.synchronize();
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
+  EXPECT_EQ(&this->mr, buff.memory_resource());
+  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
   EXPECT_EQ(this->stream, buff.stream());
 }
 
@@ -118,7 +119,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 
   // TODO check for equality between the contents of the two allocations
@@ -134,7 +135,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
   buff.stream().synchronize();
   // TODO check for equality between the contents of the two allocations
@@ -147,7 +148,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr)
   EXPECT_EQ(nullptr, buff.data());
   EXPECT_EQ(0, buff.size());
   EXPECT_EQ(0, buff.capacity());
-  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
+  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -173,8 +174,8 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   EXPECT_NE(buff.data(), buff_copy.data());
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(),
-            async_resource_ref{rmm::mr::get_current_device_resource()});
+  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
+  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -186,7 +187,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   // now use buff's stream and MR
   rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource());
   EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
-  EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
+  EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource()));
   EXPECT_EQ(buff_copy2.stream(), buff.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -216,8 +217,8 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize)
 
   // The capacity of the copy should be equal to the `size()` of the original
   EXPECT_EQ(new_size, buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(),
-            async_resource_ref{rmm::mr::get_current_device_resource()});
+  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
+  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -242,6 +243,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr)
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
+  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -273,6 +275,7 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr)
   EXPECT_EQ(new_size, buff_copy.capacity());
   EXPECT_NE(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
+  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -288,7 +291,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto mr       = buff.memory_resource();
+  auto* mr      = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -306,6 +309,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_default, buff.stream());  // NOLINT(bugprone-use-after-move)
+  EXPECT_NE(nullptr, buff.memory_resource());          // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
@@ -315,7 +319,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto mr       = buff.memory_resource();
+  auto* mr      = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -334,6 +338,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());  // NOLINT(bugprone-use-after-move)
+  EXPECT_NE(nullptr, buff.memory_resource());         // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
@@ -342,7 +347,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto mr       = src.memory_resource();
+  auto* mr      = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest;
@@ -361,6 +366,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
+  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignment)
@@ -369,7 +375,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto mr       = src.memory_resource();
+  auto* mr      = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr);
@@ -388,6 +394,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
+  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
@@ -396,7 +403,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto mr       = buff.memory_resource();
+  auto* mr      = buff.memory_resource();
   auto stream   = buff.stream();
 
   buff = std::move(buff);           // self-move-assignment shouldn't modify the buffer
diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
index 3c042a437..69d89e305 100644
--- a/tests/device_uvector_tests.cpp
+++ b/tests/device_uvector_tests.cpp
@@ -30,15 +30,14 @@ struct TypedUVectorTest : ::testing::Test {
   [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; }
 };
 
-using TestTypes          = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
-using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+using TestTypes = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
 
 TYPED_TEST_CASE(TypedUVectorTest, TestTypes);
 
 TYPED_TEST(TypedUVectorTest, MemoryResource)
 {
   rmm::device_uvector<TypeParam> vec(128, this->stream());
-  EXPECT_EQ(vec.memory_resource(), async_resource_ref{rmm::mr::get_current_device_resource()});
+  EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource());
 }
 
 TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor)

From 76d55fa29b74667de174c71b48be2f374b4e1cd2 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 2 Nov 2023 08:44:19 +0000
Subject: [PATCH 18/30] Address review comments

---
 .../rmm/mr/device/device_memory_resource.hpp  | 18 -----------------
 include/rmm/mr/device/per_device_resource.hpp |  2 --
 .../rmm/mr/device/pool_memory_resource.hpp    | 20 ++++++++++---------
 .../rmm/mr/host/pinned_memory_resource.hpp    | 11 +++++++++-
 4 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 6681e598c..2f81a117f 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -162,9 +162,6 @@ class device_memory_resource {
    *
    * The returned pointer will have at minimum 256 byte alignment.
    *
-   * If supported, this operation may optionally be executed on a stream.
-   * Otherwise, the stream is ignored and the null stream is used.
-   *
    * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
@@ -185,9 +182,6 @@ class device_memory_resource {
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
    *
-   * If supported, this operation may optionally be executed on a stream.
-   * Otherwise, the stream is ignored and the null stream is used.
-   *
    * @throws Nothing.
    *
    * @param ptr Pointer to be deallocated
@@ -205,9 +199,6 @@ class device_memory_resource {
    *
    * The returned pointer will have at minimum 256 byte alignment.
    *
-   * If supported, this operation may optionally be executed on a stream.
-   * Otherwise, the stream is ignored and the null stream is used.
-   *
    * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
@@ -226,9 +217,6 @@ class device_memory_resource {
    *
    * The returned pointer will have at minimum 256 byte alignment.
    *
-   * If supported, this operation may optionally be executed on a stream.
-   * Otherwise, the stream is ignored and the null stream is used.
-   *
    * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
@@ -249,9 +237,6 @@ class device_memory_resource {
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
    *
-   * If supported, this operation may optionally be executed on a stream.
-   * Otherwise, the stream is ignored and the null stream is used.
-   *
    * @throws Nothing.
    *
    * @param ptr Pointer to be deallocated
@@ -276,9 +261,6 @@ class device_memory_resource {
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
    *
-   * If supported, this operation may optionally be executed on a stream.
-   * Otherwise, the stream is ignored and the null stream is used.
-   *
    * @throws Nothing.
    *
    * @param ptr Pointer to be deallocated
diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp
index c0a26d2ec..cbdb71532 100644
--- a/include/rmm/mr/device/per_device_resource.hpp
+++ b/include/rmm/mr/device/per_device_resource.hpp
@@ -24,8 +24,6 @@
 #include <map>
 #include <mutex>
 
-#include <cuda/memory_resource>
-
 /**
  * @file per_device_resource.hpp
  * @brief Management of per-device `device_memory_resource`s
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index 7cca60613..face43993 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -61,23 +61,24 @@ namespace detail {
  *
  * @tparam PoolResource the pool_memory_resource class
  * @tparam Upstream memory_resource to use for allocating the pool.
+ * @tparam Property The property we want to potentially remove.
  */
-template <class PoolResource, class Upstream, class = void>
+template <class PoolResource, class Upstream, class Property, class = void>
 struct maybe_remove_property {};
 
 /**
  * @brief Specialization of maybe_remove_property to not propagate non existing properties
  */
-template <class PoolResource, class Upstream>
-struct maybe_remove_property<
-  PoolResource,
-  Upstream,
-  cuda::std::enable_if_t<!cuda::has_property<Upstream, cuda::mr::device_accessible>>> {
+template <class PoolResource, class Upstream, class Property>
+struct maybe_remove_property<PoolResource,
+                             Upstream,
+                             Property,
+                             cuda::std::enable_if_t<!cuda::has_property<Upstream, Property>>> {
   /**
-   * @brief Explicit removal of the friend function so we do not pretent to provide device
+   * @brief Explicit removal of the friend function so we do not pretend to provide device
    * accessible memory
    */
-  friend void get_property(const PoolResource&, cuda::mr::device_accessible) = delete;
+  friend void get_property(const PoolResource&, Property) = delete;
 };
 }  // namespace detail
 
@@ -93,7 +94,8 @@ struct maybe_remove_property<
  */
 template <typename Upstream>
 class pool_memory_resource final
-  : public detail::maybe_remove_property<pool_memory_resource<Upstream>, Upstream>,
+  : public detail::
+      maybe_remove_property<pool_memory_resource<Upstream>, Upstream, cuda::mr::device_accessible>,
     public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,
                                                   detail::coalescing_free_list>,
     public cuda::forward_property<pool_memory_resource<Upstream>, Upstream> {
diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp
index b4551af60..8ea259263 100644
--- a/include/rmm/mr/host/pinned_memory_resource.hpp
+++ b/include/rmm/mr/host/pinned_memory_resource.hpp
@@ -119,6 +119,13 @@ class pinned_memory_resource final : public host_memory_resource {
     do_deallocate(ptr, rmm::detail::align_up(bytes, alignment));
   }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `pinned_memory_resource` provides device accessible memory
+   */
+  friend void get_property(pinned_memory_resource const&, cuda::mr::device_accessible) noexcept {}
+
  private:
   /**
    * @brief Allocates pinned memory on the host of size at least `bytes` bytes.
@@ -172,6 +179,8 @@ class pinned_memory_resource final : public host_memory_resource {
       ptr, bytes, alignment, [](void* ptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeHost(ptr)); });
   }
 };
-static_assert(cuda::mr::async_resource<pinned_memory_resource>);
+static_assert(cuda::mr::async_resource_with<pinned_memory_resource,
+                                            cuda::mr::host_accessible,
+                                            cuda::mr::device_accessible>);
 /** @} */  // end of group
 }  // namespace rmm::mr

From 3390fb841b113acd3ea7679626b2cdccbc029a0b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Fri, 3 Nov 2023 08:23:25 +0000
Subject: [PATCH 19/30] Revert "Roll back the changes to `device_u{buffer,
 vector}`"

This reverts commit 2cf03a782a809a070e6fa5c0c2131b92eff9b42e.
---
 include/rmm/device_buffer.hpp  | 30 ++++++++++++++--------
 include/rmm/device_uvector.hpp | 27 ++++++++++++-------
 python/rmm/tests/test_rmm.py   | 32 +++++++++++------------
 tests/device_buffer_tests.cu   | 47 +++++++++++++++-------------------
 tests/device_uvector_tests.cpp |  5 ++--
 5 files changed, 77 insertions(+), 64 deletions(-)

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index 5762d2bbe..688f218b9 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -27,6 +27,8 @@
 #include <stdexcept>
 #include <utility>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -79,6 +81,8 @@ namespace rmm {
  *```
  */
 class device_buffer {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+
  public:
   // The copy constructor and copy assignment operator without a stream are deleted because they
   // provide no way to specify an explicit stream
@@ -106,7 +110,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         mr::device_memory_resource* mr = mr::get_current_device_resource())
+                         async_resource_ref mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -134,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                mr::device_memory_resource* mr = mr::get_current_device_resource())
+                async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -164,7 +168,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                async_resource_ref mr = rmm::mr::get_current_device_resource())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -236,7 +240,6 @@ class device_buffer {
   ~device_buffer() noexcept
   {
     deallocate_async();
-    _mr     = nullptr;
     _stream = cuda_stream_view{};
   }
 
@@ -397,16 +400,23 @@ class device_buffer {
   /**
    * @briefreturn{The async_resource_ref used to allocate and deallocate}
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; }
+  [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
+
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_buffer` provides device accessible memory
+   */
+  friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {}
 
  private:
   void* _data{nullptr};        ///< Pointer to device memory allocation
   std::size_t _size{};         ///< Requested size of the device memory allocation
   std::size_t _capacity{};     ///< The actual size of the device memory allocation
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
-  mr::device_memory_resource* _mr{
-    mr::get_current_device_resource()};  ///< The memory resource used to
-                                         ///< allocate/deallocate device memory
+  async_resource_ref _mr{
+    rmm::mr::get_current_device_resource()};  ///< The memory resource used to
+                                              ///< allocate/deallocate device memory
 
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
@@ -421,7 +431,7 @@ class device_buffer {
   {
     _size     = bytes;
     _capacity = bytes;
-    _data     = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr;
+    _data     = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
   }
 
   /**
@@ -435,7 +445,7 @@ class device_buffer {
    */
   void deallocate_async() noexcept
   {
-    if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); }
+    if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }
     _size     = 0;
     _capacity = 0;
     _data     = nullptr;
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index 840da8ca8..43370588c 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <vector>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -72,6 +74,7 @@ namespace rmm {
  */
 template <typename T>
 class device_uvector {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
   static_assert(std::is_trivially_copyable<T>::value,
                 "device_uvector only supports types that are trivially copyable.");
 
@@ -121,10 +124,9 @@ class device_uvector {
    * @param stream The stream on which to perform the allocation
    * @param mr The resource used to allocate the device storage
    */
-  explicit device_uvector(
-    std::size_t size,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(std::size_t size,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -138,10 +140,9 @@ class device_uvector {
    * @param stream The stream on which to perform the copy
    * @param mr The resource used to allocate device memory for the new vector
    */
-  explicit device_uvector(
-    device_uvector const& other,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(device_uvector const& other,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _storage{other._storage, stream, mr}
   {
   }
@@ -526,11 +527,19 @@ class device_uvector {
   /**
    * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage}
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept
+  [[nodiscard]] cuda::mr::async_resource_ref<cuda::mr::device_accessible> memory_resource()
+    const noexcept
   {
     return _storage.memory_resource();
   }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_uvector` provides device accessible memory
+   */
+  friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {}
+
   /**
    * @briefreturn{Stream most recently specified for allocation/deallocation}
    */
diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
index fd537749b..b5dc81c1f 100644
--- a/python/rmm/tests/test_rmm.py
+++ b/python/rmm/tests/test_rmm.py
@@ -630,11 +630,11 @@ def test_statistics_resource_adaptor(stats_mr):
         del buffers[i]
 
     assert stats_mr.allocation_counts == {
-        "current_bytes": 5000,
+        "current_bytes": 5040,
         "current_count": 5,
-        "peak_bytes": 10000,
+        "peak_bytes": 10080,
         "peak_count": 10,
-        "total_bytes": 10000,
+        "total_bytes": 10080,
         "total_count": 10,
     }
 
@@ -646,19 +646,19 @@ def test_statistics_resource_adaptor(stats_mr):
         buffers.append(rmm.DeviceBuffer(size=1000))
 
     assert mr2.allocation_counts == {
-        "current_bytes": 2000,
+        "current_bytes": 2016,
         "current_count": 2,
-        "peak_bytes": 2000,
+        "peak_bytes": 2016,
         "peak_count": 2,
-        "total_bytes": 2000,
+        "total_bytes": 2016,
         "total_count": 2,
     }
     assert stats_mr.allocation_counts == {
-        "current_bytes": 7000,
+        "current_bytes": 7056,
         "current_count": 7,
-        "peak_bytes": 10000,
+        "peak_bytes": 10080,
         "peak_count": 10,
-        "total_bytes": 12000,
+        "total_bytes": 12096,
         "total_count": 12,
     }
 
@@ -668,17 +668,17 @@ def test_statistics_resource_adaptor(stats_mr):
     assert mr2.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
-        "peak_bytes": 2000,
+        "peak_bytes": 2016,
         "peak_count": 2,
-        "total_bytes": 2000,
+        "total_bytes": 2016,
         "total_count": 2,
     }
     assert stats_mr.allocation_counts == {
         "current_bytes": 0,
         "current_count": 0,
-        "peak_bytes": 10000,
+        "peak_bytes": 10080,
         "peak_count": 10,
-        "total_bytes": 12000,
+        "total_bytes": 12096,
         "total_count": 12,
     }
     gc.collect()
@@ -696,7 +696,7 @@ def test_tracking_resource_adaptor():
     for i in range(9, 0, -2):
         del buffers[i]
 
-    assert mr.get_allocated_bytes() == 5000
+    assert mr.get_allocated_bytes() == 5040
 
     # Push a new Tracking adaptor
     mr2 = rmm.mr.TrackingResourceAdaptor(mr, capture_stacks=True)
@@ -705,8 +705,8 @@ def test_tracking_resource_adaptor():
     for _ in range(2):
         buffers.append(rmm.DeviceBuffer(size=1000))
 
-    assert mr2.get_allocated_bytes() == 2000
-    assert mr.get_allocated_bytes() == 7000
+    assert mr2.get_allocated_bytes() == 2016
+    assert mr.get_allocated_bytes() == 7056
 
     # Ensure we get back a non-empty string for the allocations
     assert len(mr.get_outstanding_allocations_str()) > 0
diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu
index d4c34385e..c414c86c4 100644
--- a/tests/device_buffer_tests.cu
+++ b/tests/device_buffer_tests.cu
@@ -57,6 +57,7 @@ struct DeviceBufferTest : public ::testing::Test {
 };
 
 using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TYPED_TEST_CASE(DeviceBufferTest, resources);
 
@@ -73,7 +74,7 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResource)
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.ssize());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -84,30 +85,28 @@ TYPED_TEST(DeviceBufferTest, DefaultMemoryResourceStream)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResource)
 {
-  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, &this->mr);
+  rmm::device_buffer buff(this->size, rmm::cuda_stream_view{}, this->mr);
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(&this->mr, buff.memory_resource());
-  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
+  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
 TYPED_TEST(DeviceBufferTest, ExplicitMemoryResourceStream)
 {
-  rmm::device_buffer buff(this->size, this->stream, &this->mr);
+  rmm::device_buffer buff(this->size, this->stream, this->mr);
   this->stream.synchronize();
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(&this->mr, buff.memory_resource());
-  EXPECT_TRUE(this->mr.is_equal(*buff.memory_resource()));
+  EXPECT_EQ(async_resource_ref{this->mr}, buff.memory_resource());
   EXPECT_EQ(this->stream, buff.stream());
 }
 
@@ -119,7 +118,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 
   // TODO check for equality between the contents of the two allocations
@@ -135,7 +134,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer)
   EXPECT_NE(nullptr, buff.data());
   EXPECT_EQ(this->size, buff.size());
   EXPECT_EQ(this->size, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
   buff.stream().synchronize();
   // TODO check for equality between the contents of the two allocations
@@ -148,7 +147,7 @@ TYPED_TEST(DeviceBufferTest, CopyFromNullptr)
   EXPECT_EQ(nullptr, buff.data());
   EXPECT_EQ(0, buff.size());
   EXPECT_EQ(0, buff.capacity());
-  EXPECT_EQ(rmm::mr::get_current_device_resource(), buff.memory_resource());
+  EXPECT_EQ(async_resource_ref{rmm::mr::get_current_device_resource()}, buff.memory_resource());
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());
 }
 
@@ -174,8 +173,8 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   EXPECT_NE(buff.data(), buff_copy.data());
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
-  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
+  EXPECT_EQ(buff_copy.memory_resource(),
+            async_resource_ref{rmm::mr::get_current_device_resource()});
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -187,7 +186,7 @@ TYPED_TEST(DeviceBufferTest, CopyConstructor)
   // now use buff's stream and MR
   rmm::device_buffer buff_copy2(buff, buff.stream(), buff.memory_resource());
   EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
-  EXPECT_TRUE(buff_copy2.memory_resource()->is_equal(*buff.memory_resource()));
+  EXPECT_EQ(buff_copy2.memory_resource(), buff.memory_resource());
   EXPECT_EQ(buff_copy2.stream(), buff.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -217,8 +216,8 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSize)
 
   // The capacity of the copy should be equal to the `size()` of the original
   EXPECT_EQ(new_size, buff_copy.capacity());
-  EXPECT_EQ(buff_copy.memory_resource(), rmm::mr::get_current_device_resource());
-  EXPECT_TRUE(buff_copy.memory_resource()->is_equal(*rmm::mr::get_current_device_resource()));
+  EXPECT_EQ(buff_copy.memory_resource(),
+            async_resource_ref{rmm::mr::get_current_device_resource()});
   EXPECT_EQ(buff_copy.stream(), rmm::cuda_stream_view{});
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(rmm::cuda_stream_default),
@@ -243,7 +242,6 @@ TYPED_TEST(DeviceBufferTest, CopyConstructorExplicitMr)
   EXPECT_EQ(buff.size(), buff_copy.size());
   EXPECT_EQ(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
-  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -275,7 +273,6 @@ TYPED_TEST(DeviceBufferTest, CopyCapacityLargerThanSizeExplicitMr)
   EXPECT_EQ(new_size, buff_copy.capacity());
   EXPECT_NE(buff.capacity(), buff_copy.capacity());
   EXPECT_EQ(buff.memory_resource(), buff_copy.memory_resource());
-  EXPECT_TRUE(buff.memory_resource()->is_equal(*buff_copy.memory_resource()));
   EXPECT_NE(buff.stream(), buff_copy.stream());
 
   EXPECT_TRUE(thrust::equal(rmm::exec_policy(buff_copy.stream()),
@@ -291,7 +288,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -309,7 +306,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructor)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_default, buff.stream());  // NOLINT(bugprone-use-after-move)
-  EXPECT_NE(nullptr, buff.memory_resource());          // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
@@ -319,7 +315,7 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   // New buffer should have the same contents as the original
@@ -338,7 +334,6 @@ TYPED_TEST(DeviceBufferTest, MoveConstructorStream)
   EXPECT_EQ(0, buff.size());      // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(0, buff.capacity());  // NOLINT(bugprone-use-after-move)
   EXPECT_EQ(rmm::cuda_stream_view{}, buff.stream());  // NOLINT(bugprone-use-after-move)
-  EXPECT_NE(nullptr, buff.memory_resource());         // NOLINT(bugprone-use-after-move)
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
@@ -347,7 +342,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto* mr      = src.memory_resource();
+  auto mr       = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest;
@@ -366,7 +361,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignmentToDefault)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
-  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, MoveAssignment)
@@ -375,7 +369,7 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   auto* ptr     = src.data();
   auto size     = src.size();
   auto capacity = src.capacity();
-  auto* mr      = src.memory_resource();
+  auto mr       = src.memory_resource();
   auto stream   = src.stream();
 
   rmm::device_buffer dest(this->size - 1, rmm::cuda_stream_default, &this->mr);
@@ -394,7 +388,6 @@ TYPED_TEST(DeviceBufferTest, MoveAssignment)
   EXPECT_EQ(0, src.size());
   EXPECT_EQ(0, src.capacity());
   EXPECT_EQ(rmm::cuda_stream_default, src.stream());
-  EXPECT_NE(nullptr, src.memory_resource());
 }
 
 TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
@@ -403,7 +396,7 @@ TYPED_TEST(DeviceBufferTest, SelfMoveAssignment)
   auto* ptr     = buff.data();
   auto size     = buff.size();
   auto capacity = buff.capacity();
-  auto* mr      = buff.memory_resource();
+  auto mr       = buff.memory_resource();
   auto stream   = buff.stream();
 
   buff = std::move(buff);           // self-move-assignment shouldn't modify the buffer
diff --git a/tests/device_uvector_tests.cpp b/tests/device_uvector_tests.cpp
index 69d89e305..3c042a437 100644
--- a/tests/device_uvector_tests.cpp
+++ b/tests/device_uvector_tests.cpp
@@ -30,14 +30,15 @@ struct TypedUVectorTest : ::testing::Test {
   [[nodiscard]] rmm::cuda_stream_view stream() const noexcept { return rmm::cuda_stream_view{}; }
 };
 
-using TestTypes = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
+using TestTypes          = ::testing::Types<int8_t, int32_t, uint64_t, float, double>;
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TYPED_TEST_CASE(TypedUVectorTest, TestTypes);
 
 TYPED_TEST(TypedUVectorTest, MemoryResource)
 {
   rmm::device_uvector<TypeParam> vec(128, this->stream());
-  EXPECT_EQ(vec.memory_resource(), rmm::mr::get_current_device_resource());
+  EXPECT_EQ(vec.memory_resource(), async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
 TYPED_TEST(TypedUVectorTest, ZeroSizeConstructor)

From 559a6744b3e9638705dd5bfa9a44c96823f4f553 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 7 Nov 2023 08:00:05 +0000
Subject: [PATCH 20/30] Add a check that we do not store unnecessary data
 inside a `async_resource_ref `

While we are at it also properly test that a `device_memory_resource` is a `async_resource`
---
 include/rmm/mr/device/device_memory_resource.hpp | 5 ++++-
 include/rmm/mr/host/host_memory_resource.hpp     | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index ab03a24cd..710db8eac 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -390,6 +390,9 @@ class device_memory_resource {
   [[nodiscard]] virtual std::pair<std::size_t, std::size_t> do_get_mem_info(
     cuda_stream_view stream) const = 0;
 };
-static_assert(cuda::mr::resource_with<device_memory_resource, cuda::mr::device_accessible>);
+static_assert(cuda::mr::async_resource_with<device_memory_resource, cuda::mr::device_accessible>);
+static_assert(
+  sizeof(cuda::mr::async_resource_ref<device_memory_resource, cuda::mr::device_accessible>) ==
+  2 * sizeof(void*));
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp
index ce870287c..e6403ad2b 100644
--- a/include/rmm/mr/host/host_memory_resource.hpp
+++ b/include/rmm/mr/host/host_memory_resource.hpp
@@ -196,6 +196,8 @@ class host_memory_resource {
   }
 };
 static_assert(cuda::mr::resource_with<host_memory_resource, cuda::mr::host_accessible>);
+static_assert(sizeof(cuda::mr::resource_ref<host_memory_resource, cuda::mr::host_accessible>) ==
+              2 * sizeof(void*));
 /** @} */  // end of group
 
 }  // namespace rmm::mr

From c1f36d8fa015ba71e2e2934fe242ab0dc8db2fc6 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 7 Nov 2023 08:03:58 +0000
Subject: [PATCH 21/30] Properly test that a `device_memory_resource` is an
 `async_resource`

---
 include/rmm/mr/device/device_memory_resource.hpp | 3 ---
 include/rmm/mr/host/host_memory_resource.hpp     | 2 --
 2 files changed, 5 deletions(-)

diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 710db8eac..b5c1fbbb9 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -391,8 +391,5 @@ class device_memory_resource {
     cuda_stream_view stream) const = 0;
 };
 static_assert(cuda::mr::async_resource_with<device_memory_resource, cuda::mr::device_accessible>);
-static_assert(
-  sizeof(cuda::mr::async_resource_ref<device_memory_resource, cuda::mr::device_accessible>) ==
-  2 * sizeof(void*));
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/host/host_memory_resource.hpp b/include/rmm/mr/host/host_memory_resource.hpp
index e6403ad2b..ce870287c 100644
--- a/include/rmm/mr/host/host_memory_resource.hpp
+++ b/include/rmm/mr/host/host_memory_resource.hpp
@@ -196,8 +196,6 @@ class host_memory_resource {
   }
 };
 static_assert(cuda::mr::resource_with<host_memory_resource, cuda::mr::host_accessible>);
-static_assert(sizeof(cuda::mr::resource_ref<host_memory_resource, cuda::mr::host_accessible>) ==
-              2 * sizeof(void*));
 /** @} */  // end of group
 
 }  // namespace rmm::mr

From 1d422b5a902c4f1e644ca286058df7207848199a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 7 Nov 2023 08:14:45 +0000
Subject: [PATCH 22/30] Also port `rmm::mr::thrust_allocator` to
 `async_resource_ref``

---
 .../mr/device/thrust_allocator_adaptor.hpp    | 26 +++++++++++++------
 tests/mr/device/thrust_allocator_tests.cu     |  4 ++-
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/include/rmm/mr/device/thrust_allocator_adaptor.hpp b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
index a1386a842..562a0d79e 100644
--- a/include/rmm/mr/device/thrust_allocator_adaptor.hpp
+++ b/include/rmm/mr/device/thrust_allocator_adaptor.hpp
@@ -24,6 +24,8 @@
 #include <thrust/device_ptr.h>
 #include <thrust/memory.h>
 
+#include <cuda/memory_resource>
+
 namespace rmm::mr {
 /**
  * @addtogroup device_resource_adaptors
@@ -42,6 +44,8 @@ namespace rmm::mr {
  */
 template <typename T>
 class thrust_allocator : public thrust::device_malloc_allocator<T> {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+
  public:
   using Base      = thrust::device_malloc_allocator<T>;  ///< The base type of this allocator
   using pointer   = typename Base::pointer;              ///< The pointer type
@@ -79,9 +83,7 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    * @param mr The resource to be used for device memory allocation
    * @param stream The stream to be used for device memory (de)allocation
    */
-  thrust_allocator(cuda_stream_view stream, device_memory_resource* mr) : _stream{stream}, _mr(mr)
-  {
-  }
+  thrust_allocator(cuda_stream_view stream, async_resource_ref mr) : _stream{stream}, _mr(mr) {}
 
   /**
    * @brief Copy constructor. Copies the resource pointer and stream.
@@ -102,7 +104,8 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   pointer allocate(size_type num)
   {
-    return thrust::device_pointer_cast(static_cast<T*>(_mr->allocate(num * sizeof(T), _stream)));
+    return thrust::device_pointer_cast(
+      static_cast<T*>(_mr.allocate_async(num * sizeof(T), _stream)));
   }
 
   /**
@@ -114,22 +117,29 @@ class thrust_allocator : public thrust::device_malloc_allocator<T> {
    */
   void deallocate(pointer ptr, size_type num)
   {
-    return _mr->deallocate(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream);
+    return _mr.deallocate_async(thrust::raw_pointer_cast(ptr), num * sizeof(T), _stream);
   }
 
   /**
-   * @briefreturn{The device memory resource used by this}
+   * @briefreturn{The async_resource_ref used to allocate and deallocate}
    */
-  [[nodiscard]] device_memory_resource* resource() const noexcept { return _mr; }
+  [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
 
   /**
    * @briefreturn{The stream used by this allocator}
    */
   [[nodiscard]] cuda_stream_view stream() const noexcept { return _stream; }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `thrust_allocator` provides device accessible memory
+   */
+  friend void get_property(thrust_allocator const&, cuda::mr::device_accessible) noexcept {}
+
  private:
   cuda_stream_view _stream{};
-  device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+  async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 };
 /** @} */  // end of group
 }  // namespace rmm::mr
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
index 41fb15973..ed8875cbe 100644
--- a/tests/mr/device/thrust_allocator_tests.cu
+++ b/tests/mr/device/thrust_allocator_tests.cu
@@ -32,6 +32,7 @@ namespace rmm::test {
 namespace {
 
 struct allocator_test : public mr_test {};
+using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
 
 TEST_P(allocator_test, first)
 {
@@ -44,7 +45,8 @@ TEST_P(allocator_test, defaults)
 {
   rmm::mr::thrust_allocator<int> allocator(rmm::cuda_stream_default);
   EXPECT_EQ(allocator.stream(), rmm::cuda_stream_default);
-  EXPECT_EQ(allocator.resource(), rmm::mr::get_current_device_resource());
+  EXPECT_EQ(allocator.memory_resource(),
+            async_resource_ref{rmm::mr::get_current_device_resource()});
 }
 
 INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,

From e867442eacf7b281ded7fcf120dbf59c335c0128 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 7 Nov 2023 10:30:32 +0000
Subject: [PATCH 23/30] Do not put `rmm::bad_alloc` in quotes

Also `Nothing` seems to be an invalid type
---
 include/rmm/mr/device/device_memory_resource.hpp | 12 +++---------
 include/rmm/mr/host/pinned_memory_resource.hpp   |  6 ++----
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index b5c1fbbb9..9c29a2ad7 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -162,7 +162,7 @@ class device_memory_resource {
    *
    * The returned pointer will have at minimum 256 byte alignment.
    *
-   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
    * @param bytes The size of the allocation
@@ -182,8 +182,6 @@ class device_memory_resource {
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
    *
-   * @throws Nothing.
-   *
    * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
@@ -199,7 +197,7 @@ class device_memory_resource {
    *
    * The returned pointer will have at minimum 256 byte alignment.
    *
-   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
    * @param bytes The size of the allocation
@@ -217,7 +215,7 @@ class device_memory_resource {
    *
    * The returned pointer will have at minimum 256 byte alignment.
    *
-   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
    * @param bytes The size of the allocation
@@ -237,8 +235,6 @@ class device_memory_resource {
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
    *
-   * @throws Nothing.
-   *
    * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
@@ -261,8 +257,6 @@ class device_memory_resource {
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
    *
-   * @throws Nothing.
-   *
    * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.
diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp
index 8ea259263..ad650800c 100644
--- a/include/rmm/mr/host/pinned_memory_resource.hpp
+++ b/include/rmm/mr/host/pinned_memory_resource.hpp
@@ -78,7 +78,7 @@ class pinned_memory_resource final : public host_memory_resource {
   /**
    * @brief Pretent to support the allocate_async interface, falling back to stream 0
    *
-   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
    * @param bytes The size of the allocation
@@ -93,7 +93,7 @@ class pinned_memory_resource final : public host_memory_resource {
   /**
    * @brief Pretent to support the allocate_async interface, falling back to stream 0
    *
-   * @throws `rmm::bad_alloc` When the requested `bytes` cannot be allocated on
+   * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on
    * the specified `stream`.
    *
    * @param bytes The size of the allocation
@@ -107,8 +107,6 @@ class pinned_memory_resource final : public host_memory_resource {
   /**
    * @brief Pretent to support the deallocate_async interface, falling back to stream 0
    *
-   * @throws Nothing.
-   *
    * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `p`.

From 73efc2e5ef1c76199ddb6f5f641615a26f44ee04 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 7 Nov 2023 11:10:11 +0000
Subject: [PATCH 24/30] Try to avoid namespaces in comments

---
 include/rmm/cuda_stream_view.hpp | 4 ++--
 include/rmm/device_uvector.hpp   | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/rmm/cuda_stream_view.hpp b/include/rmm/cuda_stream_view.hpp
index 082275ea9..f8564b16b 100644
--- a/include/rmm/cuda_stream_view.hpp
+++ b/include/rmm/cuda_stream_view.hpp
@@ -61,7 +61,7 @@ class cuda_stream_view {
   constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {}
 
   /**
-   * @brief Implicit conversion from cuda::stream_ref.
+   * @brief Implicit conversion from stream_ref.
    *
    * @param stream The underlying stream for this view
    */
@@ -84,7 +84,7 @@ class cuda_stream_view {
   /**
    * @brief Implicit conversion to stream_ref.
    *
-   * @return cuda::stream_ref The underlying stream referenced by this cuda_stream_view
+   * @return stream_ref The underlying stream referenced by this cuda_stream_view
    */
   constexpr operator cuda::stream_ref() const noexcept { return value(); }
 
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index 022d9dc02..3bd6a1c25 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -527,8 +527,7 @@ class device_uvector {
   /**
    * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage}
    */
-  [[nodiscard]] cuda::mr::async_resource_ref<cuda::mr::device_accessible> memory_resource()
-    const noexcept
+  [[nodiscard]] async_resource_ref memory_resource() const noexcept
   {
     return _storage.memory_resource();
   }

From c298fbc937fd8c2656b36ebce4315e082ea12ec7 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 7 Nov 2023 18:11:51 +0000
Subject: [PATCH 25/30] Tell sphinx to skipp documenting libcu++ names

---
 python/docs/conf.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/docs/conf.py b/python/docs/conf.py
index a063b52eb..ec6ddc70a 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -216,6 +216,18 @@ def on_missing_reference(app, env, node, contnode):
         "cudaStreamPerThread",
         "thrust",
         "spdlog",
+        "stream_ref",
+        # libcu++ names
+        "cuda",
+        "cuda::mr",
+        "resource",
+        "resource_ref",
+        "async_resource",
+        "async_resource_ref",
+        "device_accessible",
+        "host_accessible",
+        "forward_property",
+        "enable_if_t",
         # Unknown types
         "int64_t",
         "int8_t",

From 80201bd11bf7e1a681a8296ba22dddf4d30a8282 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Nov 2023 17:59:14 -0600
Subject: [PATCH 26/30] Update copyright years.

---
 cmake/thirdparty/get_libcudacxx.cmake          | 2 +-
 tests/mr/device/mr_ref_multithreaded_tests.cpp | 2 +-
 tests/mr/device/mr_ref_test.hpp                | 2 +-
 tests/mr/device/mr_ref_tests.cpp               | 2 +-
 tests/mr/host/mr_ref_tests.cpp                 | 2 +-
 tests/mr/host/pinned_pool_mr_tests.cpp         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/thirdparty/get_libcudacxx.cmake b/cmake/thirdparty/get_libcudacxx.cmake
index b1f990d39..14b0d492f 100644
--- a/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cmake/thirdparty/get_libcudacxx.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
index dd39d1e1d..0dea4f503 100644
--- a/tests/mr/device/mr_ref_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
index ecb13c2a7..126a4fde8 100644
--- a/tests/mr/device/mr_ref_test.hpp
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp
index 2507d468a..a9a94696a 100644
--- a/tests/mr/device/mr_ref_tests.cpp
+++ b/tests/mr/device/mr_ref_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp
index 1439ebabe..9f12159de 100644
--- a/tests/mr/host/mr_ref_tests.cpp
+++ b/tests/mr/host/mr_ref_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp
index 8e202ada3..dcdae37fa 100644
--- a/tests/mr/host/pinned_pool_mr_tests.cpp
+++ b/tests/mr/host/pinned_pool_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 5937e3ea172b680418c924c146c2474a05d8079d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Nov 2023 17:59:30 -0600
Subject: [PATCH 27/30] Fix typos and grammar.

---
 include/rmm/mr/device/device_memory_resource.hpp | 8 ++++----
 include/rmm/mr/device/pool_memory_resource.hpp   | 4 ++--
 include/rmm/mr/host/pinned_memory_resource.hpp   | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/rmm/mr/device/device_memory_resource.hpp b/include/rmm/mr/device/device_memory_resource.hpp
index 9c29a2ad7..63e5f39a4 100644
--- a/include/rmm/mr/device/device_memory_resource.hpp
+++ b/include/rmm/mr/device/device_memory_resource.hpp
@@ -121,7 +121,7 @@ class device_memory_resource {
   /**
    * @brief Deallocate memory pointed to by \p p.
    *
-   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * `p` must have been returned by a prior call to `allocate(bytes, stream)` on
    * a `device_memory_resource` that compares equal to `*this`, and the storage
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
@@ -177,7 +177,7 @@ class device_memory_resource {
   /**
    * @brief Deallocate memory pointed to by \p p.
    *
-   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * `p` must have been returned by a prior call to `allocate(bytes, stream)` on
    * a `device_memory_resource` that compares equal to `*this`, and the storage
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
@@ -230,7 +230,7 @@ class device_memory_resource {
   /**
    * @brief Deallocate memory pointed to by \p p.
    *
-   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * `p` must have been returned by a prior call to `allocate(bytes, stream)` on
    * a `device_memory_resource` that compares equal to `*this`, and the storage
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
@@ -252,7 +252,7 @@ class device_memory_resource {
   /**
    * @brief Deallocate memory pointed to by \p p.
    *
-   * `p` must have been returned by a prior call to `allocate(bytes,stream)` on
+   * `p` must have been returned by a prior call to `allocate(bytes, stream)` on
    * a `device_memory_resource` that compares equal to `*this`, and the storage
    * it points to must not yet have been deallocated, otherwise behavior is
    * undefined.
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index face43993..c85408359 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -56,7 +56,7 @@ namespace detail {
  * @brief A helper class to remove the device_accessible property
  *
  * We want to be able to use the pool_memory_resource with an upstream that may not
- * be device accessible. To avoid rewriting the world, we allow to conditionally remove
+ * be device accessible. To avoid rewriting the world, we allow conditionally removing
  * the cuda::mr::device_accessible property.
  *
  * @tparam PoolResource the pool_memory_resource class
@@ -67,7 +67,7 @@ template <class PoolResource, class Upstream, class Property, class = void>
 struct maybe_remove_property {};
 
 /**
- * @brief Specialization of maybe_remove_property to not propagate non existing properties
+ * @brief Specialization of maybe_remove_property to not propagate nonexistent properties
  */
 template <class PoolResource, class Upstream, class Property>
 struct maybe_remove_property<PoolResource,
diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp
index ad650800c..e49767faf 100644
--- a/include/rmm/mr/host/pinned_memory_resource.hpp
+++ b/include/rmm/mr/host/pinned_memory_resource.hpp
@@ -76,7 +76,7 @@ class pinned_memory_resource final : public host_memory_resource {
   }
 
   /**
-   * @brief Pretent to support the allocate_async interface, falling back to stream 0
+   * @brief Pretend to support the allocate_async interface, falling back to stream 0
    *
    * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on
    * the specified `stream`.
@@ -91,7 +91,7 @@ class pinned_memory_resource final : public host_memory_resource {
   }
 
   /**
-   * @brief Pretent to support the allocate_async interface, falling back to stream 0
+   * @brief Pretend to support the allocate_async interface, falling back to stream 0
    *
    * @throws rmm::bad_alloc When the requested `bytes` cannot be allocated on
    * the specified `stream`.
@@ -105,7 +105,7 @@ class pinned_memory_resource final : public host_memory_resource {
   }
 
   /**
-   * @brief Pretent to support the deallocate_async interface, falling back to stream 0
+   * @brief Pretend to support the deallocate_async interface, falling back to stream 0
    *
    * @param ptr Pointer to be deallocated
    * @param bytes The size in bytes of the allocation. This must be equal to the

From baf99f35c8cdbedd76cdd739277f917591f3992e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Nov 2023 17:59:49 -0600
Subject: [PATCH 28/30] Drop support for CUDA older than CUDA 11 in tests.

---
 tests/mr/device/mr_ref_test.hpp | 4 ----
 tests/mr/device/mr_test.hpp     | 4 ----
 tests/mr/host/mr_ref_tests.cpp  | 4 ----
 tests/mr/host/mr_tests.cpp      | 4 ----
 4 files changed, 16 deletions(-)

diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
index 126a4fde8..804c710a5 100644
--- a/tests/mr/device/mr_ref_test.hpp
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -57,11 +57,7 @@ inline bool is_device_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
-  return attributes.memoryType == cudaMemoryTypeDevice;
-#else
   return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
-#endif
 }
 
 enum size_in_bytes : size_t {};
diff --git a/tests/mr/device/mr_test.hpp b/tests/mr/device/mr_test.hpp
index 8c69df215..03f880e72 100644
--- a/tests/mr/device/mr_test.hpp
+++ b/tests/mr/device/mr_test.hpp
@@ -52,11 +52,7 @@ inline bool is_device_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
-  return attributes.memoryType == cudaMemoryTypeDevice;
-#else
   return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
-#endif
 }
 
 enum size_in_bytes : size_t {};
diff --git a/tests/mr/host/mr_ref_tests.cpp b/tests/mr/host/mr_ref_tests.cpp
index 9f12159de..6563eb635 100644
--- a/tests/mr/host/mr_ref_tests.cpp
+++ b/tests/mr/host/mr_ref_tests.cpp
@@ -43,11 +43,7 @@ inline bool is_device_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
-  return attributes.memoryType == cudaMemoryTypeDevice;
-#else
   return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
-#endif
 }
 
 /**
diff --git a/tests/mr/host/mr_tests.cpp b/tests/mr/host/mr_tests.cpp
index 1b95bea5d..678d6aeb8 100644
--- a/tests/mr/host/mr_tests.cpp
+++ b/tests/mr/host/mr_tests.cpp
@@ -43,11 +43,7 @@ inline bool is_device_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-#if CUDART_VERSION < 10000  // memoryType is deprecated in CUDA 10
-  return attributes.memoryType == cudaMemoryTypeDevice;
-#else
   return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged);
-#endif
 }
 
 /**

From af127339220e7e2999900366c29efa05a52e3b6a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 16 Nov 2023 07:48:18 +0000
Subject: [PATCH 29/30] Do not propagate the `device_accessible` property for
 containers

We currently have no design on how we want to propagate the properties of the memory_resource, so drop this for now until have a proper design ready
---
 include/rmm/device_buffer.hpp  | 7 -------
 include/rmm/device_uvector.hpp | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index c94736d2d..4a780018e 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -414,13 +414,6 @@ class device_buffer {
    */
   [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
 
-  /**
-   * @brief Enables the `cuda::mr::device_accessible` property
-   *
-   * This property declares that a `device_buffer` provides device accessible memory
-   */
-  friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {}
-
  private:
   void* _data{nullptr};        ///< Pointer to device memory allocation
   std::size_t _size{};         ///< Requested size of the device memory allocation
diff --git a/include/rmm/device_uvector.hpp b/include/rmm/device_uvector.hpp
index 3bd6a1c25..3f77f59f7 100644
--- a/include/rmm/device_uvector.hpp
+++ b/include/rmm/device_uvector.hpp
@@ -532,13 +532,6 @@ class device_uvector {
     return _storage.memory_resource();
   }
 
-  /**
-   * @brief Enables the `cuda::mr::device_accessible` property
-   *
-   * This property declares that a `device_uvector` provides device accessible memory
-   */
-  friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {}
-
   /**
    * @briefreturn{Stream most recently specified for allocation/deallocation}
    */

From 37c1e3f70a032b6d9af1c32bd16226de8c205ec1 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 16 Nov 2023 21:59:41 +0000
Subject: [PATCH 30/30] Fix deadlock in new tests using fix from #1097

---
 .../mr/device/mr_ref_multithreaded_tests.cpp  | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
index dd39d1e1d..5e5bf0c6b 100644
--- a/tests/mr/device/mr_ref_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -122,6 +122,7 @@ void allocate_async_loop(async_resource_ref ref,
                          std::size_t num_allocations,
                          std::list<allocation>& allocations,
                          std::mutex& mtx,
+                         std::condition_variable& allocations_ready,
                          cudaEvent_t& event,
                          rmm::cuda_stream_view stream)
 {
@@ -138,25 +139,32 @@ void allocate_async_loop(async_resource_ref ref,
       RMM_CUDA_TRY(cudaEventRecord(event, stream.value()));
       allocations.emplace_back(ptr, size);
     }
+    allocations_ready.notify_one();
   }
+
+  // Work around for threads going away before cudaEvent has finished async processing
+  cudaEventSynchronize(event);
 }
 
 void deallocate_async_loop(async_resource_ref ref,
                            std::size_t num_allocations,
                            std::list<allocation>& allocations,
                            std::mutex& mtx,
+                           std::condition_variable& allocations_ready,
                            cudaEvent_t& event,
                            rmm::cuda_stream_view stream)
 {
-  for (std::size_t i = 0; i < num_allocations;) {
-    std::lock_guard<std::mutex> lock(mtx);
-    if (allocations.empty()) { continue; }
-    i++;
+  for (std::size_t i = 0; i < num_allocations; i++) {
+    std::unique_lock lock(mtx);
+    allocations_ready.wait(lock, [&allocations] { return !allocations.empty(); });
     RMM_CUDA_TRY(cudaStreamWaitEvent(stream.value(), event));
     allocation alloc = allocations.front();
     allocations.pop_front();
     ref.deallocate_async(alloc.ptr, alloc.size, stream);
   }
+
+  // Work around for threads going away before cudaEvent has finished async processing
+  cudaEventSynchronize(event);
 }
 
 void test_allocate_async_free_different_threads(async_resource_ref ref,
@@ -166,6 +174,7 @@ void test_allocate_async_free_different_threads(async_resource_ref ref,
   constexpr std::size_t num_allocations{100};
 
   std::mutex mtx;
+  std::condition_variable allocations_ready;
   std::list<allocation> allocations;
   cudaEvent_t event;
 
@@ -176,6 +185,7 @@ void test_allocate_async_free_different_threads(async_resource_ref ref,
                        num_allocations,
                        std::ref(allocations),
                        std::ref(mtx),
+                       std::ref(allocations_ready),
                        std::ref(event),
                        streamA);
 
@@ -184,6 +194,7 @@ void test_allocate_async_free_different_threads(async_resource_ref ref,
                        num_allocations,
                        std::ref(allocations),
                        std::ref(mtx),
+                       std::ref(allocations_ready),
                        std::ref(event),
                        streamB);