rapidsai · rapids-bot · Nov 17, 2023 · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023
@@ -70,6 +70,7 @@ rapids_cpm_init()
 
 include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
+include(cmake/thirdparty/get_libcudacxx.cmake)
 include(cmake/thirdparty/get_thrust.cmake)
 
 # ##################################################################################################
@@ -89,11 +90,13 @@ else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()
 
+target_link_libraries(rmm INTERFACE libcudacxx::libcudacxx)
 target_link_libraries(rmm INTERFACE rmm::Thrust)
 target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
 target_link_libraries(rmm INTERFACE dl)
 target_compile_features(rmm INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_definitions(rmm INTERFACE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
 
 # ##################################################################################################
 # * tests and benchmarks ---------------------------------------------------------------------------

@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone libcudacxx
+function(find_and_configure_libcudacxx)
+
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET rmm-exports INSTALL_EXPORT_SET rmm-exports)
+
+endfunction()
+
+find_and_configure_libcudacxx()
@@ -20,6 +20,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cuda/stream_ref>
+
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
@@ -58,6 +60,13 @@ class cuda_stream_view {
    */
   constexpr cuda_stream_view(cudaStream_t stream) noexcept : stream_{stream} {}
 
+  /**
+   * @brief Implicit conversion from stream_ref.
+   *
+   * @param stream The underlying stream for this view
+   */
+  constexpr cuda_stream_view(cuda::stream_ref stream) noexcept : stream_{stream.get()} {}
+
   /**
    * @brief Get the wrapped stream.
    *
@@ -72,6 +81,13 @@ class cuda_stream_view {
    */
   constexpr operator cudaStream_t() const noexcept { return value(); }
 
+  /**
+   * @brief Implicit conversion to stream_ref.
+   *
+   * @return stream_ref The underlying stream referenced by this cuda_stream_view
+   */
+  constexpr operator cuda::stream_ref() const noexcept { return value(); }
+
   /**
    * @briefreturn{true if the wrapped stream is the CUDA per-thread default stream}
    */

@@ -27,6 +27,8 @@
 #include <stdexcept>
 #include <utility>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -79,6 +81,8 @@ namespace rmm {
  *```
  */
 class device_buffer {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
+
  public:
   // The copy constructor and copy assignment operator without a stream are deleted because they
   // provide no way to specify an explicit stream
@@ -106,7 +110,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cuda_stream_view stream,
-                         mr::device_memory_resource* mr = mr::get_current_device_resource())
+                         async_resource_ref mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -134,7 +138,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cuda_stream_view stream,
-                mr::device_memory_resource* mr = mr::get_current_device_resource())
+                async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate_async(size);
@@ -164,7 +168,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                async_resource_ref mr = rmm::mr::get_current_device_resource())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -236,7 +240,6 @@ class device_buffer {
   ~device_buffer() noexcept
   {
     deallocate_async();
-    _mr     = nullptr;
     _stream = cuda_stream_view{};
   }
 
@@ -395,18 +398,25 @@ class device_buffer {
   void set_stream(cuda_stream_view stream) noexcept { _stream = stream; }
 
   /**
-   * @briefreturn{Pointer to the memory resource used to allocate and deallocate}
+   * @briefreturn{The async_resource_ref used to allocate and deallocate}
+   */
+  [[nodiscard]] async_resource_ref memory_resource() const noexcept { return _mr; }
+
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_buffer` provides device accessible memory
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept { return _mr; }
+  friend void get_property(device_buffer const&, cuda::mr::device_accessible) noexcept {}
 
  private:
   void* _data{nullptr};        ///< Pointer to device memory allocation
   std::size_t _size{};         ///< Requested size of the device memory allocation
   std::size_t _capacity{};     ///< The actual size of the device memory allocation
   cuda_stream_view _stream{};  ///< Stream to use for device memory deallocation
-  mr::device_memory_resource* _mr{
-    mr::get_current_device_resource()};  ///< The memory resource used to
-                                         ///< allocate/deallocate device memory
+  async_resource_ref _mr{
+    rmm::mr::get_current_device_resource()};  ///< The memory resource used to
+                                              ///< allocate/deallocate device memory
 
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
@@ -421,7 +431,7 @@ class device_buffer {
   {
     _size     = bytes;
     _capacity = bytes;
-    _data     = (bytes > 0) ? memory_resource()->allocate(bytes, stream()) : nullptr;
+    _data     = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
   }
 
   /**
@@ -435,7 +445,7 @@ class device_buffer {
    */
   void deallocate_async() noexcept
   {
-    if (capacity() > 0) { memory_resource()->deallocate(data(), capacity(), stream()); }
+    if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }
     _size     = 0;
     _capacity = 0;
     _data     = nullptr;

@@ -26,6 +26,8 @@
 #include <cstddef>
 #include <vector>
 
+#include <cuda/memory_resource>
+
 namespace rmm {
 /**
  * @addtogroup data_containers
@@ -72,6 +74,7 @@ namespace rmm {
  */
 template <typename T>
 class device_uvector {
+  using async_resource_ref = cuda::mr::async_resource_ref<cuda::mr::device_accessible>;
   static_assert(std::is_trivially_copyable<T>::value,
                 "device_uvector only supports types that are trivially copyable.");
 
@@ -121,10 +124,9 @@ class device_uvector {
    * @param stream The stream on which to perform the allocation
    * @param mr The resource used to allocate the device storage
    */
-  explicit device_uvector(
-    std::size_t size,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(std::size_t size,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _storage{elements_to_bytes(size), stream, mr}
   {
   }
@@ -138,10 +140,9 @@ class device_uvector {
    * @param stream The stream on which to perform the copy
    * @param mr The resource used to allocate device memory for the new vector
    */
-  explicit device_uvector(
-    device_uvector const& other,
-    cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit device_uvector(device_uvector const& other,
+                          cuda_stream_view stream,
+                          async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _storage{other._storage, stream, mr}
   {
   }
@@ -524,13 +525,20 @@ class device_uvector {
   [[nodiscard]] bool is_empty() const noexcept { return size() == 0; }
 
   /**
-   * @briefreturn{Pointer to underlying resource used to allocate and deallocate the device storage}
+   * @briefreturn{The async_resource_ref used to allocate and deallocate the device storage}
    */
-  [[nodiscard]] mr::device_memory_resource* memory_resource() const noexcept
+  [[nodiscard]] async_resource_ref memory_resource() const noexcept
   {
     return _storage.memory_resource();
   }
 
+  /**
+   * @brief Enables the `cuda::mr::device_accessible` property
+   *
+   * This property declares that a `device_uvector` provides device accessible memory
+   */
+  friend void get_property(device_uvector const&, cuda::mr::device_accessible) noexcept {}
+
   /**
    * @briefreturn{Stream most recently specified for allocation/deallocation}
    */

@@ -143,8 +143,8 @@ class callback_memory_resource final : public device_memory_resource {
     throw std::runtime_error("cannot get free / total memory");
   }
 
-  [[nodiscard]] virtual bool supports_streams() const noexcept { return false; }
-  [[nodiscard]] virtual bool supports_get_mem_info() const noexcept { return false; }
+  [[nodiscard]] bool supports_streams() const noexcept override { return false; }
+  [[nodiscard]] bool supports_get_mem_info() const noexcept override { return false; }
 
   allocate_callback_t allocate_callback_;
   deallocate_callback_t deallocate_callback_;