From dbc33eae6cc043b2a2c445c5690c7e6075f9af6e Mon Sep 17 00:00:00 2001 From: Mark Harris <783069+harrism@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:28:53 +1100 Subject: [PATCH] Provide explicit pool size for pool_memory_resources and clean up includes (#2088) This PR fixes up RAFT to avoid usage that will soon be deprecated in RMM. Depends on rapidsai/rmm#1417 Fixes #2087 Authors: - Mark Harris (https://github.com/harrism) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/2088 --- cpp/bench/ann/src/raft/raft_ann_bench_utils.h | 15 +++++++---- cpp/bench/ann/src/raft/raft_benchmark.cu | 14 +++++------ cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu | 5 +++- .../ann/src/raft/raft_ivf_flat_wrapper.h | 20 +++++++-------- cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h | 12 ++++----- cpp/bench/prims/common/benchmark.hpp | 12 ++++++--- cpp/bench/prims/matrix/select_k.cu | 1 - cpp/bench/prims/neighbors/refine.cuh | 6 +++-- .../raft/core/device_resources_manager.hpp | 7 ++++-- cpp/include/raft/neighbors/ivf_flat-inl.cuh | 4 +-- cpp/include/raft/util/memory_pool-ext.hpp | 7 +++--- cpp/include/raft/util/memory_pool-inl.hpp | 16 +++++++----- cpp/template/src/cagra_example.cu | 7 +++--- cpp/template/src/common.cuh | 3 ++- cpp/template/src/ivf_flat_example.cu | 8 +++--- cpp/test/core/device_resources_manager.cpp | 17 ++++++++----- cpp/test/core/handle.cpp | 25 +++++++++++-------- docs/source/vector_search_tutorial.md | 24 ++++++++++-------- 18 files changed, 118 insertions(+), 85 deletions(-) diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h index 2b91c2588c..638f498b59 100644 --- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h +++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,20 @@ * limitations under the License. */ #pragma once -#include -#include -#include -#include + #include +#include #include #include #include + #include +#include + +#include +#include +#include +#include #include #include #include diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu index 4ea8babea5..37f5fb674b 100644 --- a/cpp/bench/ann/src/raft/raft_benchmark.cu +++ b/cpp/bench/ann/src/raft/raft_benchmark.cu @@ -15,23 +15,23 @@ */ #include "../common/ann_types.hpp" - #include "raft_ann_bench_param_parser.h" +#include + +#include + +#define JSON_DIAGNOSTICS 1 +#include + #include #include #include -#include -#include -#include #include #include #include #include -#define JSON_DIAGNOSTICS 1 -#include - namespace raft::bench::ann { template diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu index 9a275a31f0..709b08db76 100644 --- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu +++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu @@ -18,6 +18,7 @@ #include "raft_ann_bench_param_parser.h" #include "raft_cagra_hnswlib_wrapper.h" +#include #include #define JSON_DIAGNOSTICS 1 @@ -85,7 +86,9 @@ int main(int argc, char** argv) { rmm::mr::cuda_memory_resource cuda_mr; // Construct a resource that uses a coalescing best-fit pool allocator - rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; + // and is initially sized to half of free device memory. + rmm::mr::pool_memory_resource pool_mr{ + &cuda_mr, rmm::percent_of_free_device_memory(50)}; rmm::mr::set_current_device_resource( &pool_mr); // Updates the current device resource pointer to `pool_mr` rmm::mr::device_memory_resource* mr = diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h index 51b8b67f37..06ee355ae7 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ */ #pragma once -#include -#include -#include -#include +#include "../common/ann_types.hpp" +#include "raft_ann_bench_utils.h" + #include #include #include @@ -28,16 +27,15 @@ #include #include #include -#include -#include + +#include +#include +#include +#include #include #include #include -#include "../common/ann_types.hpp" -#include "raft_ann_bench_utils.h" -#include - namespace raft::bench::ann { template diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h index 9a373787ac..17e324f918 100644 --- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h +++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,9 @@ */ #pragma once +#include "../common/ann_types.hpp" +#include "raft_ann_bench_utils.h" + #include #include #include @@ -28,14 +31,11 @@ #include #include #include + #include #include -#include -#include -#include "../common/ann_types.hpp" -#include "raft_ann_bench_utils.h" -#include +#include namespace raft::bench::ann { diff --git a/cpp/bench/prims/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp index d3da3bff68..c924d8b50a 100644 --- a/cpp/bench/prims/common/benchmark.hpp +++ b/cpp/bench/prims/common/benchmark.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ #include +#include #include #include #include @@ -43,7 +44,7 @@ namespace raft::bench { struct using_pool_memory_res { private: rmm::mr::device_memory_resource* orig_res_; - rmm::mr::cuda_memory_resource cuda_res_; + rmm::mr::cuda_memory_resource cuda_res_{}; rmm::mr::pool_memory_resource pool_res_; public: @@ -54,7 +55,9 @@ struct using_pool_memory_res { rmm::mr::set_current_device_resource(&pool_res_); } - using_pool_memory_res() : orig_res_(rmm::mr::get_current_device_resource()), pool_res_(&cuda_res_) + using_pool_memory_res() + : orig_res_(rmm::mr::get_current_device_resource()), + pool_res_(&cuda_res_, rmm::percent_of_free_device_memory(50)) { rmm::mr::set_current_device_resource(&pool_res_); } @@ -114,7 +117,8 @@ class fixture { raft::device_resources handle; rmm::cuda_stream_view stream; - fixture(bool use_pool_memory_resource = false) : stream{resource::get_cuda_stream(handle)} + explicit fixture(bool use_pool_memory_resource = false) + : stream{resource::get_cuda_stream(handle)} { // Cache memory pool between test runs, since it is expensive to create. // This speeds up the time required to run the select_k bench by over 3x. diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu index 6364ab17da..a8e65d3e03 100644 --- a/cpp/bench/prims/matrix/select_k.cu +++ b/cpp/bench/prims/matrix/select_k.cu @@ -30,7 +30,6 @@ #include #include -#include #include #include diff --git a/cpp/bench/prims/neighbors/refine.cuh b/cpp/bench/prims/neighbors/refine.cuh index 121917f34f..7cda59b57e 100644 --- a/cpp/bench/prims/neighbors/refine.cuh +++ b/cpp/bench/prims/neighbors/refine.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -58,7 +59,8 @@ class RefineAnn : public fixture { state.SetLabel(label_stream.str()); auto old_mr = rmm::mr::get_current_device_resource(); - rmm::mr::pool_memory_resource pool_mr(old_mr); + rmm::mr::pool_memory_resource pool_mr( + old_mr, rmm::percent_of_free_device_memory(50)); rmm::mr::set_current_device_resource(&pool_mr); if (data.p.host_data) { diff --git a/cpp/include/raft/core/device_resources_manager.hpp b/cpp/include/raft/core/device_resources_manager.hpp index c3482b0c04..1f96a6f204 100644 --- a/cpp/include/raft/core/device_resources_manager.hpp +++ b/cpp/include/raft/core/device_resources_manager.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -170,7 +171,9 @@ struct device_resources_manager { if (upstream != nullptr) { result = std::make_shared>( - upstream, params.init_mem_pool_size, params.max_mem_pool_size); + upstream, + params.init_mem_pool_size.value_or(rmm::percent_of_free_device_memory(50)), + params.max_mem_pool_size); rmm::mr::set_current_device_resource(result.get()); } else { RAFT_LOG_WARN( diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh index b540de7f14..13c8e3d86d 100644 --- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh +++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,9 +21,9 @@ #include #include +#include #include -#include #include #include diff --git a/cpp/include/raft/util/memory_pool-ext.hpp b/cpp/include/raft/util/memory_pool-ext.hpp index a02908346b..030a9c681e 100644 --- a/cpp/include/raft/util/memory_pool-ext.hpp +++ b/cpp/include/raft/util/memory_pool-ext.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,11 @@ */ #pragma once -#include // size_t -#include // std::unique_ptr #include // rmm::mr::device_memory_resource +#include // size_t +#include // std::unique_ptr + namespace raft { std::unique_ptr get_pool_memory_resource( diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp index ad94ee0096..bd7e0186b3 100644 --- a/cpp/include/raft/util/memory_pool-inl.hpp +++ b/cpp/include/raft/util/memory_pool-inl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,17 @@ */ #pragma once -#include -#include #include // RAFT_INLINE_CONDITIONAL + +#include #include #include #include +#include +#include + namespace raft { /** @@ -65,14 +68,15 @@ RAFT_INLINE_CONDITIONAL std::unique_ptr get_poo rmm::mr::device_memory_resource*& mr, size_t initial_size) { using pool_res_t = rmm::mr::pool_memory_resource; - std::unique_ptr pool_res{}; + std::unique_ptr pool_res{nullptr}; if (mr) return pool_res; mr = rmm::mr::get_current_device_resource(); if (!dynamic_cast(mr) && !dynamic_cast*>(mr) && !dynamic_cast*>(mr)) { - pool_res = std::make_unique(mr, (initial_size + 255) & (~255)); - mr = pool_res.get(); + pool_res = std::make_unique( + mr, rmm::align_down(initial_size, rmm::CUDA_ALLOCATION_ALIGNMENT)); + mr = pool_res.get(); } return pool_res; } diff --git a/cpp/template/src/cagra_example.cu b/cpp/template/src/cagra_example.cu index 7f3a7d6676..3c1be8b4f8 100644 --- a/cpp/template/src/cagra_example.cu +++ b/cpp/template/src/cagra_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,8 @@ * limitations under the License. */ -#include +#include "common.cuh" + #include #include #include @@ -23,7 +24,7 @@ #include #include -#include "common.cuh" +#include void cagra_build_search_simple(raft::device_resources const& dev_resources, raft::device_matrix_view dataset, diff --git a/cpp/template/src/common.cuh b/cpp/template/src/common.cuh index 193abc747d..3057257537 100644 --- a/cpp/template/src/common.cuh +++ b/cpp/template/src/common.cuh @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -28,6 +27,8 @@ #include #include +#include + // Fill dataset and queries with synthetic data. void generate_dataset(raft::device_resources const& dev_resources, raft::device_matrix_view dataset, diff --git a/cpp/template/src/ivf_flat_example.cu b/cpp/template/src/ivf_flat_example.cu index 5d91f8fe8b..2b3980f696 100644 --- a/cpp/template/src/ivf_flat_example.cu +++ b/cpp/template/src/ivf_flat_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ * limitations under the License. */ -#include -#include +#include "common.cuh" #include #include #include @@ -29,7 +28,8 @@ #include #include -#include "common.cuh" +#include +#include void ivf_flat_build_search_simple(raft::device_resources const& dev_resources, raft::device_matrix_view dataset, diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/test/core/device_resources_manager.cpp index 11d07e3c7b..335e1cf578 100644 --- a/cpp/test/core/device_resources_manager.cpp +++ b/cpp/test/core/device_resources_manager.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,18 +13,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include -#include -#include -#include #include #include #include + #include #include #include #include + +#include + +#include + +#include + +#include +#include #include namespace raft { diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp index a1ad4385a7..0b0b4b54ab 100644 --- a/cpp/test/core/handle.cpp +++ b/cpp/test/core/handle.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,20 +14,25 @@ * limitations under the License. */ -#include -#include -#include -#include -#include #include #include #include #include #include #include + +#include #include #include #include + +#include + +#include + +#include +#include +#include #include namespace raft { @@ -279,8 +284,8 @@ TEST(Raft, WorkspaceResource) auto* orig_mr = resource::get_workspace_resource(handle)->get_upstream(); // Let's create a pooled resource - auto pool_mr = std::shared_ptr{ - new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}; + auto pool_mr = std::shared_ptr{new rmm::mr::pool_memory_resource( + rmm::mr::get_current_device_resource(), rmm::percent_of_free_device_memory(50))}; // A tiny workspace of 1MB size_t max_size = 1024 * 1024; @@ -318,8 +323,8 @@ TEST(Raft, WorkspaceResourceCopy) raft::resources tmp_res(res); resource::set_workspace_resource( tmp_res, - std::shared_ptr{ - new rmm::mr::pool_memory_resource(rmm::mr::get_current_device_resource())}, + std::shared_ptr{new rmm::mr::pool_memory_resource( + rmm::mr::get_current_device_resource(), rmm::percent_of_free_device_memory(50))}, orig_size * 2); ASSERT_EQ(orig_mr, resource::get_workspace_resource(res)); diff --git a/docs/source/vector_search_tutorial.md b/docs/source/vector_search_tutorial.md index 40e73d0032..73b1f1b28f 100644 --- a/docs/source/vector_search_tutorial.md +++ b/docs/source/vector_search_tutorial.md @@ -4,11 +4,11 @@ RAFT has several important algorithms for performing vector search on the GPU an This tutorial assumes RAFT has been installed and/or added to your build so that you are able to compile and run RAFT code. If not done already, please follow the [build and install instructions](build.md) and consider taking a look at the [example c++ template project](https://github.com/rapidsai/raft/tree/HEAD/cpp/template) for ready-to-go examples that you can immediately build and start playing with. Also take a look at RAFT's library of [reproducible vector search benchmarks](raft_ann_benchmarks.md) to run benchmarks that compare RAFT against other state-of-the-art nearest neighbors algorithms at scale. -For more information about the various APIs demonstrated in this tutorial, along with comprehensive usage examples of all the APIs offered by RAFT, please refer to the [RAFT's C++ API Documentation](https://docs.rapids.ai/api/raft/nightly/cpp_api/). +For more information about the various APIs demonstrated in this tutorial, along with comprehensive usage examples of all the APIs offered by RAFT, please refer to the [RAFT's C++ API Documentation](https://docs.rapids.ai/api/raft/nightly/cpp_api/). ## Step 1: Starting off with RAFT -### CUDA Development? +### CUDA Development? If you are reading this tuturial then you probably know about CUDA and its relationship to general-purpose GPU computing (GPGPU). You probably also know about Nvidia GPUs but might not necessarily be familiar with the programming model nor GPU computing. The good news is that extensive knowledge of CUDA and GPUs are not needed in order to get started with or build applications with RAFT. RAFT hides away most of the complexities behind simple single-threaded stateless functions that are inherently asynchronous, meaning the result of a computation isn't necessarily read to be used when the function executes and control is given back to the user. The functions are, however, allowed to be chained together in a sequence of calls that don't need to wait for subsequent computations to complete in order to continue execution. In fact, the only time you need to wait for the computation to complete is when you are ready to use the result. @@ -47,9 +47,9 @@ Since a stream is involved in the copy operation above, RAFT functions can be in `rmm::device_uvector` is a great mechanism for allocating and managing a chunk of device memory. While it's possible to use a single array to represent objects in higher dimensions like matrices, it lacks the means to pass that information along. For example, in addition to knowing that we have a 2d structure, we would need to know the number of rows, the number of columns, and even whether we read the columns or rows first (referred to as column- or row-major respectively). -For this reason, RAFT relies on the `mdspan` standard, which was composed specifically for this purpose. To be even more, `mdspan` itself doesn't actually allocate or own any data on host or device because it's just a view over an existing memory on host device. The `mdspan` simply gives us a way to represent multi-dimensional data so we can pass along the needed metadata to our APIs. Even more powerful is that we can design functions that only accept a matrix of `float` in device memory that is laid out in row-major format. +For this reason, RAFT relies on the `mdspan` standard, which was composed specifically for this purpose. To be even more, `mdspan` itself doesn't actually allocate or own any data on host or device because it's just a view over an existing memory on host device. The `mdspan` simply gives us a way to represent multi-dimensional data so we can pass along the needed metadata to our APIs. Even more powerful is that we can design functions that only accept a matrix of `float` in device memory that is laid out in row-major format. -The memory-owning counterpart to the `mdspan` is the `mdarray` and the `mdarray` can allocate memory on device or host and carry along with it the metadata about its shape and layout. An `mdspan` can be produced from an `mdarray` for invoking RAFT APIs with `mdarray.view()`. They also follow similar paradigms to the STL, where we represent an immutable `mdspan` of `int` using `mdspan` instead of `const mdspan` to ensure it's the type carried along by the `mdspan` that's not allowed to change. +The memory-owning counterpart to the `mdspan` is the `mdarray` and the `mdarray` can allocate memory on device or host and carry along with it the metadata about its shape and layout. An `mdspan` can be produced from an `mdarray` for invoking RAFT APIs with `mdarray.view()`. They also follow similar paradigms to the STL, where we represent an immutable `mdspan` of `int` using `mdspan` instead of `const mdspan` to ensure it's the type carried along by the `mdspan` that's not allowed to change. Many RAFT functions require `mdspan` to represent immutable input data and there's no implicit conversion between `mdspan` and `mdspan` we use `raft::make_const_mdspan()` to alleviate the pain of constructing a new `mdspan` to invoke these functions. @@ -159,7 +159,7 @@ auto index = raft::neighbors::cagra::build(res, index_params, r ### Query the CAGRA index -Now that we've trained a CAGRA index, we can query it by first allocating our output `mdarray` objects and passing the trained index model into the search function. +Now that we've trained a CAGRA index, we can query it by first allocating our output `mdarray` objects and passing the trained index model into the search function. ```c++ // create output arrays @@ -203,14 +203,14 @@ raft::stats::neighborhood_recall(res, res.sync_stream(); ``` -Notice we can run invoke the functions for index build and search for both algorithms, one right after the other, because we don't need to access any outputs from the algorithms in host memory. We will need to synchronize the stream on the `raft::device_resources` instance before we can read the result of the `neighborhood_recall` computation, though. +Notice we can run invoke the functions for index build and search for both algorithms, one right after the other, because we don't need to access any outputs from the algorithms in host memory. We will need to synchronize the stream on the `raft::device_resources` instance before we can read the result of the `neighborhood_recall` computation, though. Similar to a Numpy array, when we use a `host_scalar`, we are really using a multi-dimensional structure that contains only a single dimension, and further a single element. We can use element indexing to access the resulting element directly. ```c++ std::cout << recall_value(0) << std::endl; ``` -While it may seem like unnecessary additional work to wrap the result in a `host_scalar` mdspan, this API choice is made intentionally to support the possibility of also receiving the result as a `device_scalar` so that it can be used directly on the device for follow-on computations without having to incur the synchronization or transfer cost of bringing the result to host. This pattern becomes even more important when the result is being computed in a loop, such as an iterative solver, and the cost of synchronization and device-to-host (d2h) transfer becomes very expensive. +While it may seem like unnecessary additional work to wrap the result in a `host_scalar` mdspan, this API choice is made intentionally to support the possibility of also receiving the result as a `device_scalar` so that it can be used directly on the device for follow-on computations without having to incur the synchronization or transfer cost of bringing the result to host. This pattern becomes even more important when the result is being computed in a loop, such as an iterative solver, and the cost of synchronization and device-to-host (d2h) transfer becomes very expensive. ## Advanced features @@ -308,7 +308,9 @@ As an example, the following code snippet creates a `pool_memory_resource` and s rmm::mr::cuda_memory_resource cuda_mr; // Construct a resource that uses a coalescing best-fit pool allocator -rmm::mr::pool_memory_resource pool_mr{&cuda_mr}; +// set the initial size to half of the free device memory +auto init_size = rmm::percent_of_free_device_memory(50); +rmm::mr::pool_memory_resource pool_mr{&cuda_mr, init_size}; rmm::mr::set_current_device_resource(&pool_mr); // Updates the current device resource pointer to `pool_mr` ``` @@ -316,9 +318,9 @@ The `raft::device_resources` object will now also use the `rmm::current_device_r ### Workspace memory resource -As mentioned above, `raft::device_resources` will use `rmm::current_device_resource` by default for all memory allocations. However, there are times when a particular algorithm might benefit from using a different memory resource such as a `managed_memory_resource`, which creates a unified memory space between device and host memory, paging memory in and out of device as needed. Most of RAFT's algorithms allocate temporary memory as needed to perform their computations and we can control the memory resource used for these temporary allocations through the `workspace_resource` in the `raft::device_resources` instance. +As mentioned above, `raft::device_resources` will use `rmm::current_device_resource` by default for all memory allocations. However, there are times when a particular algorithm might benefit from using a different memory resource such as a `managed_memory_resource`, which creates a unified memory space between device and host memory, paging memory in and out of device as needed. Most of RAFT's algorithms allocate temporary memory as needed to perform their computations and we can control the memory resource used for these temporary allocations through the `workspace_resource` in the `raft::device_resources` instance. -For some applications, the `managed_memory_resource`, can enable a memory space that is larger than the GPU, thus allowing a natural spilling to host memory when needed. This isn't always the best way to use managed memory, though, as it can quickly lead to thrashing and severely impact performance. Still, when it can be used, it provides a very powerful tool that can also avoid out of memory errors when enough host memory is available. +For some applications, the `managed_memory_resource`, can enable a memory space that is larger than the GPU, thus allowing a natural spilling to host memory when needed. This isn't always the best way to use managed memory, though, as it can quickly lead to thrashing and severely impact performance. Still, when it can be used, it provides a very powerful tool that can also avoid out of memory errors when enough host memory is available. The following creates a managed memory allocator and set it as the `workspace_resource` of the `raft::device_resources` instance: ```c++ @@ -329,7 +331,7 @@ std::shared_ptr managed_resource; raft::device_resource res(managed_resource); ``` -The `workspace_resource` uses an `rmm::mr::limiting_resource_adaptor`, which limits the total amount of allocation possible. This allows RAFT algorithms to work within the confines of the memory constraints imposed by the user so that things like batch sizes can be automatically set to reasonable values without exceeding the allotted memory. By default, this limit restricts the memory allocation space for temporary workspace buffers to the memory available on the device. +The `workspace_resource` uses an `rmm::mr::limiting_resource_adaptor`, which limits the total amount of allocation possible. This allows RAFT algorithms to work within the confines of the memory constraints imposed by the user so that things like batch sizes can be automatically set to reasonable values without exceeding the allotted memory. By default, this limit restricts the memory allocation space for temporary workspace buffers to the memory available on the device. The below example specifies the total number of bytes that RAFT can use for temporary workspace allocations to 3GB: ```c++