From f37c41c54fc64a4e3689e5a61851ba3821800fee Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 26 Sep 2024 09:45:11 -0500
Subject: [PATCH 1/5] bump NCCL floor to 2.18.1.1 (#2443)

Contributes to https://github.com/rapidsai/build-planning/issues/102

Some RAPIDS libraries are using `ncclCommSplit()`, which was introduced in `nccl==2.18.1.1`. This is part of a series of PRs across RAPIDS updating libraries' pins to `nccl>=2.18.1.1` to ensure they get a new-enough version that supports that.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/raft/pull/2443
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-120_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-120_arch-x86_64.yaml  | 2 +-
 conda/recipes/libraft/conda_build_config.yaml           | 2 +-
 conda/recipes/raft-ann-bench/conda_build_config.yaml    | 2 +-
 conda/recipes/raft-dask/conda_build_config.yaml         | 3 +++
 conda/recipes/raft-dask/meta.yaml                       | 4 ++--
 dependencies.yaml                                       | 2 +-
 13 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 462874a7e7..de4eb7e690 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cfd974a6a8..26f4c1efaa 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 82e391e9ae..692956502b 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 0389427d13..133d42bfee 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index eff1c56840..f99cedd627 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 87b19d2952..08aea32ab1 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index ff3451c15c..572ad85ab5 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 085e099ae8..9fd23edb03 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 5c0047fb9c..6dc06648bd 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.9.9"
+  - ">=2.18.1.1"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index db0083b583..bdb4e883ea 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.9.9"
+  - ">=2.18.1.1"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index e6afed2890..58e8ec3c9e 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -24,3 +24,6 @@ ucxx_version:
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
+
+nccl_version:
+  - ">=2.18.1.1"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 74b26b5935..bc13d352b7 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - cython >=3.0.0
-    - nccl >=2.9.9
+    - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
@@ -68,7 +68,7 @@ requirements:
     - dask-cuda ={{ minor_version }}
     - rapids-dask-dependency ={{ minor_version }}
     - joblib >=0.11
-    - nccl >=2.9.9
+    - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 91fb176b56..e833e8519a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -170,7 +170,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - nccl>=2.9.9
+          - nccl>=2.18.1.1
           - libucxx==0.40.*,>=0.0.0a0
     specific:
       - output_types: conda

From 704feb1a0249e73bb0cab96968b892e8d5823b46 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 26 Sep 2024 09:32:41 -0700
Subject: [PATCH 2/5] [Opt] Enforce the UT Coverity and add benchmark for
 `transpose` (#2438)

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2438
---
 cpp/bench/prims/CMakeLists.txt               |   1 +
 cpp/bench/prims/linalg/transpose.cu          |  85 ++++
 cpp/include/raft/linalg/detail/transpose.cuh |  67 ++-
 cpp/test/linalg/transpose.cu                 | 445 ++++++++++++++-----
 4 files changed, 465 insertions(+), 133 deletions(-)
 create mode 100644 cpp/bench/prims/linalg/transpose.cu

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 9d80cbaac2..52c63ad73b 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -132,6 +132,7 @@ if(BUILD_PRIMS_BENCH)
     linalg/reduce_rows_by_key.cu
     linalg/reduce.cu
     linalg/sddmm.cu
+    linalg/transpose.cu
     main.cpp
   )
 
diff --git a/cpp/bench/prims/linalg/transpose.cu b/cpp/bench/prims/linalg/transpose.cu
new file mode 100644
index 0000000000..e60e50c125
--- /dev/null
+++ b/cpp/bench/prims/linalg/transpose.cu
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+template <typename IdxT>
+struct transpose_input {
+  IdxT rows, cols;
+};
+
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const transpose_input<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols;
+  return os;
+}
+
+template <typename T, typename IdxT, typename Layout>
+struct TransposeBench : public fixture {
+  TransposeBench(const transpose_input<IdxT>& p)
+    : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream)
+  {
+    raft::random::RngState rng{1234};
+    raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    loop_on_state(state, [this]() {
+      auto input_view =
+        raft::make_device_matrix_view<T, IdxT, Layout>(in.data(), params.rows, params.cols);
+      auto output_view = raft::make_device_vector_view<T, IdxT, Layout>(out.data(), params.rows);
+      raft::linalg::transpose(handle,
+                              input_view.data_handle(),
+                              output_view.data_handle(),
+                              params.rows,
+                              params.cols,
+                              handle.get_stream());
+    });
+  }
+
+ private:
+  transpose_input<IdxT> params;
+  rmm::device_uvector<T> in, out;
+};  // struct TransposeBench
+
+const std::vector<transpose_input<int>> transpose_inputs_i32 =
+  raft::util::itertools::product<transpose_input<int>>({10, 128, 256, 512, 1024},
+                                                       {10000, 100000, 1000000});
+
+RAFT_BENCH_REGISTER((TransposeBench<float, int, raft::row_major>), "", transpose_inputs_i32);
+RAFT_BENCH_REGISTER((TransposeBench<half, int, raft::row_major>), "", transpose_inputs_i32);
+
+RAFT_BENCH_REGISTER((TransposeBench<float, int, raft::col_major>), "", transpose_inputs_i32);
+RAFT_BENCH_REGISTER((TransposeBench<half, int, raft::col_major>), "", transpose_inputs_i32);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index ec60aacc9c..c5f0544b5c 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -38,7 +38,9 @@ template <typename IndexType, int TILE_DIM, int BLOCK_ROWS>
 RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
                                   IndexType n_cols,
                                   const half* __restrict__ in,
-                                  half* __restrict__ out)
+                                  half* __restrict__ out,
+                                  const IndexType stride_in,
+                                  const IndexType stride_out)
 {
   __shared__ half tile[TILE_DIM][TILE_DIM + 1];
 
@@ -49,7 +51,7 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
 
       for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
         if (x < n_cols && (y + j) < n_rows) {
-          tile[threadIdx.y + j][threadIdx.x] = __ldg(&in[(y + j) * n_cols + x]);
+          tile[threadIdx.y + j][threadIdx.x] = __ldg(&in[(y + j) * stride_in + x]);
         }
       }
       __syncthreads();
@@ -59,7 +61,7 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
 
       for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
         if (x < n_rows && (y + j) < n_cols) {
-          out[(y + j) * n_rows + x] = tile[threadIdx.x][threadIdx.y + j];
+          out[(y + j) * stride_out + x] = tile[threadIdx.x][threadIdx.y + j];
         }
       }
       __syncthreads();
@@ -67,9 +69,33 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
   }
 }
 
+/**
+ * @brief Transposes a matrix stored in row-major order.
+ *
+ * This function transposes a matrix of half-precision floating-point numbers (`half`).
+ * Both the input (`in`) and output (`out`) matrices are assumed to be stored in row-major order.
+ *
+ * @tparam IndexType The type used for indexing the matrix dimensions (e.g., int).
+ * @param handle The RAFT resource handle which contains resources.
+ * @param n_rows The number of rows in the input matrix.
+ * @param n_cols The number of columns in the input matrix.
+ * @param in Pointer to the input matrix in row-major order.
+ * @param out Pointer to the output matrix in row-major order, where the transposed matrix will be
+ * stored.
+ * @param stride_in The stride (number of elements between consecutive rows) for the input matrix.
+ *                  Default is 1, which means the input matrix is contiguous in memory.
+ * @param stride_out The stride (number of elements between consecutive rows) for the output matrix.
+ *                   Default is 1, which means the output matrix is contiguous in memory.
+ */
+
 template <typename IndexType>
-void transpose_half(
-  raft::resources const& handle, IndexType n_rows, IndexType n_cols, const half* in, half* out)
+void transpose_half(raft::resources const& handle,
+                    IndexType n_rows,
+                    IndexType n_cols,
+                    const half* in,
+                    half* out,
+                    const IndexType stride_in  = 1,
+                    const IndexType stride_out = 1)
 {
   if (n_cols == 0 || n_rows == 0) return;
   auto stream = resource::get_cuda_stream(handle);
@@ -100,8 +126,13 @@ void transpose_half(
 
   dim3 grids(adjusted_grid_x, adjusted_grid_y);
 
-  transpose_half_kernel<IndexType, block_dim_x, block_dim_y>
-    <<<grids, blocks, 0, stream>>>(n_rows, n_cols, in, out);
+  if (stride_in > 1 || stride_out > 1) {
+    transpose_half_kernel<IndexType, block_dim_x, block_dim_y>
+      <<<grids, blocks, 0, stream>>>(n_rows, n_cols, in, out, stride_in, stride_out);
+  } else {
+    transpose_half_kernel<IndexType, block_dim_x, block_dim_y>
+      <<<grids, blocks, 0, stream>>>(n_rows, n_cols, in, out, n_cols, n_rows);
+  }
 
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
@@ -118,7 +149,7 @@ void transpose(raft::resources const& handle,
   int out_n_cols = n_rows;
 
   if constexpr (std::is_same_v<math_t, half>) {
-    transpose_half(handle, out_n_rows, out_n_cols, in, out);
+    transpose_half(handle, n_cols, n_rows, in, out);
   } else {
     cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
     RAFT_CUBLAS_TRY(cublasSetStream(cublas_h, stream));
@@ -195,9 +226,13 @@ void transpose_row_major_impl(
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
-  auto out_n_rows = in.extent(1);
-  auto out_n_cols = in.extent(0);
-  transpose_half<IndexType>(handle, out_n_cols, out_n_rows, in.data_handle(), out.data_handle());
+  transpose_half<IndexType>(handle,
+                            in.extent(0),
+                            in.extent(1),
+                            in.data_handle(),
+                            out.data_handle(),
+                            in.stride(0),
+                            out.stride(0));
 }
 
 template <typename T, typename IndexType, typename LayoutPolicy, typename AccessorPolicy>
@@ -233,9 +268,13 @@ void transpose_col_major_impl(
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<half, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
-  auto out_n_rows = in.extent(1);
-  auto out_n_cols = in.extent(0);
-  transpose_half<IndexType>(handle, out_n_rows, out_n_cols, in.data_handle(), out.data_handle());
+  transpose_half<IndexType>(handle,
+                            in.extent(1),
+                            in.extent(0),
+                            in.data_handle(),
+                            out.data_handle(),
+                            in.stride(1),
+                            out.stride(1));
 }
 
 };  // end namespace detail
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index cbe869a9a5..22fc1c1d60 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -29,48 +29,104 @@
 
 #include <gtest/gtest.h>
 
+#include <type_traits>
+
+namespace std {
+template <>
+struct is_floating_point<half> : std::true_type {};
+}  // namespace std
+
 namespace raft {
 namespace linalg {
 
 template <typename T>
-struct TranposeInputs {
+void initialize_array(T* data_h, size_t size)
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+
+  for (size_t i = 0; i < size; ++i) {
+    if constexpr (std::is_same_v<T, half>) {
+      data_h[i] = __float2half(static_cast<float>(dis(gen)));
+    } else {
+      data_h[i] = static_cast<T>(dis(gen));
+    }
+  }
+}
+
+template <typename T>
+void cpu_transpose_row_major(
+  const T* input, T* output, int rows, int cols, int stride_in = -1, int stride_out = -1)
+{
+  stride_in  = stride_in == -1 ? cols : stride_in;
+  stride_out = stride_out == -1 ? rows : stride_out;
+  if (stride_in)
+    for (int i = 0; i < rows; ++i) {
+      for (int j = 0; j < cols; ++j) {
+        output[j * stride_out + i] = input[i * stride_in + j];
+      }
+    }
+}
+
+template <typename T>
+void cpu_transpose_col_major(
+  const T* input, T* output, int rows, int cols, int stride_in = -1, int stride_out = -1)
+{
+  cpu_transpose_row_major(input, output, cols, rows, stride_in, stride_out);
+}
+
+bool validate_half(const half* h_ref, const half* h_result, half tolerance, int len)
+{
+  bool success = true;
+  for (int i = 0; i < len; ++i) {
+    if (raft::abs(__half2float(h_result[i]) - __half2float(h_ref[i])) >= __half2float(tolerance)) {
+      success = false;
+      break;
+    }
+    if (!success) break;
+  }
+  return success;
+}
+
+namespace transpose_regular_test {
+
+template <typename T>
+struct TransposeInputs {
   T tolerance;
-  int len;
   int n_row;
   int n_col;
   unsigned long long int seed;
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const TranposeInputs<T>& dims)
-{
-  return os;
-}
-
-template <typename T>
-class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
+class TransposeTest : public ::testing::TestWithParam<TransposeInputs<T>> {
  public:
   TransposeTest()
-    : params(::testing::TestWithParam<TranposeInputs<T>>::GetParam()),
+    : params(::testing::TestWithParam<TransposeInputs<T>>::GetParam()),
       stream(resource::get_cuda_stream(handle)),
-      data(params.len, stream),
-      data_trans_ref(params.len, stream),
-      data_trans(params.len, stream)
+      data(params.n_row * params.n_col, stream),
+      data_trans_ref(params.n_row * params.n_col, stream),
+      data_trans(params.n_row * params.n_col, stream)
   {
   }
 
  protected:
   void SetUp() override
   {
-    int len = params.len;
-    ASSERT(params.len == 9, "This test works only with len=9!");
-    T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
-    raft::update_device(data.data(), data_h, len, stream);
-    T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
-    raft::update_device(data_trans_ref.data(), data_ref_h, len, stream);
+    int len = params.n_row * params.n_col;
+    std::vector<T> data_h(len);
+    std::vector<T> data_ref_h(len);
+
+    initialize_array(data_h.data(), len);
+
+    cpu_transpose_col_major(data_h.data(), data_ref_h.data(), params.n_row, params.n_col);
+
+    raft::update_device(data.data(), data_h.data(), len, stream);
+    raft::update_device(data_trans_ref.data(), data_ref_h.data(), len, stream);
 
     transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
-    transpose(data.data(), params.n_row, stream);
+    if (params.n_row == params.n_col) { transpose(data.data(), params.n_col, stream); }
     resource::sync_stream(handle, stream);
   }
 
@@ -78,28 +134,45 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   raft::resources handle;
   cudaStream_t stream;
 
-  TranposeInputs<T> params;
+  TransposeInputs<T> params;
   rmm::device_uvector<T> data, data_trans, data_trans_ref;
 };
 
-const std::vector<TranposeInputs<float>> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}};
-
-const std::vector<TranposeInputs<double>> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
-
-const std::vector<TranposeInputs<half>> inputsh2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TransposeInputs<float>> inputsf2 = {{0.1f, 3, 3, 1234ULL},
+                                                      {0.1f, 3, 4, 1234ULL},
+                                                      {0.1f, 300, 300, 1234ULL},
+                                                      {0.1f, 300, 4100, 1234ULL},
+                                                      {0.1f, 1, 13000, 1234ULL},
+                                                      {0.1f, 3, 130001, 1234ULL}};
+
+const std::vector<TransposeInputs<double>> inputsd2 = {{0.1f, 3, 3, 1234ULL},
+                                                       {0.1f, 3, 4, 1234ULL},
+                                                       {0.1f, 300, 300, 1234ULL},
+                                                       {0.1f, 300, 4100, 1234ULL},
+                                                       {0.1f, 1, 13000, 1234ULL},
+                                                       {0.1f, 3, 130001, 1234ULL}};
+
+const std::vector<TransposeInputs<half>> inputsh2 = {{0.1f, 3, 3, 1234ULL},
+                                                     {0.1f, 3, 4, 1234ULL},
+                                                     {0.1f, 300, 300, 1234ULL},
+                                                     {0.1f, 300, 4100, 1234ULL},
+                                                     {0.1f, 1, 13000, 1234ULL},
+                                                     {0.1f, 3, 130001, 1234ULL}};
 
 typedef TransposeTest<float> TransposeTestValF;
 TEST_P(TransposeTestValF, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
                                 data_trans.data(),
-                                params.len,
+                                params.n_row * params.n_col,
                                 raft::CompareApproxAbs<float>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
-                                data.data(),
-                                params.len,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+  if (params.n_row == params.n_col) {
+    ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                  data.data(),
+                                  params.n_row * params.n_col,
+                                  raft::CompareApproxAbs<float>(params.tolerance)));
+  }
 }
 
 typedef TransposeTest<double> TransposeTestValD;
@@ -107,59 +180,47 @@ TEST_P(TransposeTestValD, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
                                 data_trans.data(),
-                                params.len,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
-
-  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
-                                data.data(),
-                                params.len,
+                                params.n_row * params.n_col,
                                 raft::CompareApproxAbs<double>(params.tolerance)));
-}
-
-bool validate_half(const half* h_ref, const half* h_result, half tolerance, int len)
-{
-  bool success = true;
-  for (int i = 0; i < len; ++i) {
-    if (raft::abs(__half2float(h_result[i]) - __half2float(h_ref[i])) >= __half2float(tolerance)) {
-      success = false;
-      break;
-    }
-    if (!success) break;
+  if (params.n_row == params.n_col) {
+    ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                  data.data(),
+                                  params.n_row * params.n_col,
+                                  raft::CompareApproxAbs<double>(params.tolerance)));
   }
-  return success;
 }
 
 typedef TransposeTest<half> TransposeTestValH;
 TEST_P(TransposeTestValH, Result)
 {
-  half data_trans_ref_h[params.len];
-  half data_trans_h[params.len];
-  half data_h[params.len];
+  auto len = params.n_row * params.n_col;
 
-  RAFT_CUDA_TRY(cudaMemcpyAsync(data_trans_ref_h,
-                                data_trans_ref.data(),
-                                params.len * sizeof(half),
-                                cudaMemcpyDeviceToHost,
-                                stream));
-
-  RAFT_CUDA_TRY(cudaMemcpyAsync(
-    data_trans_h, data_trans.data(), params.len * sizeof(half), cudaMemcpyDeviceToHost, stream));
-  RAFT_CUDA_TRY(cudaMemcpyAsync(
-    data_h, data.data(), params.len * sizeof(half), cudaMemcpyDeviceToHost, stream));
+  std::vector<half> data_trans_ref_h(len);
+  std::vector<half> data_trans_h(len);
+  std::vector<half> data_h(len);
 
+  raft::copy(
+    data_trans_ref_h.data(), data_trans_ref.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(data_trans_h.data(), data_trans.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(data_h.data(), data.data(), len, resource::get_cuda_stream(handle));
   resource::sync_stream(handle, stream);
 
-  ASSERT_TRUE(validate_half(data_trans_ref_h, data_trans_h, params.tolerance, params.len));
-  ASSERT_TRUE(validate_half(data_trans_ref_h, data_h, params.tolerance, params.len));
+  ASSERT_TRUE(validate_half(
+    data_trans_ref_h.data(), data_trans_h.data(), params.tolerance, params.n_row * params.n_col));
+
+  if (params.n_row == params.n_col) {
+    ASSERT_TRUE(validate_half(
+      data_trans_ref_h.data(), data_h.data(), params.tolerance, params.n_row * params.n_col));
+  }
 }
 
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2));
-
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2));
-
 INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValH, ::testing::ValuesIn(inputsh2));
+}  // namespace transpose_regular_test
+
+namespace transpose_extra_test {
 
-namespace {
 /**
  * We hide these functions in tests for now until we have a heterogeneous mdarray
  * implementation.
@@ -230,79 +291,225 @@ template <typename T, typename IndexType>
   }
 }
 
+template <typename T>
+struct TransposeMdspanInputs {
+  int n_row;
+  int n_col;
+  T tolerance = T{0.01};
+};
+
 template <typename T, typename LayoutPolicy>
-void test_transpose_with_mdspan()
+void test_transpose_with_mdspan(const TransposeMdspanInputs<T>& param)
 {
+  auto len = param.n_row * param.n_col;
+  std::vector<T> in_h(len);
+  std::vector<T> out_ref_h(len);
+
+  initialize_array(in_h.data(), len);
+
   raft::resources handle;
-  auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 3);
-  T k{0};
-  for (size_t i = 0; i < v.extent(0); ++i) {
-    for (size_t j = 0; j < v.extent(1); ++j) {
-      v(i, j) = k++;
-    }
+  auto stream  = resource::get_cuda_stream(handle);
+  auto in      = make_device_matrix<T, size_t, LayoutPolicy>(handle, param.n_row, param.n_col);
+  auto out_ref = make_device_matrix<T, size_t, LayoutPolicy>(handle, param.n_row, param.n_col);
+  resource::sync_stream(handle, stream);
+  if constexpr (std::is_same_v<LayoutPolicy, layout_c_contiguous>) {
+    cpu_transpose_row_major(in_h.data(), out_ref_h.data(), param.n_row, param.n_col);
+  } else {
+    cpu_transpose_col_major(in_h.data(), out_ref_h.data(), param.n_row, param.n_col);
   }
-  auto out = transpose(handle, v.view());
-  static_assert(std::is_same_v<LayoutPolicy, typename decltype(out)::layout_type>);
-  ASSERT_EQ(out.extent(0), v.extent(1));
-  ASSERT_EQ(out.extent(1), v.extent(0));
+  raft::copy(in.data_handle(), in_h.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(out_ref.data_handle(), out_ref_h.data(), len, resource::get_cuda_stream(handle));
 
-  k = 0;
-  for (size_t i = 0; i < out.extent(1); ++i) {
-    for (size_t j = 0; j < out.extent(0); ++j) {
-      ASSERT_EQ(out(j, i), k++);
-    }
+  auto out = transpose(handle, in.view());
+  static_assert(std::is_same_v<LayoutPolicy, typename decltype(out)::layout_type>);
+  ASSERT_EQ(out.extent(0), in.extent(1));
+  ASSERT_EQ(out.extent(1), in.extent(0));
+  if constexpr (std::is_same_v<T, half>) {
+    std::vector<half> out_h(len);
+    raft::copy(out_h.data(), out.data_handle(), len, resource::get_cuda_stream(handle));
+    ASSERT_TRUE(validate_half(out_ref_h.data(), out_h.data(), param.tolerance, len));
+  } else {
+    ASSERT_TRUE(raft::devArrMatch(
+      out_ref.data_handle(), out.data_handle(), len, raft::CompareApproxAbs<T>(param.tolerance)));
   }
 }
-}  // namespace
 
-TEST(TransposeTest, MDSpan)
+const std::vector<TransposeMdspanInputs<float>> inputs_mdspan_f  = {{3, 3},
+                                                                    {3, 4},
+                                                                    {300, 300},
+                                                                    {300, 4100},
+                                                                    {1, 13000},
+                                                                    {3, 130001},
+                                                                    {4100, 300},
+                                                                    {13000, 1},
+                                                                    {130001, 3}};
+const std::vector<TransposeMdspanInputs<double>> inputs_mdspan_d = {{3, 3},
+                                                                    {3, 4},
+                                                                    {300, 300},
+                                                                    {300, 4100},
+                                                                    {1, 13000},
+                                                                    {3, 130001},
+                                                                    {4100, 300},
+                                                                    {13000, 1},
+                                                                    {130001, 3}};
+const std::vector<TransposeMdspanInputs<half>> inputs_mdspan_h   = {{3, 3},
+                                                                    {3, 4},
+                                                                    {300, 300},
+                                                                    {300, 4100},
+                                                                    {1, 13000},
+                                                                    {3, 130001},
+                                                                    {4100, 300},
+                                                                    {13000, 1},
+                                                                    {130001, 3}};
+
+TEST(TransposeTest, MDSpanFloat)
 {
-  test_transpose_with_mdspan<float, layout_c_contiguous>();
-  test_transpose_with_mdspan<double, layout_c_contiguous>();
-
-  test_transpose_with_mdspan<float, layout_f_contiguous>();
-  test_transpose_with_mdspan<double, layout_f_contiguous>();
+  for (const auto& p : inputs_mdspan_f) {
+    test_transpose_with_mdspan<float, layout_c_contiguous>(p);
+    test_transpose_with_mdspan<float, layout_f_contiguous>(p);
+  }
+}
+TEST(TransposeTest, MDSpanDouble)
+{
+  for (const auto& p : inputs_mdspan_d) {
+    test_transpose_with_mdspan<double, layout_c_contiguous>(p);
+    test_transpose_with_mdspan<double, layout_f_contiguous>(p);
+  }
+}
+TEST(TransposeTest, MDSpanHalf)
+{
+  for (const auto& p : inputs_mdspan_h) {
+    test_transpose_with_mdspan<half, layout_c_contiguous>(p);
+    test_transpose_with_mdspan<half, layout_f_contiguous>(p);
+  }
 }
 
-namespace {
+template <typename T>
+struct TransposeSubmatrixInputs {
+  int n_row;
+  int n_col;
+  int row_beg;
+  int row_end;
+  int col_beg;
+  int col_end;
+  T tolerance = T{0.01};
+};
+
 template <typename T, typename LayoutPolicy>
-void test_transpose_submatrix()
+void test_transpose_submatrix(const TransposeSubmatrixInputs<T>& param)
 {
+  auto len     = param.n_row * param.n_col;
+  auto sub_len = (param.row_end - param.row_beg) * (param.col_end - param.col_beg);
+
+  std::vector<T> in_h(len);
+  std::vector<T> out_ref_h(sub_len);
+
+  initialize_array(in_h.data(), len);
+
   raft::resources handle;
-  auto v = make_device_matrix<T, size_t, LayoutPolicy>(handle, 32, 33);
-  T k{0};
-  size_t row_beg{3}, row_end{13}, col_beg{2}, col_end{11};
-  for (size_t i = row_beg; i < row_end; ++i) {
-    for (size_t j = col_beg; j < col_end; ++j) {
-      v(i, j) = k++;
-    }
+  auto stream = resource::get_cuda_stream(handle);
+
+  auto in      = make_device_matrix<T, size_t, LayoutPolicy>(handle, param.n_row, param.n_col);
+  auto out_ref = make_device_matrix<T, size_t, LayoutPolicy>(
+    handle, (param.row_end - param.row_beg), (param.col_end - param.col_beg));
+
+  if constexpr (std::is_same_v<LayoutPolicy, layout_c_contiguous>) {
+    auto offset = param.row_beg * param.n_col + param.col_beg;
+    cpu_transpose_row_major(in_h.data() + offset,
+                            out_ref_h.data(),
+                            (param.row_end - param.row_beg),
+                            (param.col_end - param.col_beg),
+                            in.extent(1),
+                            (param.row_end - param.row_beg));
+  } else {
+    auto offset = param.col_beg * param.n_row + param.row_beg;
+    cpu_transpose_col_major(in_h.data() + offset,
+                            out_ref_h.data(),
+                            (param.row_end - param.row_beg),
+                            (param.col_end - param.col_beg),
+                            in.extent(0),
+                            (param.col_end - param.col_beg));
   }
 
-  auto vv     = v.view();
-  auto submat = std::experimental::submdspan(
-    vv, std::make_tuple(row_beg, row_end), std::make_tuple(col_beg, col_end));
-  static_assert(std::is_same_v<typename decltype(submat)::layout_type, layout_stride>);
+  raft::copy(in.data_handle(), in_h.data(), len, resource::get_cuda_stream(handle));
+  raft::copy(out_ref.data_handle(), out_ref_h.data(), sub_len, resource::get_cuda_stream(handle));
+  resource::sync_stream(handle, stream);
 
-  auto out = transpose(handle, submat);
-  ASSERT_EQ(out.extent(0), submat.extent(1));
-  ASSERT_EQ(out.extent(1), submat.extent(0));
+  auto in_submat = std::experimental::submdspan(in.view(),
+                                                std::make_tuple(param.row_beg, param.row_end),
+                                                std::make_tuple(param.col_beg, param.col_end));
 
-  k = 0;
-  for (size_t i = 0; i < out.extent(1); ++i) {
-    for (size_t j = 0; j < out.extent(0); ++j) {
-      ASSERT_EQ(out(j, i), k++);
-    }
+  static_assert(std::is_same_v<typename decltype(in_submat)::layout_type, layout_stride>);
+  auto out = transpose(handle, in_submat);
+
+  ASSERT_EQ(out.extent(0), in_submat.extent(1));
+  ASSERT_EQ(out.extent(1), in_submat.extent(0));
+
+  if constexpr (std::is_same_v<T, half>) {
+    std::vector<half> out_h(sub_len);
+
+    raft::copy(out_h.data(), out.data_handle(), sub_len, resource::get_cuda_stream(handle));
+    ASSERT_TRUE(validate_half(out_ref_h.data(), out_h.data(), param.tolerance, sub_len));
+  } else {
+    ASSERT_TRUE(raft::devArrMatch(out_ref.data_handle(),
+                                  out.data_handle(),
+                                  sub_len,
+                                  raft::CompareApproxAbs<T>(param.tolerance)));
   }
 }
-}  // namespace
-
-TEST(TransposeTest, SubMatrix)
+const std::vector<TransposeSubmatrixInputs<float>> inputs_submatrix_f = {
+  {3, 3, 1, 2, 0, 2},
+  {3, 4, 1, 3, 2, 3},
+  {300, 300, 1, 299, 2, 239},
+  {300, 4100, 3, 299, 101, 4001},
+  {2, 13000, 0, 1, 3, 13000},
+  {3, 130001, 0, 3, 3999, 129999},
+  {4100, 300, 159, 4001, 125, 300},
+  {13000, 5, 0, 11111, 0, 3},
+  {130001, 3, 19, 130000, 2, 3}};
+const std::vector<TransposeSubmatrixInputs<double>> inputs_submatrix_d = {
+  {3, 3, 1, 2, 0, 2},
+  {3, 4, 1, 3, 2, 3},
+  {300, 300, 1, 299, 2, 239},
+  {300, 4100, 3, 299, 101, 4001},
+  {2, 13000, 0, 1, 3, 13000},
+  {3, 130001, 0, 3, 3999, 129999},
+  {4100, 300, 159, 4001, 125, 300},
+  {13000, 5, 0, 11111, 0, 3},
+  {130001, 3, 19, 130000, 2, 3}};
+const std::vector<TransposeSubmatrixInputs<half>> inputs_submatrix_h = {
+  {3, 3, 1, 2, 0, 2},
+  {3, 4, 1, 3, 2, 3},
+  {300, 300, 1, 299, 2, 239},
+  {300, 4100, 3, 299, 101, 4001},
+  {2, 13000, 0, 1, 3, 13000},
+  {3, 130001, 0, 3, 3999, 129999},
+  {4100, 300, 159, 4001, 125, 300},
+  {13000, 5, 0, 11111, 0, 3},
+  {130001, 3, 19, 130000, 2, 3}};
+
+TEST(TransposeTest, SubMatrixFloat)
 {
-  test_transpose_submatrix<float, layout_c_contiguous>();
-  test_transpose_submatrix<double, layout_c_contiguous>();
-
-  test_transpose_submatrix<float, layout_f_contiguous>();
-  test_transpose_submatrix<double, layout_f_contiguous>();
+  for (const auto& p : inputs_submatrix_f) {
+    test_transpose_submatrix<float, layout_c_contiguous>(p);
+    test_transpose_submatrix<float, layout_f_contiguous>(p);
+  }
 }
+TEST(TransposeTest, SubMatrixDouble)
+{
+  for (const auto& p : inputs_submatrix_d) {
+    test_transpose_submatrix<double, layout_c_contiguous>(p);
+    test_transpose_submatrix<double, layout_f_contiguous>(p);
+  }
+}
+TEST(TransposeTest, SubMatrixHalf)
+{
+  for (const auto& p : inputs_submatrix_h) {
+    test_transpose_submatrix<half, layout_c_contiguous>(p);
+    test_transpose_submatrix<half, layout_f_contiguous>(p);
+  }
+}
+
+}  // namespace transpose_extra_test
 }  // end namespace linalg
 }  // end namespace raft

From b78a07d9d2a117c12f396f330526a8e8ed23a9ba Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Fri, 27 Sep 2024 00:36:44 +0200
Subject: [PATCH 3/5] Adding NCCL clique to the RAFT handle (#2431)

Authors:
  - Victor Lafargue (https://github.com/viclafargue)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2431
---
 cpp/include/raft/comms/detail/std_comms.hpp   |   8 +-
 cpp/include/raft/comms/nccl_clique.hpp        | 156 ++++++++++++++++++
 .../raft/core/resource/nccl_clique.hpp        |  66 ++++++++
 .../raft/core/resource/resource_types.hpp     |   1 +
 4 files changed, 227 insertions(+), 4 deletions(-)
 create mode 100644 cpp/include/raft/comms/nccl_clique.hpp
 create mode 100644 cpp/include/raft/core/resource/nccl_clique.hpp

diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index c5d64f6a29..ed869e6cae 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -310,13 +310,13 @@ class std_comms : public comms_iface {
             // Wait for a UCXX progress thread roundtrip, prevent waiting for longer
             // than 10ms for each operation, will retry in next iteration.
             ucxx::utils::CallbackNotifier callbackNotifierPre{};
-            worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); },
-                                       10000000 /* 10ms */);
+            (void)worker->registerGenericPre(
+              [&callbackNotifierPre]() { callbackNotifierPre.set(); }, 10000000 /* 10ms */);
             callbackNotifierPre.wait();
 
             ucxx::utils::CallbackNotifier callbackNotifierPost{};
-            worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); },
-                                        10000000 /* 10ms */);
+            (void)worker->registerGenericPost(
+              [&callbackNotifierPost]() { callbackNotifierPost.set(); }, 10000000 /* 10ms */);
             callbackNotifierPost.wait();
           } else {
             // Causes UCXX to progress through the send/recv message queue
diff --git a/cpp/include/raft/comms/nccl_clique.hpp b/cpp/include/raft/comms/nccl_clique.hpp
new file mode 100644
index 0000000000..c6520af753
--- /dev/null
+++ b/cpp/include/raft/comms/nccl_clique.hpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_resources.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <nccl.h>
+
+/**
+ * @brief Error checking macro for NCCL runtime API functions.
+ *
+ * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
+ * exception detailing the NCCL error that occurred
+ */
+#define RAFT_NCCL_TRY(call)                        \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
+  } while (0);
+
+namespace raft::comms {
+void build_comms_nccl_only(raft::resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank);
+}
+
+namespace raft::comms {
+
+struct nccl_clique {
+  using pool_mr = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+
+  /**
+   * Instantiates a NCCL clique with all available GPUs
+   *
+   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+   *
+   */
+  nccl_clique(int percent_of_free_memory = 80)
+    : root_rank_(0),
+      percent_of_free_memory_(percent_of_free_memory),
+      per_device_pools_(0),
+      device_resources_(0)
+  {
+    cudaGetDeviceCount(&num_ranks_);
+    device_ids_.resize(num_ranks_);
+    std::iota(device_ids_.begin(), device_ids_.end(), 0);
+    nccl_comms_.resize(num_ranks_);
+    nccl_clique_init();
+  }
+
+  /**
+   * Instantiates a NCCL clique
+   *
+   * Usage example:
+   * @code{.cpp}
+   * int n_devices;
+   * cudaGetDeviceCount(&n_devices);
+   * std::vector<int> device_ids(n_devices);
+   * std::iota(device_ids.begin(), device_ids.end(), 0);
+   * cuvs::neighbors::mg::nccl_clique& clique(device_ids); // first device is the root rank
+   * @endcode
+   *
+   * @param[in] device_ids list of device IDs to be used to initiate the clique
+   * @param[in] percent_of_free_memory percentage of device memory to pre-allocate as memory pool
+   *
+   */
+  nccl_clique(const std::vector<int>& device_ids, int percent_of_free_memory = 80)
+    : root_rank_(0),
+      num_ranks_(device_ids.size()),
+      percent_of_free_memory_(percent_of_free_memory),
+      device_ids_(device_ids),
+      nccl_comms_(device_ids.size()),
+      per_device_pools_(0),
+      device_resources_(0)
+  {
+    nccl_clique_init();
+  }
+
+  void nccl_clique_init()
+  {
+    RAFT_NCCL_TRY(ncclCommInitAll(nccl_comms_.data(), num_ranks_, device_ids_.data()));
+
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+
+      // create a pool memory resource for each device
+      auto old_mr = rmm::mr::get_current_device_resource();
+      per_device_pools_.push_back(std::make_unique<pool_mr>(
+        old_mr, rmm::percent_of_free_device_memory(percent_of_free_memory_)));
+      rmm::cuda_device_id id(device_ids_[rank]);
+      rmm::mr::set_per_device_resource(id, per_device_pools_.back().get());
+
+      // create a device resource handle for each device
+      device_resources_.emplace_back();
+
+      // add NCCL communications to the device resource handle
+      raft::comms::build_comms_nccl_only(
+        &device_resources_[rank], nccl_comms_[rank], num_ranks_, rank);
+    }
+
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[rank]));
+      raft::resource::sync_stream(device_resources_[rank]);
+    }
+  }
+
+  const raft::device_resources& set_current_device_to_root_rank() const
+  {
+    int root_device_id = device_ids_[root_rank_];
+    RAFT_CUDA_TRY(cudaSetDevice(root_device_id));
+    return device_resources_[root_rank_];
+  }
+
+  ~nccl_clique()
+  {
+#pragma omp parallel for  // necessary to avoid hangs
+    for (int rank = 0; rank < num_ranks_; rank++) {
+      cudaSetDevice(device_ids_[rank]);
+      ncclCommDestroy(nccl_comms_[rank]);
+      rmm::cuda_device_id id(device_ids_[rank]);
+      rmm::mr::set_per_device_resource(id, nullptr);
+    }
+  }
+
+  int root_rank_;
+  int num_ranks_;
+  int percent_of_free_memory_;
+  std::vector<int> device_ids_;
+  std::vector<ncclComm_t> nccl_comms_;
+  std::vector<std::shared_ptr<pool_mr>> per_device_pools_;
+  std::vector<raft::device_resources> device_resources_;
+};
+
+}  // namespace raft::comms
diff --git a/cpp/include/raft/core/resource/nccl_clique.hpp b/cpp/include/raft/core/resource/nccl_clique.hpp
new file mode 100644
index 0000000000..edda5043ae
--- /dev/null
+++ b/cpp/include/raft/core/resource/nccl_clique.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/comms/nccl_clique.hpp>
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <memory>
+
+namespace raft::resource {
+
+class nccl_clique_resource : public resource {
+ public:
+  nccl_clique_resource() : clique_(std::make_unique<raft::comms::nccl_clique>()) {}
+  ~nccl_clique_resource() override {}
+  void* get_resource() override { return clique_.get(); }
+
+ private:
+  std::unique_ptr<raft::comms::nccl_clique> clique_;
+};
+
+/** Factory that knows how to construct a specific raft::resource to populate the res_t. */
+class nccl_clique_resource_factory : public resource_factory {
+ public:
+  resource_type get_resource_type() override { return resource_type::NCCL_CLIQUE; }
+  resource* make_resource() override { return new nccl_clique_resource(); }
+};
+
+/**
+ * @defgroup nccl_clique_resource resource functions
+ * @{
+ */
+
+/**
+ * Retrieves a NCCL clique from raft res if it exists, otherwise initializes it and return it.
+ *
+ * @param[in] res the raft resources object
+ * @return NCCL clique
+ */
+inline const raft::comms::nccl_clique& get_nccl_clique(resources const& res)
+{
+  if (!res.has_resource_factory(resource_type::NCCL_CLIQUE)) {
+    res.add_resource_factory(std::make_shared<nccl_clique_resource_factory>());
+  }
+  return *res.get_resource<raft::comms::nccl_clique>(resource_type::NCCL_CLIQUE);
+};
+
+/**
+ * @}
+ */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index d9126251c9..4fa84c3bdb 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -46,6 +46,7 @@ enum resource_type {
   CUBLASLT_HANDLE,           // cublasLt handle
   CUSTOM,                    // runtime-shared default-constructible resource
   LARGE_WORKSPACE_RESOURCE,  // rmm device memory resource for somewhat large temporary allocations
+  NCCL_CLIQUE,               // nccl clique
 
   LAST_KEY  // reserved for the last key
 };

From 5ee0e79a7aa033d1f312e805fca1194cdf0483b3 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 26 Sep 2024 18:22:10 -0500
Subject: [PATCH 4/5] bump NCCL floor to 2.19 (#2458)

Follow-up to #2443

As part of the work to support NumPy 2 across RAPIDS, we found reason to upgrade some libraries like `cugraph` to slightly newer NCCL (`>=2.19`). Context: https://github.com/rapidsai/build-planning/issues/102#issuecomment-2375595743

This applies that same bump here, to keep the range of NCCL versions consistent across RAPIDS.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2458
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-120_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-120_arch-x86_64.yaml  | 2 +-
 conda/recipes/libraft/conda_build_config.yaml           | 2 +-
 conda/recipes/raft-ann-bench/conda_build_config.yaml    | 2 +-
 conda/recipes/raft-dask/conda_build_config.yaml         | 2 +-
 dependencies.yaml                                       | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index de4eb7e690..0b84772fad 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 26f4c1efaa..d1c01f1b16 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 692956502b..4c506f5297 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 133d42bfee..a123950e3a 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index f99cedd627..864eb2130b 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 08aea32ab1..5da6eaf17e 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 572ad85ab5..65de97c170 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 9fd23edb03..7e1adbc483 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.18.1.1
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 6dc06648bd..bc0ff1fae7 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.18.1.1"
+  - ">=2.19"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index bdb4e883ea..47bd730daf 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.18.1.1"
+  - ">=2.19"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 58e8ec3c9e..65c589fc0c 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -26,4 +26,4 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.18.1.1"
+  - ">=2.19"
diff --git a/dependencies.yaml b/dependencies.yaml
index e833e8519a..d0991f4d04 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -170,7 +170,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - nccl>=2.18.1.1
+          - nccl>=2.19
           - libucxx==0.40.*,>=0.0.0a0
     specific:
       - output_types: conda

From 6c4fdfb3bfa32c3f08d03adad0d461c167f277ce Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 26 Sep 2024 21:25:22 -0400
Subject: [PATCH 5/5] Deprecating vector search APIs and updating README
 accordingly (#2448)

I opted to deprecate just the necessary pieces, such as the `index` classes instead of deprecating every single function.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/raft/pull/2448
---
 README.md                                     | 111 +--------------
 cpp/include/raft/cluster/kmeans.cuh           |  63 +++++----
 cpp/include/raft/cluster/kmeans_balanced.cuh  |  68 ++++-----
 cpp/include/raft/cluster/single_linkage.cuh   |  33 ++---
 cpp/include/raft/distance/distance-ext.cuh    | 133 +++++++++---------
 cpp/include/raft/neighbors/ball_cover.cuh     |   1 +
 .../raft/neighbors/brute_force_types.hpp      |  32 +++--
 cpp/include/raft/neighbors/cagra_types.hpp    |  14 +-
 cpp/include/raft/neighbors/hnsw_types.hpp     |   6 +-
 cpp/include/raft/neighbors/ivf_flat_types.hpp |  16 ++-
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  20 +--
 .../raft/neighbors/nn_descent_types.hpp       |  16 ++-
 cpp/include/raft/neighbors/refine-ext.cuh     |  32 ++---
 docs/source/conf.py                           |   2 +-
 docs/source/raft_ann_benchmarks.md            |   4 +
 15 files changed, 239 insertions(+), 312 deletions(-)

diff --git a/README.md b/README.md
index 25ce059630..af2219fdd1 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools for Vector Search and More</div>
 
 > [!IMPORTANT]
-> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.10 (October) release and will be removing them altogether in the 24.12 (December) release.
 
 ![RAFT tech stack](img/raft-tech-stack-vss.png)
 
@@ -36,7 +36,7 @@
 
 ## What is RAFT?
 
-RAFT contains fundamental widely-used algorithms and primitives for machine learning and information retrieval. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications.
+RAFT contains fundamental widely-used algorithms and primitives for machine learning and data mining. The algorithms are CUDA-accelerated and form building blocks for more easily writing high performance applications.
 
 By taking a primitives-based approach to algorithm development, RAFT
 - accelerates algorithm construction time
@@ -47,12 +47,10 @@ While not exhaustive, the following general categories help summarize the accele
 #####
 | Category              | Accelerated Functions in RAFT                                                                                                     |
 |-----------------------|-----------------------------------------------------------------------------------------------------------------------------------|
-| **Nearest Neighbors** | vector search, neighborhood graph construction, epsilon neighborhoods, pairwise distances                                         |
-| **Basic Clustering**  | spectral clustering, hierarchical clustering, k-means                                                                             |
-| **Solvers**           | combinatorial optimization, iterative solvers                                                                                     |
 | **Data Formats**      | sparse & dense, conversions, data generation                                                                                      |
 | **Dense Operations**  | linear algebra, matrix and vector operations, reductions, slicing, norms, factorization, least squares, svd & eigenvalue problems |
 | **Sparse Operations** | linear algebra, eigenvalue problems, slicing, norms, reductions, factorization, symmetrization, components & labeling             |
+| **Solvers**           | combinatorial optimization, iterative solvers                                                                                     |
 | **Statistics**        | sampling, moments and summary statistics, metrics, model evaluation                                                               |
 | **Tools & Utilities** | common tools and utilities for developing CUDA applications, multi-node multi-gpu infrastructure                                  |
 
@@ -67,42 +65,6 @@ In addition being a C++ library, RAFT also provides 2 Python libraries:
 
 ![RAFT is a C++ header-only template library with optional shared library and lightweight Python wrappers](img/arch.png)
 
-## Use cases
-
-### Vector Similarity Search
-
-RAFT contains state-of-the-art implementations of approximate nearest neighbors search (ANNS) algorithms on the GPU, such as:
-
-* [Brute force](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#brute-force). Performs a brute force nearest neighbors search without an index.
-* [IVF-Flat](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-flat) and [IVF-PQ](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#ivf-pq). Use an inverted file index structure to map contents to their locations. IVF-PQ additionally uses product quantization to reduce the memory usage of vectors. These methods were originally popularized by the [FAISS](https://github.com/facebookresearch/faiss) library.
-* [CAGRA](https://docs.rapids.ai/api/raft/nightly/pylibraft_api/neighbors/#cagra) (Cuda Anns GRAph-based). Uses a fast ANNS graph construction and search implementation optimized for the GPU. CAGRA outperforms state-of-the art CPU methods (i.e. HNSW) for large batch queries, single queries, and graph construction time. 
-
-Projects that use the RAFT ANNS algorithms for accelerating vector search include: [Milvus](https://milvus.io/), [Redis](https://redis.io/), and [Faiss](https://github.com/facebookresearch/faiss). 
-
-Please see the example [Jupyter notebook](https://github.com/rapidsai/raft/blob/HEAD/notebooks/VectorSearch_QuestionRetrieval.ipynb) to get started RAFT for vector search in Python.
-
-
-
-### Information Retrieval
-
-RAFT contains a catalog of reusable primitives for composing algorithms that require fast neighborhood computations, such as
-
-1. Computing distances between vectors and computing kernel gramm matrices
-2. Performing ball radius queries for constructing epsilon neighborhoods
-3. Clustering points to partition a space for smaller and faster searches
-4. Constructing neighborhood "connectivities" graphs from dense vectors
-
-### Machine Learning
-
-RAFT's primitives are used in several RAPIDS libraries, including [cuML](https://github.com/rapidsai/cuml), [cuGraph](https://github.com/rapidsai/cugraph), and [cuOpt](https://github.com/rapidsai/cuopt) to build many end-to-end machine learning algorithms that span a large spectrum of different applications, including 
-- data generation 
-- model evaluation
-- classification and regression
-- clustering
-- manifold learning
-- dimensionality reduction.
-
-RAFT is also used by the popular collaborative filtering library [implicit](https://github.com/benfred/implicit) for recommender systems.
 
 ## Is RAFT right for me?
 
@@ -327,70 +289,3 @@ When citing RAFT generally, please consider referencing this Github project.
   year={2022}
 }
 ```
-If citing the sparse pairwise distances API, please consider using the following bibtex:
-```bibtex
-@article{nolet2021semiring,
-  title={Semiring primitives for sparse neighborhood methods on the gpu},
-  author={Nolet, Corey J and Gala, Divye and Raff, Edward and Eaton, Joe and Rees, Brad and Zedlewski, John and Oates, Tim},
-  journal={arXiv preprint arXiv:2104.06357},
-  year={2021}
-}
-```
-
-If citing the single-linkage agglomerative clustering APIs, please consider the following bibtex:
-```bibtex
-@misc{nolet2023cuslink,
-      title={cuSLINK: Single-linkage Agglomerative Clustering on the GPU},
-      author={Corey J. Nolet and Divye Gala and Alex Fender and Mahesh Doijade and Joe Eaton and Edward Raff and John Zedlewski and Brad Rees and Tim Oates},
-      year={2023},
-      eprint={2306.16354},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG}
-}
-```
-
-If citing CAGRA, please consider the following bibtex:
-```bibtex
-@misc{ootomo2023cagra,
-      title={CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search for GPUs},
-      author={Hiroyuki Ootomo and Akira Naruse and Corey Nolet and Ray Wang and Tamas Feher and Yong Wang},
-      year={2024},
-      series = {ICDE '24}
-}
-```
-
-If citing the k-selection routines, please consider the following bibtex:
-
-```bibtex
-@proceedings{10.1145/3581784,
-    title = {Parallel Top-K Algorithms on GPU: A Comprehensive Study and New Methods},
-    author={Jingrong Zhang, Akira Naruse, Xipeng Li, and Yong Wang},
-    year = {2023},
-    isbn = {9798400701092},
-    publisher = {Association for Computing Machinery},
-    address = {New York, NY, USA},
-    location = {Denver, CO, USA},
-    series = {SC '23}
-}
-```
-
-If citing the nearest neighbors descent API, please consider the following bibtex:
-```bibtex
-@inproceedings{10.1145/3459637.3482344,
-    author = {Wang, Hui and Zhao, Wan-Lei and Zeng, Xiangxiang and Yang, Jianye},
-    title = {Fast K-NN Graph Construction by GPU Based NN-Descent},
-    year = {2021},
-    isbn = {9781450384469},
-    publisher = {Association for Computing Machinery},
-    address = {New York, NY, USA},
-    url = {https://doi.org/10.1145/3459637.3482344},
-    doi = {10.1145/3459637.3482344},
-    abstract = {NN-Descent is a classic k-NN graph construction approach. It is still widely employed in machine learning, computer vision, and information retrieval tasks due to its efficiency and genericness. However, the current design only works well on CPU. In this paper, NN-Descent has been redesigned to adapt to the GPU architecture. A new graph update strategy called selective update is proposed. It reduces the data exchange between GPU cores and GPU global memory significantly, which is the processing bottleneck under GPU computation architecture. This redesign leads to full exploitation of the parallelism of the GPU hardware. In the meantime, the genericness, as well as the simplicity of NN-Descent, are well-preserved. Moreover, a procedure that allows to k-NN graph to be merged efficiently on GPU is proposed. It makes the construction of high-quality k-NN graphs for out-of-GPU-memory datasets tractable. Our approach is 100-250\texttimes{} faster than the single-thread NN-Descent and is 2.5-5\texttimes{} faster than the existing GPU-based approaches as we tested on million as well as billion scale datasets.},
-    booktitle = {Proceedings of the 30th ACM International Conference on Information \& Knowledge Management},
-    pages = {1929–1938},
-    numpages = {10},
-    keywords = {high-dimensional, nn-descent, gpu, k-nearest neighbor graph},
-    location = {Virtual Event, Queensland, Australia},
-    series = {CIKM '21}
-}
-```
diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh
index eb28cc1626..38318e8ec8 100644
--- a/cpp/include/raft/cluster/kmeans.cuh
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -86,13 +86,14 @@ using KeyValueIndexOp = detail::KeyValueIndexOp<IndexT, DataT>;
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit(raft::resources const& handle,
-         const KMeansParams& params,
-         raft::device_matrix_view<const DataT, IndexT> X,
-         std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
-         raft::device_matrix_view<DataT, IndexT> centroids,
-         raft::host_scalar_view<DataT> inertia,
-         raft::host_scalar_view<IndexT> n_iter)
+[[deprecated("Use cuVS instead")]] void fit(
+  raft::resources const& handle,
+  const KMeansParams& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+  raft::device_matrix_view<DataT, IndexT> centroids,
+  raft::host_scalar_view<DataT> inertia,
+  raft::host_scalar_view<IndexT> n_iter)
 {
   detail::kmeans_fit<DataT, IndexT>(handle, params, X, sample_weight, centroids, inertia, n_iter);
 }
@@ -150,14 +151,15 @@ void fit(raft::resources const& handle,
  *                                 their closest cluster center.
  */
 template <typename DataT, typename IndexT>
-void predict(raft::resources const& handle,
-             const KMeansParams& params,
-             raft::device_matrix_view<const DataT, IndexT> X,
-             std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
-             raft::device_matrix_view<const DataT, IndexT> centroids,
-             raft::device_vector_view<IndexT, IndexT> labels,
-             bool normalize_weight,
-             raft::host_scalar_view<DataT> inertia)
+[[deprecated("Use cuVS instead")]] void predict(
+  raft::resources const& handle,
+  const KMeansParams& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+  raft::device_matrix_view<const DataT, IndexT> centroids,
+  raft::device_vector_view<IndexT, IndexT> labels,
+  bool normalize_weight,
+  raft::host_scalar_view<DataT> inertia)
 {
   detail::kmeans_predict<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia);
@@ -213,14 +215,15 @@ void predict(raft::resources const& handle,
  * @param[out]    n_iter        Number of iterations run.
  */
 template <typename DataT, typename IndexT>
-void fit_predict(raft::resources const& handle,
-                 const KMeansParams& params,
-                 raft::device_matrix_view<const DataT, IndexT> X,
-                 std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
-                 std::optional<raft::device_matrix_view<DataT, IndexT>> centroids,
-                 raft::device_vector_view<IndexT, IndexT> labels,
-                 raft::host_scalar_view<DataT> inertia,
-                 raft::host_scalar_view<IndexT> n_iter)
+[[deprecated("Use cuVS instead")]] void fit_predict(
+  raft::resources const& handle,
+  const KMeansParams& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  std::optional<raft::device_vector_view<const DataT, IndexT>> sample_weight,
+  std::optional<raft::device_matrix_view<DataT, IndexT>> centroids,
+  raft::device_vector_view<IndexT, IndexT> labels,
+  raft::host_scalar_view<DataT> inertia,
+  raft::host_scalar_view<IndexT> n_iter)
 {
   detail::kmeans_fit_predict<DataT, IndexT>(
     handle, params, X, sample_weight, centroids, labels, inertia, n_iter);
@@ -252,13 +255,13 @@ void transform(raft::resources const& handle,
 }
 
 template <typename DataT, typename IndexT>
-void transform(raft::resources const& handle,
-               const KMeansParams& params,
-               const DataT* X,
-               const DataT* centroids,
-               IndexT n_samples,
-               IndexT n_features,
-               DataT* X_new)
+[[deprecated("Use cuVS instead")]] void transform(raft::resources const& handle,
+                                                  const KMeansParams& params,
+                                                  const DataT* X,
+                                                  const DataT* centroids,
+                                                  IndexT n_samples,
+                                                  IndexT n_features,
+                                                  DataT* X_new)
 {
   detail::kmeans_transform<DataT, IndexT>(
     handle, params, X, centroids, n_samples, n_features, X_new);
diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh
index a1a182608b..7479047fce 100644
--- a/cpp/include/raft/cluster/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/kmeans_balanced.cuh
@@ -73,11 +73,11 @@ namespace raft::cluster::kmeans_balanced {
  *                        datatype. If DataT == MathT, this must be the identity.
  */
 template <typename DataT, typename MathT, typename IndexT, typename MappingOpT = raft::identity_op>
-void fit(const raft::resources& handle,
-         kmeans_balanced_params const& params,
-         raft::device_matrix_view<const DataT, IndexT> X,
-         raft::device_matrix_view<MathT, IndexT> centroids,
-         MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void fit(const raft::resources& handle,
+                                            kmeans_balanced_params const& params,
+                                            raft::device_matrix_view<const DataT, IndexT> X,
+                                            raft::device_matrix_view<MathT, IndexT> centroids,
+                                            MappingOpT mapping_op = raft::identity_op())
 {
   RAFT_EXPECTS(X.extent(1) == centroids.extent(1),
                "Number of features in dataset and centroids are different");
@@ -131,12 +131,13 @@ template <typename DataT,
           typename IndexT,
           typename LabelT,
           typename MappingOpT = raft::identity_op>
-void predict(const raft::resources& handle,
-             kmeans_balanced_params const& params,
-             raft::device_matrix_view<const DataT, IndexT> X,
-             raft::device_matrix_view<const MathT, IndexT> centroids,
-             raft::device_vector_view<LabelT, IndexT> labels,
-             MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void predict(
+  const raft::resources& handle,
+  kmeans_balanced_params const& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<const MathT, IndexT> centroids,
+  raft::device_vector_view<LabelT, IndexT> labels,
+  MappingOpT mapping_op = raft::identity_op())
 {
   RAFT_EXPECTS(X.extent(0) == labels.extent(0),
                "Number of rows in dataset and labels are different");
@@ -196,12 +197,13 @@ template <typename DataT,
           typename IndexT,
           typename LabelT,
           typename MappingOpT = raft::identity_op>
-void fit_predict(const raft::resources& handle,
-                 kmeans_balanced_params const& params,
-                 raft::device_matrix_view<const DataT, IndexT> X,
-                 raft::device_matrix_view<MathT, IndexT> centroids,
-                 raft::device_vector_view<LabelT, IndexT> labels,
-                 MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void fit_predict(
+  const raft::resources& handle,
+  kmeans_balanced_params const& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<MathT, IndexT> centroids,
+  raft::device_vector_view<LabelT, IndexT> labels,
+  MappingOpT mapping_op = raft::identity_op())
 {
   auto centroids_const = raft::make_device_matrix_view<const MathT, IndexT>(
     centroids.data_handle(), centroids.extent(0), centroids.extent(1));
@@ -255,14 +257,15 @@ template <typename DataT,
           typename LabelT,
           typename CounterT,
           typename MappingOpT>
-void build_clusters(const raft::resources& handle,
-                    const kmeans_balanced_params& params,
-                    raft::device_matrix_view<const DataT, IndexT> X,
-                    raft::device_matrix_view<MathT, IndexT> centroids,
-                    raft::device_vector_view<LabelT, IndexT> labels,
-                    raft::device_vector_view<CounterT, IndexT> cluster_sizes,
-                    MappingOpT mapping_op = raft::identity_op(),
-                    std::optional<raft::device_vector_view<const MathT>> X_norm = std::nullopt)
+[[deprecated("Use cuVS instead")]] void build_clusters(
+  const raft::resources& handle,
+  const kmeans_balanced_params& params,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_matrix_view<MathT, IndexT> centroids,
+  raft::device_vector_view<LabelT, IndexT> labels,
+  raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+  MappingOpT mapping_op                                       = raft::identity_op(),
+  std::optional<raft::device_vector_view<const MathT>> X_norm = std::nullopt)
 {
   RAFT_EXPECTS(X.extent(0) == labels.extent(0),
                "Number of rows in dataset and labels are different");
@@ -334,13 +337,14 @@ template <typename DataT,
           typename LabelT,
           typename CounterT,
           typename MappingOpT = raft::identity_op>
-void calc_centers_and_sizes(const raft::resources& handle,
-                            raft::device_matrix_view<const DataT, IndexT> X,
-                            raft::device_vector_view<const LabelT, IndexT> labels,
-                            raft::device_matrix_view<MathT, IndexT> centroids,
-                            raft::device_vector_view<CounterT, IndexT> cluster_sizes,
-                            bool reset_counters   = true,
-                            MappingOpT mapping_op = raft::identity_op())
+[[deprecated("Use cuVS instead")]] void calc_centers_and_sizes(
+  const raft::resources& handle,
+  raft::device_matrix_view<const DataT, IndexT> X,
+  raft::device_vector_view<const LabelT, IndexT> labels,
+  raft::device_matrix_view<MathT, IndexT> centroids,
+  raft::device_vector_view<CounterT, IndexT> cluster_sizes,
+  bool reset_counters   = true,
+  MappingOpT mapping_op = raft::identity_op())
 {
   RAFT_EXPECTS(X.extent(0) == labels.extent(0),
                "Number of rows in dataset and labels are different");
diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh
index d9eba6edc5..067445c542 100644
--- a/cpp/include/raft/cluster/single_linkage.cuh
+++ b/cpp/include/raft/cluster/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,14 +50,14 @@ namespace raft::cluster {
 template <typename value_idx,
           typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(raft::resources const& handle,
-                    const value_t* X,
-                    size_t m,
-                    size_t n,
-                    raft::distance::DistanceType metric,
-                    linkage_output<value_idx>* out,
-                    int c,
-                    size_t n_clusters)
+[[deprecated("Use cuVS instead")]] void single_linkage(raft::resources const& handle,
+                                                       const value_t* X,
+                                                       size_t m,
+                                                       size_t n,
+                                                       raft::distance::DistanceType metric,
+                                                       linkage_output<value_idx>* out,
+                                                       int c,
+                                                       size_t n_clusters)
 {
   detail::single_linkage<value_idx, value_t, dist_type>(
     handle, X, m, n, metric, out, c, n_clusters);
@@ -87,13 +87,14 @@ constexpr int DEFAULT_CONST_C = 15;
  control of k. The algorithm will set `k = log(n) + c`
  */
 template <typename value_t, typename idx_t, LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(raft::resources const& handle,
-                    raft::device_matrix_view<const value_t, idx_t, row_major> X,
-                    raft::device_matrix_view<idx_t, idx_t, row_major> dendrogram,
-                    raft::device_vector_view<idx_t, idx_t> labels,
-                    raft::distance::DistanceType metric,
-                    size_t n_clusters,
-                    std::optional<int> c = std::make_optional<int>(DEFAULT_CONST_C))
+[[deprecated("Use cuVS instead")]] void single_linkage(
+  raft::resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, row_major> X,
+  raft::device_matrix_view<idx_t, idx_t, row_major> dendrogram,
+  raft::device_vector_view<idx_t, idx_t> labels,
+  raft::distance::DistanceType metric,
+  size_t n_clusters,
+  std::optional<int> c = std::make_optional<int>(DEFAULT_CONST_C))
 {
   linkage_output<idx_t> out_arrs;
   out_arrs.children = dendrogram.data_handle();
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 2d41e029fe..dcbfbfdbc3 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -35,42 +35,43 @@ template <raft::distance::DistanceType DistT,
           typename OutT,
           typename FinalLambda,
           typename IdxT = int>
-void distance(raft::resources const& handle,
-              const DataT* x,
-              const DataT* y,
-              OutT* dist,
-              IdxT m,
-              IdxT n,
-              IdxT k,
-              void* workspace,
-              size_t worksize,
-              FinalLambda fin_op,
-              bool isRowMajor  = true,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+                                                 const DataT* x,
+                                                 const DataT* y,
+                                                 OutT* dist,
+                                                 IdxT m,
+                                                 IdxT n,
+                                                 IdxT k,
+                                                 void* workspace,
+                                                 size_t worksize,
+                                                 FinalLambda fin_op,
+                                                 bool isRowMajor  = true,
+                                                 DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
           typename OutT,
           typename IdxT = int>
-void distance(raft::resources const& handle,
-              const DataT* x,
-              const DataT* y,
-              OutT* dist,
-              IdxT m,
-              IdxT n,
-              IdxT k,
-              void* workspace,
-              size_t worksize,
-              bool isRowMajor  = true,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+                                                 const DataT* x,
+                                                 const DataT* y,
+                                                 OutT* dist,
+                                                 IdxT m,
+                                                 IdxT n,
+                                                 IdxT k,
+                                                 void* workspace,
+                                                 size_t worksize,
+                                                 bool isRowMajor  = true,
+                                                 DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
           typename OutT,
           typename IdxT = int>
-size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] size_t getWorkspaceSize(
+  const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
@@ -86,40 +87,40 @@ template <raft::distance::DistanceType DistT,
           typename AccT,
           typename OutT,
           typename IdxT = int>
-void distance(raft::resources const& handle,
-              const DataT* x,
-              const DataT* y,
-              OutT* dist,
-              IdxT m,
-              IdxT n,
-              IdxT k,
-              bool isRowMajor  = true,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(raft::resources const& handle,
+                                                 const DataT* x,
+                                                 const DataT* y,
+                                                 OutT* dist,
+                                                 IdxT m,
+                                                 IdxT n,
+                                                 IdxT k,
+                                                 bool isRowMajor  = true,
+                                                 DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <typename Type, typename IdxT = int>
-void pairwise_distance(raft::resources const& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       IdxT m,
-                       IdxT n,
-                       IdxT k,
-                       rmm::device_uvector<char>& workspace,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(raft::resources const& handle,
+                                                          const Type* x,
+                                                          const Type* y,
+                                                          Type* dist,
+                                                          IdxT m,
+                                                          IdxT n,
+                                                          IdxT k,
+                                                          rmm::device_uvector<char>& workspace,
+                                                          raft::distance::DistanceType metric,
+                                                          bool isRowMajor = true,
+                                                          Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <typename Type, typename IdxT = int>
-void pairwise_distance(raft::resources const& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       IdxT m,
-                       IdxT n,
-                       IdxT k,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(raft::resources const& handle,
+                                                          const Type* x,
+                                                          const Type* y,
+                                                          Type* dist,
+                                                          IdxT m,
+                                                          IdxT n,
+                                                          IdxT k,
+                                                          raft::distance::DistanceType metric,
+                                                          bool isRowMajor = true,
+                                                          Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <raft::distance::DistanceType DistT,
           typename DataT,
@@ -127,19 +128,21 @@ template <raft::distance::DistanceType DistT,
           typename OutT,
           typename layout = raft::layout_c_contiguous,
           typename IdxT   = int>
-void distance(raft::resources const& handle,
-              raft::device_matrix_view<DataT, IdxT, layout> const x,
-              raft::device_matrix_view<DataT, IdxT, layout> const y,
-              raft::device_matrix_view<OutT, IdxT, layout> dist,
-              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void distance(
+  raft::resources const& handle,
+  raft::device_matrix_view<DataT, IdxT, layout> const x,
+  raft::device_matrix_view<DataT, IdxT, layout> const y,
+  raft::device_matrix_view<OutT, IdxT, layout> dist,
+  DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
 template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
-void pairwise_distance(raft::resources const& handle,
-                       device_matrix_view<Type, IdxT, layout> const x,
-                       device_matrix_view<Type, IdxT, layout> const y,
-                       device_matrix_view<Type, IdxT, layout> dist,
-                       raft::distance::DistanceType metric,
-                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void pairwise_distance(
+  raft::resources const& handle,
+  device_matrix_view<Type, IdxT, layout> const x,
+  device_matrix_view<Type, IdxT, layout> const y,
+  device_matrix_view<Type, IdxT, layout> dist,
+  raft::distance::DistanceType metric,
+  Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
 };  // namespace distance
 };  // namespace raft
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
index 20c88f3318..09938020b9 100644
--- a/cpp/include/raft/neighbors/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 #ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
 #include "ball_cover-inl.cuh"
diff --git a/cpp/include/raft/neighbors/brute_force_types.hpp b/cpp/include/raft/neighbors/brute_force_types.hpp
index a8f073edc6..4511f8d8ba 100644
--- a/cpp/include/raft/neighbors/brute_force_types.hpp
+++ b/cpp/include/raft/neighbors/brute_force_types.hpp
@@ -94,12 +94,14 @@ struct index : ann::index {
    * the dataset. If the dataset is in host memory, it will be copied to the device and the
    * index will own the device memory.
    */
+
   template <typename data_accessor>
-  index(raft::resources const& res,
-        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
-        std::optional<raft::device_vector<T, int64_t>>&& norms,
-        raft::distance::DistanceType metric,
-        T metric_arg = 0.0)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+    std::optional<raft::device_vector<T, int64_t>>&& norms,
+    raft::distance::DistanceType metric,
+    T metric_arg = 0.0)
     : ann::index(),
       metric_(metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
@@ -116,11 +118,12 @@ struct index : ann::index {
    * This class stores a non-owning reference to the dataset and norms here.
    * Having precomputed norms gives us a performance advantage at query time.
    */
-  index(raft::resources const& res,
-        raft::device_matrix_view<const T, int64_t, row_major> dataset_view,
-        std::optional<raft::device_vector_view<const T, int64_t>> norms_view,
-        raft::distance::DistanceType metric,
-        T metric_arg = 0.0)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::device_matrix_view<const T, int64_t, row_major> dataset_view,
+    std::optional<raft::device_vector_view<const T, int64_t>> norms_view,
+    raft::distance::DistanceType metric,
+    T metric_arg = 0.0)
     : ann::index(),
       metric_(metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
@@ -131,10 +134,11 @@ struct index : ann::index {
   }
 
   template <typename data_accessor>
-  index(raft::resources const& res,
-        index_params const& params,
-        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
-        std::optional<raft::device_vector<T, int64_t>>&& norms = std::nullopt)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    index_params const& params,
+    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+    std::optional<raft::device_vector<T, int64_t>>&& norms = std::nullopt)
     : ann::index(),
       metric_(params.metric),
       dataset_(make_device_matrix<T, int64_t>(res, 0, 0)),
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 97c9c0d098..bc7c380db1 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -201,8 +201,9 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. */
-  index(raft::resources const& res,
-        raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
       metric_(metric),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
@@ -265,10 +266,11 @@ struct index : ann::index {
    *
    */
   template <typename data_accessor, typename graph_accessor>
-  index(raft::resources const& res,
-        raft::distance::DistanceType metric,
-        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
-        mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::distance::DistanceType metric,
+    mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> dataset,
+    mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
     : ann::index(),
       metric_(metric),
       graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
diff --git a/cpp/include/raft/neighbors/hnsw_types.hpp b/cpp/include/raft/neighbors/hnsw_types.hpp
index f90de6f01b..f78571f491 100644
--- a/cpp/include/raft/neighbors/hnsw_types.hpp
+++ b/cpp/include/raft/neighbors/hnsw_types.hpp
@@ -38,7 +38,6 @@ struct search_params : ann::search_params {
   int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
                         // automatically maximizes parallelism
 };
-
 template <typename T>
 struct index : ann::index {
  public:
@@ -51,7 +50,10 @@ struct index : ann::index {
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
    */
-  index(int dim, raft::distance::DistanceType metric) : dim_{dim}, metric_{metric} {}
+  [[deprecated("Use cuVS instead")]] index(int dim, raft::distance::DistanceType metric)
+    : dim_{dim}, metric_{metric}
+  {
+  }
 
   /**
   @brief Get underlying index
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 7605bd82a3..2cafceb512 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -261,12 +261,12 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& res,
-        raft::distance::DistanceType metric,
-        uint32_t n_lists,
-        bool adaptive_centers,
-        bool conservative_memory_allocation,
-        uint32_t dim)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& res,
+                                           raft::distance::DistanceType metric,
+                                           uint32_t n_lists,
+                                           bool adaptive_centers,
+                                           bool conservative_memory_allocation,
+                                           uint32_t dim)
     : ann::index(),
       veclen_(calculate_veclen(dim)),
       metric_(metric),
@@ -285,7 +285,9 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& res, const index_params& params, uint32_t dim)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& res,
+                                           const index_params& params,
+                                           uint32_t dim)
     : index(res,
             params.metric,
             params.n_lists,
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 3ee350c6fb..d5906d621c 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -361,14 +361,14 @@ struct index : ann::index {
   ~index()                               = default;
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& handle,
-        raft::distance::DistanceType metric,
-        codebook_gen codebook_kind,
-        uint32_t n_lists,
-        uint32_t dim,
-        uint32_t pq_bits                    = 8,
-        uint32_t pq_dim                     = 0,
-        bool conservative_memory_allocation = false)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& handle,
+                                           raft::distance::DistanceType metric,
+                                           codebook_gen codebook_kind,
+                                           uint32_t n_lists,
+                                           uint32_t dim,
+                                           uint32_t pq_bits                    = 8,
+                                           uint32_t pq_dim                     = 0,
+                                           bool conservative_memory_allocation = false)
     : ann::index(),
       metric_(metric),
       codebook_kind_(codebook_kind),
@@ -391,7 +391,9 @@ struct index : ann::index {
   }
 
   /** Construct an empty index. It needs to be trained and then populated. */
-  index(raft::resources const& handle, const index_params& params, uint32_t dim)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& handle,
+                                           const index_params& params,
+                                           uint32_t dim)
     : index(handle,
             params.metric,
             params.codebook_kind,
diff --git a/cpp/include/raft/neighbors/nn_descent_types.hpp b/cpp/include/raft/neighbors/nn_descent_types.hpp
index eb01a423be..9decf47f39 100644
--- a/cpp/include/raft/neighbors/nn_descent_types.hpp
+++ b/cpp/include/raft/neighbors/nn_descent_types.hpp
@@ -101,7 +101,10 @@ struct index : ann::index {
    * @param n_cols number of cols in knn-graph
    * @param return_distances whether to allocate and get distances information
    */
-  index(raft::resources const& res, int64_t n_rows, int64_t n_cols, bool return_distances = false)
+  [[deprecated("Use cuVS instead")]] index(raft::resources const& res,
+                                           int64_t n_rows,
+                                           int64_t n_cols,
+                                           bool return_distances = false)
     : ann::index(),
       res_{res},
       metric_{raft::distance::DistanceType::L2Expanded},
@@ -128,11 +131,12 @@ struct index : ann::index {
    * storing knn-graph distances
    * @param return_distances whether to allocate and get distances information
    */
-  index(raft::resources const& res,
-        raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
-        std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
-          std::nullopt,
-        bool return_distances = false)
+  [[deprecated("Use cuVS instead")]] index(
+    raft::resources const& res,
+    raft::host_matrix_view<IdxT, int64_t, raft::row_major> graph_view,
+    std::optional<raft::device_matrix_view<float, int64_t, row_major>> distances_view =
+      std::nullopt,
+    bool return_distances = false)
     : ann::index(),
       res_{res},
       metric_{raft::distance::DistanceType::L2Expanded},
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
index 7948a0e4f2..216e1b9ab5 100644
--- a/cpp/include/raft/neighbors/refine-ext.cuh
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -29,24 +29,24 @@
 namespace raft::neighbors {
 
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::resources const& handle,
-            raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
-            raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
-            raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
-            raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
-            raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
-            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
-  RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void refine(
+  raft::resources const& handle,
+  raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
+  raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
+  raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+  raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+  raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
+  raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
 
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
-void refine(raft::resources const& handle,
-            raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
-            raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
-            raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
-            raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
-            raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
-            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
-  RAFT_EXPLICIT;
+[[deprecated("Use cuVS instead")]] void refine(
+  raft::resources const& handle,
+  raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
+  raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
+  raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+  raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
+  raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
+  raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
 
 }  // namespace raft::neighbors
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8b2040baa2..7a287b689f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -71,7 +71,7 @@
 
 .. attention::
 
-  The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called `cuVS <https://github.com/rapidsai/cuvs>`_. We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+  The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called `cuVS <https://github.com/rapidsai/cuvs>`_. We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.10 (October) release and they will be removed from RAFT altogether in the 24.12 (December) release.
 
 """
 
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index fc11a56ac8..7bac2047fc 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -2,6 +2,10 @@
 
 This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU.
 
+> [!IMPORTANT]
+> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). As a result, `raft-ann-bench` is being migrated to `cuvs-bench` and will be removed from RAFT altogether in the 24.12 (December) release.
+
+
 ## Table of Contents
 
 - [Installing the benchmarks](#installing-the-benchmarks)