From 52760956341d149e3616e58cc272ac78853e10c7 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 9 Nov 2021 12:56:00 +0100
Subject: [PATCH 1/8] Fix coalesced access checks via a new CoalescedAccess
 helper struct.

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 44 +++++++--------
 cpp/include/raft/vectorized.cuh              | 56 ++++++++++++++++++++
 2 files changed, 79 insertions(+), 21 deletions(-)
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index e948c3e673..944dee012f 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,18 @@
 namespace raft {
 namespace linalg {
 
+namespace {
+template <size_t VecBytes>
+struct AlignedAccess {
+  template <typename T>
+  static inline bool test(const T *matrix, size_t strideBytes) {
+    return CoalescedAccess<VecBytes>::isAligned(matrix) &&
+           CoalescedAccess<VecBytes>::isAligned(strideBytes) &&
+           CoalescedAccess<sizeof(T)>::isAligned(VecBytes);
+  }
+};
+};  // namespace
+
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
 __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
                                      const Type *vector, IdxType D, IdxType N,
@@ -101,24 +113,19 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
   IdxType stride = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
-  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
-           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
-  };
-
-  if (test_aligned_access(16)) {
+  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(8)) {
+  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(4)) {
+  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(2)) {
+  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (1 / sizeof(Type)) {
+  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
   } else {
@@ -209,24 +216,19 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
   IdxType stride = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
-  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
-           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
-  };
-
-  if (test_aligned_access(16)) {
+  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(8)) {
+  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(4)) {
+  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(2)) {
+  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (1 / sizeof(Type)) {
+  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
   } else {
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index ceffbcca78..1e3e8a0253 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -21,6 +21,62 @@
 
 namespace raft {
 
+/**
+ * @brief Check pointers for byte alignment.
+ *
+ * @tparam VecBytes size of the alignment in bytes.
+ */
+template <std::size_t VecBytes>
+struct CoalescedAccess {
+ private:
+  static constexpr std::size_t VecMod = VecBytes - 1;
+
+#define CoalescedAccess_CHECK_TYPE(T)                                    \
+  static_assert(std::is_pointer<T>::value || std::is_integral<T>::value, \
+                "Only pointer or integral types make sense here")
+
+ public:
+  static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two.");
+
+  /** Number of elements fitting in a chunk of memory of size VecBytes. */
+  template <typename T>
+  static constexpr std::size_t nElems = VecBytes / sizeof(T);
+
+  /** Tell whether the pointer is VecBytes-aligned. */
+  template <typename PtrT>
+  static HDI bool isAligned(PtrT p) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    return (reinterpret_cast<std::size_t>(p) & VecMod) == 0;
+  }
+
+  /** Tell whether two pointers have the same address modulo VecBytes. */
+  template <typename PtrT, typename PtrS>
+  static HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    CoalescedAccess_CHECK_TYPE(PtrS);
+    auto x = reinterpret_cast<std::size_t>(a);
+    auto y = reinterpret_cast<std::size_t>(b);
+    return (x & VecMod) == (y & VecMod);
+  }
+
+  /** Get this or next VecBytes-aligned address. */
+  template <typename PtrT>
+  static HDI PtrT roundUp(PtrT p) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    auto x = reinterpret_cast<std::size_t>(p);
+    return reinterpret_cast<PtrT>(x + VecMod - ((x - 1) & VecMod));
+  }
+
+  /** Get this or previous VecBytes-aligned address. */
+  template <typename PtrT>
+  static HDI PtrT roundDown(PtrT p) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    auto x = reinterpret_cast<std::size_t>(p);
+    return reinterpret_cast<PtrT>(x - (x & VecMod));
+  }
+#undef CoalescedAccess_CHECK_TYPE
+};
+
 template <typename math_, int VecLen>
 struct IOType {};
 template <>

From 469bd69fd11240ae4d9be98aecb8758bf642f754 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 9 Nov 2021 13:24:43 +0100
Subject: [PATCH 2/8] Put CoalescedAccess in a separate file

---
 cpp/include/raft/coalesced_access.cuh        | 77 ++++++++++++++++++++
 cpp/include/raft/linalg/matrix_vector_op.cuh |  1 +
 cpp/include/raft/vectorized.cuh              | 56 --------------
 3 files changed, 78 insertions(+), 56 deletions(-)
 create mode 100644 cpp/include/raft/coalesced_access.cuh

diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh
new file mode 100644
index 0000000000..b459a3378d
--- /dev/null
+++ b/cpp/include/raft/coalesced_access.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuda_utils.cuh"
+
+namespace raft {
+
+/**
+ * @brief Check pointers for byte alignment.
+ *
+ * @tparam VecBytes size of the alignment in bytes.
+ */
+template <std::size_t VecBytes>
+struct CoalescedAccess {
+ private:
+  static constexpr std::size_t VecMod = VecBytes - 1;
+
+#define CoalescedAccess_CHECK_TYPE(T)                                    \
+  static_assert(std::is_pointer<T>::value || std::is_integral<T>::value, \
+                "Only pointer or integral types make sense here")
+
+ public:
+  static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two.");
+
+  /** Number of elements fitting in a chunk of memory of size VecBytes. */
+  template <typename T>
+  static constexpr std::size_t nElems = VecBytes / sizeof(T);
+
+  /** Tell whether the pointer is VecBytes-aligned. */
+  template <typename PtrT>
+  static constexpr HDI bool isAligned(PtrT p) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    return (reinterpret_cast<std::size_t>(p) & VecMod) == 0;
+  }
+
+  /** Tell whether two pointers have the same address modulo VecBytes. */
+  template <typename PtrT, typename PtrS>
+  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    CoalescedAccess_CHECK_TYPE(PtrS);
+    auto x = reinterpret_cast<std::size_t>(a);
+    auto y = reinterpret_cast<std::size_t>(b);
+    return (x & VecMod) == (y & VecMod);
+  }
+
+  /** Get this or next VecBytes-aligned address. */
+  template <typename PtrT>
+  static constexpr HDI PtrT roundUp(PtrT p) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    auto x = reinterpret_cast<std::size_t>(p);
+    return reinterpret_cast<PtrT>(x + VecMod - ((x - 1) & VecMod));
+  }
+
+  /** Get this or previous VecBytes-aligned address. */
+  template <typename PtrT>
+  static constexpr HDI PtrT roundDown(PtrT p) noexcept {
+    CoalescedAccess_CHECK_TYPE(PtrT);
+    auto x = reinterpret_cast<std::size_t>(p);
+    return reinterpret_cast<PtrT>(x - (x & VecMod));
+  }
+#undef CoalescedAccess_CHECK_TYPE
+};
+
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 944dee012f..9b43584c93 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/coalesced_access.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/vectorized.cuh>
 
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index 1e3e8a0253..ceffbcca78 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -21,62 +21,6 @@
 
 namespace raft {
 
-/**
- * @brief Check pointers for byte alignment.
- *
- * @tparam VecBytes size of the alignment in bytes.
- */
-template <std::size_t VecBytes>
-struct CoalescedAccess {
- private:
-  static constexpr std::size_t VecMod = VecBytes - 1;
-
-#define CoalescedAccess_CHECK_TYPE(T)                                    \
-  static_assert(std::is_pointer<T>::value || std::is_integral<T>::value, \
-                "Only pointer or integral types make sense here")
-
- public:
-  static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two.");
-
-  /** Number of elements fitting in a chunk of memory of size VecBytes. */
-  template <typename T>
-  static constexpr std::size_t nElems = VecBytes / sizeof(T);
-
-  /** Tell whether the pointer is VecBytes-aligned. */
-  template <typename PtrT>
-  static HDI bool isAligned(PtrT p) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    return (reinterpret_cast<std::size_t>(p) & VecMod) == 0;
-  }
-
-  /** Tell whether two pointers have the same address modulo VecBytes. */
-  template <typename PtrT, typename PtrS>
-  static HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    CoalescedAccess_CHECK_TYPE(PtrS);
-    auto x = reinterpret_cast<std::size_t>(a);
-    auto y = reinterpret_cast<std::size_t>(b);
-    return (x & VecMod) == (y & VecMod);
-  }
-
-  /** Get this or next VecBytes-aligned address. */
-  template <typename PtrT>
-  static HDI PtrT roundUp(PtrT p) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    auto x = reinterpret_cast<std::size_t>(p);
-    return reinterpret_cast<PtrT>(x + VecMod - ((x - 1) & VecMod));
-  }
-
-  /** Get this or previous VecBytes-aligned address. */
-  template <typename PtrT>
-  static HDI PtrT roundDown(PtrT p) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    auto x = reinterpret_cast<std::size_t>(p);
-    return reinterpret_cast<PtrT>(x - (x & VecMod));
-  }
-#undef CoalescedAccess_CHECK_TYPE
-};
-
 template <typename math_, int VecLen>
 struct IOType {};
 template <>

From dabd1d5f1925aadde12e4c9dd023213914b3f43e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 9 Nov 2021 13:50:24 +0100
Subject: [PATCH 3/8] Fixed couple small things

---
 cpp/include/raft/coalesced_access.cuh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh
index b459a3378d..a9fc921dd1 100644
--- a/cpp/include/raft/coalesced_access.cuh
+++ b/cpp/include/raft/coalesced_access.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "cuda_utils.cuh"
 
 namespace raft {
@@ -37,7 +39,9 @@ struct CoalescedAccess {
 
   /** Number of elements fitting in a chunk of memory of size VecBytes. */
   template <typename T>
-  static constexpr std::size_t nElems = VecBytes / sizeof(T);
+  static constexpr HDI std::size_t nElems() {
+    return VecBytes / sizeof(T);
+  }
 
   /** Tell whether the pointer is VecBytes-aligned. */
   template <typename PtrT>

From 6cd9edacf0804318ee1eef7d1af6e8da0ffd06f5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 9 Nov 2021 18:07:57 +0100
Subject: [PATCH 4/8] Remove nElems member, as it's syntactically unusable
 anyway.

---
 cpp/include/raft/coalesced_access.cuh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh
index a9fc921dd1..946ed4030d 100644
--- a/cpp/include/raft/coalesced_access.cuh
+++ b/cpp/include/raft/coalesced_access.cuh
@@ -37,12 +37,6 @@ struct CoalescedAccess {
  public:
   static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two.");
 
-  /** Number of elements fitting in a chunk of memory of size VecBytes. */
-  template <typename T>
-  static constexpr HDI std::size_t nElems() {
-    return VecBytes / sizeof(T);
-  }
-
   /** Tell whether the pointer is VecBytes-aligned. */
   template <typename PtrT>
   static constexpr HDI bool isAligned(PtrT p) noexcept {

From 9e9a1ee1950a98e6aa08b212791ea4f28831cd8a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 15 Nov 2021 13:23:09 +0100
Subject: [PATCH 5/8] Make a more flexible and generic version of
 CoalescedAccess

---
 cpp/include/raft/coalesced_access.cuh | 75 -----------------------
 cpp/include/raft/pow2_utils.cuh       | 85 +++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 75 deletions(-)
 delete mode 100644 cpp/include/raft/coalesced_access.cuh
 create mode 100644 cpp/include/raft/pow2_utils.cuh

diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh
deleted file mode 100644
index 946ed4030d..0000000000
--- a/cpp/include/raft/coalesced_access.cuh
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "cuda_utils.cuh"
-
-namespace raft {
-
-/**
- * @brief Check pointers for byte alignment.
- *
- * @tparam VecBytes size of the alignment in bytes.
- */
-template <std::size_t VecBytes>
-struct CoalescedAccess {
- private:
-  static constexpr std::size_t VecMod = VecBytes - 1;
-
-#define CoalescedAccess_CHECK_TYPE(T)                                    \
-  static_assert(std::is_pointer<T>::value || std::is_integral<T>::value, \
-                "Only pointer or integral types make sense here")
-
- public:
-  static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two.");
-
-  /** Tell whether the pointer is VecBytes-aligned. */
-  template <typename PtrT>
-  static constexpr HDI bool isAligned(PtrT p) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    return (reinterpret_cast<std::size_t>(p) & VecMod) == 0;
-  }
-
-  /** Tell whether two pointers have the same address modulo VecBytes. */
-  template <typename PtrT, typename PtrS>
-  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    CoalescedAccess_CHECK_TYPE(PtrS);
-    auto x = reinterpret_cast<std::size_t>(a);
-    auto y = reinterpret_cast<std::size_t>(b);
-    return (x & VecMod) == (y & VecMod);
-  }
-
-  /** Get this or next VecBytes-aligned address. */
-  template <typename PtrT>
-  static constexpr HDI PtrT roundUp(PtrT p) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    auto x = reinterpret_cast<std::size_t>(p);
-    return reinterpret_cast<PtrT>(x + VecMod - ((x - 1) & VecMod));
-  }
-
-  /** Get this or previous VecBytes-aligned address. */
-  template <typename PtrT>
-  static constexpr HDI PtrT roundDown(PtrT p) noexcept {
-    CoalescedAccess_CHECK_TYPE(PtrT);
-    auto x = reinterpret_cast<std::size_t>(p);
-    return reinterpret_cast<PtrT>(x - (x & VecMod));
-  }
-#undef CoalescedAccess_CHECK_TYPE
-};
-
-};  // namespace raft
diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
new file mode 100644
index 0000000000..faad3d9a60
--- /dev/null
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_utils.cuh"
+
+namespace raft {
+
+/**
+ * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time.
+ *
+ * @tparam Value a compile-time value representable as a power-of-two.
+ */
+template <auto Value>
+struct Pow2 {
+ public:
+  typedef decltype(Value) Type;
+  static constexpr Type Log2 = log2(Value);
+  static constexpr Type Mask = Value - 1;
+
+  static_assert(std::is_integral<Type>::value, "Value must be integral.");
+  static_assert(Value && !(Value & Mask), "Value must be power of two.");
+
+  /** Compute (x % Value). */
+  static constexpr HDI Type mod(Type x) noexcept { return x & Mask; }
+  /** Compute (x / Value). */
+  static constexpr HDI Type div(Type x) noexcept { return x >> Log2; }
+
+#define Pow2_CHECK_TYPE(T)                                               \
+  static_assert(std::is_pointer<T>::value || std::is_integral<T>::value, \
+                "Only pointer or integral types make sense here")
+
+  /**
+   * Tell whether the pointer or integral is Value-aligned.
+   * NB: for pointers, the alignment is checked in bytes, not in elements.
+   */
+  template <typename PtrT>
+  static constexpr HDI bool isAligned(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    return mod(reinterpret_cast<Type>(p)) == 0;
+  }
+
+  /** Tell whether two pointers have the same address modulo Value. */
+  template <typename PtrT, typename PtrS>
+  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    Pow2_CHECK_TYPE(PtrS);
+    auto x = reinterpret_cast<Type>(a);
+    auto y = reinterpret_cast<Type>(b);
+    return mod(x) == mod(y);
+  }
+
+  /** Get this or next Value-aligned address (in bytes) or integral. */
+  template <typename PtrT>
+  static constexpr HDI PtrT roundUp(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    auto x = reinterpret_cast<Type>(p);
+    return reinterpret_cast<PtrT>(x + Mask - mod(x - 1));
+  }
+
+  /** Get this or previous Value-aligned address (in bytes) or integral. */
+  template <typename PtrT>
+  static constexpr HDI PtrT roundDown(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    auto x = reinterpret_cast<Type>(p);
+    return reinterpret_cast<PtrT>(x - mod(x));
+  }
+#undef Pow2_CHECK_TYPE
+};
+
+};  // namespace raft

From 6846b81515639901bc87d83f2f7fdb014720a211 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 15 Nov 2021 13:24:58 +0100
Subject: [PATCH 6/8] Make a more flexible and generic version of
 CoalescedAccess

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 9b43584c93..93f2d746fa 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/coalesced_access.cuh>
 #include <raft/cuda_utils.cuh>
+#include <raft/pow2_utils.cuh>
 #include <raft/vectorized.cuh>
 
 namespace raft {
@@ -28,9 +28,9 @@ template <size_t VecBytes>
 struct AlignedAccess {
   template <typename T>
   static inline bool test(const T *matrix, size_t strideBytes) {
-    return CoalescedAccess<VecBytes>::isAligned(matrix) &&
-           CoalescedAccess<VecBytes>::isAligned(strideBytes) &&
-           CoalescedAccess<sizeof(T)>::isAligned(VecBytes);
+    return Pow2<VecBytes>::isAligned(matrix) &&
+           Pow2<VecBytes>::isAligned(strideBytes) &&
+           Pow2<sizeof(T)>::isAligned(VecBytes);
   }
 };
 };  // namespace

From 3e987550b0e8ba13ce1aadc3fa39b0e69b7e4e55 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 17 Nov 2021 17:13:52 +0100
Subject: [PATCH 7/8] Make pow2 utils more flexible and add tests.

---
 cpp/include/raft/pow2_utils.cuh | 108 +++++++++++++++++++++++++++-----
 cpp/test/CMakeLists.txt         |   1 +
 cpp/test/pow2_utils.cu          | 108 ++++++++++++++++++++++++++++++++
 3 files changed, 201 insertions(+), 16 deletions(-)
 create mode 100644 cpp/test/pow2_utils.cu

diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
index faad3d9a60..45e2e4d768 100644
--- a/cpp/include/raft/pow2_utils.cuh
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -23,22 +23,81 @@ namespace raft {
 /**
  * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time.
  *
- * @tparam Value a compile-time value representable as a power-of-two.
+ * @tparam Value_ a compile-time value representable as a power-of-two.
  */
-template <auto Value>
+template <auto Value_>
 struct Pow2 {
- public:
-  typedef decltype(Value) Type;
+  typedef decltype(Value_) Type;
+  static constexpr Type Value = Value_;
   static constexpr Type Log2 = log2(Value);
   static constexpr Type Mask = Value - 1;
 
   static_assert(std::is_integral<Type>::value, "Value must be integral.");
   static_assert(Value && !(Value & Mask), "Value must be power of two.");
 
-  /** Compute (x % Value). */
-  static constexpr HDI Type mod(Type x) noexcept { return x & Mask; }
-  /** Compute (x / Value). */
-  static constexpr HDI Type div(Type x) noexcept { return x >> Log2; }
+#define Pow2_IsRepresentableAs(I) \
+  (std::is_integral<I>::value && Type(I(Value)) == Value)
+
+  /**
+   * Integer division by Value truncated toward zero
+   * (same as `x / Value` in C++).
+   *
+   *  Invariant: `x = Value * quot(x) + rem(x)`
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(
+    I x) noexcept {
+    if constexpr (std::is_signed<I>::value)
+      return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
+    if constexpr (std::is_unsigned<I>::value) return x >> I(Log2);
+  }
+
+  /**
+   *  Remainder of integer division by Value truncated toward zero
+   *  (same as `x % Value` in C++).
+   *
+   *  Invariant: `x = Value * quot(x) + rem(x)`.
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(
+    I x) noexcept {
+    if constexpr (std::is_signed<I>::value)
+      return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
+    if constexpr (std::is_unsigned<I>::value) return x & I(Mask);
+  }
+
+  /**
+   * Integer division by Value truncated toward negative infinity
+   * (same as `x // Value` in Python).
+   *
+   * Invariant: `x = Value * div(x) + mod(x)`.
+   *
+   * Note, `div` and `mod` for negative values are slightly faster
+   * than `quot` and `rem`, but behave slightly different
+   * compared to normal C++ operators `/` and `%`.
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(
+    I x) noexcept {
+    return x >> I(Log2);
+  }
+
+  /**
+   * x modulo Value operation (remainder of the `div(x)`)
+   * (same as `x % Value` in Python).
+   *
+   * Invariant: `mod(x) >= 0`
+   * Invariant: `x = Value * div(x) + mod(x)`.
+   *
+   * Note, `div` and `mod` for negative values are slightly faster
+   * than `quot` and `rem`, but behave slightly different
+   * compared to normal C++ operators `/` and `%`.
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(
+    I x) noexcept {
+    return x & I(Mask);
+  }
 
 #define Pow2_CHECK_TYPE(T)                                               \
   static_assert(std::is_pointer<T>::value || std::is_integral<T>::value, \
@@ -51,7 +110,9 @@ struct Pow2 {
   template <typename PtrT>
   static constexpr HDI bool isAligned(PtrT p) noexcept {
     Pow2_CHECK_TYPE(PtrT);
-    return mod(reinterpret_cast<Type>(p)) == 0;
+    if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0;
+    if constexpr (!Pow2_IsRepresentableAs(PtrT))
+      return mod(reinterpret_cast<Type>(p)) == 0;
   }
 
   /** Tell whether two pointers have the same address modulo Value. */
@@ -59,27 +120,42 @@ struct Pow2 {
   static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
     Pow2_CHECK_TYPE(PtrT);
     Pow2_CHECK_TYPE(PtrS);
-    auto x = reinterpret_cast<Type>(a);
-    auto y = reinterpret_cast<Type>(b);
-    return mod(x) == mod(y);
+    Type x, y;
+    if constexpr (Pow2_IsRepresentableAs(PtrT))
+      x = Type(mod(a));
+    else
+      x = mod(reinterpret_cast<Type>(a));
+    if constexpr (Pow2_IsRepresentableAs(PtrS))
+      y = Type(mod(b));
+    else
+      y = mod(reinterpret_cast<Type>(b));
+    return x == y;
   }
 
   /** Get this or next Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
   static constexpr HDI PtrT roundUp(PtrT p) noexcept {
     Pow2_CHECK_TYPE(PtrT);
-    auto x = reinterpret_cast<Type>(p);
-    return reinterpret_cast<PtrT>(x + Mask - mod(x - 1));
+    if constexpr (Pow2_IsRepresentableAs(PtrT))
+      return p + PtrT(Mask) - mod(p - PtrT(Mask));
+    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+      auto x = reinterpret_cast<Type>(p);
+      return reinterpret_cast<PtrT>(x + Mask - mod(x + Mask));
+    }
   }
 
   /** Get this or previous Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
   static constexpr HDI PtrT roundDown(PtrT p) noexcept {
     Pow2_CHECK_TYPE(PtrT);
-    auto x = reinterpret_cast<Type>(p);
-    return reinterpret_cast<PtrT>(x - mod(x));
+    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p);
+    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+      auto x = reinterpret_cast<Type>(p);
+      return reinterpret_cast<PtrT>(x - mod(x));
+    }
   }
 #undef Pow2_CHECK_TYPE
+#undef Pow2_IsRepresentableAs
 };
 
 };  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 43e1c65695..4a89fd3273 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -36,6 +36,7 @@ add_executable(test_raft
     test/eigen_solvers.cu
     test/handle.cpp
     test/integer_utils.cpp
+    test/pow2_utils.cu
     test/label/label.cu
     test/label/merge_labels.cu
     test/lap/lap.cu
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu
new file mode 100644
index 0000000000..79556e2e87
--- /dev/null
+++ b/cpp/test/pow2_utils.cu
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/pow2_utils.cuh>
+
+namespace raft {
+
+template <auto Val, typename TargetT>
+struct Pow2Test : public ::testing::Test {
+  typedef Pow2<Val> P;
+  std::vector<TargetT> data;
+
+  void SetUp() override {
+    std::vector<TargetT> pos = {0, 1, 2, 7, 15, 16, 17, 35, 1024, 1623};
+    data.insert(data.end(), pos.begin(), pos.end());
+    if constexpr (std::is_signed<TargetT>::value) {
+      std::vector<TargetT> neg = {-0, -1, -2, -5, -15, -16, -17, -156};
+      data.insert(data.end(), neg.begin(), neg.end());
+    }
+    data.push_back(std::numeric_limits<TargetT>::min());
+    data.push_back(std::numeric_limits<TargetT>::max());
+  }
+
+  void quotRem() {
+    for (auto x : data) {
+      ASSERT_EQ(P::quot(x), x / P::Value) << "  where x = " << x;
+      ASSERT_EQ(P::rem(x), x % P::Value) << "  where x = " << x;
+      ASSERT_EQ(x, P::quot(x) * P::Value + P::rem(x));
+    }
+  }
+
+  void divMod() {
+    for (auto x : data) {
+      ASSERT_GE(P::mod(x), 0) << "  where x = " << x;
+      ASSERT_EQ(x, P::div(x) * P::Value + P::mod(x));
+    }
+  }
+
+  void round() {
+    for (auto x : data) {
+      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value))
+        ASSERT_GE(P::roundUp(x), x);
+      if (x >= std::numeric_limits<TargetT>::min() + TargetT(P::Value))
+        ASSERT_LE(P::roundDown(x), x);
+      ASSERT_LE(P::roundUp(x) - x, P::Value);
+      ASSERT_LE(x - P::roundDown(x), P::Value);
+    }
+  }
+
+  void alignment() {
+    for (auto x : data) {
+      ASSERT_TRUE(P::areSameAlignOffsets(x, x));
+      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value)) {
+        ASSERT_TRUE(P::areSameAlignOffsets(x, x + TargetT(P::Value)));
+        int aligned_count = 0;
+        int same_aligned_count = 0;
+        for (int i = 0; i < int(P::Value); i++) {
+          aligned_count += P::isAligned(x + i);
+          same_aligned_count += P::areSameAlignOffsets(x, x + i);
+        }
+        ASSERT_EQ(aligned_count, 1) << "  where x = " << x;
+        ASSERT_EQ(same_aligned_count, 1) << "  where x = " << x;
+      }
+    }
+  }
+};
+
+#define TEST_IT(T)                 \
+  TEST_F(T, quotRem) { divMod(); } \
+  TEST_F(T, divMod) { divMod(); }  \
+  TEST_F(T, round) { round(); }    \
+  TEST_F(T, alignment) { alignment(); }
+
+typedef Pow2Test<16, int> Pow2_i32_i32_16;
+typedef Pow2Test<1UL, uint64_t> Pow2_u64_u64_1;
+typedef Pow2Test<128UL, int> Pow2_u64_i32_128;
+typedef Pow2Test<32LL, uint16_t> Pow2_ll_u16_32;
+typedef Pow2Test<16, uint64_t> Pow2_i32_u64_16;
+TEST_IT(Pow2_i32_i32_16);
+TEST_IT(Pow2_u64_u64_1);
+TEST_IT(Pow2_u64_i32_128);
+TEST_IT(Pow2_ll_u16_32);
+TEST_IT(Pow2_i32_u64_16);
+
+TEST(Pow2, pointers) {
+  typedef Pow2<32UL> P;
+  for (ptrdiff_t i = 0; i <= ptrdiff_t(P::Value); i++) {
+    auto *p = reinterpret_cast<float *>(16345 + i);
+    ASSERT_GE(P::roundUp(p), p);
+    ASSERT_LE(P::roundDown(p), p);
+  }
+}
+
+}  // namespace raft

From 5700dfd57163b5c032a3a83b84c43ca82b9db681 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 17 Nov 2021 17:24:19 +0100
Subject: [PATCH 8/8] Fix a typo and add a corresponding test.

---
 cpp/include/raft/pow2_utils.cuh | 2 +-
 cpp/test/pow2_utils.cu          | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
index 45e2e4d768..de5fc46452 100644
--- a/cpp/include/raft/pow2_utils.cuh
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -137,7 +137,7 @@ struct Pow2 {
   static constexpr HDI PtrT roundUp(PtrT p) noexcept {
     Pow2_CHECK_TYPE(PtrT);
     if constexpr (Pow2_IsRepresentableAs(PtrT))
-      return p + PtrT(Mask) - mod(p - PtrT(Mask));
+      return p + PtrT(Mask) - mod(p + PtrT(Mask));
     if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
       auto x = reinterpret_cast<Type>(p);
       return reinterpret_cast<PtrT>(x + Mask - mod(x + Mask));
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu
index 79556e2e87..92976e5c61 100644
--- a/cpp/test/pow2_utils.cu
+++ b/cpp/test/pow2_utils.cu
@@ -25,7 +25,7 @@ struct Pow2Test : public ::testing::Test {
   std::vector<TargetT> data;
 
   void SetUp() override {
-    std::vector<TargetT> pos = {0, 1, 2, 7, 15, 16, 17, 35, 1024, 1623};
+    std::vector<TargetT> pos = {0, 1, 2, 7, 15, 16, 17, 31, 35, 1024, 1623};
     data.insert(data.end(), pos.begin(), pos.end());
     if constexpr (std::is_signed<TargetT>::value) {
       std::vector<TargetT> neg = {-0, -1, -2, -5, -15, -16, -17, -156};
@@ -56,8 +56,9 @@ struct Pow2Test : public ::testing::Test {
         ASSERT_GE(P::roundUp(x), x);
       if (x >= std::numeric_limits<TargetT>::min() + TargetT(P::Value))
         ASSERT_LE(P::roundDown(x), x);
-      ASSERT_LE(P::roundUp(x) - x, P::Value);
-      ASSERT_LE(x - P::roundDown(x), P::Value);
+      ASSERT_EQ(x - P::roundDown(x), P::mod(x)) << "  where x = " << x;
+      ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0)
+        << "  where x = " << x;
     }
   }