From 52760956341d149e3616e58cc272ac78853e10c7 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 Nov 2021 12:56:00 +0100 Subject: [PATCH 1/8] Fix coalesced access checks via a new CoalescedAccess helper struct. --- cpp/include/raft/linalg/matrix_vector_op.cuh | 44 +++++++-------- cpp/include/raft/vectorized.cuh | 56 ++++++++++++++++++++ 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index e948c3e673..944dee012f 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,18 @@ namespace raft { namespace linalg { +namespace { +template +struct AlignedAccess { + template + static inline bool test(const T *matrix, size_t strideBytes) { + return CoalescedAccess::isAligned(matrix) && + CoalescedAccess::isAligned(strideBytes) && + CoalescedAccess::isAligned(VecBytes); + } +}; +}; // namespace + template __global__ void matrixVectorOpKernel(Type *out, const Type *matrix, const Type *vector, IdxType D, IdxType N, @@ -101,24 +113,19 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, IdxType stride = rowMajor ? D : N; size_t stride_bytes = stride * sizeof(Type); - auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { - return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && - reinterpret_cast(matrix) % sizeof(Type); - }; - - if (test_aligned_access(16)) { + if (AlignedAccess<16>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(8)) { + } else if (AlignedAccess<8>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(4)) { + } else if (AlignedAccess<4>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(2)) { + } else if (AlignedAccess<2>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (1 / sizeof(Type)) { + } else if (AlignedAccess<1>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } else { @@ -209,24 +216,19 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, IdxType stride = rowMajor ? D : N; size_t stride_bytes = stride * sizeof(Type); - auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { - return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && - reinterpret_cast(matrix) % sizeof(Type); - }; - - if (test_aligned_access(16)) { + if (AlignedAccess<16>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(8)) { + } else if (AlignedAccess<8>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(4)) { + } else if (AlignedAccess<4>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (test_aligned_access(2)) { + } else if (AlignedAccess<2>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (1 / sizeof(Type)) { + } else if (AlignedAccess<1>::test(matrix, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } else { diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index ceffbcca78..1e3e8a0253 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -21,6 +21,62 @@ namespace raft { +/** + * @brief Check pointers for byte alignment. + * + * @tparam VecBytes size of the alignment in bytes. + */ +template +struct CoalescedAccess { + private: + static constexpr std::size_t VecMod = VecBytes - 1; + +#define CoalescedAccess_CHECK_TYPE(T) \ + static_assert(std::is_pointer::value || std::is_integral::value, \ + "Only pointer or integral types make sense here") + + public: + static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two."); + + /** Number of elements fitting in a chunk of memory of size VecBytes. */ + template + static constexpr std::size_t nElems = VecBytes / sizeof(T); + + /** Tell whether the pointer is VecBytes-aligned. */ + template + static HDI bool isAligned(PtrT p) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + return (reinterpret_cast(p) & VecMod) == 0; + } + + /** Tell whether two pointers have the same address modulo VecBytes. */ + template + static HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + CoalescedAccess_CHECK_TYPE(PtrS); + auto x = reinterpret_cast(a); + auto y = reinterpret_cast(b); + return (x & VecMod) == (y & VecMod); + } + + /** Get this or next VecBytes-aligned address. */ + template + static HDI PtrT roundUp(PtrT p) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + auto x = reinterpret_cast(p); + return reinterpret_cast(x + VecMod - ((x - 1) & VecMod)); + } + + /** Get this or previous VecBytes-aligned address. */ + template + static HDI PtrT roundDown(PtrT p) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + auto x = reinterpret_cast(p); + return reinterpret_cast(x - (x & VecMod)); + } +#undef CoalescedAccess_CHECK_TYPE +}; + template struct IOType {}; template <> From 469bd69fd11240ae4d9be98aecb8758bf642f754 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 Nov 2021 13:24:43 +0100 Subject: [PATCH 2/8] Put CoalescedAccess in a separate file --- cpp/include/raft/coalesced_access.cuh | 77 ++++++++++++++++++++ cpp/include/raft/linalg/matrix_vector_op.cuh | 1 + cpp/include/raft/vectorized.cuh | 56 -------------- 3 files changed, 78 insertions(+), 56 deletions(-) create mode 100644 cpp/include/raft/coalesced_access.cuh diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh new file mode 100644 index 0000000000..b459a3378d --- /dev/null +++ b/cpp/include/raft/coalesced_access.cuh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuda_utils.cuh" + +namespace raft { + +/** + * @brief Check pointers for byte alignment. + * + * @tparam VecBytes size of the alignment in bytes. + */ +template +struct CoalescedAccess { + private: + static constexpr std::size_t VecMod = VecBytes - 1; + +#define CoalescedAccess_CHECK_TYPE(T) \ + static_assert(std::is_pointer::value || std::is_integral::value, \ + "Only pointer or integral types make sense here") + + public: + static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two."); + + /** Number of elements fitting in a chunk of memory of size VecBytes. */ + template + static constexpr std::size_t nElems = VecBytes / sizeof(T); + + /** Tell whether the pointer is VecBytes-aligned. */ + template + static constexpr HDI bool isAligned(PtrT p) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + return (reinterpret_cast(p) & VecMod) == 0; + } + + /** Tell whether two pointers have the same address modulo VecBytes. */ + template + static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + CoalescedAccess_CHECK_TYPE(PtrS); + auto x = reinterpret_cast(a); + auto y = reinterpret_cast(b); + return (x & VecMod) == (y & VecMod); + } + + /** Get this or next VecBytes-aligned address. */ + template + static constexpr HDI PtrT roundUp(PtrT p) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + auto x = reinterpret_cast(p); + return reinterpret_cast(x + VecMod - ((x - 1) & VecMod)); + } + + /** Get this or previous VecBytes-aligned address. */ + template + static constexpr HDI PtrT roundDown(PtrT p) noexcept { + CoalescedAccess_CHECK_TYPE(PtrT); + auto x = reinterpret_cast(p); + return reinterpret_cast(x - (x & VecMod)); + } +#undef CoalescedAccess_CHECK_TYPE +}; + +}; // namespace raft diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 944dee012f..9b43584c93 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index 1e3e8a0253..ceffbcca78 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -21,62 +21,6 @@ namespace raft { -/** - * @brief Check pointers for byte alignment. - * - * @tparam VecBytes size of the alignment in bytes. - */ -template -struct CoalescedAccess { - private: - static constexpr std::size_t VecMod = VecBytes - 1; - -#define CoalescedAccess_CHECK_TYPE(T) \ - static_assert(std::is_pointer::value || std::is_integral::value, \ - "Only pointer or integral types make sense here") - - public: - static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two."); - - /** Number of elements fitting in a chunk of memory of size VecBytes. */ - template - static constexpr std::size_t nElems = VecBytes / sizeof(T); - - /** Tell whether the pointer is VecBytes-aligned. */ - template - static HDI bool isAligned(PtrT p) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - return (reinterpret_cast(p) & VecMod) == 0; - } - - /** Tell whether two pointers have the same address modulo VecBytes. */ - template - static HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - CoalescedAccess_CHECK_TYPE(PtrS); - auto x = reinterpret_cast(a); - auto y = reinterpret_cast(b); - return (x & VecMod) == (y & VecMod); - } - - /** Get this or next VecBytes-aligned address. */ - template - static HDI PtrT roundUp(PtrT p) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - auto x = reinterpret_cast(p); - return reinterpret_cast(x + VecMod - ((x - 1) & VecMod)); - } - - /** Get this or previous VecBytes-aligned address. */ - template - static HDI PtrT roundDown(PtrT p) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - auto x = reinterpret_cast(p); - return reinterpret_cast(x - (x & VecMod)); - } -#undef CoalescedAccess_CHECK_TYPE -}; - template struct IOType {}; template <> From dabd1d5f1925aadde12e4c9dd023213914b3f43e Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 Nov 2021 13:50:24 +0100 Subject: [PATCH 3/8] Fixed couple small things --- cpp/include/raft/coalesced_access.cuh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh index b459a3378d..a9fc921dd1 100644 --- a/cpp/include/raft/coalesced_access.cuh +++ b/cpp/include/raft/coalesced_access.cuh @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include "cuda_utils.cuh" namespace raft { @@ -37,7 +39,9 @@ struct CoalescedAccess { /** Number of elements fitting in a chunk of memory of size VecBytes. */ template - static constexpr std::size_t nElems = VecBytes / sizeof(T); + static constexpr HDI std::size_t nElems() { + return VecBytes / sizeof(T); + } /** Tell whether the pointer is VecBytes-aligned. */ template From 6cd9edacf0804318ee1eef7d1af6e8da0ffd06f5 Mon Sep 17 00:00:00 2001 From: achirkin Date: Tue, 9 Nov 2021 18:07:57 +0100 Subject: [PATCH 4/8] Remove nElems member, as it's syntactically unusable anyway. --- cpp/include/raft/coalesced_access.cuh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh index a9fc921dd1..946ed4030d 100644 --- a/cpp/include/raft/coalesced_access.cuh +++ b/cpp/include/raft/coalesced_access.cuh @@ -37,12 +37,6 @@ struct CoalescedAccess { public: static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two."); - /** Number of elements fitting in a chunk of memory of size VecBytes. */ - template - static constexpr HDI std::size_t nElems() { - return VecBytes / sizeof(T); - } - /** Tell whether the pointer is VecBytes-aligned. */ template static constexpr HDI bool isAligned(PtrT p) noexcept { From 9e9a1ee1950a98e6aa08b212791ea4f28831cd8a Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 15 Nov 2021 13:23:09 +0100 Subject: [PATCH 5/8] Make a more flexible and generic version of CoalescedAccess --- cpp/include/raft/coalesced_access.cuh | 75 ----------------------- cpp/include/raft/pow2_utils.cuh | 85 +++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 75 deletions(-) delete mode 100644 cpp/include/raft/coalesced_access.cuh create mode 100644 cpp/include/raft/pow2_utils.cuh diff --git a/cpp/include/raft/coalesced_access.cuh b/cpp/include/raft/coalesced_access.cuh deleted file mode 100644 index 946ed4030d..0000000000 --- a/cpp/include/raft/coalesced_access.cuh +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "cuda_utils.cuh" - -namespace raft { - -/** - * @brief Check pointers for byte alignment. - * - * @tparam VecBytes size of the alignment in bytes. - */ -template -struct CoalescedAccess { - private: - static constexpr std::size_t VecMod = VecBytes - 1; - -#define CoalescedAccess_CHECK_TYPE(T) \ - static_assert(std::is_pointer::value || std::is_integral::value, \ - "Only pointer or integral types make sense here") - - public: - static_assert((VecBytes & VecMod) == 0, "VecBytes must be power of two."); - - /** Tell whether the pointer is VecBytes-aligned. */ - template - static constexpr HDI bool isAligned(PtrT p) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - return (reinterpret_cast(p) & VecMod) == 0; - } - - /** Tell whether two pointers have the same address modulo VecBytes. */ - template - static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - CoalescedAccess_CHECK_TYPE(PtrS); - auto x = reinterpret_cast(a); - auto y = reinterpret_cast(b); - return (x & VecMod) == (y & VecMod); - } - - /** Get this or next VecBytes-aligned address. */ - template - static constexpr HDI PtrT roundUp(PtrT p) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - auto x = reinterpret_cast(p); - return reinterpret_cast(x + VecMod - ((x - 1) & VecMod)); - } - - /** Get this or previous VecBytes-aligned address. */ - template - static constexpr HDI PtrT roundDown(PtrT p) noexcept { - CoalescedAccess_CHECK_TYPE(PtrT); - auto x = reinterpret_cast(p); - return reinterpret_cast(x - (x & VecMod)); - } -#undef CoalescedAccess_CHECK_TYPE -}; - -}; // namespace raft diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh new file mode 100644 index 0000000000..faad3d9a60 --- /dev/null +++ b/cpp/include/raft/pow2_utils.cuh @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_utils.cuh" + +namespace raft { + +/** + * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time. + * + * @tparam Value a compile-time value representable as a power-of-two. + */ +template +struct Pow2 { + public: + typedef decltype(Value) Type; + static constexpr Type Log2 = log2(Value); + static constexpr Type Mask = Value - 1; + + static_assert(std::is_integral::value, "Value must be integral."); + static_assert(Value && !(Value & Mask), "Value must be power of two."); + + /** Compute (x % Value). */ + static constexpr HDI Type mod(Type x) noexcept { return x & Mask; } + /** Compute (x / Value). */ + static constexpr HDI Type div(Type x) noexcept { return x >> Log2; } + +#define Pow2_CHECK_TYPE(T) \ + static_assert(std::is_pointer::value || std::is_integral::value, \ + "Only pointer or integral types make sense here") + + /** + * Tell whether the pointer or integral is Value-aligned. + * NB: for pointers, the alignment is checked in bytes, not in elements. + */ + template + static constexpr HDI bool isAligned(PtrT p) noexcept { + Pow2_CHECK_TYPE(PtrT); + return mod(reinterpret_cast(p)) == 0; + } + + /** Tell whether two pointers have the same address modulo Value. */ + template + static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { + Pow2_CHECK_TYPE(PtrT); + Pow2_CHECK_TYPE(PtrS); + auto x = reinterpret_cast(a); + auto y = reinterpret_cast(b); + return mod(x) == mod(y); + } + + /** Get this or next Value-aligned address (in bytes) or integral. */ + template + static constexpr HDI PtrT roundUp(PtrT p) noexcept { + Pow2_CHECK_TYPE(PtrT); + auto x = reinterpret_cast(p); + return reinterpret_cast(x + Mask - mod(x - 1)); + } + + /** Get this or previous Value-aligned address (in bytes) or integral. */ + template + static constexpr HDI PtrT roundDown(PtrT p) noexcept { + Pow2_CHECK_TYPE(PtrT); + auto x = reinterpret_cast(p); + return reinterpret_cast(x - mod(x)); + } +#undef Pow2_CHECK_TYPE +}; + +}; // namespace raft From 6846b81515639901bc87d83f2f7fdb014720a211 Mon Sep 17 00:00:00 2001 From: achirkin Date: Mon, 15 Nov 2021 13:24:58 +0100 Subject: [PATCH 6/8] Make a more flexible and generic version of CoalescedAccess --- cpp/include/raft/linalg/matrix_vector_op.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 9b43584c93..93f2d746fa 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -16,8 +16,8 @@ #pragma once -#include #include +#include #include namespace raft { @@ -28,9 +28,9 @@ template struct AlignedAccess { template static inline bool test(const T *matrix, size_t strideBytes) { - return CoalescedAccess::isAligned(matrix) && - CoalescedAccess::isAligned(strideBytes) && - CoalescedAccess::isAligned(VecBytes); + return Pow2::isAligned(matrix) && + Pow2::isAligned(strideBytes) && + Pow2::isAligned(VecBytes); } }; }; // namespace From 3e987550b0e8ba13ce1aadc3fa39b0e69b7e4e55 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 17 Nov 2021 17:13:52 +0100 Subject: [PATCH 7/8] Make pow2 utils more flexible and add tests. --- cpp/include/raft/pow2_utils.cuh | 108 +++++++++++++++++++++++++++----- cpp/test/CMakeLists.txt | 1 + cpp/test/pow2_utils.cu | 108 ++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+), 16 deletions(-) create mode 100644 cpp/test/pow2_utils.cu diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh index faad3d9a60..45e2e4d768 100644 --- a/cpp/include/raft/pow2_utils.cuh +++ b/cpp/include/raft/pow2_utils.cuh @@ -23,22 +23,81 @@ namespace raft { /** * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time. * - * @tparam Value a compile-time value representable as a power-of-two. + * @tparam Value_ a compile-time value representable as a power-of-two. */ -template +template struct Pow2 { - public: - typedef decltype(Value) Type; + typedef decltype(Value_) Type; + static constexpr Type Value = Value_; static constexpr Type Log2 = log2(Value); static constexpr Type Mask = Value - 1; static_assert(std::is_integral::value, "Value must be integral."); static_assert(Value && !(Value & Mask), "Value must be power of two."); - /** Compute (x % Value). */ - static constexpr HDI Type mod(Type x) noexcept { return x & Mask; } - /** Compute (x / Value). */ - static constexpr HDI Type div(Type x) noexcept { return x >> Log2; } +#define Pow2_IsRepresentableAs(I) \ + (std::is_integral::value && Type(I(Value)) == Value) + + /** + * Integer division by Value truncated toward zero + * (same as `x / Value` in C++). + * + * Invariant: `x = Value * quot(x) + rem(x)` + */ + template + static constexpr HDI std::enable_if_t quot( + I x) noexcept { + if constexpr (std::is_signed::value) + return (x >> I(Log2)) + (x < 0 && (x & I(Mask))); + if constexpr (std::is_unsigned::value) return x >> I(Log2); + } + + /** + * Remainder of integer division by Value truncated toward zero + * (same as `x % Value` in C++). + * + * Invariant: `x = Value * quot(x) + rem(x)`. + */ + template + static constexpr HDI std::enable_if_t rem( + I x) noexcept { + if constexpr (std::is_signed::value) + return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask)); + if constexpr (std::is_unsigned::value) return x & I(Mask); + } + + /** + * Integer division by Value truncated toward negative infinity + * (same as `x // Value` in Python). + * + * Invariant: `x = Value * div(x) + mod(x)`. + * + * Note, `div` and `mod` for negative values are slightly faster + * than `quot` and `rem`, but behave slightly different + * compared to normal C++ operators `/` and `%`. + */ + template + static constexpr HDI std::enable_if_t div( + I x) noexcept { + return x >> I(Log2); + } + + /** + * x modulo Value operation (remainder of the `div(x)`) + * (same as `x % Value` in Python). + * + * Invariant: `mod(x) >= 0` + * Invariant: `x = Value * div(x) + mod(x)`. + * + * Note, `div` and `mod` for negative values are slightly faster + * than `quot` and `rem`, but behave slightly different + * compared to normal C++ operators `/` and `%`. + */ + template + static constexpr HDI std::enable_if_t mod( + I x) noexcept { + return x & I(Mask); + } #define Pow2_CHECK_TYPE(T) \ static_assert(std::is_pointer::value || std::is_integral::value, \ @@ -51,7 +110,9 @@ struct Pow2 { template static constexpr HDI bool isAligned(PtrT p) noexcept { Pow2_CHECK_TYPE(PtrT); - return mod(reinterpret_cast(p)) == 0; + if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0; + if constexpr (!Pow2_IsRepresentableAs(PtrT)) + return mod(reinterpret_cast(p)) == 0; } /** Tell whether two pointers have the same address modulo Value. */ @@ -59,27 +120,42 @@ struct Pow2 { static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { Pow2_CHECK_TYPE(PtrT); Pow2_CHECK_TYPE(PtrS); - auto x = reinterpret_cast(a); - auto y = reinterpret_cast(b); - return mod(x) == mod(y); + Type x, y; + if constexpr (Pow2_IsRepresentableAs(PtrT)) + x = Type(mod(a)); + else + x = mod(reinterpret_cast(a)); + if constexpr (Pow2_IsRepresentableAs(PtrS)) + y = Type(mod(b)); + else + y = mod(reinterpret_cast(b)); + return x == y; } /** Get this or next Value-aligned address (in bytes) or integral. */ template static constexpr HDI PtrT roundUp(PtrT p) noexcept { Pow2_CHECK_TYPE(PtrT); - auto x = reinterpret_cast(p); - return reinterpret_cast(x + Mask - mod(x - 1)); + if constexpr (Pow2_IsRepresentableAs(PtrT)) + return p + PtrT(Mask) - mod(p - PtrT(Mask)); + if constexpr (!Pow2_IsRepresentableAs(PtrT)) { + auto x = reinterpret_cast(p); + return reinterpret_cast(x + Mask - mod(x + Mask)); + } } /** Get this or previous Value-aligned address (in bytes) or integral. */ template static constexpr HDI PtrT roundDown(PtrT p) noexcept { Pow2_CHECK_TYPE(PtrT); - auto x = reinterpret_cast(p); - return reinterpret_cast(x - mod(x)); + if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p); + if constexpr (!Pow2_IsRepresentableAs(PtrT)) { + auto x = reinterpret_cast(p); + return reinterpret_cast(x - mod(x)); + } } #undef Pow2_CHECK_TYPE +#undef Pow2_IsRepresentableAs }; }; // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 43e1c65695..4a89fd3273 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -36,6 +36,7 @@ add_executable(test_raft test/eigen_solvers.cu test/handle.cpp test/integer_utils.cpp + test/pow2_utils.cu test/label/label.cu test/label/merge_labels.cu test/lap/lap.cu diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu new file mode 100644 index 0000000000..79556e2e87 --- /dev/null +++ b/cpp/test/pow2_utils.cu @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft { + +template +struct Pow2Test : public ::testing::Test { + typedef Pow2 P; + std::vector data; + + void SetUp() override { + std::vector pos = {0, 1, 2, 7, 15, 16, 17, 35, 1024, 1623}; + data.insert(data.end(), pos.begin(), pos.end()); + if constexpr (std::is_signed::value) { + std::vector neg = {-0, -1, -2, -5, -15, -16, -17, -156}; + data.insert(data.end(), neg.begin(), neg.end()); + } + data.push_back(std::numeric_limits::min()); + data.push_back(std::numeric_limits::max()); + } + + void quotRem() { + for (auto x : data) { + ASSERT_EQ(P::quot(x), x / P::Value) << " where x = " << x; + ASSERT_EQ(P::rem(x), x % P::Value) << " where x = " << x; + ASSERT_EQ(x, P::quot(x) * P::Value + P::rem(x)); + } + } + + void divMod() { + for (auto x : data) { + ASSERT_GE(P::mod(x), 0) << " where x = " << x; + ASSERT_EQ(x, P::div(x) * P::Value + P::mod(x)); + } + } + + void round() { + for (auto x : data) { + if (x <= std::numeric_limits::max() - TargetT(P::Value)) + ASSERT_GE(P::roundUp(x), x); + if (x >= std::numeric_limits::min() + TargetT(P::Value)) + ASSERT_LE(P::roundDown(x), x); + ASSERT_LE(P::roundUp(x) - x, P::Value); + ASSERT_LE(x - P::roundDown(x), P::Value); + } + } + + void alignment() { + for (auto x : data) { + ASSERT_TRUE(P::areSameAlignOffsets(x, x)); + if (x <= std::numeric_limits::max() - TargetT(P::Value)) { + ASSERT_TRUE(P::areSameAlignOffsets(x, x + TargetT(P::Value))); + int aligned_count = 0; + int same_aligned_count = 0; + for (int i = 0; i < int(P::Value); i++) { + aligned_count += P::isAligned(x + i); + same_aligned_count += P::areSameAlignOffsets(x, x + i); + } + ASSERT_EQ(aligned_count, 1) << " where x = " << x; + ASSERT_EQ(same_aligned_count, 1) << " where x = " << x; + } + } + } +}; + +#define TEST_IT(T) \ + TEST_F(T, quotRem) { divMod(); } \ + TEST_F(T, divMod) { divMod(); } \ + TEST_F(T, round) { round(); } \ + TEST_F(T, alignment) { alignment(); } + +typedef Pow2Test<16, int> Pow2_i32_i32_16; +typedef Pow2Test<1UL, uint64_t> Pow2_u64_u64_1; +typedef Pow2Test<128UL, int> Pow2_u64_i32_128; +typedef Pow2Test<32LL, uint16_t> Pow2_ll_u16_32; +typedef Pow2Test<16, uint64_t> Pow2_i32_u64_16; +TEST_IT(Pow2_i32_i32_16); +TEST_IT(Pow2_u64_u64_1); +TEST_IT(Pow2_u64_i32_128); +TEST_IT(Pow2_ll_u16_32); +TEST_IT(Pow2_i32_u64_16); + +TEST(Pow2, pointers) { + typedef Pow2<32UL> P; + for (ptrdiff_t i = 0; i <= ptrdiff_t(P::Value); i++) { + auto *p = reinterpret_cast(16345 + i); + ASSERT_GE(P::roundUp(p), p); + ASSERT_LE(P::roundDown(p), p); + } +} + +} // namespace raft From 5700dfd57163b5c032a3a83b84c43ca82b9db681 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 17 Nov 2021 17:24:19 +0100 Subject: [PATCH 8/8] Fix a typo and add a corresponding test. --- cpp/include/raft/pow2_utils.cuh | 2 +- cpp/test/pow2_utils.cu | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh index 45e2e4d768..de5fc46452 100644 --- a/cpp/include/raft/pow2_utils.cuh +++ b/cpp/include/raft/pow2_utils.cuh @@ -137,7 +137,7 @@ struct Pow2 { static constexpr HDI PtrT roundUp(PtrT p) noexcept { Pow2_CHECK_TYPE(PtrT); if constexpr (Pow2_IsRepresentableAs(PtrT)) - return p + PtrT(Mask) - mod(p - PtrT(Mask)); + return p + PtrT(Mask) - mod(p + PtrT(Mask)); if constexpr (!Pow2_IsRepresentableAs(PtrT)) { auto x = reinterpret_cast(p); return reinterpret_cast(x + Mask - mod(x + Mask)); diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu index 79556e2e87..92976e5c61 100644 --- a/cpp/test/pow2_utils.cu +++ b/cpp/test/pow2_utils.cu @@ -25,7 +25,7 @@ struct Pow2Test : public ::testing::Test { std::vector data; void SetUp() override { - std::vector pos = {0, 1, 2, 7, 15, 16, 17, 35, 1024, 1623}; + std::vector pos = {0, 1, 2, 7, 15, 16, 17, 31, 35, 1024, 1623}; data.insert(data.end(), pos.begin(), pos.end()); if constexpr (std::is_signed::value) { std::vector neg = {-0, -1, -2, -5, -15, -16, -17, -156}; @@ -56,8 +56,9 @@ struct Pow2Test : public ::testing::Test { ASSERT_GE(P::roundUp(x), x); if (x >= std::numeric_limits::min() + TargetT(P::Value)) ASSERT_LE(P::roundDown(x), x); - ASSERT_LE(P::roundUp(x) - x, P::Value); - ASSERT_LE(x - P::roundDown(x), P::Value); + ASSERT_EQ(x - P::roundDown(x), P::mod(x)) << " where x = " << x; + ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0) + << " where x = " << x; } }