Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-24.10 into branch-24.12 #2456

Merged
merged 1 commit into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions cpp/bench/prims/util/popc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,9 @@ struct popc_bench : public fixture {
auto bits_view =
raft::make_device_vector_view<const bits_t, index_t>(bits_d.data_handle(), bits_d.size());

index_t max_len = params.n_rows * params.n_cols;
auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
auto nnz_actual_view =
nnz_actual_d.view(); // raft::make_device_scalar_view<index_t>(nnz_actual_d.data_handle());
index_t max_len = params.n_rows * params.n_cols;
auto max_len_view = raft::make_host_scalar_view<const index_t, index_t>(&max_len);
auto nnz_actual_view = nnz_actual_d.view();
raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
});
}
Expand Down
107 changes: 106 additions & 1 deletion cpp/include/raft/core/bitset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <raft/util/device_atomics.cuh>
#include <raft/util/popc.cuh>

#include <rmm/device_scalar.hpp>

#include <thrust/for_each.h>

namespace raft::core {
Expand Down Expand Up @@ -60,6 +62,109 @@ _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index
}
}

template <typename bitset_t, typename index_t>
void bitset_view<bitset_t, index_t>::count(const raft::resources& res,
raft::device_scalar_view<index_t> count_gpu_scalar) const
{
auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
auto values = raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
raft::popc(res, values, max_len, count_gpu_scalar);
}

template <typename bitset_t, typename index_t>
RAFT_KERNEL bitset_repeat_kernel(const bitset_t* src,
bitset_t* output,
index_t src_bit_len,
index_t repeat_times)
{
constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
int output_idx = blockIdx.x * blockDim.x + threadIdx.x;

index_t total_bits = src_bit_len * repeat_times;
index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
index_t src_size = (src_bit_len + bits_per_element - 1) / bits_per_element;

if (output_idx < output_size) {
bitset_t result = 0;
index_t bit_written = 0;

index_t start_bit = output_idx * bits_per_element;

while (bit_written < bits_per_element && start_bit + bit_written < total_bits) {
index_t bit_idx = (start_bit + bit_written) % src_bit_len;
index_t src_word_idx = bit_idx / bits_per_element;
index_t src_offset = bit_idx % bits_per_element;

index_t remaining_bits = min(bits_per_element - bit_written, src_bit_len - bit_idx);

bitset_t src_value = (src[src_word_idx] >> src_offset);

if (src_offset + remaining_bits > bits_per_element) {
bitset_t next_value = src[(src_word_idx + 1) % src_size];
src_value |= (next_value << (bits_per_element - src_offset));
}
src_value &= ((bitset_t{1} << remaining_bits) - 1);
result |= (src_value << bit_written);
bit_written += remaining_bits;
}
output[output_idx] = result;
}
}

template <typename bitset_t, typename index_t>
void bitset_repeat(raft::resources const& handle,
const bitset_t* d_src,
bitset_t* d_output,
index_t src_bit_len,
index_t repeat_times)
{
if (src_bit_len == 0 || repeat_times == 0) return;
auto stream = resource::get_cuda_stream(handle);

constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
const index_t total_bits = src_bit_len * repeat_times;
const index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;

int threadsPerBlock = 128;
int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;
bitset_repeat_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
d_src, d_output, src_bit_len, repeat_times);

return;
}

template <typename bitset_t, typename index_t>
void bitset_view<bitset_t, index_t>::repeat(const raft::resources& res,
index_t times,
bitset_t* output_device_ptr) const
{
auto thrust_policy = raft::resource::get_thrust_policy(res);
constexpr index_t bits_per_element = sizeof(bitset_t) * 8;

if (bitset_len_ % bits_per_element == 0) {
index_t num_elements_to_copy = bitset_len_ / bits_per_element;

for (index_t i = 0; i < times; ++i) {
raft::copy(output_device_ptr + i * num_elements_to_copy,
bitset_ptr_,
num_elements_to_copy,
raft::resource::get_cuda_stream(res));
}
} else {
bitset_repeat(res, bitset_ptr_, output_device_ptr, bitset_len_, times);
}
}

template <typename bitset_t, typename index_t>
double bitset_view<bitset_t, index_t>::sparsity(const raft::resources& res) const
{
index_t size_h = this->size();
if (0 == size_h) { return static_cast<double>(1.0); }
index_t count_h = this->count(res);

return static_cast<double>((1.0 * (size_h - count_h)) / (1.0 * size_h));
}

template <typename bitset_t, typename index_t>
bitset<bitset_t, index_t>::bitset(const raft::resources& res,
raft::device_vector_view<const index_t, index_t> mask_index,
Expand Down Expand Up @@ -155,7 +260,7 @@ template <typename bitset_t, typename index_t>
void bitset<bitset_t, index_t>::count(const raft::resources& res,
raft::device_scalar_view<index_t> count_gpu_scalar)
{
auto max_len = raft::make_host_scalar_view<index_t>(&bitset_len_);
auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
auto values =
raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
raft::popc(res, values, max_len, count_gpu_scalar);
Expand Down
76 changes: 76 additions & 0 deletions cpp/include/raft/core/bitset.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <raft/core/resources.hpp>
#include <raft/util/integer_utils.hpp>

#include <cmath>

namespace raft::core {
/**
* @defgroup bitset Bitset
Expand Down Expand Up @@ -103,6 +105,80 @@ struct bitset_view {
{
return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
}
/**
* @brief Returns the number of bits set to true in count_gpu_scalar.
*
* @param[in] res RAFT resources
* @param[out] count_gpu_scalar Device scalar to store the count
*/
void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar) const;
/**
* @brief Returns the number of bits set to true.
*
* @param res RAFT resources
* @return index_t Number of bits set to true
*/
auto count(const raft::resources& res) const -> index_t
{
auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
count(res, count_gpu_scalar.view());
index_t count_cpu = 0;
raft::update_host(
&count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
resource::sync_stream(res);
return count_cpu;
}

/**
* @brief Repeats the bitset data and copies it to the output device pointer.
*
* This function takes the original bitset data stored in the device memory
* and repeats it a specified number of times into a new location in the device memory.
* The bits are copied bit-by-bit to ensure that even if the number of bits (bitset_len_)
* is not a multiple of the bitset element size (e.g., 32 for uint32_t), the bits are
* tightly packed without any gaps between rows.
*
* @param res RAFT resources for managing CUDA streams and execution policies.
* @param times Number of times the bitset data should be repeated in the output.
* @param output_device_ptr Device pointer where the repeated bitset data will be stored.
*
* The caller must ensure that the output device pointer has enough memory allocated
* to hold `times * bitset_len` bits, where `bitset_len` is the number of bits in the original
* bitset. This function uses Thrust parallel algorithms to efficiently perform the operation on
* the GPU.
*/
void repeat(const raft::resources& res, index_t times, bitset_t* output_device_ptr) const;

/**
* @brief Calculate the sparsity (fraction of 0s) of the bitset.
*
* This function computes the sparsity of the bitset, defined as the ratio of unset bits (0s)
* to the total number of bits in the set. If the total number of bits is zero, the function
* returns 1.0, indicating the set is fully sparse.
*
* @param res RAFT resources for managing CUDA streams and execution policies.
* @return double The sparsity of the bitset, i.e., the fraction of unset bits.
*
* This API will synchronize on the stream of `res`.
*/
double sparsity(const raft::resources& res) const;

/**
* @brief Calculates the number of `bitset_t` elements required to store a bitset.
*
* This function computes the number of `bitset_t` elements needed to store a bitset, ensuring
* that all bits are accounted for. If the bitset length is not a multiple of the `bitset_t` size
* (in bits), the calculation rounds up to include the remaining bits in an additional `bitset_t`
* element.
*
* @param bitset_len The total length of the bitset in bits.
* @return size_t The number of `bitset_t` elements required to store the bitset.
*/
static inline size_t eval_n_elements(size_t bitset_len)
{
const size_t bits_per_element = sizeof(bitset_t) * 8;
return (bitset_len + bits_per_element - 1) / bits_per_element;
}

private:
bitset_t* bitset_ptr_;
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/raft/util/detail/popc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ namespace raft::detail {
*/
template <typename value_t, typename index_t>
void popc(const raft::resources& res,
device_vector_view<value_t, index_t> values,
raft::host_scalar_view<index_t> max_len,
device_vector_view<const value_t, index_t> values,
raft::host_scalar_view<const index_t, index_t> max_len,
raft::device_scalar_view<index_t> counter)
{
auto values_size = values.size();
auto values_matrix = raft::make_device_matrix_view<value_t, index_t, col_major>(
auto values_matrix = raft::make_device_matrix_view<const value_t, index_t, col_major>(
values.data_handle(), values_size, 1);
auto counter_vector = raft::make_device_vector_view<index_t, index_t>(counter.data_handle(), 1);

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/util/popc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ namespace raft {
*/
template <typename value_t, typename index_t>
void popc(const raft::resources& res,
device_vector_view<value_t, index_t> values,
raft::host_scalar_view<index_t> max_len,
device_vector_view<const value_t, index_t> values,
raft::host_scalar_view<const index_t, index_t> max_len,
raft::device_scalar_view<index_t> counter)
{
detail::popc(res, values, max_len, counter);
Expand Down
Loading
Loading