Skip to content

Commit

Permalink
Large strings support for cudf::interleave_columns (#15544)
Browse files Browse the repository at this point in the history
Updates the `cudf::interleave_columns` logic to use gather-based `make_strings_column` instead of the `make_strings_children` since the gather-based function already efficiently supports longs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #15544
  • Loading branch information
davidwendt authored May 3, 2024
1 parent ce6902f commit 09f8ff3
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 126 deletions.
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
# ---------------------------------------------------------------------------------
ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)

# ##################################################################################################
# * reshape benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)

add_custom_target(
run_benchmarks
DEPENDS CUDF_BENCHMARKS
Expand Down
59 changes: 59 additions & 0 deletions cpp/benchmarks/reshape/interleave.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/reshape.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_interleave(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const str_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);

auto const source_view = source_table->view();
auto const stream = cudf::get_default_stream();

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
cudf::strings_column_view(source_view.column(1)).chars_size(stream);
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto result = cudf::interleave_columns(source_view);
});
}

NVBENCH_BENCH(bench_interleave)
.set_name("interleave_strings")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("columns", {2, 10, 100});
100 changes: 37 additions & 63 deletions cpp/src/lists/interleave_columns.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/valid_if.cuh>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/type_dispatcher.hpp>

Expand Down Expand Up @@ -128,40 +128,39 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
return std::move(result->release()[0]);
}

// Error case when no other overload or specialization is available
template <typename T, typename Enable = void>
struct interleave_list_entries_impl {
template <typename... Args>
std::unique_ptr<column> operator()(Args&&...)
{
CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
}
};

/**
* @brief Compute string sizes, string validities, and interleave string lists functor.
* @brief Interleave array of string_index_pair objects for a list of strings
*
* This functor is executed twice. In the first pass, the sizes and validities of the output strings
* will be computed. In the second pass, this will interleave the lists of strings of the given
* table containing those lists.
* Each thread processes the strings for the corresponding list row
*/
struct compute_string_sizes_and_interleave_lists_fn {
table_device_view const table_dv;

// Store list offsets of the output lists column.
size_type const* const dst_list_offsets;

// Flag to specify whether to compute string validities.
bool const has_null_mask;

// Store offsets of the strings.
size_type* d_offsets{nullptr};

// If d_chars == nullptr: only compute sizes and validities of the output strings.
// If d_chars != nullptr: only interleave lists of strings.
char* d_chars{nullptr};

// We need to set `1` or `0` for the validities of the strings in the child column.
int8_t* d_validities{nullptr};
using string_index_pair = cudf::strings::detail::string_index_pair;
string_index_pair* indices; // output

// thread per list row per column
__device__ void operator()(size_type const idx)
{
auto const num_cols = table_dv.num_columns();
auto const col_id = idx % num_cols;
auto const list_id = idx / num_cols;

auto const& lists_col = table_dv.column(col_id);
if (has_null_mask and lists_col.is_null(list_id)) { return; }
if (lists_col.is_null(list_id)) { return; }

auto const list_offsets =
lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
Expand All @@ -181,65 +180,40 @@ struct compute_string_sizes_and_interleave_lists_fn {
// read_idx and write_idx are indices of string elements.
size_type write_idx = dst_list_offsets[idx];

if (not d_chars) { // just compute sizes and validities of strings within a list
for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
if (has_null_mask) {
d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
}
d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
}
} else { // just copy the entire memory region containing all strings in the list
// start_byte and end_byte are indices of character of the string elements.
auto const start_byte = str_offsets[start_str_idx];
auto const end_byte = str_offsets[end_str_idx];
if (start_byte < end_byte) {
auto const input_ptr = str_col.template head<char>() + start_byte;
auto const output_ptr = d_chars + d_offsets[write_idx];
thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
auto const offset = str_offsets[read_idx];
auto const size = str_offsets[read_idx + 1] - offset;
string_index_pair result = {nullptr, size};
if (str_col.is_valid(read_idx)) {
result.first = size > 0 ? str_col.template head<char>() + offset : "";
}
indices[write_idx] = result;
}
}
};

// Error case when no other overload or specialization is available
template <typename T, typename Enable = void>
struct interleave_list_entries_impl {
template <typename... Args>
std::unique_ptr<column> operator()(Args&&...)
{
CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
}
};

template <typename T>
struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
std::unique_ptr<column> operator()(table_view const& input,
column_view const& output_list_offsets,
size_type num_output_lists,
size_type num_output_entries,
bool data_has_null_mask,
bool,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr) const noexcept
{
auto const table_dv_ptr = table_device_view::create(input, stream);
auto comp_fn = compute_string_sizes_and_interleave_lists_fn{
*table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};

auto validities =
rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
comp_fn.d_validities = validities.data();

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
comp_fn, num_output_lists, num_output_entries, stream, mr);

auto [null_mask, null_count] =
cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);

return make_strings_column(num_output_entries,
std::move(offsets_column),
chars.release(),
null_count,
std::move(null_mask));
auto const table_dv_ptr = table_device_view::create(input, stream);
auto const d_list_offsets = output_list_offsets.template begin<size_type>();

rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_output_entries,
stream);
auto comp_fn =
compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()};
thrust::for_each_n(rmm::exec_policy_nosync(stream),
thrust::counting_iterator<size_type>(0),
num_output_lists,
comp_fn);
return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
}
};

Expand Down
95 changes: 32 additions & 63 deletions cpp/src/reshape/interleave_columns.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/detail/valid_if.cuh>
#include <cudf/lists/detail/interleave_columns.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table_device_view.cuh>
Expand Down Expand Up @@ -140,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
}
};

struct interleave_strings_fn {
using string_index_pair = cudf::strings::detail::string_index_pair;
table_device_view d_table;

__device__ string_index_pair operator()(size_type idx)
{
auto const num_columns = d_table.num_columns();
auto const source_col_idx = idx % num_columns;
auto const source_row_idx = idx / num_columns;
auto const col = d_table.column(source_col_idx);
if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; }
auto const d_str = col.element<string_view>(source_row_idx);
// ensures an empty string is not identified as a null row
return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
: string_index_pair{"", 0};
}
};

template <typename T>
struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
bool create_mask,
bool,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto num_columns = strings_columns.num_columns();
if (num_columns == 1) // Single strings column returns a copy
if (num_columns == 1) { // Single strings column returns a copy
return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
}

auto strings_count = strings_columns.num_rows();
if (strings_count == 0) // All columns have 0 rows
if (strings_count == 0) { // All columns have 0 rows
return make_empty_column(type_id::STRING);
}

// Create device views from the strings columns.
auto table = table_device_view::create(strings_columns, stream);
auto d_table = *table;
auto d_table = table_device_view::create(strings_columns, stream);
auto num_strings = num_columns * strings_count;

std::pair<rmm::device_buffer, size_type> valid_mask{};
if (create_mask) {
// Create resulting null mask
valid_mask = cudf::detail::valid_if(
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(num_strings),
[num_columns, d_table] __device__(size_type idx) {
auto source_row_idx = idx % num_columns;
auto source_col_idx = idx / num_columns;
return !d_table.column(source_row_idx).is_null(source_col_idx);
},
stream,
mr);
}

auto const null_count = valid_mask.second;

// Build offsets column by computing sizes of each string in the output
auto offsets_transformer =
cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) {
// First compute the column and the row this item belongs to
auto source_row_idx = idx % num_columns;
auto source_col_idx = idx / num_columns;
return d_table.column(source_row_idx).is_valid(source_col_idx)
? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes()
: 0;
});
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
auto d_results_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// Create the chars column
rmm::device_uvector<char> chars(bytes, stream, mr);
auto d_results_chars = chars.data();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
num_strings,
[num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
auto source_row_idx = idx % num_columns;
auto source_col_idx = idx / num_columns;

// Do not write to buffer if the column value for this row is null
if (d_table.column(source_row_idx).is_null(source_col_idx)) return;

size_type offset = d_results_offsets[idx];
char* d_buffer = d_results_chars + offset;
strings::detail::copy_string(
d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx));
});
rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream);
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(num_strings),
indices.begin(),
interleave_strings_fn{*d_table});

return make_strings_column(num_strings,
std::move(offsets_column),
chars.release(),
null_count,
std::move(valid_mask.first));
return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
}
};

Expand Down

0 comments on commit 09f8ff3

Please sign in to comment.