Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Large strings support for cudf::interleave_columns #15544

Merged
merged 16 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
# ---------------------------------------------------------------------------------
ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)

# ##################################################################################################
# * reshape benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)

add_custom_target(
run_benchmarks
DEPENDS CUDF_BENCHMARKS
Expand Down
59 changes: 59 additions & 0 deletions cpp/benchmarks/reshape/interleave.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/reshape.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

static void bench_interleave(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Skip benchmarks greater than size_type limit");
}

data_profile const str_profile = data_profile_builder().distribution(
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);

auto const source_view = source_table->view();
auto const stream = cudf::get_default_stream();

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
cudf::strings_column_view(source_view.column(1)).chars_size(stream);
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto result = cudf::interleave_columns(source_view);
});
}

NVBENCH_BENCH(bench_interleave)
.set_name("interleave_strings")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("columns", {2, 10, 100});
100 changes: 37 additions & 63 deletions cpp/src/lists/interleave_columns.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/valid_if.cuh>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/type_dispatcher.hpp>

Expand Down Expand Up @@ -128,40 +128,39 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
return std::move(result->release()[0]);
}

// Error case when no other overload or specialization is available
template <typename T, typename Enable = void>
struct interleave_list_entries_impl {
template <typename... Args>
std::unique_ptr<column> operator()(Args&&...)
{
CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
}
};
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was just moved from below.


/**
* @brief Compute string sizes, string validities, and interleave string lists functor.
* @brief Interleave array of string_index_pair objects for a list of strings
*
* This functor is executed twice. In the first pass, the sizes and validities of the output strings
* will be computed. In the second pass, this will interleave the lists of strings of the given
* table containing those lists.
* Each thread processes the strings for the corresponding list row
*/
struct compute_string_sizes_and_interleave_lists_fn {
table_device_view const table_dv;

// Store list offsets of the output lists column.
size_type const* const dst_list_offsets;

// Flag to specify whether to compute string validities.
bool const has_null_mask;

// Store offsets of the strings.
size_type* d_offsets{nullptr};

// If d_chars == nullptr: only compute sizes and validities of the output strings.
// If d_chars != nullptr: only interleave lists of strings.
char* d_chars{nullptr};

// We need to set `1` or `0` for the validities of the strings in the child column.
int8_t* d_validities{nullptr};
using string_index_pair = cudf::strings::detail::string_index_pair;
string_index_pair* indices; // output

// thread per list row per column
__device__ void operator()(size_type const idx)
{
auto const num_cols = table_dv.num_columns();
auto const col_id = idx % num_cols;
auto const list_id = idx / num_cols;

auto const& lists_col = table_dv.column(col_id);
if (has_null_mask and lists_col.is_null(list_id)) { return; }
if (lists_col.is_null(list_id)) { return; }

auto const list_offsets =
lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
Expand All @@ -181,65 +180,40 @@ struct compute_string_sizes_and_interleave_lists_fn {
// read_idx and write_idx are indices of string elements.
size_type write_idx = dst_list_offsets[idx];

if (not d_chars) { // just compute sizes and validities of strings within a list
for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
if (has_null_mask) {
d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
}
d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
}
} else { // just copy the entire memory region containing all strings in the list
// start_byte and end_byte are indices of character of the string elements.
auto const start_byte = str_offsets[start_str_idx];
auto const end_byte = str_offsets[end_str_idx];
if (start_byte < end_byte) {
auto const input_ptr = str_col.template head<char>() + start_byte;
auto const output_ptr = d_chars + d_offsets[write_idx];
thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
auto const offset = str_offsets[read_idx];
auto const size = str_offsets[read_idx + 1] - offset;
string_index_pair result = {nullptr, size};
if (str_col.is_valid(read_idx)) {
result.first = size > 0 ? str_col.template head<char>() + offset : "";
vuule marked this conversation as resolved.
Show resolved Hide resolved
}
indices[write_idx] = result;
}
}
};

// Error case when no other overload or specialization is available
template <typename T, typename Enable = void>
struct interleave_list_entries_impl {
template <typename... Args>
std::unique_ptr<column> operator()(Args&&...)
{
CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
}
};

template <typename T>
struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
std::unique_ptr<column> operator()(table_view const& input,
column_view const& output_list_offsets,
size_type num_output_lists,
size_type num_output_entries,
bool data_has_null_mask,
bool,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr) const noexcept
{
auto const table_dv_ptr = table_device_view::create(input, stream);
auto comp_fn = compute_string_sizes_and_interleave_lists_fn{
*table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};

auto validities =
rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
comp_fn.d_validities = validities.data();

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
comp_fn, num_output_lists, num_output_entries, stream, mr);

auto [null_mask, null_count] =
cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);

return make_strings_column(num_output_entries,
std::move(offsets_column),
chars.release(),
null_count,
std::move(null_mask));
auto const table_dv_ptr = table_device_view::create(input, stream);
auto const d_list_offsets = output_list_offsets.template begin<size_type>();

rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_output_entries,
stream);
auto comp_fn =
compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()};
thrust::for_each_n(rmm::exec_policy_nosync(stream),
thrust::counting_iterator<size_type>(0),
num_output_lists,
comp_fn);
return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
}
};

Expand Down
95 changes: 32 additions & 63 deletions cpp/src/reshape/interleave_columns.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/detail/valid_if.cuh>
#include <cudf/lists/detail/interleave_columns.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/structs/structs_column_view.hpp>
#include <cudf/table/table_device_view.cuh>
Expand Down Expand Up @@ -140,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
}
};

struct interleave_strings_fn {
using string_index_pair = cudf::strings::detail::string_index_pair;
table_device_view d_table;

__device__ string_index_pair operator()(size_type idx)
{
auto const num_columns = d_table.num_columns();
auto const source_col_idx = idx % num_columns;
auto const source_row_idx = idx / num_columns;
Comment on lines +151 to +152
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This access pattern makes me wonder if a kernel would be significantlty faster.
But I assume this is light-weigth either way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean faster than thrust::transform?
The lambda here should be very fast since it only operates on the bitmask and the offsets in a very coalesced access pattern.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the reason I thought about this is because threads with adjacent indices access different columns.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes. That is a good point.

Copy link
Contributor

@bdice bdice May 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I almost made the same comment as @vuule but then I wondered if the point was that interleaving would have coalesced writes (not reads)? I didn’t look too closely at whether that was true but my intuition was that swapping these might be worthwhile. At least worth benchmarking.

Copy link
Contributor Author

@davidwendt davidwendt May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I switched the order from coalesced write to coalesced read and wrote a benchmark with different number of columns. The performance did suffer 10% (for 2 columns) to 35% (for 100 columns).
This probably could be mitigated with some extra work to use shared-memory to minimize the non-coalesced writes.
But I think this kind of effort should also encompass the non-strings code paths as well (which also do coalesced writes). So I feel this may be a bit out of scope for this PR.
I will include the benchmark code in this PR since it has already been created.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Amazing. Thanks for measuring.

auto const col = d_table.column(source_col_idx);
if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; }
auto const d_str = col.element<string_view>(source_row_idx);
// ensures an empty string is not identified as a null row
return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
: string_index_pair{"", 0};
}
};

template <typename T>
struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
bool create_mask,
bool,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto num_columns = strings_columns.num_columns();
if (num_columns == 1) // Single strings column returns a copy
if (num_columns == 1) { // Single strings column returns a copy
return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
}

auto strings_count = strings_columns.num_rows();
if (strings_count == 0) // All columns have 0 rows
if (strings_count == 0) { // All columns have 0 rows
return make_empty_column(type_id::STRING);
}

// Create device views from the strings columns.
auto table = table_device_view::create(strings_columns, stream);
auto d_table = *table;
auto d_table = table_device_view::create(strings_columns, stream);
auto num_strings = num_columns * strings_count;

std::pair<rmm::device_buffer, size_type> valid_mask{};
if (create_mask) {
// Create resulting null mask
valid_mask = cudf::detail::valid_if(
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(num_strings),
[num_columns, d_table] __device__(size_type idx) {
auto source_row_idx = idx % num_columns;
auto source_col_idx = idx / num_columns;
return !d_table.column(source_row_idx).is_null(source_col_idx);
},
stream,
mr);
}

auto const null_count = valid_mask.second;

// Build offsets column by computing sizes of each string in the output
auto offsets_transformer =
cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) {
// First compute the column and the row this item belongs to
auto source_row_idx = idx % num_columns;
auto source_col_idx = idx / num_columns;
return d_table.column(source_row_idx).is_valid(source_col_idx)
? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes()
: 0;
});
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
auto d_results_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// Create the chars column
rmm::device_uvector<char> chars(bytes, stream, mr);
auto d_results_chars = chars.data();
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
num_strings,
[num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
auto source_row_idx = idx % num_columns;
auto source_col_idx = idx / num_columns;

// Do not write to buffer if the column value for this row is null
if (d_table.column(source_row_idx).is_null(source_col_idx)) return;

size_type offset = d_results_offsets[idx];
char* d_buffer = d_results_chars + offset;
strings::detail::copy_string(
d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx));
});
rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream);
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(num_strings),
indices.begin(),
interleave_strings_fn{*d_table});

return make_strings_column(num_strings,
std::move(offsets_column),
chars.release(),
null_count,
std::move(valid_mask.first));
return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
}
};

Expand Down
Loading