-
Notifications
You must be signed in to change notification settings - Fork 894
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Large strings support for cudf::interleave_columns #15544
Changes from all commits
3ba24d6
f913bb9
83b854b
cafe925
d225883
f0dbe2c
064d317
eab5142
a75b86f
2620f67
87485db
d375fc0
ddf80e0
e250ba5
a5ed384
5a43775
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <benchmarks/common/generate_input.hpp> | ||
|
||
#include <cudf/reshape.hpp> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
#include <cudf/utilities/default_stream.hpp> | ||
|
||
#include <nvbench/nvbench.cuh> | ||
|
||
static void bench_interleave(nvbench::state& state) | ||
{ | ||
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows")); | ||
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width")); | ||
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns")); | ||
|
||
if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >= | ||
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) { | ||
state.skip("Skip benchmarks greater than size_type limit"); | ||
} | ||
|
||
data_profile const str_profile = data_profile_builder().distribution( | ||
cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); | ||
std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING); | ||
auto const source_table = create_random_table(types, row_count{num_rows}, str_profile); | ||
|
||
auto const source_view = source_table->view(); | ||
auto const stream = cudf::get_default_stream(); | ||
|
||
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); | ||
auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) + | ||
cudf::strings_column_view(source_view.column(1)).chars_size(stream); | ||
state.add_global_memory_reads<nvbench::int8_t>(chars_size); // all bytes are read | ||
state.add_global_memory_writes<nvbench::int8_t>(chars_size); // all bytes are written | ||
|
||
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { | ||
[[maybe_unused]] auto result = cudf::interleave_columns(source_view); | ||
}); | ||
} | ||
|
||
NVBENCH_BENCH(bench_interleave) | ||
.set_name("interleave_strings") | ||
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) | ||
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) | ||
.add_int64_axis("columns", {2, 10, 100}); |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
#include <cudf/detail/valid_if.cuh> | ||
#include <cudf/lists/detail/interleave_columns.hpp> | ||
#include <cudf/strings/detail/strings_children.cuh> | ||
#include <cudf/strings/detail/strings_column_factories.cuh> | ||
#include <cudf/strings/detail/utilities.cuh> | ||
#include <cudf/structs/structs_column_view.hpp> | ||
#include <cudf/table/table_device_view.cuh> | ||
|
@@ -140,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc | |
} | ||
}; | ||
|
||
struct interleave_strings_fn { | ||
using string_index_pair = cudf::strings::detail::string_index_pair; | ||
table_device_view d_table; | ||
|
||
__device__ string_index_pair operator()(size_type idx) | ||
{ | ||
auto const num_columns = d_table.num_columns(); | ||
auto const source_col_idx = idx % num_columns; | ||
auto const source_row_idx = idx / num_columns; | ||
Comment on lines
+151
to
+152
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This access pattern makes me wonder if a kernel would be significantlty faster. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean faster than There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the reason I thought about this is because threads with adjacent indices access different columns. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes. That is a good point. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I almost made the same comment as @vuule but then I wondered if the point was that interleaving would have coalesced writes (not reads)? I didn’t look too closely at whether that was true but my intuition was that swapping these might be worthwhile. At least worth benchmarking. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I switched the order from coalesced write to coalesced read and wrote a benchmark with different number of columns. The performance did suffer 10% (for 2 columns) to 35% (for 100 columns). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Amazing. Thanks for measuring. |
||
auto const col = d_table.column(source_col_idx); | ||
if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; } | ||
auto const d_str = col.element<string_view>(source_row_idx); | ||
// ensures an empty string is not identified as a null row | ||
return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()} | ||
: string_index_pair{"", 0}; | ||
} | ||
}; | ||
|
||
template <typename T> | ||
struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> { | ||
std::unique_ptr<cudf::column> operator()(table_view const& strings_columns, | ||
bool create_mask, | ||
bool, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
auto num_columns = strings_columns.num_columns(); | ||
if (num_columns == 1) // Single strings column returns a copy | ||
if (num_columns == 1) { // Single strings column returns a copy | ||
return std::make_unique<column>(*(strings_columns.begin()), stream, mr); | ||
} | ||
|
||
auto strings_count = strings_columns.num_rows(); | ||
if (strings_count == 0) // All columns have 0 rows | ||
if (strings_count == 0) { // All columns have 0 rows | ||
return make_empty_column(type_id::STRING); | ||
} | ||
|
||
// Create device views from the strings columns. | ||
auto table = table_device_view::create(strings_columns, stream); | ||
auto d_table = *table; | ||
auto d_table = table_device_view::create(strings_columns, stream); | ||
auto num_strings = num_columns * strings_count; | ||
|
||
std::pair<rmm::device_buffer, size_type> valid_mask{}; | ||
if (create_mask) { | ||
// Create resulting null mask | ||
valid_mask = cudf::detail::valid_if( | ||
thrust::make_counting_iterator<size_type>(0), | ||
thrust::make_counting_iterator<size_type>(num_strings), | ||
[num_columns, d_table] __device__(size_type idx) { | ||
auto source_row_idx = idx % num_columns; | ||
auto source_col_idx = idx / num_columns; | ||
return !d_table.column(source_row_idx).is_null(source_col_idx); | ||
}, | ||
stream, | ||
mr); | ||
} | ||
|
||
auto const null_count = valid_mask.second; | ||
|
||
// Build offsets column by computing sizes of each string in the output | ||
auto offsets_transformer = | ||
cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) { | ||
// First compute the column and the row this item belongs to | ||
auto source_row_idx = idx % num_columns; | ||
auto source_col_idx = idx / num_columns; | ||
return d_table.column(source_row_idx).is_valid(source_col_idx) | ||
? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes() | ||
: 0; | ||
}); | ||
auto offsets_transformer_itr = thrust::make_transform_iterator( | ||
thrust::make_counting_iterator<size_type>(0), offsets_transformer); | ||
auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( | ||
offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr); | ||
auto d_results_offsets = | ||
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); | ||
|
||
// Create the chars column | ||
rmm::device_uvector<char> chars(bytes, stream, mr); | ||
auto d_results_chars = chars.data(); | ||
thrust::for_each_n( | ||
rmm::exec_policy(stream), | ||
thrust::make_counting_iterator<size_type>(0), | ||
num_strings, | ||
[num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) { | ||
auto source_row_idx = idx % num_columns; | ||
auto source_col_idx = idx / num_columns; | ||
|
||
// Do not write to buffer if the column value for this row is null | ||
if (d_table.column(source_row_idx).is_null(source_col_idx)) return; | ||
|
||
size_type offset = d_results_offsets[idx]; | ||
char* d_buffer = d_results_chars + offset; | ||
strings::detail::copy_string( | ||
d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx)); | ||
}); | ||
rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream); | ||
thrust::transform(rmm::exec_policy_nosync(stream), | ||
thrust::make_counting_iterator<size_type>(0), | ||
thrust::make_counting_iterator<size_type>(num_strings), | ||
indices.begin(), | ||
interleave_strings_fn{*d_table}); | ||
|
||
return make_strings_column(num_strings, | ||
std::move(offsets_column), | ||
chars.release(), | ||
null_count, | ||
std::move(valid_mask.first)); | ||
return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr); | ||
} | ||
}; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was just moved from below.