From 3ba24d67223472d6e7794b64b6f5968247045629 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 16 Apr 2024 17:02:58 -0400 Subject: [PATCH 1/4] Large strings support for cudf::interleave_columns --- cpp/src/lists/interleave_columns.cu | 100 ++++++++++---------------- cpp/src/reshape/interleave_columns.cu | 94 ++++++++---------------- 2 files changed, 68 insertions(+), 126 deletions(-) diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index 478b6c9a209..9ce14bcd0ef 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -127,12 +127,20 @@ std::unique_ptr concatenate_and_gather_lists(host_spanrelease()[0]); } +// Error case when no other overload or specialization is available +template +struct interleave_list_entries_impl { + template + std::unique_ptr operator()(Args&&...) + { + CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types."); + } +}; + /** - * @brief Compute string sizes, string validities, and interleave string lists functor. + * @brief Interleave array of string_index_pair objects for a list of strings * - * This functor is executed twice. In the first pass, the sizes and validities of the output strings - * will be computed. In the second pass, this will interleave the lists of strings of the given - * table containing those lists. + * Each thread processes the strings for the corresponding list row */ struct compute_string_sizes_and_interleave_lists_fn { table_device_view const table_dv; @@ -140,19 +148,10 @@ struct compute_string_sizes_and_interleave_lists_fn { // Store list offsets of the output lists column. size_type const* const dst_list_offsets; - // Flag to specify whether to compute string validities. - bool const has_null_mask; - - // Store offsets of the strings. - size_type* d_offsets{nullptr}; - - // If d_chars == nullptr: only compute sizes and validities of the output strings. - // If d_chars != nullptr: only interleave lists of strings. - char* d_chars{nullptr}; - - // We need to set `1` or `0` for the validities of the strings in the child column. - int8_t* d_validities{nullptr}; + using string_index_pair = cudf::strings::detail::string_index_pair; + string_index_pair* indices; // output + // thread per list row per column __device__ void operator()(size_type const idx) { auto const num_cols = table_dv.num_columns(); @@ -160,7 +159,7 @@ struct compute_string_sizes_and_interleave_lists_fn { auto const list_id = idx / num_cols; auto const& lists_col = table_dv.column(col_id); - if (has_null_mask and lists_col.is_null(list_id)) { return; } + if (lists_col.is_null(list_id)) { return; } auto const list_offsets = lists_col.child(lists_column_view::offsets_column_index).template data() + @@ -180,65 +179,40 @@ struct compute_string_sizes_and_interleave_lists_fn { // read_idx and write_idx are indices of string elements. size_type write_idx = dst_list_offsets[idx]; - if (not d_chars) { // just compute sizes and validities of strings within a list - for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) { - if (has_null_mask) { - d_validities[write_idx] = static_cast(str_col.is_valid(read_idx)); - } - d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx]; - } - } else { // just copy the entire memory region containing all strings in the list - // start_byte and end_byte are indices of character of the string elements. - auto const start_byte = str_offsets[start_str_idx]; - auto const end_byte = str_offsets[end_str_idx]; - if (start_byte < end_byte) { - auto const input_ptr = str_col.template head() + start_byte; - auto const output_ptr = d_chars + d_offsets[write_idx]; - thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr); + for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) { + auto const offset = str_offsets[read_idx]; + auto const size = str_offsets[read_idx + 1] - offset; + string_index_pair result = {nullptr, size}; + if (str_col.is_valid(read_idx)) { + result.first = size > 0 ? str_col.template head() + offset : ""; } + indices[write_idx] = result; } } }; -// Error case when no other overload or specialization is available -template -struct interleave_list_entries_impl { - template - std::unique_ptr operator()(Args&&...) - { - CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types."); - } -}; - template struct interleave_list_entries_impl>> { std::unique_ptr operator()(table_view const& input, column_view const& output_list_offsets, size_type num_output_lists, size_type num_output_entries, - bool data_has_null_mask, + bool, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const noexcept { - auto const table_dv_ptr = table_device_view::create(input, stream); - auto comp_fn = compute_string_sizes_and_interleave_lists_fn{ - *table_dv_ptr, output_list_offsets.template begin(), data_has_null_mask}; - - auto validities = - rmm::device_uvector(data_has_null_mask ? num_output_entries : 0, stream); - comp_fn.d_validities = validities.data(); - - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - comp_fn, num_output_lists, num_output_entries, stream, mr); - - auto [null_mask, null_count] = - cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr); - - return make_strings_column(num_output_entries, - std::move(offsets_column), - chars.release(), - null_count, - std::move(null_mask)); + auto const table_dv_ptr = table_device_view::create(input, stream); + auto const d_list_offsets = output_list_offsets.template begin(); + + rmm::device_uvector indices(num_output_entries, + stream); + auto comp_fn = + compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()}; + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + num_output_lists, + comp_fn); + return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr); } }; diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index 72227ab5dda..379778b296e 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -139,85 +140,52 @@ struct interleave_columns_impl(source_row_idx); + return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()} + : string_index_pair{"", 0}; + } +}; + template struct interleave_columns_impl>> { std::unique_ptr operator()(table_view const& strings_columns, - bool create_mask, + bool, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto num_columns = strings_columns.num_columns(); - if (num_columns == 1) // Single strings column returns a copy + if (num_columns == 1) { // Single strings column returns a copy return std::make_unique(*(strings_columns.begin()), stream, mr); + } auto strings_count = strings_columns.num_rows(); - if (strings_count == 0) // All columns have 0 rows + if (strings_count == 0) { // All columns have 0 rows return make_empty_column(type_id::STRING); + } // Create device views from the strings columns. - auto table = table_device_view::create(strings_columns, stream); - auto d_table = *table; + auto d_table = table_device_view::create(strings_columns, stream); auto num_strings = num_columns * strings_count; - std::pair valid_mask{}; - if (create_mask) { - // Create resulting null mask - valid_mask = cudf::detail::valid_if( - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_strings), - [num_columns, d_table] __device__(size_type idx) { - auto source_row_idx = idx % num_columns; - auto source_col_idx = idx / num_columns; - return !d_table.column(source_row_idx).is_null(source_col_idx); - }, - stream, - mr); - } - - auto const null_count = valid_mask.second; - - // Build offsets column by computing sizes of each string in the output - auto offsets_transformer = - cuda::proclaim_return_type([num_columns, d_table] __device__(size_type idx) { - // First compute the column and the row this item belongs to - auto source_row_idx = idx % num_columns; - auto source_col_idx = idx / num_columns; - return d_table.column(source_row_idx).is_valid(source_col_idx) - ? d_table.column(source_row_idx).element(source_col_idx).size_bytes() - : 0; - }); - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), offsets_transformer); - auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr); - auto d_results_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); - - // Create the chars column - rmm::device_uvector chars(bytes, stream, mr); - auto d_results_chars = chars.data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_strings, - [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) { - auto source_row_idx = idx % num_columns; - auto source_col_idx = idx / num_columns; - - // Do not write to buffer if the column value for this row is null - if (d_table.column(source_row_idx).is_null(source_col_idx)) return; - - size_type offset = d_results_offsets[idx]; - char* d_buffer = d_results_chars + offset; - strings::detail::copy_string( - d_buffer, d_table.column(source_row_idx).element(source_col_idx)); - }); + rmm::device_uvector indices(num_strings, stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_strings), + indices.begin(), + interleave_strings_fn{*d_table}); - return make_strings_column(num_strings, - std::move(offsets_column), - chars.release(), - null_count, - std::move(valid_mask.first)); + return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr); } }; From eab514252b9f1efd5a371c9f4810a6924ac3bd18 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 23 Apr 2024 10:26:41 -0400 Subject: [PATCH 2/4] add empty string comment --- cpp/src/reshape/interleave_columns.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index e52b05d4038..580db0e24c5 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -153,6 +153,7 @@ struct interleave_strings_fn { auto const col = d_table.column(source_col_idx); if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; } auto const d_str = col.element(source_row_idx); + // ensures an empty string is not identified as a null row return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()} : string_index_pair{"", 0}; } From e250ba5d089fc3e87d1934894289180abb52919f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 2 May 2024 11:12:50 -0400 Subject: [PATCH 3/4] add benchmark --- cpp/benchmarks/CMakeLists.txt | 5 +++ cpp/benchmarks/reshape/interleave.cpp | 59 +++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 cpp/benchmarks/reshape/interleave.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5fd328dfc68..7e61d881f07 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB) # --------------------------------------------------------------------------------- ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp) +# ################################################################################################## +# * reshape benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp) + add_custom_target( run_benchmarks DEPENDS CUDF_BENCHMARKS diff --git a/cpp/benchmarks/reshape/interleave.cpp b/cpp/benchmarks/reshape/interleave.cpp new file mode 100644 index 00000000000..8f1f70017b2 --- /dev/null +++ b/cpp/benchmarks/reshape/interleave.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +static void bench_interleave(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_cols = static_cast(state.get_int64("columns")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const str_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + std::vector types(num_cols, cudf::type_id::STRING); + auto const source_table = create_random_table(types, row_count{num_rows}, str_profile); + + auto const source_view = source_table->view(); + auto const stream = cudf::get_default_stream(); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) + + cudf::strings_column_view(source_view.column(1)).chars_size(stream); + state.add_global_memory_reads(chars_size); // all bytes are read + state.add_global_memory_writes(chars_size); // all bytes are written + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + [[maybe_unused]] auto result = cudf::interleave_columns(source_view); + }); +} + +NVBENCH_BENCH(bench_interleave) + .set_name("interleave_strings") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("columns", {2, 10, 100}); From 5a437750d713c13c4d2a0200a01076f6b349d1b7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 3 May 2024 13:34:42 -0400 Subject: [PATCH 4/4] fix benchmark to include num_cols in limit check --- cpp/benchmarks/reshape/interleave.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/reshape/interleave.cpp b/cpp/benchmarks/reshape/interleave.cpp index 8f1f70017b2..4499e34af77 100644 --- a/cpp/benchmarks/reshape/interleave.cpp +++ b/cpp/benchmarks/reshape/interleave.cpp @@ -28,7 +28,7 @@ static void bench_interleave(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const num_cols = static_cast(state.get_int64("columns")); - if (static_cast(num_rows) * static_cast(row_width) >= + if (static_cast(num_rows) * static_cast(row_width) * num_cols >= static_cast(std::numeric_limits::max())) { state.skip("Skip benchmarks greater than size_type limit"); }