rapidsai · rapids-bot · May 3, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
@@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 # ---------------------------------------------------------------------------------
 ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 
+# ##################################################################################################
+# * reshape benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_interleave(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("columns"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const str_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
+  auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);
+
+  auto const source_view = source_table->view();
+  auto const stream      = cudf::get_default_stream();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
+                    cudf::strings_column_view(source_view.column(1)).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::interleave_columns(source_view);
+  });
+}
+
+NVBENCH_BENCH(bench_interleave)
+  .set_name("interleave_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("columns", {2, 10, 100});
@@ -22,7 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -128,40 +128,39 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
   return std::move(result->release()[0]);
 }
 
+// Error case when no other overload or specialization is available
+template <typename T, typename Enable = void>
+struct interleave_list_entries_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
+  }
+};
+
 /**
- * @brief Compute string sizes, string validities, and interleave string lists functor.
+ * @brief Interleave array of string_index_pair objects for a list of strings
  *
- * This functor is executed twice. In the first pass, the sizes and validities of the output strings
- * will be computed. In the second pass, this will interleave the lists of strings of the given
- * table containing those lists.
+ * Each thread processes the strings for the corresponding list row
  */
 struct compute_string_sizes_and_interleave_lists_fn {
   table_device_view const table_dv;
 
   // Store list offsets of the output lists column.
   size_type const* const dst_list_offsets;
 
-  // Flag to specify whether to compute string validities.
-  bool const has_null_mask;
-
-  // Store offsets of the strings.
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes and validities of the output strings.
-  // If d_chars != nullptr: only interleave lists of strings.
-  char* d_chars{nullptr};
-
-  // We need to set `1` or `0` for the validities of the strings in the child column.
-  int8_t* d_validities{nullptr};
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  string_index_pair* indices;  // output
 
+  // thread per list row per column
   __device__ void operator()(size_type const idx)
   {
     auto const num_cols = table_dv.num_columns();
     auto const col_id   = idx % num_cols;
     auto const list_id  = idx / num_cols;
 
     auto const& lists_col = table_dv.column(col_id);
-    if (has_null_mask and lists_col.is_null(list_id)) { return; }
+    if (lists_col.is_null(list_id)) { return; }
 
     auto const list_offsets =
       lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
@@ -181,65 +180,40 @@ struct compute_string_sizes_and_interleave_lists_fn {
     // read_idx and write_idx are indices of string elements.
     size_type write_idx = dst_list_offsets[idx];
 
-    if (not d_chars) {  // just compute sizes and validities of strings within a list
-      for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
-        if (has_null_mask) {
-          d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
-        }
-        d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
-      }
-    } else {  // just copy the entire memory region containing all strings in the list
-      // start_byte and end_byte are indices of character of the string elements.
-      auto const start_byte = str_offsets[start_str_idx];
-      auto const end_byte   = str_offsets[end_str_idx];
-      if (start_byte < end_byte) {
-        auto const input_ptr  = str_col.template head<char>() + start_byte;
-        auto const output_ptr = d_chars + d_offsets[write_idx];
-        thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
+    for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
+      auto const offset        = str_offsets[read_idx];
+      auto const size          = str_offsets[read_idx + 1] - offset;
+      string_index_pair result = {nullptr, size};
+      if (str_col.is_valid(read_idx)) {
+        result.first = size > 0 ? str_col.template head<char>() + offset : "";
       }
+      indices[write_idx] = result;
     }
   }
 };
 
-// Error case when no other overload or specialization is available
-template <typename T, typename Enable = void>
-struct interleave_list_entries_impl {
-  template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...)
-  {
-    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
-  }
-};
-
 template <typename T>
 struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<column> operator()(table_view const& input,
                                      column_view const& output_list_offsets,
                                      size_type num_output_lists,
                                      size_type num_output_entries,
-                                     bool data_has_null_mask,
+                                     bool,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const noexcept
   {
-    auto const table_dv_ptr = table_device_view::create(input, stream);
-    auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
-      *table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};
-
-    auto validities =
-      rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
-    comp_fn.d_validities = validities.data();
-
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-      comp_fn, num_output_lists, num_output_entries, stream, mr);
-
-    auto [null_mask, null_count] =
-      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
-
-    return make_strings_column(num_output_entries,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(null_mask));
+    auto const table_dv_ptr   = table_device_view::create(input, stream);
+    auto const d_list_offsets = output_list_offsets.template begin<size_type>();
+
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_output_entries,
+                                                                          stream);
+    auto comp_fn =
+      compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()};
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       num_output_lists,
+                       comp_fn);
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 

@@ -22,6 +22,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -140,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   }
 };
 
+struct interleave_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  table_device_view d_table;
+
+  __device__ string_index_pair operator()(size_type idx)
+  {
+    auto const num_columns    = d_table.num_columns();
+    auto const source_col_idx = idx % num_columns;
+    auto const source_row_idx = idx / num_columns;
+    auto const col            = d_table.column(source_col_idx);
+    if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = col.element<string_view>(source_row_idx);
+    // ensures an empty string is not identified as a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
+  }
+};
+
 template <typename T>
 struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
-                                           bool create_mask,
+                                           bool,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
     auto num_columns = strings_columns.num_columns();
-    if (num_columns == 1)  // Single strings column returns a copy
+    if (num_columns == 1) {  // Single strings column returns a copy
       return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
+    }
 
     auto strings_count = strings_columns.num_rows();
-    if (strings_count == 0)  // All columns have 0 rows
+    if (strings_count == 0) {  // All columns have 0 rows
       return make_empty_column(type_id::STRING);
+    }
 
     // Create device views from the strings columns.
-    auto table       = table_device_view::create(strings_columns, stream);
-    auto d_table     = *table;
+    auto d_table     = table_device_view::create(strings_columns, stream);
     auto num_strings = num_columns * strings_count;
 
-    std::pair<rmm::device_buffer, size_type> valid_mask{};
-    if (create_mask) {
-      // Create resulting null mask
-      valid_mask = cudf::detail::valid_if(
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(num_strings),
-        [num_columns, d_table] __device__(size_type idx) {
-          auto source_row_idx = idx % num_columns;
-          auto source_col_idx = idx / num_columns;
-          return !d_table.column(source_row_idx).is_null(source_col_idx);
-        },
-        stream,
-        mr);
-    }
-
-    auto const null_count = valid_mask.second;
-
-    // Build offsets column by computing sizes of each string in the output
-    auto offsets_transformer =
-      cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) {
-        // First compute the column and the row this item belongs to
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-        return d_table.column(source_row_idx).is_valid(source_col_idx)
-                 ? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes()
-                 : 0;
-      });
-    auto offsets_transformer_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0), offsets_transformer);
-    auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
-    auto d_results_offsets =
-      cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
-
-    // Create the chars column
-    rmm::device_uvector<char> chars(bytes, stream, mr);
-    auto d_results_chars = chars.data();
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      num_strings,
-      [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-
-        // Do not write to buffer if the column value for this row is null
-        if (d_table.column(source_row_idx).is_null(source_col_idx)) return;
-
-        size_type offset = d_results_offsets[idx];
-        char* d_buffer   = d_results_chars + offset;
-        strings::detail::copy_string(
-          d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx));
-      });
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(num_strings),
+                      indices.begin(),
+                      interleave_strings_fn{*d_table});
 
-    return make_strings_column(num_strings,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(valid_mask.first));
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };