Skip to content

Commit

Permalink
Use experimental make_strings_children for strings join/url_encode/sl…
Browse files Browse the repository at this point in the history
…ice (#15598)

Updates strings APIs to use the new experimental `make_strings_children` which supports building large strings.
- `cudf::strings::join_strings`
- `cudf::strings::join_list_elements`
- `cudf::strings::slice_strings`
- `cudf::strings::format_list_column`
- `cudf::strings::url_encode`

Reference #15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #15598
  • Loading branch information
davidwendt authored Apr 30, 2024
1 parent 1fd3db8 commit 2439dee
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 25 deletions.
8 changes: 5 additions & 3 deletions cpp/src/strings/combine/join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/detail/combine.hpp>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
Expand Down Expand Up @@ -84,8 +85,9 @@ struct join_base_fn {
* This functor is suitable for make_strings_children
*/
struct join_fn : public join_base_fn {
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

join_fn(column_device_view const d_strings,
string_view d_separator,
Expand All @@ -106,7 +108,7 @@ struct join_fn : public join_base_fn {
} else {
bytes += d_str.size_bytes() + d_sep.size_bytes();
}
if (!d_chars) { d_offsets[idx] = bytes; }
if (!d_chars) { d_sizes[idx] = bytes; }
}
};

Expand Down Expand Up @@ -148,7 +150,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
if ((input.size() == input.null_count()) ||
((input.chars_size(stream) / (input.size() - input.null_count())) <=
AVG_CHAR_BYTES_THRESHOLD)) {
return std::get<1>(make_strings_children(
return std::get<1>(experimental::make_strings_children(
join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr))
.release();
}
Expand Down
13 changes: 7 additions & 6 deletions cpp/src/strings/combine/join_list_elements.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -60,11 +60,12 @@ struct compute_size_and_concatenate_fn {
separator_on_nulls const separate_nulls;
output_if_empty_list const empty_list_policy;

size_type* d_offsets{nullptr};
size_type* d_sizes{nullptr};

// If d_chars == nullptr: only compute sizes and validities of the output strings.
// If d_chars != nullptr: only concatenate strings.
char* d_chars{nullptr};
cudf::detail::input_offsetalator d_offsets;

[[nodiscard]] __device__ bool output_is_null(size_type const idx,
size_type const start_idx,
Expand All @@ -84,7 +85,7 @@ struct compute_size_and_concatenate_fn {
auto const end_idx = list_offsets[idx + 1];

if (!d_chars && output_is_null(idx, start_idx, end_idx)) {
d_offsets[idx] = 0;
d_sizes[idx] = 0;
return;
}

Expand Down Expand Up @@ -120,7 +121,7 @@ struct compute_size_and_concatenate_fn {

// If there are all null elements, the output should be the same as having an empty list input:
// a null or an empty string
if (!d_chars) { d_offsets[idx] = has_valid_element ? size_bytes : 0; }
if (!d_chars) { d_sizes[idx] = has_valid_element ? size_bytes : 0; }
}
};

Expand Down Expand Up @@ -208,7 +209,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
separate_nulls,
empty_list_policy};

auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
auto [null_mask, null_count] =
cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_rows),
Expand Down Expand Up @@ -283,7 +284,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
separate_nulls,
empty_list_policy};

auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
auto [null_mask, null_count] =
cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_rows),
Expand Down
9 changes: 5 additions & 4 deletions cpp/src/strings/convert/convert_lists.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/convert/convert_lists.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/utilities/default_stream.hpp>
Expand Down Expand Up @@ -66,8 +66,9 @@ struct format_lists_fn {
string_view const d_na_rep;
stack_item* d_stack;
size_type const max_depth;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ column_device_view get_nested_child(size_type idx)
{
Expand Down Expand Up @@ -184,7 +185,7 @@ struct format_lists_fn {
}
}

if (!d_chars) d_offsets[idx] = bytes;
if (!d_chars) { d_sizes[idx] = bytes; }
}
};

Expand Down Expand Up @@ -217,7 +218,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
auto const d_separators = column_device_view::create(separators.parent(), stream);
auto const d_na_rep = na_rep.value(stream);

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = experimental::make_strings_children(
format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
input.size(),
stream,
Expand Down
13 changes: 7 additions & 6 deletions cpp/src/strings/convert/convert_urls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/strings/convert/convert_urls.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -50,8 +50,9 @@ namespace {
//
struct url_encoder_fn {
column_device_view const d_strings;
size_type* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

// utility to create 2-byte hex characters from single binary byte
__device__ void byte_to_hex(uint8_t byte, char* hex)
Expand Down Expand Up @@ -80,7 +81,7 @@ struct url_encoder_fn {
__device__ void operator()(size_type idx)
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}

Expand Down Expand Up @@ -117,7 +118,7 @@ struct url_encoder_fn {
}
}
}
if (!d_chars) d_offsets[idx] = nbytes;
if (!d_chars) { d_sizes[idx] = nbytes; }
}
};

Expand All @@ -132,8 +133,8 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,

auto d_column = column_device_view::create(input.parent(), stream);

auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
url_encoder_fn{*d_column}, input.size(), stream, mr);
auto [offsets_column, chars] =
experimental::make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);

return make_strings_column(input.size(),
std::move(offsets_column),
Expand Down
13 changes: 7 additions & 6 deletions cpp/src/strings/slice.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/strings_children_ex.cuh>
#include <cudf/strings/slice.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -79,19 +79,20 @@ struct substring_fn {
numeric_scalar_device_view<size_type> const d_start;
numeric_scalar_device_view<size_type> const d_stop;
numeric_scalar_device_view<size_type> const d_step;
int32_t* d_offsets{};
size_type* d_sizes{};
char* d_chars{};
cudf::detail::input_offsetalator d_offsets;

__device__ void operator()(size_type idx)
{
if (d_column.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const d_str = d_column.template element<string_view>(idx);
auto const length = d_str.length();
if (length == 0) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
size_type const step = d_step.is_valid() ? d_step.value() : 1;
Expand Down Expand Up @@ -131,7 +132,7 @@ struct substring_fn {
}
itr += step;
}
if (!d_chars) d_offsets[idx] = bytes;
if (!d_chars) { d_sizes[idx] = bytes; }
}
};

Expand Down Expand Up @@ -205,7 +206,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
auto const d_stop = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
auto const d_step = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));

auto [offsets, chars] = make_strings_children(
auto [offsets, chars] = experimental::make_strings_children(
substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);

return make_strings_column(strings.size(),
Expand Down

0 comments on commit 2439dee

Please sign in to comment.