Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[opt](parse) optimize parsing string to datetime #38385

Merged
merged 1 commit into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 32 additions & 49 deletions be/src/vec/functions/function_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -967,9 +967,9 @@ struct NameToDateTime {
static constexpr auto name = "toDateTime";
};

template <typename DataType, typename Additions = void*, typename FromDataType = void*>
template <typename DataType, typename FromDataType = void*>
bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionContext* context,
Additions additions [[maybe_unused]] = Additions()) {
UInt32 scale [[maybe_unused]] = 0) {
if constexpr (IsDateTimeType<DataType>) {
return try_read_datetime_text(x, rb, context->state()->timezone_obj());
}
Expand All @@ -983,7 +983,6 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionCon
}

if constexpr (IsDateTimeV2Type<DataType>) {
UInt32 scale = additions;
return try_read_datetime_v2_text(x, rb, context->state()->timezone_obj(), scale);
}

Expand Down Expand Up @@ -1021,7 +1020,6 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionCon

template <typename DataType, typename Additions = void*>
StringParser::ParseResult try_parse_decimal_impl(typename DataType::FieldType& x, ReadBuffer& rb,
const cctz::time_zone& local_time_zone,
Additions additions
[[maybe_unused]] = Additions()) {
if constexpr (IsDataTypeDecimalV2<DataType>) {
Expand Down Expand Up @@ -1450,15 +1448,9 @@ class PreparedFunctionCast : public PreparedFunctionImpl {
const char* name;
};

struct NameCast {
static constexpr auto name = "CAST";
};

template <typename FromDataType, typename ToDataType, typename Name>
struct ConvertThroughParsing {
static_assert(std::is_same_v<FromDataType, DataTypeString>,
"ConvertThroughParsing is only applicable for String or FixedString data types");

// always from DataTypeString
template <typename ToDataType, typename Name>
struct StringParsing {
using ToFieldType = typename ToDataType::FieldType;

static bool is_all_read(ReadBuffer& in) { return in.eof(); }
Expand All @@ -1471,57 +1463,46 @@ struct ConvertThroughParsing {
ColumnDecimal<ToFieldType>, ColumnVector<ToFieldType>>;

const IColumn* col_from = block.get_by_position(arguments[0]).column.get();
const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from);
const auto* col_from_string = check_and_get_column<ColumnString>(col_from);

if (std::is_same_v<FromDataType, DataTypeString> && !col_from_string) {
if (!col_from_string) {
return Status::RuntimeError("Illegal column {} of first argument of function {}",
col_from->get_name(), Name::name);
}

size_t size = input_rows_count;
size_t row = input_rows_count;
typename ColVecTo::MutablePtr col_to = nullptr;

if constexpr (IsDataTypeDecimal<ToDataType>) {
UInt32 scale = ((PrecisionScaleArg)additions).scale;
ToDataType::check_type_scale(scale);
col_to = ColVecTo::create(size, scale);
col_to = ColVecTo::create(row, scale);
} else {
col_to = ColVecTo::create(size);
col_to = ColVecTo::create(row);
}

typename ColVecTo::Container& vec_to = col_to->get_data();

ColumnUInt8::MutablePtr col_null_map_to;
ColumnUInt8::Container* vec_null_map_to [[maybe_unused]] = nullptr;
col_null_map_to = ColumnUInt8::create(size);
col_null_map_to = ColumnUInt8::create(row);
vec_null_map_to = &col_null_map_to->get_data();

const ColumnString::Chars* chars = nullptr;
const IColumn::Offsets* offsets = nullptr;
size_t fixed_string_size = 0;

if constexpr (std::is_same_v<FromDataType, DataTypeString>) {
chars = &col_from_string->get_chars();
offsets = &col_from_string->get_offsets();
}
const ColumnString::Chars* chars = &col_from_string->get_chars();
const IColumn::Offsets* offsets = &col_from_string->get_offsets();

size_t current_offset = 0;
for (size_t i = 0; i < size; ++i) {
size_t next_offset = std::is_same_v<FromDataType, DataTypeString>
? (*offsets)[i]
: (current_offset + fixed_string_size);
size_t string_size = std::is_same_v<FromDataType, DataTypeString>
? next_offset - current_offset
: fixed_string_size;
for (size_t i = 0; i < row; ++i) {
size_t next_offset = (*offsets)[i];
size_t string_size = next_offset - current_offset;

ReadBuffer read_buffer(&(*chars)[current_offset], string_size);

bool parsed;
if constexpr (IsDataTypeDecimal<ToDataType>) {
ToDataType::check_type_precision((PrecisionScaleArg(additions).precision));
StringParser::ParseResult res = try_parse_decimal_impl<ToDataType>(
vec_to[i], read_buffer, context->state()->timezone_obj(),
PrecisionScaleArg(additions));
vec_to[i], read_buffer, PrecisionScaleArg(additions));
parsed = (res == StringParser::PARSE_SUCCESS ||
res == StringParser::PARSE_OVERFLOW ||
res == StringParser::PARSE_UNDERFLOW);
Expand All @@ -1531,8 +1512,8 @@ struct ConvertThroughParsing {
parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, context,
type->get_scale());
} else {
parsed = try_parse_impl<ToDataType, void*, FromDataType>(vec_to[i], read_buffer,
context);
parsed =
try_parse_impl<ToDataType, DataTypeString>(vec_to[i], read_buffer, context);
}
(*vec_null_map_to)[i] = !parsed || !is_all_read(read_buffer);
current_offset = next_offset;
Expand All @@ -1546,25 +1527,27 @@ struct ConvertThroughParsing {

template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal32>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal32>, Name> {};
: StringParsing<DataTypeDecimal<Decimal32>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal64>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal64>, Name> {};
: StringParsing<DataTypeDecimal<Decimal64>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V2>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V2>, Name> {};
: StringParsing<DataTypeDecimal<Decimal128V2>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V3>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V3>, Name> {};
: StringParsing<DataTypeDecimal<Decimal128V3>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal256>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal256>, Name> {};
: StringParsing<DataTypeDecimal<Decimal256>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeIPv4, Name>
: ConvertThroughParsing<DataTypeString, DataTypeIPv4, Name> {};
struct ConvertImpl<DataTypeString, DataTypeIPv4, Name> : StringParsing<DataTypeIPv4, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeIPv6, Name>
: ConvertThroughParsing<DataTypeString, DataTypeIPv6, Name> {};
struct ConvertImpl<DataTypeString, DataTypeIPv6, Name> : StringParsing<DataTypeIPv6, Name> {};

struct NameCast {
static constexpr auto name = "CAST";
};

template <typename ToDataType, typename Name>
class FunctionConvertFromString : public IFunction {
Expand Down Expand Up @@ -1599,8 +1582,8 @@ class FunctionConvertFromString : public IFunction {
const IDataType* from_type = block.get_by_position(arguments[0]).type.get();

if (check_and_get_data_type<DataTypeString>(from_type)) {
return ConvertThroughParsing<DataTypeString, ToDataType, Name>::execute(
context, block, arguments, result, input_rows_count);
return StringParsing<ToDataType, Name>::execute(context, block, arguments, result,
input_rows_count);
}

return Status::RuntimeError(
Expand Down
29 changes: 19 additions & 10 deletions be/src/vec/runtime/vdatetime_value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ uint8_t mysql_week_mode(uint32_t mode) {
return mode;
}

static bool check_space(char ch) {
// \t, \n, \v, \f, \r are 9~13, respectively.
return UNLIKELY(ch == ' ' || (ch >= 9 && ch <= 13));
zclllyybb marked this conversation as resolved.
Show resolved Hide resolved
}

static bool check_date_punct(char ch) {
return UNLIKELY(!(isdigit(ch) || isalpha(ch)));
}

static bool time_zone_begins(const char* ptr, const char* end) {
return *ptr == '+' || (*ptr == '-' && ptr + 3 < end && *(ptr + 3) == ':') ||
(isalpha(*ptr) && *ptr != 'T');
Expand Down Expand Up @@ -104,7 +113,7 @@ bool VecDateTimeValue::from_date_str_base(const char* date_str, int len,

_neg = false;
// Skip space character
while (ptr < end && isspace(*ptr)) {
while (ptr < end && check_space(*ptr)) {
ptr++;
}
if (ptr == end || !isdigit(*ptr)) {
Expand Down Expand Up @@ -202,8 +211,8 @@ bool VecDateTimeValue::from_date_str_base(const char* date_str, int len,
continue;
}
// escape separator
while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) {
if (isspace(*ptr)) {
while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) {
if (check_space(*ptr)) {
if (((1 << field_idx) & allow_space_mask) == 0) {
return false;
}
Expand Down Expand Up @@ -1235,7 +1244,7 @@ bool VecDateTimeValue::from_date_format_str(const char* format, int format_len,
auto [year, month, day, hour, minute, second] = std::tuple {0, 0, 0, 0, 0, 0};
while (ptr < end && val < val_end) {
// Skip space character
while (val < val_end && isspace(*val)) {
while (val < val_end && check_space(*val)) {
val++;
}
if (val >= val_end) {
Expand Down Expand Up @@ -1500,7 +1509,7 @@ bool VecDateTimeValue::from_date_format_str(const char* format, int format_len,
default:
return false;
}
} else if (!isspace(*ptr)) {
} else if (!check_space(*ptr)) {
if (*ptr != *val) {
return false;
}
Expand Down Expand Up @@ -1987,13 +1996,13 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale /* =
bool convert_zero) {
return from_date_str_base(date_str, len, scale, nullptr, convert_zero);
}
// when we parse
template <typename T>
bool DateV2Value<T>::from_date_str(const char* date_str, int len,
const cctz::time_zone& local_time_zone, int scale /* = -1*/,
bool convert_zero) {
return from_date_str_base(date_str, len, scale, &local_time_zone, convert_zero);
}
// if local_time_zone is null, only be able to parse time without timezone
template <typename T>
bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale,
zclllyybb marked this conversation as resolved.
Show resolved Hide resolved
const cctz::time_zone* local_time_zone, bool convert_zero) {
Expand All @@ -2005,7 +2014,7 @@ bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale
int32_t date_len[MAX_DATE_PARTS] = {0};

// Skip space character
while (ptr < end && isspace(*ptr)) {
while (ptr < end && check_space(*ptr)) {
ptr++;
}
if (ptr == end || !isdigit(*ptr)) {
Expand Down Expand Up @@ -2153,8 +2162,8 @@ bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale
continue;
}
// escape separator
while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) {
if (isspace(*ptr)) {
while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) {
if (check_space(*ptr)) {
if (((1 << field_idx) & allow_space_mask) == 0) {
return false;
}
Expand Down Expand Up @@ -2286,7 +2295,7 @@ bool DateV2Value<T>::from_date_format_str(const char* format, int format_len, co
auto [year, month, day, hour, minute, second, microsecond] = std::tuple {0, 0, 0, 0, 0, 0, 0};
while (ptr < end && val < val_end) {
// Skip space character
while (val < val_end && isspace(*val)) {
while (val < val_end && check_space(*val)) {
val++;
}
if (val >= val_end) {
Expand Down
Loading