Skip to content

Commit

Permalink
[opt](parse) optimize parsing string to datetime (#38385)
Browse files Browse the repository at this point in the history
  • Loading branch information
zclllyybb authored Jul 29, 2024
1 parent 64b55f0 commit bb5b05b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 59 deletions.
81 changes: 32 additions & 49 deletions be/src/vec/functions/function_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -978,9 +978,9 @@ struct NameToDateTime {
static constexpr auto name = "toDateTime";
};

template <typename DataType, typename Additions = void*, typename FromDataType = void*>
template <typename DataType, typename FromDataType = void*>
bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionContext* context,
Additions additions [[maybe_unused]] = Additions()) {
UInt32 scale [[maybe_unused]] = 0) {
if constexpr (IsDateTimeType<DataType>) {
return try_read_datetime_text(x, rb, context->state()->timezone_obj());
}
Expand All @@ -994,7 +994,6 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionCon
}

if constexpr (IsDateTimeV2Type<DataType>) {
UInt32 scale = additions;
return try_read_datetime_v2_text(x, rb, context->state()->timezone_obj(), scale);
}

Expand Down Expand Up @@ -1032,7 +1031,6 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionCon

template <typename DataType, typename Additions = void*>
StringParser::ParseResult try_parse_decimal_impl(typename DataType::FieldType& x, ReadBuffer& rb,
const cctz::time_zone& local_time_zone,
Additions additions
[[maybe_unused]] = Additions()) {
if constexpr (IsDataTypeDecimalV2<DataType>) {
Expand Down Expand Up @@ -1461,15 +1459,9 @@ class PreparedFunctionCast : public PreparedFunctionImpl {
const char* name;
};

struct NameCast {
static constexpr auto name = "CAST";
};

template <typename FromDataType, typename ToDataType, typename Name>
struct ConvertThroughParsing {
static_assert(std::is_same_v<FromDataType, DataTypeString>,
"ConvertThroughParsing is only applicable for String or FixedString data types");

// always from DataTypeString
template <typename ToDataType, typename Name>
struct StringParsing {
using ToFieldType = typename ToDataType::FieldType;

static bool is_all_read(ReadBuffer& in) { return in.eof(); }
Expand All @@ -1482,57 +1474,46 @@ struct ConvertThroughParsing {
ColumnDecimal<ToFieldType>, ColumnVector<ToFieldType>>;

const IColumn* col_from = block.get_by_position(arguments[0]).column.get();
const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from);
const auto* col_from_string = check_and_get_column<ColumnString>(col_from);

if (std::is_same_v<FromDataType, DataTypeString> && !col_from_string) {
if (!col_from_string) {
return Status::RuntimeError("Illegal column {} of first argument of function {}",
col_from->get_name(), Name::name);
}

size_t size = input_rows_count;
size_t row = input_rows_count;
typename ColVecTo::MutablePtr col_to = nullptr;

if constexpr (IsDataTypeDecimal<ToDataType>) {
UInt32 scale = ((PrecisionScaleArg)additions).scale;
ToDataType::check_type_scale(scale);
col_to = ColVecTo::create(size, scale);
col_to = ColVecTo::create(row, scale);
} else {
col_to = ColVecTo::create(size);
col_to = ColVecTo::create(row);
}

typename ColVecTo::Container& vec_to = col_to->get_data();

ColumnUInt8::MutablePtr col_null_map_to;
ColumnUInt8::Container* vec_null_map_to [[maybe_unused]] = nullptr;
col_null_map_to = ColumnUInt8::create(size);
col_null_map_to = ColumnUInt8::create(row);
vec_null_map_to = &col_null_map_to->get_data();

const ColumnString::Chars* chars = nullptr;
const IColumn::Offsets* offsets = nullptr;
size_t fixed_string_size = 0;

if constexpr (std::is_same_v<FromDataType, DataTypeString>) {
chars = &col_from_string->get_chars();
offsets = &col_from_string->get_offsets();
}
const ColumnString::Chars* chars = &col_from_string->get_chars();
const IColumn::Offsets* offsets = &col_from_string->get_offsets();

size_t current_offset = 0;
for (size_t i = 0; i < size; ++i) {
size_t next_offset = std::is_same_v<FromDataType, DataTypeString>
? (*offsets)[i]
: (current_offset + fixed_string_size);
size_t string_size = std::is_same_v<FromDataType, DataTypeString>
? next_offset - current_offset
: fixed_string_size;
for (size_t i = 0; i < row; ++i) {
size_t next_offset = (*offsets)[i];
size_t string_size = next_offset - current_offset;

ReadBuffer read_buffer(&(*chars)[current_offset], string_size);

bool parsed;
if constexpr (IsDataTypeDecimal<ToDataType>) {
ToDataType::check_type_precision((PrecisionScaleArg(additions).precision));
StringParser::ParseResult res = try_parse_decimal_impl<ToDataType>(
vec_to[i], read_buffer, context->state()->timezone_obj(),
PrecisionScaleArg(additions));
vec_to[i], read_buffer, PrecisionScaleArg(additions));
parsed = (res == StringParser::PARSE_SUCCESS ||
res == StringParser::PARSE_OVERFLOW ||
res == StringParser::PARSE_UNDERFLOW);
Expand All @@ -1542,8 +1523,8 @@ struct ConvertThroughParsing {
parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, context,
type->get_scale());
} else {
parsed = try_parse_impl<ToDataType, void*, FromDataType>(vec_to[i], read_buffer,
context);
parsed =
try_parse_impl<ToDataType, DataTypeString>(vec_to[i], read_buffer, context);
}
(*vec_null_map_to)[i] = !parsed || !is_all_read(read_buffer);
current_offset = next_offset;
Expand All @@ -1557,25 +1538,27 @@ struct ConvertThroughParsing {

template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal32>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal32>, Name> {};
: StringParsing<DataTypeDecimal<Decimal32>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal64>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal64>, Name> {};
: StringParsing<DataTypeDecimal<Decimal64>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V2>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V2>, Name> {};
: StringParsing<DataTypeDecimal<Decimal128V2>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V3>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V3>, Name> {};
: StringParsing<DataTypeDecimal<Decimal128V3>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal256>, Name>
: ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal256>, Name> {};
: StringParsing<DataTypeDecimal<Decimal256>, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeIPv4, Name>
: ConvertThroughParsing<DataTypeString, DataTypeIPv4, Name> {};
struct ConvertImpl<DataTypeString, DataTypeIPv4, Name> : StringParsing<DataTypeIPv4, Name> {};
template <typename Name>
struct ConvertImpl<DataTypeString, DataTypeIPv6, Name>
: ConvertThroughParsing<DataTypeString, DataTypeIPv6, Name> {};
struct ConvertImpl<DataTypeString, DataTypeIPv6, Name> : StringParsing<DataTypeIPv6, Name> {};

struct NameCast {
static constexpr auto name = "CAST";
};

template <typename ToDataType, typename Name>
class FunctionConvertFromString : public IFunction {
Expand Down Expand Up @@ -1610,8 +1593,8 @@ class FunctionConvertFromString : public IFunction {
const IDataType* from_type = block.get_by_position(arguments[0]).type.get();

if (check_and_get_data_type<DataTypeString>(from_type)) {
return ConvertThroughParsing<DataTypeString, ToDataType, Name>::execute(
context, block, arguments, result, input_rows_count);
return StringParsing<ToDataType, Name>::execute(context, block, arguments, result,
input_rows_count);
}

return Status::RuntimeError(
Expand Down
29 changes: 19 additions & 10 deletions be/src/vec/runtime/vdatetime_value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ uint8_t mysql_week_mode(uint32_t mode) {
return mode;
}

static bool check_space(char ch) {
// \t, \n, \v, \f, \r are 9~13, respectively.
return UNLIKELY(ch == ' ' || (ch >= 9 && ch <= 13));
}

static bool check_date_punct(char ch) {
return UNLIKELY(!(isdigit(ch) || isalpha(ch)));
}

static bool time_zone_begins(const char* ptr, const char* end) {
return *ptr == '+' || (*ptr == '-' && ptr + 3 < end && *(ptr + 3) == ':') ||
(isalpha(*ptr) && *ptr != 'T');
Expand Down Expand Up @@ -104,7 +113,7 @@ bool VecDateTimeValue::from_date_str_base(const char* date_str, int len,

_neg = false;
// Skip space character
while (ptr < end && isspace(*ptr)) {
while (ptr < end && check_space(*ptr)) {
ptr++;
}
if (ptr == end || !isdigit(*ptr)) {
Expand Down Expand Up @@ -202,8 +211,8 @@ bool VecDateTimeValue::from_date_str_base(const char* date_str, int len,
continue;
}
// escape separator
while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) {
if (isspace(*ptr)) {
while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) {
if (check_space(*ptr)) {
if (((1 << field_idx) & allow_space_mask) == 0) {
return false;
}
Expand Down Expand Up @@ -1235,7 +1244,7 @@ bool VecDateTimeValue::from_date_format_str(const char* format, int format_len,
auto [year, month, day, hour, minute, second] = std::tuple {0, 0, 0, 0, 0, 0};
while (ptr < end && val < val_end) {
// Skip space character
while (val < val_end && isspace(*val)) {
while (val < val_end && check_space(*val)) {
val++;
}
if (val >= val_end) {
Expand Down Expand Up @@ -1500,7 +1509,7 @@ bool VecDateTimeValue::from_date_format_str(const char* format, int format_len,
default:
return false;
}
} else if (!isspace(*ptr)) {
} else if (!check_space(*ptr)) {
if (*ptr != *val) {
return false;
}
Expand Down Expand Up @@ -1987,13 +1996,13 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale /* =
bool convert_zero) {
return from_date_str_base(date_str, len, scale, nullptr, convert_zero);
}
// when we parse
template <typename T>
bool DateV2Value<T>::from_date_str(const char* date_str, int len,
const cctz::time_zone& local_time_zone, int scale /* = -1*/,
bool convert_zero) {
return from_date_str_base(date_str, len, scale, &local_time_zone, convert_zero);
}
// if local_time_zone is null, only be able to parse time without timezone
template <typename T>
bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale,
const cctz::time_zone* local_time_zone, bool convert_zero) {
Expand All @@ -2005,7 +2014,7 @@ bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale
int32_t date_len[MAX_DATE_PARTS] = {0};

// Skip space character
while (ptr < end && isspace(*ptr)) {
while (ptr < end && check_space(*ptr)) {
ptr++;
}
if (ptr == end || !isdigit(*ptr)) {
Expand Down Expand Up @@ -2153,8 +2162,8 @@ bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale
continue;
}
// escape separator
while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) {
if (isspace(*ptr)) {
while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) {
if (check_space(*ptr)) {
if (((1 << field_idx) & allow_space_mask) == 0) {
return false;
}
Expand Down Expand Up @@ -2286,7 +2295,7 @@ bool DateV2Value<T>::from_date_format_str(const char* format, int format_len, co
auto [year, month, day, hour, minute, second, microsecond] = std::tuple {0, 0, 0, 0, 0, 0, 0};
while (ptr < end && val < val_end) {
// Skip space character
while (val < val_end && isspace(*val)) {
while (val < val_end && check_space(*val)) {
val++;
}
if (val >= val_end) {
Expand Down

0 comments on commit bb5b05b

Please sign in to comment.