From 92033ec88cb05c664273293f8cfcc2061e86a10b Mon Sep 17 00:00:00 2001 From: AEsir777 Date: Fri, 6 Oct 2023 11:20:00 -0400 Subject: [PATCH] commit --- dataset/load-from-test/bracket_fail.csv | 1 + dataset/load-from-test/change_config.csv | 4 + dataset/load-from-test/delim_fail.csv | 1 + dataset/load-from-test/quote_fail.csv | 1 + dataset/load-from-test/schema.cypher | 0 dataset/load-from-test/should_pass.csv | 7 + src/binder/bind/bind_graph_pattern.cpp | 8 +- src/common/type_utils.cpp | 10 - src/function/CMakeLists.txt | 1 + src/function/cast_utils.cpp | 57 ++++ src/include/common/type_utils.h | 1 - src/include/function/cast/cast_functions.h | 47 ++-- .../cast/{numeric_cast.h => cast_utils.h} | 254 ++++++++---------- .../function/cast/vector_cast_functions.h | 8 +- .../in_mem_column_chunk.h | 4 +- src/include/storage/store/column_chunk.h | 4 +- src/parser/transform/transform_expression.cpp | 8 +- .../operator/persistent/reader/csv/driver.cpp | 172 +++++++++++- .../in_mem_storage_structure/in_mem_lists.cpp | 2 +- src/storage/store/table_copy_utils.cpp | 117 ++++---- .../tinysnb/cast/cast_string_to_list.test | 34 +++ 21 files changed, 485 insertions(+), 256 deletions(-) create mode 100644 dataset/load-from-test/bracket_fail.csv create mode 100644 dataset/load-from-test/change_config.csv create mode 100644 dataset/load-from-test/delim_fail.csv create mode 100644 dataset/load-from-test/quote_fail.csv create mode 100644 dataset/load-from-test/schema.cypher create mode 100644 dataset/load-from-test/should_pass.csv create mode 100644 src/function/cast_utils.cpp rename src/include/function/cast/{numeric_cast.h => cast_utils.h} (56%) create mode 100644 test/test_files/tinysnb/cast/cast_string_to_list.test diff --git a/dataset/load-from-test/bracket_fail.csv b/dataset/load-from-test/bracket_fail.csv new file mode 100644 index 00000000000..31bc67112d2 --- /dev/null +++ b/dataset/load-from-test/bracket_fail.csv @@ -0,0 +1 @@ +"(()" \ No newline at end of file diff --git a/dataset/load-from-test/change_config.csv b/dataset/load-from-test/change_config.csv new file mode 100644 index 00000000000..3523ad9ffff --- /dev/null +++ b/dataset/load-from-test/change_config.csv @@ -0,0 +1,4 @@ +list | str +'(this | is a word | normal | )'|'try escape ~~' +'(escape | is escape success? ~~)'|' ~' ( ) do not need to escape sepeical | ()' +'(~~ ~' not work also this "~'" )'|'th' diff --git a/dataset/load-from-test/delim_fail.csv b/dataset/load-from-test/delim_fail.csv new file mode 100644 index 00000000000..e1690440c52 --- /dev/null +++ b/dataset/load-from-test/delim_fail.csv @@ -0,0 +1 @@ +"((hello),(bdfadf),)" \ No newline at end of file diff --git a/dataset/load-from-test/quote_fail.csv b/dataset/load-from-test/quote_fail.csv new file mode 100644 index 00000000000..4af2a3d5c46 --- /dev/null +++ b/dataset/load-from-test/quote_fail.csv @@ -0,0 +1 @@ +'[23, 432, 234]' \ No newline at end of file diff --git a/dataset/load-from-test/schema.cypher b/dataset/load-from-test/schema.cypher new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dataset/load-from-test/should_pass.csv b/dataset/load-from-test/should_pass.csv new file mode 100644 index 00000000000..c54fd34a34a --- /dev/null +++ b/dataset/load-from-test/should_pass.csv @@ -0,0 +1,7 @@ +"[[1,3,423,124,43242],[432]]" +" [ [ 1 ,3, 423 , 124,43242 ] , [ 432 ]]" +"[ [ 1 ,3, 423 , 124,43242 ] , [432]] " +"[,[], [1, 2, 3]]" +"[null, NULL, Null, nUll, nuLl, nulL, nuLL, NUll, NuLl, NulL, [1, 2, 3]]" +"[[], [], [] ]" +"[, ,]" diff --git a/src/binder/bind/bind_graph_pattern.cpp b/src/binder/bind/bind_graph_pattern.cpp index ce511310519..fea27facfec 100644 --- a/src/binder/bind/bind_graph_pattern.cpp +++ b/src/binder/bind/bind_graph_pattern.cpp @@ -11,6 +11,7 @@ #include "catalog/rel_table_schema.h" #include "common/exception/binder.h" #include "common/string_utils.h" +#include "function/cast/cast_utils.h" #include "main/client_context.h" using namespace kuzu::common; @@ -424,10 +425,13 @@ std::shared_ptr Binder::createRecursiveQueryRel(const parser::Rel std::pair Binder::bindVariableLengthRelBound( const kuzu::parser::RelPattern& relPattern) { auto recursiveInfo = relPattern.getRecursiveInfo(); - auto lowerBound = TypeUtils::convertToUint32(recursiveInfo->lowerBound.c_str()); + uint32_t lowerBound; + function::StringCastUtils::simpleIntegerCast(recursiveInfo->lowerBound.c_str(), + recursiveInfo->lowerBound.length(), lowerBound); auto upperBound = clientContext->varLengthExtendMaxDepth; if (!recursiveInfo->upperBound.empty()) { - upperBound = TypeUtils::convertToUint32(recursiveInfo->upperBound.c_str()); + function::StringCastUtils::simpleIntegerCast(recursiveInfo->upperBound.c_str(), + recursiveInfo->upperBound.length(), upperBound); } if (lowerBound > upperBound) { throw BinderException( diff --git a/src/common/type_utils.cpp b/src/common/type_utils.cpp index c72a20b6807..0e4970345e8 100644 --- a/src/common/type_utils.cpp +++ b/src/common/type_utils.cpp @@ -8,16 +8,6 @@ namespace kuzu { namespace common { -uint32_t TypeUtils::convertToUint32(const char* data) { - std::istringstream iss(data); - uint32_t val; - if (!(iss >> val)) { - throw ConversionException( - StringUtils::string_format("Failed to convert {} to uint32_t", data)); - } - return val; -} - std::string TypeUtils::castValueToString( const LogicalType& dataType, uint8_t* value, void* vector) { auto valueVector = reinterpret_cast(vector); diff --git a/src/function/CMakeLists.txt b/src/function/CMakeLists.txt index 49a33436391..5ca76cee0e5 100644 --- a/src/function/CMakeLists.txt +++ b/src/function/CMakeLists.txt @@ -7,6 +7,7 @@ add_library(kuzu_function built_in_aggregate_functions.cpp built_in_vector_functions.cpp built_in_table_functions.cpp + cast_utils.cpp comparison_functions.cpp find_function.cpp scalar_macro_function.cpp diff --git a/src/function/cast_utils.cpp b/src/function/cast_utils.cpp new file mode 100644 index 00000000000..66ceb3453ed --- /dev/null +++ b/src/function/cast_utils.cpp @@ -0,0 +1,57 @@ +#include "function/cast/cast_utils.h" + +namespace kuzu { +namespace function { + +bool StringCastUtils::tryCastToBool(const char* input, uint64_t len, bool& result) { + common::StringUtils::removeCStringWhiteSpaces(input, len); + + switch (len) { + case 1: { + char c = std::tolower(*input); + if (c == 't' || c == '1') { + result = true; + return true; + } else if (c == 'f' || c == '0') { + result = false; + return true; + } + return false; + } + case 4: { + auto t = std::tolower(input[0]); + auto r = std::tolower(input[1]); + auto u = std::tolower(input[2]); + auto e = std::tolower(input[3]); + if (t == 't' && r == 'r' && u == 'u' && e == 'e') { + result = true; + return true; + } + return false; + } + case 5: { + auto f = std::tolower(input[0]); + auto a = std::tolower(input[1]); + auto l = std::tolower(input[2]); + auto s = std::tolower(input[3]); + auto e = std::tolower(input[4]); + if (f == 'f' && a == 'a' && l == 'l' && s == 's' && e == 'e') { + result = false; + return true; + } + return false; + } + default: + return false; + } +} + +void StringCastUtils::castStringToBool(const char* input, uint64_t len, bool& result) { + if (!tryCastToBool(input, len, result)) { + throw common::ConversionException( + "Cast failed. " + std::string{input, len} + " is not in BOOL range."); + } +} + +} // namespace function +} // namespace kuzu \ No newline at end of file diff --git a/src/include/common/type_utils.h b/src/include/common/type_utils.h index bed4a66d261..dc2af071699 100644 --- a/src/include/common/type_utils.h +++ b/src/include/common/type_utils.h @@ -15,7 +15,6 @@ namespace common { class TypeUtils { public: - static uint32_t convertToUint32(const char* data); template static inline std::string toString(const T& val, void* valueVector = nullptr) { static_assert(std::is_same::value || std::is_same::value || diff --git a/src/include/function/cast/cast_functions.h b/src/include/function/cast/cast_functions.h index 10d63d36085..e9a64d22c49 100644 --- a/src/include/function/cast/cast_functions.h +++ b/src/include/function/cast/cast_functions.h @@ -2,12 +2,12 @@ #include +#include "cast_utils.h" #include "common/exception/runtime.h" #include "common/string_utils.h" #include "common/type_utils.h" #include "common/types/blob.h" #include "common/vector/value_vector.h" -#include "numeric_cast.h" namespace kuzu { namespace function { @@ -88,7 +88,8 @@ inline std::string CastToString::castToStringWithVector( struct CastToBool { static inline void operation(common::ku_string_t& input, bool& result) { - if (!tryCastToBool(reinterpret_cast(input.getData()), input.len, result)) { + if (!StringCastUtils::tryCastToBool( + reinterpret_cast(input.getData()), input.len, result)) { throw common::ConversionException{common::StringUtils::string_format( "Value {} is not a valid boolean", input.getAsString())}; } @@ -107,13 +108,13 @@ struct CastToDouble { template<> inline void CastToDouble::operation(char*& input, double_t& result) { - doubleCast( + StringCastUtils::doubleCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::DOUBLE}); } template<> inline void CastToDouble::operation(common::ku_string_t& input, double_t& result) { - doubleCast((char*)input.getData(), input.len, result, + StringCastUtils::doubleCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::DOUBLE}); } @@ -129,13 +130,13 @@ struct CastToFloat { template<> inline void CastToFloat::operation(char*& input, float_t& result) { - doubleCast( + StringCastUtils::doubleCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::FLOAT}); } template<> inline void CastToFloat::operation(common::ku_string_t& input, float_t& result) { - doubleCast((char*)input.getData(), input.len, result, + StringCastUtils::doubleCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::FLOAT}); } @@ -151,13 +152,13 @@ struct CastToInt64 { template<> inline void CastToInt64::operation(char*& input, int64_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::INT64}); } template<> inline void CastToInt64::operation(common::ku_string_t& input, int64_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::INT64}); } @@ -173,7 +174,7 @@ struct CastToSerial { template<> inline void CastToSerial::operation(common::ku_string_t& input, int64_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::INT64}); } @@ -189,13 +190,13 @@ struct CastToInt32 { template<> inline void CastToInt32::operation(char*& input, int32_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::INT32}); } template<> inline void CastToInt32::operation(common::ku_string_t& input, int32_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::INT32}); } @@ -211,13 +212,13 @@ struct CastToInt16 { template<> inline void CastToInt16::operation(common::ku_string_t& input, int16_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::INT16}); } template<> inline void CastToInt16::operation(char*& input, int16_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::INT16}); } @@ -233,13 +234,13 @@ struct CastToInt8 { template<> inline void CastToInt8::operation(common::ku_string_t& input, int8_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::INT8}); } template<> inline void CastToInt8::operation(char*& input, int8_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::INT8}); } @@ -255,13 +256,13 @@ struct CastToUInt64 { template<> inline void CastToUInt64::operation(common::ku_string_t& input, uint64_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::UINT64}); } template<> inline void CastToUInt64::operation(char*& input, uint64_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::UINT64}); } @@ -277,13 +278,13 @@ struct CastToUInt32 { template<> inline void CastToUInt32::operation(common::ku_string_t& input, uint32_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::UINT32}); } template<> inline void CastToUInt32::operation(char*& input, uint32_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::UINT32}); } @@ -299,13 +300,13 @@ struct CastToUInt16 { template<> inline void CastToUInt16::operation(common::ku_string_t& input, uint16_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::UINT16}); } template<> inline void CastToUInt16::operation(char*& input, uint16_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::UINT16}); } @@ -321,13 +322,13 @@ struct CastToUInt8 { template<> inline void CastToUInt8::operation(common::ku_string_t& input, uint8_t& result) { - simpleIntegerCast((char*)input.getData(), input.len, result, + StringCastUtils::simpleIntegerCast((char*)input.getData(), input.len, result, common::LogicalType{common::LogicalTypeID::UINT8}); } template<> inline void CastToUInt8::operation(char*& input, uint8_t& result) { - simpleIntegerCast( + StringCastUtils::simpleIntegerCast( input, strlen(input), result, common::LogicalType{common::LogicalTypeID::UINT8}); } diff --git a/src/include/function/cast/numeric_cast.h b/src/include/function/cast/cast_utils.h similarity index 56% rename from src/include/function/cast/numeric_cast.h rename to src/include/function/cast/cast_utils.h index 36ce5886c0b..e86864392f3 100644 --- a/src/include/function/cast/numeric_cast.h +++ b/src/include/function/cast/cast_utils.h @@ -6,55 +6,14 @@ #include "common/string_utils.h" #include "common/type_utils.h" #include "common/types/ku_string.h" +#include "common/vector/value_vector.h" #include "fast_float.h" #include "numeric_limits.h" namespace kuzu { namespace function { -static bool tryCastToBool(const char* input, uint64_t len, bool& result) { - common::StringUtils::removeCStringWhiteSpaces(input, len); - - switch (len) { - case 1: { - char c = std::tolower(*input); - if (c == 't' || c == '1') { - result = true; - return true; - } else if (c == 'f' || c == '0') { - result = false; - return true; - } - return false; - } - case 4: { - auto t = std::tolower(input[0]); - auto r = std::tolower(input[1]); - auto u = std::tolower(input[2]); - auto e = std::tolower(input[3]); - if (t == 't' && r == 'r' && u == 'u' && e == 'e') { - result = true; - return true; - } - return false; - } - case 5: { - auto f = std::tolower(input[0]); - auto a = std::tolower(input[1]); - auto l = std::tolower(input[2]); - auto s = std::tolower(input[3]); - auto e = std::tolower(input[4]); - if (f == 'f' && a == 'a' && l == 'l' && s == 's' && e == 'e') { - result = false; - return true; - } - return false; - } - default: - return false; - } -} - +// cast string to numerical template struct IntegerCastData { using Result = T; @@ -86,162 +45,169 @@ struct IntegerCastOperation { } }; -// TODO: support exponent + decimal -template -static bool integerCastLoop(const char* input, uint64_t len, T& result) { - int64_t start_pos = 0; - if (NEGATIVE) { - start_pos = 1; - } - int64_t pos = start_pos; - while (pos < len) { - if (!common::StringUtils::CharacterIsDigit(input[pos])) { - // TODO: exponent and decimals - return false; +namespace StringCastUtils { + + // cast string to bool + bool tryCastToBool(const char* input, uint64_t len, bool& result); + void castStringToBool(const char* input, uint64_t len, bool& result); + + // cast to numerical values + // TODO: support exponent + decimal + template + static bool integerCastLoop(const char* input, uint64_t len, T& result) { + int64_t start_pos = 0; + if (NEGATIVE) { + start_pos = 1; } - uint8_t digit = input[pos++] - '0'; - if (!OP::template handleDigit(result, digit)) { + int64_t pos = start_pos; + while (pos < len) { + if (!common::StringUtils::CharacterIsDigit(input[pos])) { + // TODO: exponent and decimals + return false; + } + uint8_t digit = input[pos++] - '0'; + if (!OP::template handleDigit(result, digit)) { + return false; + } + } // append all digits to result + if (!OP::template finalize(result)) { return false; } - } // append all digits to result - if (!OP::template finalize(result)) { - return false; + return pos > start_pos; // false if no digits "" or "-" } - return pos > start_pos; // false if no digits "" or "-" -} -template -static bool tryIntegerCast(const char* input, uint64_t& len, T& result) { - common::StringUtils::removeCStringWhiteSpaces(input, len); - if (len == 0) { - return false; - } + template + static bool tryIntegerCast(const char* input, uint64_t& len, T& result) { + common::StringUtils::removeCStringWhiteSpaces(input, len); + if (len == 0) { + return false; + } - // negative - if (*input == '-') { - if constexpr (!IS_SIGNED) { // unsigned if not -0 - uint64_t pos = 1; - while (pos < len) { - if (input[pos++] != '0') { - return false; + // negative + if (*input == '-') { + if constexpr (!IS_SIGNED) { // unsigned if not -0 + uint64_t pos = 1; + while (pos < len) { + if (input[pos++] != '0') { + return false; + } } } + // decimal separator is default to "." + return integerCastLoop(input, len, result); } - // decimal separator is default to "." - return integerCastLoop(input, len, result); - } - - // not allow leading 0 - if (len > 1 && *input == '0') { - return false; - } - return integerCastLoop(input, len, result); -} + // not allow leading 0 + if (len > 1 && *input == '0') { + return false; + } -template -static bool trySimpleIntegerCast(const char* input, uint64_t len, T& result) { - IntegerCastData data; - data.result = 0; - if (tryIntegerCast, IS_SIGNED>(input, len, data)) { - result = data.result; - return true; + return integerCastLoop(input, len, result); } - return false; -} -template -static void simpleIntegerCast(const char* input, uint64_t len, T& result, - const common::LogicalType& type = common::LogicalType{common::LogicalTypeID::ANY}) { - if (!trySimpleIntegerCast(input, len, result)) { - throw common::ConversionException( - "Cast failed. " + std::string{input, len} + " is not in " + - common::LogicalTypeUtils::dataTypeToString(type) + " range."); + template + static bool trySimpleIntegerCast(const char* input, uint64_t len, T& result) { + IntegerCastData data; + data.result = 0; + if (tryIntegerCast, IS_SIGNED>(input, len, data)) { + result = data.result; + return true; + } + return false; } -} -template -static bool tryDoubleCast(const char* input, uint64_t len, T& result) { - common::StringUtils::removeCStringWhiteSpaces(input, len); - if (len == 0) { - return false; + template + static void simpleIntegerCast(const char* input, uint64_t len, T& result, + const common::LogicalType& type = common::LogicalType{common::LogicalTypeID::ANY}) { + if (!trySimpleIntegerCast(input, len, result)) { + throw common::ConversionException( + "Cast failed. " + std::string{input, len} + " is not in " + + common::LogicalTypeUtils::dataTypeToString(type) + " range."); + } } - // not allow leading 0 - if (len > 1 && *input == '0') { - if (common::StringUtils::CharacterIsDigit(input[1])) { + + template + static bool tryDoubleCast(const char* input, uint64_t len, T& result) { + common::StringUtils::removeCStringWhiteSpaces(input, len); + if (len == 0) { return false; } + // not allow leading 0 + if (len > 1 && *input == '0') { + if (common::StringUtils::CharacterIsDigit(input[1])) { + return false; + } + } + auto end = input + len; + auto parse_result = kuzu_fast_float::from_chars(input, end, result); + if (parse_result.ec != std::errc()) { + return false; + } + return parse_result.ptr == end; } - auto end = input + len; - auto parse_result = kuzu_fast_float::from_chars(input, end, result); - if (parse_result.ec != std::errc()) { - return false; - } - return parse_result.ptr == end; -} -template -static void doubleCast(const char* input, uint64_t len, T& result, - const common::LogicalType& type = common::LogicalType{common::LogicalTypeID::ANY}) { - if (!tryDoubleCast(input, len, result)) { - throw common::ConversionException( - "Cast failed. " + std::string{input, len} + " is not in " + - common::LogicalTypeUtils::dataTypeToString(type) + " range."); + template + static void doubleCast(const char* input, uint64_t len, T& result, + const common::LogicalType& type = common::LogicalType{common::LogicalTypeID::ANY}) { + if (!tryDoubleCast(input, len, result)) { + throw common::ConversionException( + "Cast failed. " + std::string{input, len} + " is not in " + + common::LogicalTypeUtils::dataTypeToString(type) + " range."); + } } -} -static void castStringToBool(const char* input, uint64_t len, bool& result) { - if (!tryCastToBool(input, len, result)) { - throw common::ConversionException( - "Cast failed. " + std::string{input, len} + " is not in BOOL range."); + template + static inline T castStringToNum(const char* input, uint64_t len, + const common::LogicalType& type = common::LogicalType{common::LogicalTypeID::ANY}) { + T result; + simpleIntegerCast(input, len, result, type); + return result; } -} - -template -static inline T castStringToNum(const char* input, uint64_t len, - const common::LogicalType& type = common::LogicalType{common::LogicalTypeID::ANY}) { - T result; - simpleIntegerCast(input, len, result, type); - return result; -} +}; template<> -inline uint64_t castStringToNum(const char* input, uint64_t len, const common::LogicalType& type) { +inline uint64_t StringCastUtils::castStringToNum( + const char* input, uint64_t len, const common::LogicalType& type) { uint64_t result; simpleIntegerCast(input, len, result, type); return result; } template<> -inline uint32_t castStringToNum(const char* input, uint64_t len, const common::LogicalType& type) { +inline uint32_t StringCastUtils::castStringToNum( + const char* input, uint64_t len, const common::LogicalType& type) { uint32_t result; simpleIntegerCast(input, len, result, type); return result; } template<> -inline uint16_t castStringToNum(const char* input, uint64_t len, const common::LogicalType& type) { +inline uint16_t StringCastUtils::castStringToNum( + const char* input, uint64_t len, const common::LogicalType& type) { uint16_t result; simpleIntegerCast(input, len, result, type); return result; } template<> -inline uint8_t castStringToNum(const char* input, uint64_t len, const common::LogicalType& type) { +inline uint8_t StringCastUtils::castStringToNum( + const char* input, uint64_t len, const common::LogicalType& type) { uint8_t result; simpleIntegerCast(input, len, result, type); return result; } template<> -inline double_t castStringToNum(const char* input, uint64_t len, const common::LogicalType& type) { +inline double_t StringCastUtils::castStringToNum( + const char* input, uint64_t len, const common::LogicalType& type) { double_t result; doubleCast(input, len, result, type); return result; } template<> -inline float_t castStringToNum(const char* input, uint64_t len, const common::LogicalType& type) { +inline float_t StringCastUtils::castStringToNum( + const char* input, uint64_t len, const common::LogicalType& type) { float_t result; doubleCast(input, len, result, type); return result; diff --git a/src/include/function/cast/vector_cast_functions.h b/src/include/function/cast/vector_cast_functions.h index b2ea5c43d52..389e20cf290 100644 --- a/src/include/function/cast/vector_cast_functions.h +++ b/src/include/function/cast/vector_cast_functions.h @@ -125,6 +125,10 @@ struct CastToBlobVectorFunction : public VectorCastFunction { static vector_function_definitions getDefinitions(); }; +struct CastToBoolVectorFunction : public VectorCastFunction { + static vector_function_definitions getDefinitions(); +}; + struct CastToDoubleVectorFunction : public VectorCastFunction { static vector_function_definitions getDefinitions(); }; @@ -169,9 +173,5 @@ struct CastToUInt8VectorFunction : public VectorCastFunction { static vector_function_definitions getDefinitions(); }; -struct CastToBoolVectorFunction : public VectorCastFunction { - static vector_function_definitions getDefinitions(); -}; - } // namespace function } // namespace kuzu diff --git a/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h b/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h index f9b9ada5554..df429f92c90 100644 --- a/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h +++ b/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h @@ -1,7 +1,7 @@ #pragma once #include "common/types/types.h" -#include "function/cast/numeric_cast.h" +#include "function/cast/cast_utils.h" #include "storage/storage_structure/in_mem_file.h" #include "storage/store/table_copy_utils.h" #include @@ -53,7 +53,7 @@ class InMemColumnChunk { template void setValueFromString( const char* value, uint64_t length, common::offset_t pos, Args... args) { - auto val = function::castStringToNum(value, length); + auto val = function::StringCastUtils::castStringToNum(value, length); setValue(val, pos); } diff --git a/src/include/storage/store/column_chunk.h b/src/include/storage/store/column_chunk.h index 47d62ce93aa..575e9c5a7df 100644 --- a/src/include/storage/store/column_chunk.h +++ b/src/include/storage/store/column_chunk.h @@ -5,7 +5,7 @@ #include "common/types/types.h" #include "common/vector/value_vector.h" #include "compression.h" -#include "function/cast/numeric_cast.h" +#include "function/cast/cast_utils.h" #include "storage/buffer_manager/bm_file_handle.h" #include "storage/wal/wal.h" #include "transaction/transaction.h" @@ -103,7 +103,7 @@ class ColumnChunk { template void setValueFromString(const char* value, uint64_t length, common::offset_t pos) { - setValue(function::castStringToNum(value, length), pos); + setValue(function::StringCastUtils::castStringToNum(value, length), pos); } static inline common::page_idx_t getNumPagesForBytes(uint64_t numBytes) { diff --git a/src/parser/transform/transform_expression.cpp b/src/parser/transform/transform_expression.cpp index 5a09a9d1cd0..576c031a78d 100644 --- a/src/parser/transform/transform_expression.cpp +++ b/src/parser/transform/transform_expression.cpp @@ -1,5 +1,5 @@ #include "common/string_utils.h" -#include "function/cast/numeric_cast.h" +#include "function/cast/cast_utils.h" #include "parser/expression/parsed_case_expression.h" #include "parser/expression/parsed_function_expression.h" #include "parser/expression/parsed_literal_expression.h" @@ -569,8 +569,8 @@ std::string Transformer::transformPropertyKeyName(CypherParser::OC_PropertyKeyNa std::unique_ptr Transformer::transformIntegerLiteral( CypherParser::OC_IntegerLiteralContext& ctx) { auto text = ctx.DecimalInteger()->getText(); - auto value = - std::make_unique(function::castStringToNum(text.c_str(), text.length())); + auto value = std::make_unique( + function::StringCastUtils::castStringToNum(text.c_str(), text.length())); return std::make_unique(std::move(value), ctx.getText()); } @@ -578,7 +578,7 @@ std::unique_ptr Transformer::transformDoubleLiteral( CypherParser::OC_DoubleLiteralContext& ctx) { auto text = ctx.RegularDecimalReal()->getText(); auto value = - std::make_unique(function::castStringToNum(text.c_str(), text.length())); + std::make_unique(function::StringCastUtils::castStringToNum(text.c_str(), text.length())); return std::make_unique(std::move(value), ctx.getText()); } diff --git a/src/processor/operator/persistent/reader/csv/driver.cpp b/src/processor/operator/persistent/reader/csv/driver.cpp index f818225e7a9..d8e5cab7433 100644 --- a/src/processor/operator/persistent/reader/csv/driver.cpp +++ b/src/processor/operator/persistent/reader/csv/driver.cpp @@ -7,7 +7,7 @@ #include "common/type_utils.h" #include "common/types/blob.h" #include "common/types/value/value.h" -#include "function/cast/numeric_cast.h" +#include "function/cast/cast_utils.h" #include "processor/operator/persistent/reader/csv/parallel_csv_reader.h" #include "processor/operator/persistent/reader/csv/serial_csv_reader.h" #include "storage/store/table_copy_utils.h" @@ -19,6 +19,148 @@ namespace processor { ParsingDriver::ParsingDriver(common::DataChunk& chunk) : chunk(chunk), rowEmpty(false) {} +void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view strVal, + CSVReaderConfig csvReaderConfig); + +static void skipWhitespace(const char*& input, const char* end) { + while (input != end && isspace(*input)) { + input++; + } +} + +bool skipToCloseQuotes(const char*& input, const char* end, const CSVReaderConfig& csvReaderConfig) { + input++; // skip the first ", ' + bool escaped = false; + + while (input != end) { + if (*input == csvReaderConfig.escapeChar) { + escaped = !escaped; + } else { + if (*input == csvReaderConfig.quoteChar && !escaped) { + return true; + } + escaped = false; + } + input++; + } + return false; +} + +static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char target, + const CSVReaderConfig& csvReaderConfig) { + input++; + while (input != end) { + if (*input == csvReaderConfig.escapeChar) { + input++; + } else if (*input == '{') { // must have closing brackets fro {, ] if they are not quoted + if (!skipToClose(input, end, lvl, '}', csvReaderConfig)) { + return false; + } + } else if (*input == csvReaderConfig.listBeginChar) { + if (!skipToClose(input, end, lvl, csvReaderConfig.listEndChar, csvReaderConfig)) { + return false; + } + lvl++; // nested one more level + } else if (*input == target) { + if (target == ']') { + lvl--; + } + return true; + } + input++; + } + return false; // no corresponding closing bracket +} + +struct CountPartOperation { + uint64_t count = 0; + + void HandleValue(const char* start, const char* end, const CSVReaderConfig& config) { + count++; + } +}; + +struct SplitStringListOperation { + SplitStringListOperation(uint64_t& offset, common::ValueVector* resultVector) + : offset(offset), resultVector(resultVector) { + } + + uint64_t& offset; + ValueVector* resultVector; + + void HandleValue(const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) { + // NULL + skipWhitespace(start, end); + if ((*start == 'N' || *start == 'n') && (*(start + 1) == 'U' || *(start + 1) == 'u') && + (*(start + 2) == 'L' || *(start + 2) == 'l') && + (*(start + 3) == 'L' || *(start + 3) == 'l')) { + start = end; + } + copyStringToVector(resultVector, offset, std::string_view{start, end}, csvReaderConfig); + offset++; + } +}; + +template +static bool splitCString(const char* input, uint64_t len, T& state, const CSVReaderConfig& csvReaderConfig) { + auto end = input + len; + uint64_t lvl = 1; + bool seen_value = false; + + // locate [ + skipWhitespace(input, end); + if (input == end || *input != csvReaderConfig.listBeginChar) { + return false; + } + input++; + + // TODO: test if not skipping any white space works for all data type + auto start_ptr = input; + while (input < end) { + auto ch = *input; + if (ch == csvReaderConfig.listBeginChar) { + if (!skipToClose(input, end, ++lvl, csvReaderConfig.listEndChar, csvReaderConfig)) { + return false; + } + } else if (ch == '{') { + uint64_t struct_lvl = 0; + skipToClose(input, end, struct_lvl, '}', csvReaderConfig); + } else if (ch == csvReaderConfig.delimiter || ch == csvReaderConfig.listEndChar) { // split + // allow empty string? + if (ch != csvReaderConfig.listEndChar || start_ptr < input || seen_value) { + state.HandleValue(start_ptr, input, csvReaderConfig); + seen_value = true; + } + if (ch == csvReaderConfig.listEndChar) { // last ] + lvl--; + break; + } + start_ptr = ++input; + continue; + } + input++; + } + skipWhitespace(++input, end); + return (input == end && lvl == 0); +} + +static void castStringToList(const char* input, uint64_t len, ValueVector* vector, uint64_t rowToAdd, const CSVReaderConfig& csvReaderConfig) { + // calculate the number of elements in array + CountPartOperation state; + splitCString(input, len, state, csvReaderConfig); + + auto list_entry = ListVector::addList(vector, state.count); + vector->setValue(rowToAdd, list_entry); + auto listDataVector = common::ListVector::getDataVector(vector); + + SplitStringListOperation split{list_entry.offset, listDataVector}; + if (!splitCString(input, len, split, csvReaderConfig)) { + throw common::ConversionException( + "Cast failed. " + std::string{input, len} + " is not in " + + common::LogicalTypeUtils::dataTypeToString(vector->dataType) + " range."); + } +} + void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view strVal, CSVReaderConfig csvReaderConfig) { auto& type = vector->dataType; @@ -31,57 +173,57 @@ void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view switch (type.getLogicalTypeID()) { case LogicalTypeID::INT64: { int64_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::INT32: { int32_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::INT16: { int16_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::INT8: { int8_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::UINT64: { uint64_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::UINT32: { uint32_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::UINT16: { uint16_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::UINT8: { uint8_t val; - function::simpleIntegerCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::simpleIntegerCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::FLOAT: { float_t val; - function::doubleCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::doubleCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::DOUBLE: { double_t val; - function::doubleCast(strVal.data(), strVal.length(), val, type); + function::StringCastUtils::doubleCast(strVal.data(), strVal.length(), val, type); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::BOOL: { bool val; - function::castStringToBool(strVal.data(), strVal.length(), val); + function::StringCastUtils::castStringToBool(strVal.data(), strVal.length(), val); vector->setValue(rowToAdd, val); } break; case LogicalTypeID::BLOB: { @@ -106,12 +248,14 @@ void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view case LogicalTypeID::INTERVAL: { vector->setValue(rowToAdd, Interval::fromCString(strVal.data(), strVal.length())); } break; - case LogicalTypeID::MAP: - case LogicalTypeID::VAR_LIST: { + case LogicalTypeID::MAP: { auto value = storage::TableCopyUtils::getVarListValue( strVal, 1, strVal.length() - 2, type, csvReaderConfig); vector->copyFromValue(rowToAdd, *value); } break; + case LogicalTypeID::VAR_LIST: { + castStringToList(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig); + } break; case LogicalTypeID::FIXED_LIST: { auto fixedListVal = storage::TableCopyUtils::getArrowFixedListVal( strVal, 1, strVal.length() - 2, type, csvReaderConfig); diff --git a/src/storage/in_mem_storage_structure/in_mem_lists.cpp b/src/storage/in_mem_storage_structure/in_mem_lists.cpp index f17ef17ac61..77aa4ee331f 100644 --- a/src/storage/in_mem_storage_structure/in_mem_lists.cpp +++ b/src/storage/in_mem_storage_structure/in_mem_lists.cpp @@ -163,7 +163,7 @@ void InMemLists::setValue(offset_t nodeOffset, uint64_t pos, uint8_t* val) { template void InMemLists::setValueFromString( offset_t nodeOffset, uint64_t pos, const char* val, uint64_t length) { - auto numericVal = function::castStringToNum(val, length); + auto numericVal = function::StringCastUtils::castStringToNum(val, length); setValue(nodeOffset, pos, (uint8_t*)&numericVal); } diff --git a/src/storage/store/table_copy_utils.cpp b/src/storage/store/table_copy_utils.cpp index d58ad33c677..776ba992b7e 100644 --- a/src/storage/store/table_copy_utils.cpp +++ b/src/storage/store/table_copy_utils.cpp @@ -4,7 +4,7 @@ #include "common/exception/copy.h" #include "common/exception/parser.h" #include "common/string_utils.h" -#include "function/cast/numeric_cast.h" +#include "function/cast/cast_utils.h" #include "storage/storage_structure/lists/lists.h" #include #include @@ -171,65 +171,71 @@ std::unique_ptr TableCopyUtils::getArrowFixedList(std::string_view l, switch (childDataType->getLogicalTypeID()) { case LogicalTypeID::INT64: { int64_t val; - function::simpleIntegerCast(element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), + val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(int64_t), &val, sizeof(int64_t)); numElementsRead++; } break; case LogicalTypeID::INT32: { int32_t val; - function::simpleIntegerCast(element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), + val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(int32_t), &val, sizeof(int32_t)); numElementsRead++; } break; case LogicalTypeID::INT16: { int16_t val; - function::simpleIntegerCast(element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), + val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(int16_t), &val, sizeof(int16_t)); numElementsRead++; } break; case LogicalTypeID::INT8: { int8_t val; - function::simpleIntegerCast(element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), + val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(int8_t), &val, sizeof(int8_t)); numElementsRead++; } break; case LogicalTypeID::UINT64: { uint64_t val; - function::simpleIntegerCast( - element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(uint64_t), &val, sizeof(uint64_t)); numElementsRead++; } case LogicalTypeID::UINT32: { uint32_t val; - function::simpleIntegerCast( - element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(uint32_t), &val, sizeof(uint32_t)); numElementsRead++; } break; case LogicalTypeID::UINT16: { uint16_t val; - function::simpleIntegerCast( - element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(uint16_t), &val, sizeof(uint16_t)); numElementsRead++; } break; case LogicalTypeID::UINT8: { uint8_t val; - function::simpleIntegerCast( - element.data(), element.length(), val, dataType); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, dataType); memcpy(listVal.get() + numElementsRead * sizeof(uint8_t), &val, sizeof(uint8_t)); numElementsRead++; } break; case LogicalTypeID::DOUBLE: { double_t val; - function::doubleCast(element.data(), element.length(), val, dataType); + function::StringCastUtils::doubleCast(element.data(), element.length(), val, + dataType); memcpy(listVal.get() + numElementsRead * sizeof(double_t), &val, sizeof(double_t)); numElementsRead++; } break; case LogicalTypeID::FLOAT: { float_t val; - function::doubleCast(element.data(), element.length(), val, dataType); + function::StringCastUtils::doubleCast(element.data(), element.length(), val, + dataType); memcpy(listVal.get() + numElementsRead * sizeof(float_t), &val, sizeof(float_t)); numElementsRead++; } break; @@ -306,52 +312,56 @@ std::shared_ptr TableCopyUtils::toArrowDataType(const LogicalTy } } -bool TableCopyUtils::tryCast( - const common::LogicalType& targetType, const char* value, uint64_t length) { +bool TableCopyUtils::tryCast(const common::LogicalType& targetType, const char* value, + uint64_t length) { switch (targetType.getLogicalTypeID()) { case LogicalTypeID::BOOL: { bool result; - return function::tryCastToBool(value, length, result); + return function::StringCastUtils::tryCastToBool(value, length, result); } case LogicalTypeID::INT64: { int64_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, result); } case LogicalTypeID::INT32: { int32_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, result); } case LogicalTypeID::INT16: { int16_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, result); } case LogicalTypeID::INT8: { int8_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, result); } case LogicalTypeID::UINT64: { uint64_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, + result); } case LogicalTypeID::UINT32: { uint32_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, + result); } case LogicalTypeID::UINT16: { uint16_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, + result); } case LogicalTypeID::UINT8: { uint8_t result; - return function::trySimpleIntegerCast(value, length, result); + return function::StringCastUtils::trySimpleIntegerCast(value, length, + result); } case LogicalTypeID::DOUBLE: { double_t result; - return function::tryDoubleCast(value, length, result); + return function::StringCastUtils::tryDoubleCast(value, length, result); } case LogicalTypeID::FLOAT: { float_t result; - return function::tryDoubleCast(value, length, result); + return function::StringCastUtils::tryDoubleCast(value, length, result); } case LogicalTypeID::DATE: { date_t result; @@ -371,8 +381,8 @@ bool TableCopyUtils::tryCast( } } -std::vector TableCopyUtils::parseStructFieldNameAndValues( - LogicalType& type, std::string_view structString, const CSVReaderConfig& csvReaderConfig) { +std::vector TableCopyUtils::parseStructFieldNameAndValues(LogicalType& type, + std::string_view structString, const CSVReaderConfig& csvReaderConfig) { std::vector structFieldIdxAndValueParis; uint64_t curPos = 0u; while (curPos < structString.length()) { @@ -387,63 +397,72 @@ std::vector TableCopyUtils::parseStructFieldNameAndValue return structFieldIdxAndValueParis; } -std::unique_ptr TableCopyUtils::convertStringToValue( - std::string_view element, const LogicalType& type, const CSVReaderConfig& csvReaderConfig) { +std::unique_ptr TableCopyUtils::convertStringToValue(std::string_view element, + const LogicalType& type, const CSVReaderConfig& csvReaderConfig) { std::unique_ptr value; switch (type.getLogicalTypeID()) { case LogicalTypeID::INT64: { int64_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), val, + type); value = std::make_unique(val); } break; case LogicalTypeID::INT32: { int32_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), val, + type); value = std::make_unique(val); } break; case LogicalTypeID::INT16: { int16_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), val, + type); value = std::make_unique(val); } break; case LogicalTypeID::INT8: { int8_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), element.length(), val, + type); value = std::make_unique(val); } break; case LogicalTypeID::UINT64: { uint64_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, type); value = std::make_unique(val); } break; case LogicalTypeID::UINT32: { uint32_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, type); value = std::make_unique(val); } break; case LogicalTypeID::UINT16: { uint16_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, type); value = std::make_unique(val); } break; case LogicalTypeID::UINT8: { uint8_t val; - function::simpleIntegerCast(element.data(), element.length(), val, type); + function::StringCastUtils::simpleIntegerCast(element.data(), + element.length(), val, type); value = std::make_unique(val); } break; case LogicalTypeID::FLOAT: { float_t val; - function::doubleCast(element.data(), element.length(), val, type); + function::StringCastUtils::doubleCast(element.data(), element.length(), val, type); value = std::make_unique(val); } break; case LogicalTypeID::DOUBLE: { double_t val; - function::doubleCast(element.data(), element.length(), val, type); + function::StringCastUtils::doubleCast(element.data(), element.length(), val, + type); value = std::make_unique(val); } break; case LogicalTypeID::BOOL: { bool val; - function::castStringToBool(element.data(), element.length(), val); + function::StringCastUtils::castStringToBool(element.data(), element.length(), val); value = std::make_unique(val); } break; case LogicalTypeID::STRING: { @@ -519,15 +538,15 @@ std::unique_ptr TableCopyUtils::parseMap(std::string_view l, int64_t from // parsing. structFields[0] = convertStringToValue(key, *structFieldTypes[0], csvReaderConfig); structFields[1] = convertStringToValue(value, *structFieldTypes[1], csvReaderConfig); - values.push_back(std::make_unique( - *VarListType::getChildType(&dataType), std::move(structFields))); + values.push_back(std::make_unique(*VarListType::getChildType(&dataType), + std::move(structFields))); } } return make_unique(dataType, std::move(values)); } -std::pair TableCopyUtils::parseMapFields( - std::string_view l, int64_t from, int64_t length, const CSVReaderConfig& csvReaderConfig) { +std::pair TableCopyUtils::parseMapFields(std::string_view l, int64_t from, + int64_t length, const CSVReaderConfig& csvReaderConfig) { std::string key; std::string value; auto numListBeginChars = 0u; @@ -587,8 +606,8 @@ std::string TableCopyUtils::parseStructFieldName(std::string_view structString, throw ParserException{"Invalid struct string: " + std::string(structString)}; } -std::string TableCopyUtils::parseStructFieldValue( - std::string_view structString, uint64_t& curPos, const CSVReaderConfig& csvReaderConfig) { +std::string TableCopyUtils::parseStructFieldValue(std::string_view structString, uint64_t& curPos, + const CSVReaderConfig& csvReaderConfig) { auto numListBeginChars = 0u; auto numStructBeginChars = 0u; auto numDoubleQuotes = 0u; diff --git a/test/test_files/tinysnb/cast/cast_string_to_list.test b/test/test_files/tinysnb/cast/cast_string_to_list.test new file mode 100644 index 00000000000..ec6b3867347 --- /dev/null +++ b/test/test_files/tinysnb/cast/cast_string_to_list.test @@ -0,0 +1,34 @@ +-GROUP TinySnbReadTest +-DATASET CSV load-from-test + +-- + +-CASE CastStringToList +-STATEMENT LOAD WITH HEADERS (list INT64[][]) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/should_pass.csv" RETURN *; +---- 7 +[[1,3,423,124,43242],[432]] +[[1,3,423,124,43242],[432]] +[[1,3,423,124,43242],[432]] +[,[],[1,2,3]] +[,,,,,,,,,,[1,2,3]] +[[],[],[]] +[,,] + +-CASE ChangeCSVConfig +-STATEMENT LOAD WITH HEADERS (list STRING[], str STRING) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/change_config.csv" (HEADER=true, DELIM="|", ESCAPE="~", QUOTE="'", LIST_BEGIN="(", LIST_END=")") RETURN * ; +---- 3 +[escape ,is escape success? ~]| ' ( ) do not need to escape sepeical | () +[this ,is a word ,normal ,]|try escape ~ +[~ ' not work also this "'" ]|th + +-CASE ErrorTest +-STATEMENT LOAD WITH HEADERS (list STRING[][]) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/delim_fail.csv" (DELIM="|", ESCAPE="~", QUOTE="'", LIST_BEGIN="(", LIST_END=")") RETURN * ; +---- error +Conversion exception: Cast failed. "((hello),(bdfadf),)" is not in STRING[][] range. +-STATEMENT LOAD WITH HEADERS (list STRING[][]) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/bracket_fail.csv" (DELIM="|", ESCAPE="~", LIST_BEGIN="(", LIST_END=")") RETURN * ; +---- error +Conversion exception: Cast failed. (() is not in STRING[][] range. +-STATEMENT LOAD WITH HEADERS (list INT32[]) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/quote_fail.csv" RETURN * ; +---- error +Binder exception: Number of columns mismatch. Detect 3 but expect 1. +