diff --git a/dataset/load-from-test/change_config.csv b/dataset/load-from-test/change_config.csv index 562be141266..d2eb00cfce8 100644 --- a/dataset/load-from-test/change_config.csv +++ b/dataset/load-from-test/change_config.csv @@ -1,4 +1,4 @@ list | str | struct '(this | is a word | normal not escape | ~' ( ) | , ~~ ~' )'|'try escape ~~'|' { a : { b : {c:( 3432423 | -43423 | 31231 | NULL )} } } ' '(escape | is escape success? ~~)'|' ~' ( ) do not need to escape sepeical | ()'|'{a:{}}' -'(~~ ~' not work also this "~'" )'|'nu'|'{a:{b:{}}}' +'(~~ ~' not work also this ~'"" )'|'nu'|'{a:{b:{}}}' diff --git a/dataset/load-from-test/map_correct.csv b/dataset/load-from-test/map_correct.csv new file mode 100644 index 00000000000..39e17b9dd4c --- /dev/null +++ b/dataset/load-from-test/map_correct.csv @@ -0,0 +1,7 @@ +map +"{[432> 24> 12] = {c: okay}> [90> 11> 43> 54] = {c: bad}> [-0] = {c: good}}" +"{[]={}}" +"{[11> 43> NULL ]={c: this is a struct}}" +"{[]={c: this is not empty}}" +"{[1000> 143245> 432> 43241> -43214324> -432423> -4324324> -41412> -131242143> 0> -4324324> -0]={}}" +"{={c: '~~ have some > >'}}" diff --git a/dataset/load-from-test/map_incorrect_type.csv b/dataset/load-from-test/map_incorrect_type.csv new file mode 100644 index 00000000000..c15e2b4c604 --- /dev/null +++ b/dataset/load-from-test/map_incorrect_type.csv @@ -0,0 +1 @@ +"{234=-3}" diff --git a/dataset/load-from-test/map_no_closing1.csv b/dataset/load-from-test/map_no_closing1.csv new file mode 100644 index 00000000000..e349e5bfe3e --- /dev/null +++ b/dataset/load-from-test/map_no_closing1.csv @@ -0,0 +1 @@ +"{10=10, 20=20," diff --git a/dataset/load-from-test/map_no_closing2.csv b/dataset/load-from-test/map_no_closing2.csv new file mode 100644 index 00000000000..9acbf4477f7 --- /dev/null +++ b/dataset/load-from-test/map_no_closing2.csv @@ -0,0 +1 @@ +"{" diff --git a/dataset/load-from-test/map_no_closing3.csv b/dataset/load-from-test/map_no_closing3.csv new file mode 100644 index 00000000000..7ffa97e5d7d --- /dev/null +++ b/dataset/load-from-test/map_no_closing3.csv @@ -0,0 +1 @@ +"{{3=3}" diff --git a/dataset/load-from-test/map_no_closing4.csv b/dataset/load-from-test/map_no_closing4.csv new file mode 100644 index 00000000000..86868d0563d --- /dev/null +++ b/dataset/load-from-test/map_no_closing4.csv @@ -0,0 +1 @@ +"{432=[432,432}" diff --git a/dataset/load-from-test/map_no_quote.csv b/dataset/load-from-test/map_no_quote.csv new file mode 100644 index 00000000000..3eab7ac32f2 --- /dev/null +++ b/dataset/load-from-test/map_no_quote.csv @@ -0,0 +1 @@ +"{324=\"}" diff --git a/dataset/load-from-test/map_no_val.csv b/dataset/load-from-test/map_no_val.csv new file mode 100644 index 00000000000..1bba4ffc880 --- /dev/null +++ b/dataset/load-from-test/map_no_val.csv @@ -0,0 +1 @@ +"{4324}" diff --git a/dataset/load-from-test/nested_map2.csv b/dataset/load-from-test/nested_map2.csv new file mode 100644 index 00000000000..f2a32ec260d --- /dev/null +++ b/dataset/load-from-test/nested_map2.csv @@ -0,0 +1,7 @@ +"{{30099=30099, 1=0}={3mTEf=3mTEf, 3mTEf=3mTEf, 3mTEf=3mTEf}}" +"{{18046=18046, 2=321, 3= 423421}={dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y}}" +"{{}={YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y}}" +"{{31395=31395}={}}" +"{{}={}}" +"{=}" +"{}" diff --git a/dataset/load-from-test/nested_map_correct.csv b/dataset/load-from-test/nested_map_correct.csv new file mode 100644 index 00000000000..4cccb1d8329 --- /dev/null +++ b/dataset/load-from-test/nested_map_correct.csv @@ -0,0 +1,3 @@ +" { c= {a = 3423 }, b = { g = 3421 } } " +"{}" +"{d = {}}" diff --git a/dataset/load-from-test/struct_close_fail.csv b/dataset/load-from-test/struct_close_fail.csv index 6a644d10184..d94aa719c18 100644 --- a/dataset/load-from-test/struct_close_fail.csv +++ b/dataset/load-from-test/struct_close_fail.csv @@ -1 +1 @@ -"{ c : 423 } still stuff here" +"{ c : 423 } c: 479," diff --git a/dataset/load-from-test/struct_noclose.csv b/dataset/load-from-test/struct_noclose.csv new file mode 100644 index 00000000000..98232c64fce --- /dev/null +++ b/dataset/load-from-test/struct_noclose.csv @@ -0,0 +1 @@ +{ diff --git a/src/common/string_utils.cpp b/src/common/string_utils.cpp index 966bef6ecbd..b1226ccd907 100644 --- a/src/common/string_utils.cpp +++ b/src/common/string_utils.cpp @@ -5,6 +5,26 @@ namespace kuzu { namespace common { +std::vector StringUtils::splitComma( + const std::string& input, bool ignoreEmptyStringParts) { + auto result = std::vector(); + auto currentPos = 0u; + auto lvl = 0u; + while (currentPos < input.length()) { + if (input[currentPos] == '(') { + lvl++; + } else if (input[currentPos] == ')') { + lvl--; + } else if (lvl == 0 && input[currentPos] == ',') { + break; + } + currentPos++; + } + result.push_back(input.substr(0, currentPos)); + result.push_back(input.substr(currentPos + 1)); + return result; +} + std::vector StringUtils::split( const std::string& input, const std::string& delimiter, bool ignoreEmptyStringParts) { auto result = std::vector(); diff --git a/src/common/types/types.cpp b/src/common/types/types.cpp index f234203828a..d69a947530e 100644 --- a/src/common/types/types.cpp +++ b/src/common/types/types.cpp @@ -836,7 +836,7 @@ std::unique_ptr LogicalTypeUtils::parseMapType(const std::string& t throw Exception("Cannot parse map type: " + trimmedStr); } auto mapTypeStr = trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1); - auto keyValueTypes = StringUtils::split(mapTypeStr, ","); + auto keyValueTypes = StringUtils::splitComma(mapTypeStr); return MapType::createMapType( std::make_unique(dataTypeFromString(keyValueTypes[0])), std::make_unique(dataTypeFromString(keyValueTypes[1]))); diff --git a/src/include/common/string_utils.h b/src/include/common/string_utils.h index 857a5db31ed..d6a653ea1ea 100644 --- a/src/include/common/string_utils.h +++ b/src/include/common/string_utils.h @@ -20,6 +20,9 @@ class StringUtils { return fmt::format(fmt::runtime(format), args...); } + static std::vector splitComma( + const std::string& input, bool ignoreEmptyStringParts = true); + static std::vector split( const std::string& input, const std::string& delimiter, bool ignoreEmptyStringParts = true); diff --git a/src/processor/operator/persistent/reader/csv/driver.cpp b/src/processor/operator/persistent/reader/csv/driver.cpp index a9749e57c30..1ef2b039ede 100644 --- a/src/processor/operator/persistent/reader/csv/driver.cpp +++ b/src/processor/operator/persistent/reader/csv/driver.cpp @@ -65,7 +65,7 @@ static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char } lvl++; // nested one more level } else if (*input == target) { - if (target == ']') { + if (target == csvReaderConfig.listEndChar) { lvl--; } return true; @@ -78,11 +78,17 @@ static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char struct CountPartOperation { uint64_t count = 0; - void handleValue(const char* start, const char* end, const CSVReaderConfig& config) { count++; } + static inline bool handleKey( + const char* start, const char* end, const CSVReaderConfig& config) { + return true; + } + inline void handleValue(const char* start, const char* end, const CSVReaderConfig& config) { + count++; + } }; struct SplitStringListOperation { - SplitStringListOperation(uint64_t& offset, common::ValueVector* resultVector) + SplitStringListOperation(uint64_t& offset, ValueVector* resultVector) : offset(offset), resultVector(resultVector) {} uint64_t& offset; @@ -90,7 +96,7 @@ struct SplitStringListOperation { void handleValue(const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) { // NULL - auto start_copy = start; + auto startCopy = start; skipWhitespace(start, end); if (end - start >= 4 && (*start == 'N' || *start == 'n') && (*(start + 1) == 'U' || *(start + 1) == 'u') && @@ -99,20 +105,43 @@ struct SplitStringListOperation { auto null_end = start + 4; skipWhitespace(null_end, end); if (null_end == end) { - start_copy = end; + startCopy = end; } } if (start == end) { // check if its empty string - NULL - start_copy = end; + startCopy = end; } copyStringToVector(resultVector, offset, - std::string_view{start_copy, (uint32_t)(end - start_copy)}, csvReaderConfig); + std::string_view{startCopy, (uint32_t)(end - startCopy)}, csvReaderConfig); offset++; } }; +struct SplitStringMapOperation { + SplitStringMapOperation(uint64_t& offset, ValueVector* resultVector) + : offset(offset), resultVector(resultVector) {} + + uint64_t& offset; + ValueVector* resultVector; + + inline bool handleKey( + const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) { + trimRightWhitespace(start, end); + copyStringToVector(StructVector::getFieldVector(resultVector, 0).get(), offset, + std::string_view{start, (uint32_t)(end - start)}, csvReaderConfig); + return true; + } + + inline void handleValue( + const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) { + trimRightWhitespace(start, end); + copyStringToVector(StructVector::getFieldVector(resultVector, 1).get(), offset++, + std::string_view{start, (uint32_t)(end - start)}, csvReaderConfig); + } +}; + template -static bool splitCString( +static bool splitCStringList( const char* input, uint64_t len, T& state, const CSVReaderConfig& csvReaderConfig) { auto end = input + len; uint64_t lvl = 1; @@ -132,7 +161,7 @@ static bool splitCString( if (!skipToClose(input, end, ++lvl, csvReaderConfig.listEndChar, csvReaderConfig)) { return false; } - } else if (ch == '\'') { + } else if (ch == '\'' || ch == '"') { if (!skipToCloseQuotes(input, end)) { return false; } @@ -161,14 +190,100 @@ static void castStringToList(const char* input, uint64_t len, ValueVector* vecto uint64_t rowToAdd, const CSVReaderConfig& csvReaderConfig) { // calculate the number of elements in array CountPartOperation state; - splitCString(input, len, state, csvReaderConfig); + splitCStringList(input, len, state, csvReaderConfig); auto list_entry = ListVector::addList(vector, state.count); vector->setValue(rowToAdd, list_entry); auto listDataVector = common::ListVector::getDataVector(vector); SplitStringListOperation split{list_entry.offset, listDataVector}; - if (!splitCString(input, len, split, csvReaderConfig)) { + if (!splitCStringList(input, len, split, csvReaderConfig)) { + throw ConversionException("Cast failed. " + std::string{input, len} + " is not in " + + LogicalTypeUtils::dataTypeToString(vector->dataType) + " range."); + } +} + +template +static bool parseKeyOrValue(const char*& input, const char* end, T& state, bool isKey, + bool& closeBracket, const CSVReaderConfig& csvReaderConfig) { + auto start = input; + uint64_t lvl = 0; + + while (input < end) { + if (*input == '"' || *input == '\'') { + if (!skipToCloseQuotes(input, end)) { + return false; + }; + } else if (*input == '{') { + if (!skipToClose(input, end, lvl, '}', csvReaderConfig)) { + return false; + } + } else if (*input == csvReaderConfig.listBeginChar) { + if (!skipToClose(input, end, lvl, csvReaderConfig.listEndChar, csvReaderConfig)) { + return false; + }; + } else if (isKey && *input == '=') { + return state.handleKey(start, input, csvReaderConfig); + } else if (!isKey && (*input == csvReaderConfig.delimiter || *input == '}')) { + state.handleValue(start, input, csvReaderConfig); + if (*input == '}') { + closeBracket = true; + } + return true; + } + input++; + } + return false; +} + +// {a=12,b=13} +template +static bool splitCStringMap( + const char* input, uint64_t len, T& state, const CSVReaderConfig& csvReaderConfig) { + auto end = input + len; + bool closeBracket = false; + + skipWhitespace(input, end); + if (input == end || *input != '{') { // start with { + return false; + } + skipWhitespace(++input, end); + if (input == end) { + return false; + } + if (*input == '}') { + skipWhitespace(++input, end); // empty + return input == end; + } + + while (input < end) { + if (!parseKeyOrValue(input, end, state, true, closeBracket, csvReaderConfig)) { + return false; + } + skipWhitespace(++input, end); + if (!parseKeyOrValue(input, end, state, false, closeBracket, csvReaderConfig)) { + return false; + } + skipWhitespace(++input, end); + if (closeBracket) { + return (input == end); + } + } + return false; +} + +static void castStringToMap(const char* input, uint64_t len, ValueVector* vector, uint64_t rowToAdd, + const CSVReaderConfig& csvReaderConfig) { + // count the number of maps in map + CountPartOperation state; + splitCStringMap(input, len, state, csvReaderConfig); + + auto list_entry = ListVector::addList(vector, state.count); + vector->setValue(rowToAdd, list_entry); + auto structVector = common::ListVector::getDataVector(vector); + + SplitStringMapOperation split{list_entry.offset, structVector}; + if (!splitCStringMap(input, len, split, csvReaderConfig)) { throw ConversionException("Cast failed. " + std::string{input, len} + " is not in " + LogicalTypeUtils::dataTypeToString(vector->dataType) + " range."); } @@ -185,7 +300,7 @@ static bool parseStructFieldName(const char*& input, const char* end) { } static bool parseStructFieldValue( - const char*& input, const char* end, const CSVReaderConfig& csvReaderConfig) { + const char*& input, const char* end, const CSVReaderConfig& csvReaderConfig, bool& closeBrack) { uint64_t lvl = 0; while (input < end) { if (*input == '"' || *input == '\'') { @@ -197,16 +312,14 @@ static bool parseStructFieldValue( return false; } } else if (*input == csvReaderConfig.listBeginChar) { - if (!skipToClose(input, end, lvl, csvReaderConfig.listEndChar, csvReaderConfig)) { + if (!skipToClose(input, end, ++lvl, csvReaderConfig.listEndChar, csvReaderConfig)) { return false; } } else if (*input == csvReaderConfig.delimiter || *input == '}') { if (*input == '}') { - auto copy = input + 1; - skipWhitespace(copy, end); - return (copy == end); + closeBrack = true; } - return true; + return (lvl == 0); } input++; } @@ -224,11 +337,15 @@ static bool tryCastStringToStruct(const char* input, uint64_t len, ValueVector* } skipWhitespace(++input, end); + if (input == end) { // no closing bracket + return false; + } if (*input == '}') { skipWhitespace(++input, end); return input == end; } + bool closeBracket = false; while (input < end) { auto keyStart = input; if (!parseStructFieldName(input, end)) { // find key @@ -243,7 +360,7 @@ static bool tryCastStringToStruct(const char* input, uint64_t len, ValueVector* skipWhitespace(++input, end); auto valStart = input; - if (!parseStructFieldValue(input, end, csvReaderConfig)) { // find value + if (!parseStructFieldValue(input, end, csvReaderConfig, closeBracket)) { // find value return false; } auto valEnd = input; @@ -251,9 +368,13 @@ static bool tryCastStringToStruct(const char* input, uint64_t len, ValueVector* skipWhitespace(++input, end); copyStringToVector(StructVector::getFieldVector(vector, fieldIdx).get(), rowToAdd, - std::string_view{valStart, (size_t)(valEnd - valStart)}, csvReaderConfig); + std::string_view{valStart, (uint32_t)(valEnd - valStart)}, csvReaderConfig); + + if (closeBracket) { + return (input == end); + } } - return true; + return false; } static void castStringToStruct(const char* input, uint64_t len, ValueVector* vector, @@ -352,9 +473,7 @@ void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view vector->setValue(rowToAdd, Interval::fromCString(strVal.data(), strVal.length())); } break; case LogicalTypeID::MAP: { - auto value = storage::TableCopyUtils::getVarListValue( - strVal, 1, strVal.length() - 2, type, csvReaderConfig); - vector->copyFromValue(rowToAdd, *value); + castStringToMap(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig); } break; case LogicalTypeID::VAR_LIST: { castStringToList(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig); diff --git a/test/test_files/tinysnb/cast/cast_string_to_other_type.test b/test/test_files/tinysnb/cast/cast_string_to_other_type.test index 6168b6256f3..256bc70204d 100644 --- a/test/test_files/tinysnb/cast/cast_string_to_other_type.test +++ b/test/test_files/tinysnb/cast/cast_string_to_other_type.test @@ -17,7 +17,29 @@ ---- 3 [escape , is escape success? ~]| ' ( ) do not need to escape sepeical | ()|{a: {b: {c: []}}} [this , is a word , normal not escape , ' ( ) | , ~ ' ]|try escape ~|{a: {b: {c: [3432423,-43423,31231,]}}} -[~ ' not work also this "'" ]|nu|{a: {b: {c: []}}} +[~ ' not work also this '"" ]|nu|{a: {b: {c: []}}} +-STATEMENT LOAD WITH HEADERS (map MAP(STRING, MAP(STRING, INT16))) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/nested_map_correct.csv" RETURN *; +---- 3 +{c={a=3423}, b={g=3421}} +{d={}} +{} +-STATEMENT LOAD WITH HEADERS (map MAP(INT32[], STRUCT(c STRING))) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_correct.csv" (HEADER=true, DELIM=">") RETURN *; +---- 6 +{[432,24,12]={c: okay}, [90,11,43,54]={c: bad}, [0]={c: good}} +{[]={c: }} +{[11,43,]={c: this is a struct}} +{[]={c: this is not empty}} +{[1000,143245,432,43241,-43214324,-432423,-4324324,-41412,-131242143,0,-4324324,0]={c: }} +{={c: '~~ have some > >'}} +-STATEMENT LOAD WITH HEADERS (map MAP(MAP(INT64, INT64), MAP(STRING, STRING))) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/nested_map2.csv" RETURN *; +---- 7 +{{18046=18046, 2=321, 3=423421}={dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y}} +{{30099=30099, 1=0}={3mTEf=3mTEf, 3mTEf=3mTEf, 3mTEf=3mTEf}} +{{31395=31395}={}} +{{}={YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y}} +{{}={}} +{=} +{} -CASE ErrorTest -STATEMENT LOAD WITH HEADERS (list STRING[][]) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/delim_fail.csv" (DELIM="|", ESCAPE="~", QUOTE="'", LIST_BEGIN="(", LIST_END=")") RETURN * ; @@ -49,7 +71,7 @@ Conversion exception: Cast failed. {a: {b: fsdf is not in STRUCT(a:STRUCT(b:STRI Conversion exception: Cast failed. {a: {b: fds} is not in STRUCT(a:STRUCT(b:STRING)) range. -STATEMENT LOAD WITH HEADERS (a STRUCT(c STRING)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/struct_close_fail.csv" RETURN *; ---- error -Conversion exception: Cast failed. { c : 423 } still stuff here is not in STRUCT(c:STRING) range. +Conversion exception: Cast failed. { c : 423 } c: 479, is not in STRUCT(c:STRING) range. -STATEMENT LOAD WITH HEADERS (a STRUCT(c INT32)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/invalid_field_name.csv" RETURN *; ---- error Parser exception: Invalid struct field name: d @@ -59,3 +81,27 @@ Conversion exception: Cast failed. {c 3423} is not in STRUCT(c:INT32) range. -STATEMENT LOAD WITH HEADERS (a STRUCT(c INT32)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/struct_quote_fail.csv" RETURN *; ---- error Conversion exception: Cast failed. { c: 'fdsfs } is not in STRUCT(c:INT32) range. +-STATEMENT LOAD WITH HEADERS (a STRUCT(c INT32)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/struct_noclose.csv" RETURN *; +---- error +Conversion exception: Cast failed. { is not in STRUCT(c:INT32) range. +-STATEMENT LOAD WITH HEADERS (a MAP(UINT8, UINT8)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_incorrect_type.csv" RETURN *; +---- error +Conversion exception: Cast failed. -3 is not in UINT8 range. +-STATEMENT LOAD WITH HEADERS (a MAP(UINT8, UINT8)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_no_val.csv" RETURN *; +---- error +Conversion exception: Cast failed. {4324} is not in MAP(UINT8: UINT8) range. +-STATEMENT LOAD WITH HEADERS (a MAP(UINT8, UINT8)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_no_closing1.csv" RETURN *; +---- error +Conversion exception: Cast failed. {10=10, 20=20, is not in MAP(UINT8: UINT8) range. +-STATEMENT LOAD WITH HEADERS (a MAP(UINT8, UINT8)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_no_closing2.csv" RETURN *; +---- error +Conversion exception: Cast failed. { is not in MAP(UINT8: UINT8) range. +-STATEMENT LOAD WITH HEADERS (a MAP(STRING, UINT8)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_no_quote.csv" RETURN *; +---- error +Conversion exception: Cast failed. {324="} is not in MAP(STRING: UINT8) range. +-STATEMENT LOAD WITH HEADERS (a MAP(UINT8, UINT8)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_no_closing3.csv" RETURN *; +---- error +Conversion exception: Cast failed. {{3=3} is not in MAP(UINT8: UINT8) range. +-STATEMENT LOAD WITH HEADERS (a MAP(UINT16, UINT16[])) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/map_no_closing4.csv" RETURN *; +---- error +Conversion exception: Cast failed. {432=[432,432} is not in MAP(UINT16: UINT16[]) range.