Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

finish cast string to map #2201

Merged
merged 1 commit into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dataset/load-from-test/change_config.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
list | str | struct
'(this | is a word | normal not escape | ~' ( ) | , ~~ ~' )'|'try escape ~~'|' { a : { b : {c:( 3432423 | -43423 | 31231 | NULL )} } } '
'(escape | is escape success? ~~)'|' ~' ( ) do not need to escape sepeical | ()'|'{a:{}}'
'(~~ ~' not work also this "~'" )'|'nu'|'{a:{b:{}}}'
'(~~ ~' not work also this ~'"" )'|'nu'|'{a:{b:{}}}'
7 changes: 7 additions & 0 deletions dataset/load-from-test/map_correct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
map
"{[432> 24> 12] = {c: okay}> [90> 11> 43> 54] = {c: bad}> [-0] = {c: good}}"
"{[]={}}"
"{[11> 43> NULL ]={c: this is a struct}}"
"{[]={c: this is not empty}}"
"{[1000> 143245> 432> 43241> -43214324> -432423> -4324324> -41412> -131242143> 0> -4324324> -0]={}}"
"{={c: '~~ have some > >'}}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_incorrect_type.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{234=-3}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{10=10, 20=20,"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{{3=3}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing4.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{432=[432,432}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_quote.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{324=\"}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_val.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{4324}"
7 changes: 7 additions & 0 deletions dataset/load-from-test/nested_map2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"{{30099=30099, 1=0}={3mTEf=3mTEf, 3mTEf=3mTEf, 3mTEf=3mTEf}}"
"{{18046=18046, 2=321, 3= 423421}={dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y}}"
"{{}={YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y}}"
"{{31395=31395}={}}"
"{{}={}}"
"{=}"
"{}"
3 changes: 3 additions & 0 deletions dataset/load-from-test/nested_map_correct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
" { c= {a = 3423 }, b = { g = 3421 } } "
"{}"
"{d = {}}"
2 changes: 1 addition & 1 deletion dataset/load-from-test/struct_close_fail.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
"{ c : 423 } still stuff here"
"{ c : 423 } c: 479,"
1 change: 1 addition & 0 deletions dataset/load-from-test/struct_noclose.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{
20 changes: 20 additions & 0 deletions src/common/string_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@
namespace kuzu {
namespace common {

std::vector<std::string> StringUtils::splitComma(
const std::string& input, bool ignoreEmptyStringParts) {
auto result = std::vector<std::string>();
auto currentPos = 0u;
auto lvl = 0u;
while (currentPos < input.length()) {
if (input[currentPos] == '(') {
lvl++;
AEsir777 marked this conversation as resolved.
Show resolved Hide resolved
} else if (input[currentPos] == ')') {
lvl--;
} else if (lvl == 0 && input[currentPos] == ',') {
break;
}
currentPos++;
}
result.push_back(input.substr(0, currentPos));
result.push_back(input.substr(currentPos + 1));
return result;
}

Check warning on line 26 in src/common/string_utils.cpp

View check run for this annotation

Codecov / codecov/patch

src/common/string_utils.cpp#L26

Added line #L26 was not covered by tests

std::vector<std::string> StringUtils::split(
const std::string& input, const std::string& delimiter, bool ignoreEmptyStringParts) {
auto result = std::vector<std::string>();
Expand Down
2 changes: 1 addition & 1 deletion src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@ std::unique_ptr<LogicalType> LogicalTypeUtils::parseMapType(const std::string& t
throw Exception("Cannot parse map type: " + trimmedStr);
}
auto mapTypeStr = trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto keyValueTypes = StringUtils::split(mapTypeStr, ",");
auto keyValueTypes = StringUtils::splitComma(mapTypeStr);
return MapType::createMapType(
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[0])),
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[1])));
Expand Down
3 changes: 3 additions & 0 deletions src/include/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ class StringUtils {
return fmt::format(fmt::runtime(format), args...);
}

static std::vector<std::string> splitComma(
const std::string& input, bool ignoreEmptyStringParts = true);

static std::vector<std::string> split(
const std::string& input, const std::string& delimiter, bool ignoreEmptyStringParts = true);

Expand Down
165 changes: 142 additions & 23 deletions src/processor/operator/persistent/reader/csv/driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char
}
lvl++; // nested one more level
} else if (*input == target) {
if (target == ']') {
if (target == csvReaderConfig.listEndChar) {
lvl--;
}
return true;
Expand All @@ -78,19 +78,25 @@ static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char
struct CountPartOperation {
uint64_t count = 0;

void handleValue(const char* start, const char* end, const CSVReaderConfig& config) { count++; }
static inline bool handleKey(
const char* start, const char* end, const CSVReaderConfig& config) {
return true;
}
inline void handleValue(const char* start, const char* end, const CSVReaderConfig& config) {
count++;
}
};

struct SplitStringListOperation {
SplitStringListOperation(uint64_t& offset, common::ValueVector* resultVector)
SplitStringListOperation(uint64_t& offset, ValueVector* resultVector)
: offset(offset), resultVector(resultVector) {}

uint64_t& offset;
ValueVector* resultVector;

void handleValue(const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) {
// NULL
auto start_copy = start;
auto startCopy = start;
skipWhitespace(start, end);
if (end - start >= 4 && (*start == 'N' || *start == 'n') &&
(*(start + 1) == 'U' || *(start + 1) == 'u') &&
Expand All @@ -99,20 +105,43 @@ struct SplitStringListOperation {
auto null_end = start + 4;
skipWhitespace(null_end, end);
if (null_end == end) {
start_copy = end;
startCopy = end;
}
}
if (start == end) { // check if its empty string - NULL
start_copy = end;
startCopy = end;
}
copyStringToVector(resultVector, offset,
std::string_view{start_copy, (uint32_t)(end - start_copy)}, csvReaderConfig);
std::string_view{startCopy, (uint32_t)(end - startCopy)}, csvReaderConfig);
offset++;
}
};

struct SplitStringMapOperation {
SplitStringMapOperation(uint64_t& offset, ValueVector* resultVector)
: offset(offset), resultVector(resultVector) {}

uint64_t& offset;
ValueVector* resultVector;

inline bool handleKey(
const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) {
trimRightWhitespace(start, end);
copyStringToVector(StructVector::getFieldVector(resultVector, 0).get(), offset,
std::string_view{start, (uint32_t)(end - start)}, csvReaderConfig);
return true;
}

inline void handleValue(
const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) {
trimRightWhitespace(start, end);
copyStringToVector(StructVector::getFieldVector(resultVector, 1).get(), offset++,
std::string_view{start, (uint32_t)(end - start)}, csvReaderConfig);
}
};

template<typename T>
static bool splitCString(
static bool splitCStringList(
const char* input, uint64_t len, T& state, const CSVReaderConfig& csvReaderConfig) {
auto end = input + len;
uint64_t lvl = 1;
Expand All @@ -132,7 +161,7 @@ static bool splitCString(
if (!skipToClose(input, end, ++lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
return false;
}
} else if (ch == '\'') {
} else if (ch == '\'' || ch == '"') {
if (!skipToCloseQuotes(input, end)) {
return false;
}
Expand Down Expand Up @@ -161,14 +190,100 @@ static void castStringToList(const char* input, uint64_t len, ValueVector* vecto
uint64_t rowToAdd, const CSVReaderConfig& csvReaderConfig) {
// calculate the number of elements in array
CountPartOperation state;
splitCString(input, len, state, csvReaderConfig);
splitCStringList(input, len, state, csvReaderConfig);

auto list_entry = ListVector::addList(vector, state.count);
vector->setValue<list_entry_t>(rowToAdd, list_entry);
auto listDataVector = common::ListVector::getDataVector(vector);

SplitStringListOperation split{list_entry.offset, listDataVector};
if (!splitCString(input, len, split, csvReaderConfig)) {
if (!splitCStringList(input, len, split, csvReaderConfig)) {
throw ConversionException("Cast failed. " + std::string{input, len} + " is not in " +
LogicalTypeUtils::dataTypeToString(vector->dataType) + " range.");
}
}

template<typename T>
static bool parseKeyOrValue(const char*& input, const char* end, T& state, bool isKey,
bool& closeBracket, const CSVReaderConfig& csvReaderConfig) {
auto start = input;
uint64_t lvl = 0;

while (input < end) {
if (*input == '"' || *input == '\'') {
if (!skipToCloseQuotes(input, end)) {
return false;
};
} else if (*input == '{') {
if (!skipToClose(input, end, lvl, '}', csvReaderConfig)) {
return false;
}
} else if (*input == csvReaderConfig.listBeginChar) {
if (!skipToClose(input, end, lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
return false;
};
} else if (isKey && *input == '=') {
return state.handleKey(start, input, csvReaderConfig);
} else if (!isKey && (*input == csvReaderConfig.delimiter || *input == '}')) {
state.handleValue(start, input, csvReaderConfig);
if (*input == '}') {
closeBracket = true;
}
return true;
}
input++;
}
return false;
}

// {a=12,b=13}
template<typename T>
static bool splitCStringMap(
const char* input, uint64_t len, T& state, const CSVReaderConfig& csvReaderConfig) {
auto end = input + len;
bool closeBracket = false;

skipWhitespace(input, end);
if (input == end || *input != '{') { // start with {
return false;
}
skipWhitespace(++input, end);
if (input == end) {
return false;
}
if (*input == '}') {
skipWhitespace(++input, end); // empty
return input == end;
}

while (input < end) {
if (!parseKeyOrValue(input, end, state, true, closeBracket, csvReaderConfig)) {
return false;
}
skipWhitespace(++input, end);
if (!parseKeyOrValue(input, end, state, false, closeBracket, csvReaderConfig)) {
return false;
}
skipWhitespace(++input, end);
if (closeBracket) {
return (input == end);
}
}
return false;
}

static void castStringToMap(const char* input, uint64_t len, ValueVector* vector, uint64_t rowToAdd,
const CSVReaderConfig& csvReaderConfig) {
// count the number of maps in map
CountPartOperation state;
splitCStringMap(input, len, state, csvReaderConfig);

auto list_entry = ListVector::addList(vector, state.count);
vector->setValue<list_entry_t>(rowToAdd, list_entry);
auto structVector = common::ListVector::getDataVector(vector);

SplitStringMapOperation split{list_entry.offset, structVector};
if (!splitCStringMap(input, len, split, csvReaderConfig)) {
throw ConversionException("Cast failed. " + std::string{input, len} + " is not in " +
LogicalTypeUtils::dataTypeToString(vector->dataType) + " range.");
}
Expand All @@ -185,7 +300,7 @@ static bool parseStructFieldName(const char*& input, const char* end) {
}

static bool parseStructFieldValue(
const char*& input, const char* end, const CSVReaderConfig& csvReaderConfig) {
const char*& input, const char* end, const CSVReaderConfig& csvReaderConfig, bool& closeBrack) {
uint64_t lvl = 0;
while (input < end) {
if (*input == '"' || *input == '\'') {
Expand All @@ -197,16 +312,14 @@ static bool parseStructFieldValue(
return false;
}
} else if (*input == csvReaderConfig.listBeginChar) {
if (!skipToClose(input, end, lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
if (!skipToClose(input, end, ++lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
return false;
}
} else if (*input == csvReaderConfig.delimiter || *input == '}') {
if (*input == '}') {
auto copy = input + 1;
skipWhitespace(copy, end);
return (copy == end);
closeBrack = true;
}
return true;
return (lvl == 0);
}
input++;
}
Expand All @@ -224,11 +337,15 @@ static bool tryCastStringToStruct(const char* input, uint64_t len, ValueVector*
}
skipWhitespace(++input, end);

if (input == end) { // no closing bracket
return false;
}
if (*input == '}') {
skipWhitespace(++input, end);
return input == end;
}

bool closeBracket = false;
while (input < end) {
auto keyStart = input;
if (!parseStructFieldName(input, end)) { // find key
Expand All @@ -243,17 +360,21 @@ static bool tryCastStringToStruct(const char* input, uint64_t len, ValueVector*

skipWhitespace(++input, end);
auto valStart = input;
if (!parseStructFieldValue(input, end, csvReaderConfig)) { // find value
if (!parseStructFieldValue(input, end, csvReaderConfig, closeBracket)) { // find value
return false;
}
auto valEnd = input;
trimRightWhitespace(valStart, valEnd);
skipWhitespace(++input, end);

copyStringToVector(StructVector::getFieldVector(vector, fieldIdx).get(), rowToAdd,
std::string_view{valStart, (size_t)(valEnd - valStart)}, csvReaderConfig);
std::string_view{valStart, (uint32_t)(valEnd - valStart)}, csvReaderConfig);

if (closeBracket) {
return (input == end);
}
}
return true;
return false;
}

static void castStringToStruct(const char* input, uint64_t len, ValueVector* vector,
Expand Down Expand Up @@ -352,9 +473,7 @@ void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view
vector->setValue(rowToAdd, Interval::fromCString(strVal.data(), strVal.length()));
} break;
case LogicalTypeID::MAP: {
auto value = storage::TableCopyUtils::getVarListValue(
strVal, 1, strVal.length() - 2, type, csvReaderConfig);
vector->copyFromValue(rowToAdd, *value);
castStringToMap(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig);
} break;
case LogicalTypeID::VAR_LIST: {
castStringToList(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig);
Expand Down
Loading