Skip to content

Commit

Permalink
finish cast string to map
Browse files Browse the repository at this point in the history
  • Loading branch information
AEsir777 committed Oct 13, 2023
1 parent faa5f98 commit a51e7f8
Show file tree
Hide file tree
Showing 18 changed files with 241 additions and 28 deletions.
2 changes: 1 addition & 1 deletion dataset/load-from-test/change_config.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
list | str | struct
'(this | is a word | normal not escape | ~' ( ) | , ~~ ~' )'|'try escape ~~'|' { a : { b : {c:( 3432423 | -43423 | 31231 | NULL )} } } '
'(escape | is escape success? ~~)'|' ~' ( ) do not need to escape sepeical | ()'|'{a:{}}'
'(~~ ~' not work also this "~'" )'|'nu'|'{a:{b:{}}}'
'(~~ ~' not work also this ~'"" )'|'nu'|'{a:{b:{}}}'
7 changes: 7 additions & 0 deletions dataset/load-from-test/map_correct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
map
"{[432> 24> 12] = {c: okay}> [90> 11> 43> 54] = {c: bad}> [-0] = {c: good}}"
"{[]={}}"
"{[11> 43> NULL ]={c: this is a struct}}"
"{[]={c: this is not empty}}"
"{[1000> 143245> 432> 43241> -43214324> -432423> -4324324> -41412> -131242143> 0> -4324324> -0]={}}"
"{={c: '~~ have some > >'}}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_incorrect_type.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{234=-3}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{10=10, 20=20,"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{{3=3}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_closing4.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{432=[432,432}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_quote.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{324=\"}"
1 change: 1 addition & 0 deletions dataset/load-from-test/map_no_val.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{4324}"
7 changes: 7 additions & 0 deletions dataset/load-from-test/nested_map2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"{{30099=30099, 1=0}={3mTEf=3mTEf, 3mTEf=3mTEf, 3mTEf=3mTEf}}"
"{{18046=18046, 2=321, 3= 423421}={dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y, dJ3cf6Y=dJ3cf6Y}}"
"{{}={YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y, YQcmYouhyFqD3y=YQcmYouhyFqD3y}}"
"{{31395=31395}={}}"
"{{}={}}"
"{=}"
"{}"
3 changes: 3 additions & 0 deletions dataset/load-from-test/nested_map_correct.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
" { c= {a = 3423 }, b = { g = 3421 } } "
"{}"
"{d = {}}"
2 changes: 1 addition & 1 deletion dataset/load-from-test/struct_close_fail.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
"{ c : 423 } still stuff here"
"{ c : 423 } c: 479,"
1 change: 1 addition & 0 deletions dataset/load-from-test/struct_noclose.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{
20 changes: 20 additions & 0 deletions src/common/string_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@
namespace kuzu {
namespace common {

std::vector<std::string> StringUtils::splitComma(
const std::string& input, bool ignoreEmptyStringParts) {
auto result = std::vector<std::string>();
auto currentPos = 0u;
auto lvl = 0u;
while (currentPos < input.length()) {
if (input[currentPos] == '(') {
lvl++;
} else if (input[currentPos] == ')') {
lvl--;
} else if (lvl == 0 && input[currentPos] == ',') {
break;
}
currentPos++;
}
result.push_back(input.substr(0, currentPos));
result.push_back(input.substr(currentPos + 1));
return result;
}

Check warning on line 26 in src/common/string_utils.cpp

View check run for this annotation

Codecov / codecov/patch

src/common/string_utils.cpp#L26

Added line #L26 was not covered by tests

std::vector<std::string> StringUtils::split(
const std::string& input, const std::string& delimiter, bool ignoreEmptyStringParts) {
auto result = std::vector<std::string>();
Expand Down
2 changes: 1 addition & 1 deletion src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@ std::unique_ptr<LogicalType> LogicalTypeUtils::parseMapType(const std::string& t
throw Exception("Cannot parse map type: " + trimmedStr);
}
auto mapTypeStr = trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto keyValueTypes = StringUtils::split(mapTypeStr, ",");
auto keyValueTypes = StringUtils::splitComma(mapTypeStr);
return MapType::createMapType(
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[0])),
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[1])));
Expand Down
3 changes: 3 additions & 0 deletions src/include/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ class StringUtils {
return fmt::format(fmt::runtime(format), args...);
}

static std::vector<std::string> splitComma(
const std::string& input, bool ignoreEmptyStringParts = true);

static std::vector<std::string> split(
const std::string& input, const std::string& delimiter, bool ignoreEmptyStringParts = true);

Expand Down
165 changes: 142 additions & 23 deletions src/processor/operator/persistent/reader/csv/driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char
}
lvl++; // nested one more level
} else if (*input == target) {
if (target == ']') {
if (target == csvReaderConfig.listEndChar) {
lvl--;
}
return true;
Expand All @@ -78,19 +78,25 @@ static bool skipToClose(const char*& input, const char* end, uint64_t& lvl, char
struct CountPartOperation {
uint64_t count = 0;

void handleValue(const char* start, const char* end, const CSVReaderConfig& config) { count++; }
static inline bool handleKey(
const char* start, const char* end, const CSVReaderConfig& config) {
return true;
}
inline void handleValue(const char* start, const char* end, const CSVReaderConfig& config) {
count++;
}
};

struct SplitStringListOperation {
SplitStringListOperation(uint64_t& offset, common::ValueVector* resultVector)
SplitStringListOperation(uint64_t& offset, ValueVector* resultVector)
: offset(offset), resultVector(resultVector) {}

uint64_t& offset;
ValueVector* resultVector;

void handleValue(const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) {
// NULL
auto start_copy = start;
auto startCopy = start;
skipWhitespace(start, end);
if (end - start >= 4 && (*start == 'N' || *start == 'n') &&
(*(start + 1) == 'U' || *(start + 1) == 'u') &&
Expand All @@ -99,20 +105,43 @@ struct SplitStringListOperation {
auto null_end = start + 4;
skipWhitespace(null_end, end);
if (null_end == end) {
start_copy = end;
startCopy = end;
}
}
if (start == end) { // check if its empty string - NULL
start_copy = end;
startCopy = end;
}
copyStringToVector(resultVector, offset,
std::string_view{start_copy, (uint32_t)(end - start_copy)}, csvReaderConfig);
std::string_view{startCopy, (uint32_t)(end - startCopy)}, csvReaderConfig);
offset++;
}
};

struct SplitStringMapOperation {
SplitStringMapOperation(uint64_t& offset, ValueVector* resultVector)
: offset(offset), resultVector(resultVector) {}

uint64_t& offset;
ValueVector* resultVector;

inline bool handleKey(
const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) {
trimRightWhitespace(start, end);
copyStringToVector(StructVector::getFieldVector(resultVector, 0).get(), offset,
std::string_view{start, (uint32_t)(end - start)}, csvReaderConfig);
return true;
}

inline void handleValue(
const char* start, const char* end, const CSVReaderConfig& csvReaderConfig) {
trimRightWhitespace(start, end);
copyStringToVector(StructVector::getFieldVector(resultVector, 1).get(), offset++,
std::string_view{start, (uint32_t)(end - start)}, csvReaderConfig);
}
};

template<typename T>
static bool splitCString(
static bool splitCStringList(
const char* input, uint64_t len, T& state, const CSVReaderConfig& csvReaderConfig) {
auto end = input + len;
uint64_t lvl = 1;
Expand All @@ -132,7 +161,7 @@ static bool splitCString(
if (!skipToClose(input, end, ++lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
return false;
}
} else if (ch == '\'') {
} else if (ch == '\'' || ch == '"') {
if (!skipToCloseQuotes(input, end)) {
return false;
}
Expand Down Expand Up @@ -161,14 +190,100 @@ static void castStringToList(const char* input, uint64_t len, ValueVector* vecto
uint64_t rowToAdd, const CSVReaderConfig& csvReaderConfig) {
// calculate the number of elements in array
CountPartOperation state;
splitCString(input, len, state, csvReaderConfig);
splitCStringList(input, len, state, csvReaderConfig);

auto list_entry = ListVector::addList(vector, state.count);
vector->setValue<list_entry_t>(rowToAdd, list_entry);
auto listDataVector = common::ListVector::getDataVector(vector);

SplitStringListOperation split{list_entry.offset, listDataVector};
if (!splitCString(input, len, split, csvReaderConfig)) {
if (!splitCStringList(input, len, split, csvReaderConfig)) {
throw ConversionException("Cast failed. " + std::string{input, len} + " is not in " +
LogicalTypeUtils::dataTypeToString(vector->dataType) + " range.");
}
}

template<typename T>
static bool parseKeyOrValue(const char*& input, const char* end, T& state, bool isKey,
bool& closeBracket, const CSVReaderConfig& csvReaderConfig) {
auto start = input;
uint64_t lvl = 0;

while (input < end) {
if (*input == '"' || *input == '\'') {
if (!skipToCloseQuotes(input, end)) {
return false;
};
} else if (*input == '{') {
if (!skipToClose(input, end, lvl, '}', csvReaderConfig)) {
return false;
}
} else if (*input == csvReaderConfig.listBeginChar) {
if (!skipToClose(input, end, lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
return false;
};
} else if (isKey && *input == '=') {
return state.handleKey(start, input, csvReaderConfig);
} else if (!isKey && (*input == csvReaderConfig.delimiter || *input == '}')) {
state.handleValue(start, input, csvReaderConfig);
if (*input == '}') {
closeBracket = true;
}
return true;
}
input++;
}
return false;
}

// {a=12,b=13}
template<typename T>
static bool splitCStringMap(
const char* input, uint64_t len, T& state, const CSVReaderConfig& csvReaderConfig) {
auto end = input + len;
bool closeBracket = false;

skipWhitespace(input, end);
if (input == end || *input != '{') { // start with {
return false;
}
skipWhitespace(++input, end);
if (input == end) {
return false;
}
if (*input == '}') {
skipWhitespace(++input, end); // empty
return input == end;
}

while (input < end) {
if (!parseKeyOrValue(input, end, state, true, closeBracket, csvReaderConfig)) {
return false;
}
skipWhitespace(++input, end);
if (!parseKeyOrValue(input, end, state, false, closeBracket, csvReaderConfig)) {
return false;
}
skipWhitespace(++input, end);
if (closeBracket) {
return (input == end);
}
}
return false;
}

static void castStringToMap(const char* input, uint64_t len, ValueVector* vector, uint64_t rowToAdd,
const CSVReaderConfig& csvReaderConfig) {
// count the number of maps in map
CountPartOperation state;
splitCStringMap(input, len, state, csvReaderConfig);

auto list_entry = ListVector::addList(vector, state.count);
vector->setValue<list_entry_t>(rowToAdd, list_entry);
auto structVector = common::ListVector::getDataVector(vector);

SplitStringMapOperation split{list_entry.offset, structVector};
if (!splitCStringMap(input, len, split, csvReaderConfig)) {
throw ConversionException("Cast failed. " + std::string{input, len} + " is not in " +
LogicalTypeUtils::dataTypeToString(vector->dataType) + " range.");
}
Expand All @@ -185,7 +300,7 @@ static bool parseStructFieldName(const char*& input, const char* end) {
}

static bool parseStructFieldValue(
const char*& input, const char* end, const CSVReaderConfig& csvReaderConfig) {
const char*& input, const char* end, const CSVReaderConfig& csvReaderConfig, bool& closeBrack) {
uint64_t lvl = 0;
while (input < end) {
if (*input == '"' || *input == '\'') {
Expand All @@ -197,16 +312,14 @@ static bool parseStructFieldValue(
return false;
}
} else if (*input == csvReaderConfig.listBeginChar) {
if (!skipToClose(input, end, lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
if (!skipToClose(input, end, ++lvl, csvReaderConfig.listEndChar, csvReaderConfig)) {
return false;
}
} else if (*input == csvReaderConfig.delimiter || *input == '}') {
if (*input == '}') {
auto copy = input + 1;
skipWhitespace(copy, end);
return (copy == end);
closeBrack = true;
}
return true;
return (lvl == 0);
}
input++;
}
Expand All @@ -224,11 +337,15 @@ static bool tryCastStringToStruct(const char* input, uint64_t len, ValueVector*
}
skipWhitespace(++input, end);

if (input == end) { // no closing bracket
return false;
}
if (*input == '}') {
skipWhitespace(++input, end);
return input == end;
}

bool closeBracket = false;
while (input < end) {
auto keyStart = input;
if (!parseStructFieldName(input, end)) { // find key
Expand All @@ -243,17 +360,21 @@ static bool tryCastStringToStruct(const char* input, uint64_t len, ValueVector*

skipWhitespace(++input, end);
auto valStart = input;
if (!parseStructFieldValue(input, end, csvReaderConfig)) { // find value
if (!parseStructFieldValue(input, end, csvReaderConfig, closeBracket)) { // find value
return false;
}
auto valEnd = input;
trimRightWhitespace(valStart, valEnd);
skipWhitespace(++input, end);

copyStringToVector(StructVector::getFieldVector(vector, fieldIdx).get(), rowToAdd,
std::string_view{valStart, (size_t)(valEnd - valStart)}, csvReaderConfig);
std::string_view{valStart, (uint32_t)(valEnd - valStart)}, csvReaderConfig);

if (closeBracket) {
return (input == end);
}
}
return true;
return false;
}

static void castStringToStruct(const char* input, uint64_t len, ValueVector* vector,
Expand Down Expand Up @@ -352,9 +473,7 @@ void copyStringToVector(ValueVector* vector, uint64_t rowToAdd, std::string_view
vector->setValue(rowToAdd, Interval::fromCString(strVal.data(), strVal.length()));
} break;
case LogicalTypeID::MAP: {
auto value = storage::TableCopyUtils::getVarListValue(
strVal, 1, strVal.length() - 2, type, csvReaderConfig);
vector->copyFromValue(rowToAdd, *value);
castStringToMap(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig);
} break;
case LogicalTypeID::VAR_LIST: {
castStringToList(strVal.data(), strVal.length(), vector, rowToAdd, csvReaderConfig);
Expand Down
Loading

0 comments on commit a51e7f8

Please sign in to comment.