Skip to content

Commit

Permalink
Merge pull request #1563 from kuzudb/struct-of-list
Browse files Browse the repository at this point in the history
Add loader support for struct of list/struct/string
  • Loading branch information
acquamarin committed May 24, 2023
2 parents c08468f + 3ae8639 commit 122d68f
Show file tree
Hide file tree
Showing 22 changed files with 580 additions and 387 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ jobs:
working-directory: ./build/debug/test
continue-on-error: true

- name: Display ASan log
run: cat /tmp/asan.log* || true
- name: Report ASan log
run: ls /tmp/asan.log* 1>/dev/null 2>&1 && (cat /tmp/asan.log*; exit 1) || exit 0
shell: bash

- name: Clean up ASan log
Expand Down
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE);
Expand Down
6 changes: 3 additions & 3 deletions dataset/tinysnb/vOrganisation.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
1,ABFsUni,325,3.7,-2,10 years 5 months 13 hours 24 us,3 years 5 days,1
4,CsWork,934,4.1,-100,2 years 4 days 10 hours,26 years 52 days 48 hours,0.78
6,DEsWork,824,4.1,7,2 years 4 hours 22 us 34 minutes,82 hours 100 milliseconds,0.52
1,ABFsUni,325,3.7,-2,10 years 5 months 13 hours 24 us,3 years 5 days,1,"{revenue: 138, location: ['toronto', 'montr,eal'], stock: {price: [96, 56], volume: 1000}}"
4,CsWork,934,4.1,-100,2 years 4 days 10 hours,26 years 52 days 48 hours,0.78,"{revenue: 152, location: [\"vanco,uver north area\"], stock: {price: [15, 78, 671], volume: 432}}"
6,DEsWork,824,4.1,7,2 years 4 hours 22 us 34 minutes,82 hours 100 milliseconds,0.52,"{revenue: 558, location: ['very long city name', 'new york'], stock: {price: [22], volume: 99}}"
143 changes: 85 additions & 58 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,58 @@ std::unique_ptr<LogicalType> LogicalType::copy() {
return dataType;
}

void LogicalType::setPhysicalType() {
switch (typeID) {
case LogicalTypeID::ANY: {
physicalType = PhysicalTypeID::ANY;
} break;
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
physicalType = PhysicalTypeID::INT64;
} break;
case LogicalTypeID::DATE:
case LogicalTypeID::INT32: {
physicalType = PhysicalTypeID::INT32;
} break;
case LogicalTypeID::INT16: {
physicalType = PhysicalTypeID::INT16;
} break;
case LogicalTypeID::DOUBLE: {
physicalType = PhysicalTypeID::DOUBLE;
} break;
case LogicalTypeID::FLOAT: {
physicalType = PhysicalTypeID::FLOAT;
} break;
case LogicalTypeID::INTERVAL: {
physicalType = PhysicalTypeID::INTERVAL;
} break;
case LogicalTypeID::FIXED_LIST: {
physicalType = PhysicalTypeID::FIXED_LIST;
} break;
case LogicalTypeID::INTERNAL_ID: {
physicalType = PhysicalTypeID::INTERNAL_ID;
} break;
case LogicalTypeID::STRING: {
physicalType = PhysicalTypeID::STRING;
} break;
case LogicalTypeID::RECURSIVE_REL:
case LogicalTypeID::VAR_LIST: {
physicalType = PhysicalTypeID::VAR_LIST;
} break;
case LogicalTypeID::NODE:
case LogicalTypeID::REL:
case LogicalTypeID::STRUCT: {
physicalType = PhysicalTypeID::STRUCT;
} break;
default:
throw NotImplementedException{"LogicalType::setPhysicalType()."};
}
}

LogicalType LogicalTypeUtils::dataTypeFromString(const std::string& dataTypeString) {
LogicalType dataType;
if (dataTypeString.ends_with("[]")) {
Expand All @@ -222,22 +274,21 @@ LogicalType LogicalTypeUtils::dataTypeFromString(const std::string& dataTypeStri
} else if (dataTypeString.starts_with("STRUCT")) {
dataType.typeID = LogicalTypeID::STRUCT;
auto leftBracketPos = dataTypeString.find('(');
auto rightBracketPos = dataTypeString.find(')');
auto rightBracketPos = dataTypeString.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + dataTypeString);
}
// Remove the leading and trailing brackets.
auto structTypeStr =
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto structFieldsStr = common::StringUtils::split(structTypeStr, ",");
auto structFieldsStr = parseStructFields(structTypeStr);
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& structField : structFieldsStr) {
auto structFieldParts = common::StringUtils::split(structField, " ");
if (structFieldParts.size() != 2) {
throw RuntimeException("Cannot parse struct type: " + dataTypeString);
}
structFields.emplace_back(std::make_unique<StructField>(structFieldParts[0],
std::make_unique<LogicalType>(dataTypeFromString(structFieldParts[1]))));
for (auto& structFieldStr : structFieldsStr) {
auto pos = structFieldStr.find(' ');
auto fieldName = structFieldStr.substr(0, pos);
auto fieldTypeString = structFieldStr.substr(pos + 1);
structFields.emplace_back(std::make_unique<StructField>(
fieldName, std::make_unique<LogicalType>(dataTypeFromString(fieldTypeString))));
}
dataType.extraTypeInfo = std::make_unique<StructTypeInfo>(std::move(structFields));
} else {
Expand Down Expand Up @@ -423,56 +474,32 @@ bool LogicalTypeUtils::isNumerical(const kuzu::common::LogicalType& dataType) {
}
}

void LogicalType::setPhysicalType() {
switch (typeID) {
case LogicalTypeID::ANY: {
physicalType = PhysicalTypeID::ANY;
} break;
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
physicalType = PhysicalTypeID::INT64;
} break;
case LogicalTypeID::DATE:
case LogicalTypeID::INT32: {
physicalType = PhysicalTypeID::INT32;
} break;
case LogicalTypeID::INT16: {
physicalType = PhysicalTypeID::INT16;
} break;
case LogicalTypeID::DOUBLE: {
physicalType = PhysicalTypeID::DOUBLE;
} break;
case LogicalTypeID::FLOAT: {
physicalType = PhysicalTypeID::FLOAT;
} break;
case LogicalTypeID::INTERVAL: {
physicalType = PhysicalTypeID::INTERVAL;
} break;
case LogicalTypeID::FIXED_LIST: {
physicalType = PhysicalTypeID::FIXED_LIST;
} break;
case LogicalTypeID::INTERNAL_ID: {
physicalType = PhysicalTypeID::INTERNAL_ID;
} break;
case LogicalTypeID::STRING: {
physicalType = PhysicalTypeID::STRING;
} break;
case LogicalTypeID::RECURSIVE_REL:
case LogicalTypeID::VAR_LIST: {
physicalType = PhysicalTypeID::VAR_LIST;
} break;
case LogicalTypeID::NODE:
case LogicalTypeID::REL:
case LogicalTypeID::STRUCT: {
physicalType = PhysicalTypeID::STRUCT;
} break;
default:
throw NotImplementedException{"LogicalType::setPhysicalType()."};
std::vector<std::string> LogicalTypeUtils::parseStructFields(const std::string& structTypeStr) {
std::vector<std::string> structFieldsStr;
auto startPos = 0u;
auto curPos = 0u;
auto numOpenBrackets = 0u;
while (curPos < structTypeStr.length()) {
switch (structTypeStr[curPos]) {
case '(': {
numOpenBrackets++;
} break;
case ')': {
numOpenBrackets--;
} break;
case ',': {
if (numOpenBrackets == 0) {
structFieldsStr.push_back(
StringUtils::ltrim(structTypeStr.substr(startPos, curPos - startPos)));
startPos = curPos + 1;
}
} break;
}
curPos++;
}
structFieldsStr.push_back(
StringUtils::ltrim(structTypeStr.substr(startPos, curPos - startPos)));
return structFieldsStr;
}

// Specialized Ser/Deser functions for logical dataTypes.
Expand Down
6 changes: 6 additions & 0 deletions src/include/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <algorithm>
#include <cstring>
#include <regex>
#include <sstream>
#include <string>
#include <vector>
Expand Down Expand Up @@ -50,6 +51,11 @@ class StringUtils {
s.begin(), find_if(s.begin(), s.end(), [](unsigned char ch) { return !isspace(ch); }));
return s;
}

static inline void removeWhiteSpaces(std::string& str) {
std::regex whiteSpacePattern{"\\s"};
str = std::regex_replace(str, whiteSpacePattern, "");
}
};

} // namespace common
Expand Down
1 change: 1 addition & 0 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ class LogicalTypeUtils {

private:
static LogicalTypeID dataTypeIDFromString(const std::string& dataTypeIDString);
static std::vector<std::string> parseStructFields(const std::string& structTypeStr);
};

enum class DBFileType : uint8_t { ORIGINAL = 0, WAL_VERSION = 1 };
Expand Down
7 changes: 1 addition & 6 deletions src/include/storage/copier/node_copier.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,7 @@ class NodeCopier {
std::vector<std::shared_ptr<InMemColumn>> columns,
std::vector<catalog::Property> properties)
: sharedState{std::move(sharedState)}, pkIndex{pkIndex}, copyDesc{copyDesc}, table{table},
pkColumnID{pkColumnID}, columns{std::move(columns)}, properties{std::move(properties)} {
overflowCursors.resize(this->properties.size());
}
pkColumnID{pkColumnID}, columns{std::move(columns)}, properties{std::move(properties)} {}

virtual ~NodeCopier() = default;

Expand All @@ -126,8 +124,6 @@ class NodeCopier {
throw common::CopyException("executeInternal not implemented");
}

void copyArrayIntoColumnChunk(InMemColumnChunk* columnChunk, common::column_id_t columnID,
arrow::Array& arrowArray, common::CopyDescription& copyDescription);
void populatePKIndex(InMemColumnChunk* chunk, InMemOverflowFile* overflowFile,
common::offset_t startOffset, uint64_t numValues);

Expand All @@ -150,7 +146,6 @@ class NodeCopier {
// The properties to be copied into. Each property corresponds to a column.
std::vector<catalog::Property> properties;
std::vector<std::shared_ptr<InMemColumn>> columns;
std::vector<PageByteCursor> overflowCursors;
common::column_id_t pkColumnID;
};

Expand Down
28 changes: 24 additions & 4 deletions src/include/storage/in_mem_storage_structure/in_mem_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

namespace kuzu {
namespace storage {
// TODO(GUODONG): Currently, we have both InMemNodeColumn and InMemColumn. This is a temporary
// solution for now to allow gradual refactorings. Eventually, we should only have InMemColumn.

class InMemColumn {
public:
Expand All @@ -16,13 +14,35 @@ class InMemColumn {

void flushChunk(InMemColumnChunk* chunk);

std::unique_ptr<InMemColumnChunk> getInMemColumnChunk(common::offset_t startNodeOffset,
common::offset_t endNodeOffset, const common::CopyDescription* copyDescription) {
switch (dataType.getLogicalTypeID()) {
case common::LogicalTypeID::STRING:
case common::LogicalTypeID::VAR_LIST: {
return std::make_unique<InMemColumnChunkWithOverflow>(
dataType, startNodeOffset, endNodeOffset, copyDescription, inMemOverflowFile.get());
}
case common::LogicalTypeID::STRUCT: {
auto inMemStructColumnChunk = std::make_unique<InMemStructColumnChunk>(
dataType, startNodeOffset, endNodeOffset, copyDescription);
for (auto& fieldColumn : childColumns) {
inMemStructColumnChunk->addFieldChunk(fieldColumn->getInMemColumnChunk(
startNodeOffset, endNodeOffset, copyDescription));
}
return inMemStructColumnChunk;
}
default: {
return std::make_unique<InMemColumnChunk>(
dataType, startNodeOffset, endNodeOffset, copyDescription);
}
}
}

inline common::LogicalType getDataType() { return dataType; }
inline InMemOverflowFile* getInMemOverflowFile() { return inMemOverflowFile.get(); }
inline uint16_t getNumBytesForValue() const { return numBytesForValue; }

protected:
std::string filePath;
uint16_t numBytesForValue;
std::unique_ptr<FileHandle> fileHandle;
common::LogicalType dataType;
std::unique_ptr<InMemColumn> nullColumn;
Expand Down
Loading

0 comments on commit 122d68f

Please sign in to comment.