Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add loader support for struct of list/struct/string #1563

Merged
merged 1 commit into from
May 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ jobs:
working-directory: ./build/debug/test
continue-on-error: true

- name: Display ASan log
run: cat /tmp/asan.log* || true
- name: Report ASan log
run: ls /tmp/asan.log* 1>/dev/null 2>&1 && (cat /tmp/asan.log*; exit 1) || exit 0
shell: bash

- name: Clean up ASan log
Expand Down
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE);
Expand Down
6 changes: 3 additions & 3 deletions dataset/tinysnb/vOrganisation.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
1,ABFsUni,325,3.7,-2,10 years 5 months 13 hours 24 us,3 years 5 days,1
4,CsWork,934,4.1,-100,2 years 4 days 10 hours,26 years 52 days 48 hours,0.78
6,DEsWork,824,4.1,7,2 years 4 hours 22 us 34 minutes,82 hours 100 milliseconds,0.52
1,ABFsUni,325,3.7,-2,10 years 5 months 13 hours 24 us,3 years 5 days,1,"{revenue: 138, location: ['toronto', 'montr,eal'], stock: {price: [96, 56], volume: 1000}}"
4,CsWork,934,4.1,-100,2 years 4 days 10 hours,26 years 52 days 48 hours,0.78,"{revenue: 152, location: [\"vanco,uver north area\"], stock: {price: [15, 78, 671], volume: 432}}"
6,DEsWork,824,4.1,7,2 years 4 hours 22 us 34 minutes,82 hours 100 milliseconds,0.52,"{revenue: 558, location: ['very long city name', 'new york'], stock: {price: [22], volume: 99}}"
143 changes: 85 additions & 58 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,58 @@ std::unique_ptr<LogicalType> LogicalType::copy() {
return dataType;
}

void LogicalType::setPhysicalType() {
switch (typeID) {
case LogicalTypeID::ANY: {
physicalType = PhysicalTypeID::ANY;
} break;
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
physicalType = PhysicalTypeID::INT64;
} break;
case LogicalTypeID::DATE:
case LogicalTypeID::INT32: {
physicalType = PhysicalTypeID::INT32;
} break;
case LogicalTypeID::INT16: {
physicalType = PhysicalTypeID::INT16;
} break;
case LogicalTypeID::DOUBLE: {
physicalType = PhysicalTypeID::DOUBLE;
} break;
case LogicalTypeID::FLOAT: {
physicalType = PhysicalTypeID::FLOAT;
} break;
case LogicalTypeID::INTERVAL: {
physicalType = PhysicalTypeID::INTERVAL;
} break;
case LogicalTypeID::FIXED_LIST: {
physicalType = PhysicalTypeID::FIXED_LIST;
} break;
case LogicalTypeID::INTERNAL_ID: {
physicalType = PhysicalTypeID::INTERNAL_ID;
} break;
case LogicalTypeID::STRING: {
physicalType = PhysicalTypeID::STRING;
} break;
case LogicalTypeID::RECURSIVE_REL:
case LogicalTypeID::VAR_LIST: {
physicalType = PhysicalTypeID::VAR_LIST;
} break;
case LogicalTypeID::NODE:
case LogicalTypeID::REL:
case LogicalTypeID::STRUCT: {
physicalType = PhysicalTypeID::STRUCT;
} break;
default:
throw NotImplementedException{"LogicalType::setPhysicalType()."};
}
}

LogicalType LogicalTypeUtils::dataTypeFromString(const std::string& dataTypeString) {
LogicalType dataType;
if (dataTypeString.ends_with("[]")) {
Expand All @@ -222,22 +274,21 @@ LogicalType LogicalTypeUtils::dataTypeFromString(const std::string& dataTypeStri
} else if (dataTypeString.starts_with("STRUCT")) {
dataType.typeID = LogicalTypeID::STRUCT;
auto leftBracketPos = dataTypeString.find('(');
auto rightBracketPos = dataTypeString.find(')');
auto rightBracketPos = dataTypeString.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + dataTypeString);
}
// Remove the leading and trailing brackets.
auto structTypeStr =
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto structFieldsStr = common::StringUtils::split(structTypeStr, ",");
auto structFieldsStr = parseStructFields(structTypeStr);
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& structField : structFieldsStr) {
auto structFieldParts = common::StringUtils::split(structField, " ");
if (structFieldParts.size() != 2) {
throw RuntimeException("Cannot parse struct type: " + dataTypeString);
}
structFields.emplace_back(std::make_unique<StructField>(structFieldParts[0],
std::make_unique<LogicalType>(dataTypeFromString(structFieldParts[1]))));
for (auto& structFieldStr : structFieldsStr) {
auto pos = structFieldStr.find(' ');
auto fieldName = structFieldStr.substr(0, pos);
auto fieldTypeString = structFieldStr.substr(pos + 1);
structFields.emplace_back(std::make_unique<StructField>(
fieldName, std::make_unique<LogicalType>(dataTypeFromString(fieldTypeString))));
}
dataType.extraTypeInfo = std::make_unique<StructTypeInfo>(std::move(structFields));
} else {
Expand Down Expand Up @@ -423,56 +474,32 @@ bool LogicalTypeUtils::isNumerical(const kuzu::common::LogicalType& dataType) {
}
}

void LogicalType::setPhysicalType() {
switch (typeID) {
case LogicalTypeID::ANY: {
physicalType = PhysicalTypeID::ANY;
} break;
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
physicalType = PhysicalTypeID::INT64;
} break;
case LogicalTypeID::DATE:
case LogicalTypeID::INT32: {
physicalType = PhysicalTypeID::INT32;
} break;
case LogicalTypeID::INT16: {
physicalType = PhysicalTypeID::INT16;
} break;
case LogicalTypeID::DOUBLE: {
physicalType = PhysicalTypeID::DOUBLE;
} break;
case LogicalTypeID::FLOAT: {
physicalType = PhysicalTypeID::FLOAT;
} break;
case LogicalTypeID::INTERVAL: {
physicalType = PhysicalTypeID::INTERVAL;
} break;
case LogicalTypeID::FIXED_LIST: {
physicalType = PhysicalTypeID::FIXED_LIST;
} break;
case LogicalTypeID::INTERNAL_ID: {
physicalType = PhysicalTypeID::INTERNAL_ID;
} break;
case LogicalTypeID::STRING: {
physicalType = PhysicalTypeID::STRING;
} break;
case LogicalTypeID::RECURSIVE_REL:
case LogicalTypeID::VAR_LIST: {
physicalType = PhysicalTypeID::VAR_LIST;
} break;
case LogicalTypeID::NODE:
case LogicalTypeID::REL:
case LogicalTypeID::STRUCT: {
physicalType = PhysicalTypeID::STRUCT;
} break;
default:
throw NotImplementedException{"LogicalType::setPhysicalType()."};
std::vector<std::string> LogicalTypeUtils::parseStructFields(const std::string& structTypeStr) {
std::vector<std::string> structFieldsStr;
auto startPos = 0u;
auto curPos = 0u;
auto numOpenBrackets = 0u;
while (curPos < structTypeStr.length()) {
switch (structTypeStr[curPos]) {
case '(': {
numOpenBrackets++;
} break;
case ')': {
numOpenBrackets--;
} break;
case ',': {
if (numOpenBrackets == 0) {
structFieldsStr.push_back(
StringUtils::ltrim(structTypeStr.substr(startPos, curPos - startPos)));
startPos = curPos + 1;
}
} break;
}
curPos++;
}
structFieldsStr.push_back(
StringUtils::ltrim(structTypeStr.substr(startPos, curPos - startPos)));
return structFieldsStr;
}

// Specialized Ser/Deser functions for logical dataTypes.
Expand Down
6 changes: 6 additions & 0 deletions src/include/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <algorithm>
#include <cstring>
#include <regex>
#include <sstream>
#include <string>
#include <vector>
Expand Down Expand Up @@ -50,6 +51,11 @@ class StringUtils {
s.begin(), find_if(s.begin(), s.end(), [](unsigned char ch) { return !isspace(ch); }));
return s;
}

static inline void removeWhiteSpaces(std::string& str) {
std::regex whiteSpacePattern{"\\s"};
str = std::regex_replace(str, whiteSpacePattern, "");
}
};

} // namespace common
Expand Down
1 change: 1 addition & 0 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ class LogicalTypeUtils {

private:
static LogicalTypeID dataTypeIDFromString(const std::string& dataTypeIDString);
static std::vector<std::string> parseStructFields(const std::string& structTypeStr);
};

enum class DBFileType : uint8_t { ORIGINAL = 0, WAL_VERSION = 1 };
Expand Down
7 changes: 1 addition & 6 deletions src/include/storage/copier/node_copier.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,7 @@ class NodeCopier {
std::vector<std::shared_ptr<InMemColumn>> columns,
std::vector<catalog::Property> properties)
: sharedState{std::move(sharedState)}, pkIndex{pkIndex}, copyDesc{copyDesc}, table{table},
pkColumnID{pkColumnID}, columns{std::move(columns)}, properties{std::move(properties)} {
overflowCursors.resize(this->properties.size());
}
pkColumnID{pkColumnID}, columns{std::move(columns)}, properties{std::move(properties)} {}

virtual ~NodeCopier() = default;

Expand All @@ -126,8 +124,6 @@ class NodeCopier {
throw common::CopyException("executeInternal not implemented");
}

void copyArrayIntoColumnChunk(InMemColumnChunk* columnChunk, common::column_id_t columnID,
arrow::Array& arrowArray, common::CopyDescription& copyDescription);
void populatePKIndex(InMemColumnChunk* chunk, InMemOverflowFile* overflowFile,
common::offset_t startOffset, uint64_t numValues);

Expand All @@ -150,7 +146,6 @@ class NodeCopier {
// The properties to be copied into. Each property corresponds to a column.
std::vector<catalog::Property> properties;
std::vector<std::shared_ptr<InMemColumn>> columns;
std::vector<PageByteCursor> overflowCursors;
common::column_id_t pkColumnID;
};

Expand Down
28 changes: 24 additions & 4 deletions src/include/storage/in_mem_storage_structure/in_mem_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

namespace kuzu {
namespace storage {
// TODO(GUODONG): Currently, we have both InMemNodeColumn and InMemColumn. This is a temporary
// solution for now to allow gradual refactorings. Eventually, we should only have InMemColumn.

class InMemColumn {
public:
Expand All @@ -16,13 +14,35 @@ class InMemColumn {

void flushChunk(InMemColumnChunk* chunk);

std::unique_ptr<InMemColumnChunk> getInMemColumnChunk(common::offset_t startNodeOffset,
common::offset_t endNodeOffset, const common::CopyDescription* copyDescription) {
switch (dataType.getLogicalTypeID()) {
case common::LogicalTypeID::STRING:
case common::LogicalTypeID::VAR_LIST: {
return std::make_unique<InMemColumnChunkWithOverflow>(
dataType, startNodeOffset, endNodeOffset, copyDescription, inMemOverflowFile.get());
}
case common::LogicalTypeID::STRUCT: {
auto inMemStructColumnChunk = std::make_unique<InMemStructColumnChunk>(
dataType, startNodeOffset, endNodeOffset, copyDescription);
for (auto& fieldColumn : childColumns) {
inMemStructColumnChunk->addFieldChunk(fieldColumn->getInMemColumnChunk(
startNodeOffset, endNodeOffset, copyDescription));
}
return inMemStructColumnChunk;
}
default: {
return std::make_unique<InMemColumnChunk>(
dataType, startNodeOffset, endNodeOffset, copyDescription);
}
}
}

inline common::LogicalType getDataType() { return dataType; }
inline InMemOverflowFile* getInMemOverflowFile() { return inMemOverflowFile.get(); }
inline uint16_t getNumBytesForValue() const { return numBytesForValue; }

protected:
std::string filePath;
uint16_t numBytesForValue;
std::unique_ptr<FileHandle> fileHandle;
common::LogicalType dataType;
std::unique_ptr<InMemColumn> nullColumn;
Expand Down
Loading