Skip to content

Commit

Permalink
Add loader for map dataType
Browse files Browse the repository at this point in the history
  • Loading branch information
acquamarin committed Aug 7, 2023
1 parent 315d7c7 commit d116825
Show file tree
Hide file tree
Showing 23 changed files with 2,567 additions and 2,342 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ if(MSVC)
# For now, hardcode _AMD64_
# CMAKE_GENERATOR_PLATFORM can be used for visual studio builds, but not for ninja
add_compile_definitions(_AMD64_)
# Non-english windows system may use other encodings other than utf-8 (e.g. Chinese use GBK).
add_compile_options("/utf-8")
endif()
if(CMAKE_BUILD_TYPE MATCHES Release)
if(MSVC)
Expand Down
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), content BYTEA, PRIMARY KEY (name));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), content BYTEA, audience MAP(STRING, INT64), PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float, MANY_ONE);
Expand Down
6 changes: 3 additions & 3 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}","\\xAA\\xABinteresting\\x0B"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}","\\xAB\\xCD"
Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}","pure ascii characters"
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}","\\xAA\\xABinteresting\\x0B","{audience1= 52,audience53= 42}"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}","\\xAB\\xCD",{audience1= 33}
Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}","pure ascii characters","{}"
3 changes: 2 additions & 1 deletion src/antlr4/Cypher.g4
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ TO: ( 'T' | 't' ) ( 'O' | 'o' ) ;
kU_DataType
: oC_SymbolicName
| ( oC_SymbolicName kU_ListIdentifiers )
| oC_SymbolicName SP? '(' SP? kU_PropertyDefinitions SP? ')' ;
| oC_SymbolicName SP? '(' SP? kU_PropertyDefinitions SP? ')'
| oC_SymbolicName SP? '(' SP? kU_DataType SP? ',' SP? kU_DataType SP? ')' ;

kU_ListIdentifiers : kU_ListIdentifier ( kU_ListIdentifier )* ;

Expand Down
102 changes: 68 additions & 34 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <stdexcept>

#include "common/constants.h"
#include "common/exception.h"
#include "common/null_buffer.h"
#include "common/ser_deser.h"
Expand Down Expand Up @@ -363,44 +364,18 @@ void LogicalType::setPhysicalType() {

LogicalType LogicalTypeUtils::dataTypeFromString(const std::string& dataTypeString) {
LogicalType dataType;
auto upperDataTypeString = StringUtils::getUpper(dataTypeString);
auto trimmedStr = StringUtils::ltrim(StringUtils::rtrim(dataTypeString));
auto upperDataTypeString = StringUtils::getUpper(trimmedStr);
if (upperDataTypeString.ends_with("[]")) {
dataType.typeID = LogicalTypeID::VAR_LIST;
dataType.extraTypeInfo = std::make_unique<VarListTypeInfo>(std::make_unique<LogicalType>(
dataTypeFromString(dataTypeString.substr(0, dataTypeString.size() - 2))));
dataType = *parseVarListType(trimmedStr);
} else if (upperDataTypeString.ends_with("]")) {
dataType.typeID = LogicalTypeID::FIXED_LIST;
auto leftBracketPos = dataTypeString.find('[');
auto rightBracketPos = dataTypeString.find(']');
auto childType = std::make_unique<LogicalType>(
dataTypeFromString(dataTypeString.substr(0, leftBracketPos)));
auto fixedNumElementsInList = std::strtoll(
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1).c_str(),
nullptr, 0 /* base */);
dataType.extraTypeInfo =
std::make_unique<FixedListTypeInfo>(std::move(childType), fixedNumElementsInList);
dataType = *parseFixedListType(trimmedStr);
} else if (upperDataTypeString.starts_with("STRUCT")) {
dataType.typeID = LogicalTypeID::STRUCT;
auto leftBracketPos = dataTypeString.find('(');
auto rightBracketPos = dataTypeString.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + dataTypeString);
}
// Remove the leading and trailing brackets.
auto structTypeStr =
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto structFieldsStr = parseStructFields(structTypeStr);
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& structFieldStr : structFieldsStr) {
auto pos = structFieldStr.find(' ');
auto fieldName = structFieldStr.substr(0, pos);
auto fieldTypeString = structFieldStr.substr(pos + 1);
structFields.emplace_back(std::make_unique<StructField>(
fieldName, std::make_unique<LogicalType>(dataTypeFromString(fieldTypeString))));
}
dataType.extraTypeInfo = std::make_unique<StructTypeInfo>(std::move(structFields));
dataType = *parseStructType(trimmedStr);
} else if (upperDataTypeString.starts_with("MAP")) {
dataType = *parseMapType(trimmedStr);
} else {
dataType.typeID = dataTypeIDFromString(dataTypeString);
dataType.typeID = dataTypeIDFromString(upperDataTypeString);
}
dataType.setPhysicalType();
return dataType;
Expand Down Expand Up @@ -675,5 +650,64 @@ std::vector<std::string> LogicalTypeUtils::parseStructFields(const std::string&
return structFieldsStr;
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseVarListType(const std::string& trimmedStr) {
return std::make_unique<LogicalType>(LogicalTypeID::VAR_LIST,
std::make_unique<VarListTypeInfo>(std::make_unique<LogicalType>(
dataTypeFromString(trimmedStr.substr(0, trimmedStr.size() - 2)))));
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseFixedListType(const std::string& trimmedStr) {
auto leftBracketPos = trimmedStr.find('[');
auto rightBracketPos = trimmedStr.find(']');
auto childType =
std::make_unique<LogicalType>(dataTypeFromString(trimmedStr.substr(0, leftBracketPos)));
auto fixedNumElementsInList = std::strtoll(
trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1).c_str(),
nullptr, 0 /* base */);
return std::make_unique<LogicalType>(LogicalTypeID::FIXED_LIST,
std::make_unique<FixedListTypeInfo>(std::move(childType), fixedNumElementsInList));
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseStructType(const std::string& trimmedStr) {
auto leftBracketPos = trimmedStr.find('(');
auto rightBracketPos = trimmedStr.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + trimmedStr);
}
// Remove the leading and trailing brackets.
auto structTypeStr =
trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto structFieldsStr = parseStructFields(structTypeStr);
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& structFieldStr : structFieldsStr) {
auto pos = structFieldStr.find(' ');
auto fieldName = structFieldStr.substr(0, pos);
auto fieldTypeString = structFieldStr.substr(pos + 1);
structFields.emplace_back(std::make_unique<StructField>(
fieldName, std::make_unique<LogicalType>(dataTypeFromString(fieldTypeString))));
}
return std::make_unique<LogicalType>(
LogicalTypeID::STRUCT, std::make_unique<StructTypeInfo>(std::move(structFields)));
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseMapType(const std::string& trimmedStr) {
auto leftBracketPos = trimmedStr.find('(');
auto rightBracketPos = trimmedStr.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + trimmedStr);
}
auto mapTypeStr = trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto keyValueTypes = StringUtils::split(mapTypeStr, ",");
std::vector<std::unique_ptr<StructField>> structFields;
structFields.emplace_back(std::make_unique<StructField>(InternalKeyword::MAP_KEY,
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[0]))));
structFields.emplace_back(std::make_unique<StructField>(InternalKeyword::MAP_VALUE,
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[1]))));
auto childType = std::make_unique<LogicalType>(
LogicalTypeID::STRUCT, std::make_unique<StructTypeInfo>(std::move(structFields)));
return std::make_unique<LogicalType>(
LogicalTypeID::MAP, std::make_unique<VarListTypeInfo>(std::move(childType)));
}

} // namespace common
} // namespace kuzu
3 changes: 2 additions & 1 deletion src/common/types/value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,9 @@ std::string Value::toString() const {
result += structVal->children[0]->toString();
result += "=";
result += structVal->children[1]->toString();
result += (i == childrenSize - 1 ? "}" : ", ");
result += (i == childrenSize - 1 ? "" : ", ");
}
result += "}";
return result;
}
case LogicalTypeID::VAR_LIST:
Expand Down
4 changes: 2 additions & 2 deletions src/function/vector_map_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ std::unique_ptr<FunctionBindData> MapCreationVectorFunctions::bindFunc(
auto valueType = common::VarListType::getChildType(&arguments[1]->dataType);
std::vector<std::unique_ptr<common::StructField>> structFields;
structFields.emplace_back(std::make_unique<common::StructField>(
"key", std::make_unique<common::LogicalType>(*keyType)));
common::InternalKeyword::MAP_KEY, std::make_unique<common::LogicalType>(*keyType)));
structFields.emplace_back(std::make_unique<common::StructField>(
"value", std::make_unique<common::LogicalType>(*valueType)));
common::InternalKeyword::MAP_VALUE, std::make_unique<common::LogicalType>(*valueType)));
auto mapStructType = std::make_unique<common::LogicalType>(common::LogicalTypeID::STRUCT,
std::make_unique<common::StructTypeInfo>(std::move(structFields)));
auto resultType = common::LogicalType(common::LogicalTypeID::MAP,
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ struct InternalKeyword {
static constexpr char TAG[] = "_TAG";
static constexpr char STAR[] = "*";
static constexpr char PLACE_HOLDER[] = "_PLACE_HOLDER";
static constexpr char MAP_KEY[] = "KEY";
static constexpr char MAP_VALUE[] = "VALUE";
};

enum PageSizeClass : uint8_t {
Expand Down
6 changes: 5 additions & 1 deletion src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,8 +408,12 @@ class LogicalTypeUtils {
static std::vector<LogicalType> getAllValidLogicTypes();

private:
static LogicalTypeID dataTypeIDFromString(const std::string& dataTypeIDString);
static LogicalTypeID dataTypeIDFromString(const std::string& trimmedStr);
static std::vector<std::string> parseStructFields(const std::string& structTypeStr);
static std::unique_ptr<LogicalType> parseVarListType(const std::string& trimmedStr);
static std::unique_ptr<LogicalType> parseFixedListType(const std::string& trimmedStr);
static std::unique_ptr<LogicalType> parseStructType(const std::string& trimmedStr);
static std::unique_ptr<LogicalType> parseMapType(const std::string& trimmedStr);
};

enum class DBFileType : uint8_t { ORIGINAL = 0, WAL_VERSION = 1 };
Expand Down
11 changes: 9 additions & 2 deletions src/include/storage/copier/table_copy_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct FileBlockInfo {
class TableCopyUtils {
public:
static void throwCopyExceptionIfNotOK(const arrow::Status& status);
static std::unique_ptr<common::Value> getArrowVarList(const std::string& l, int64_t from,
static std::unique_ptr<common::Value> getVarListValue(const std::string& l, int64_t from,
int64_t to, const common::LogicalType& dataType,
const common::CopyDescription& copyDescription);
static std::unique_ptr<common::Value> getArrowFixedListVal(const std::string& l, int64_t from,
Expand All @@ -48,7 +48,7 @@ class TableCopyUtils {
catalog::TableSchema* tableSchema,
std::unordered_map<std::string, FileBlockInfo>& fileBlockInfos);

static std::vector<std::pair<int64_t, int64_t>> getListElementPos(const std::string& l,
static std::vector<std::pair<int64_t, int64_t>> splitByDelimiter(const std::string& l,
int64_t from, int64_t to, const common::CopyDescription& copyDescription);

static std::shared_ptr<arrow::DataType> toArrowDataType(const common::LogicalType& dataType);
Expand All @@ -68,6 +68,13 @@ class TableCopyUtils {
static std::vector<std::string> getColumnNamesToRead(catalog::TableSchema* tableSchema);
static void validateNumElementsInList(
uint64_t numElementsRead, const common::LogicalType& type);
static std::unique_ptr<common::Value> parseVarList(const std::string& l, int64_t from,
int64_t to, const common::LogicalType& dataType,
const common::CopyDescription& copyDescription);
static std::unique_ptr<common::Value> parseMap(const std::string& l, int64_t from, int64_t to,
const common::LogicalType& dataType, const common::CopyDescription& copyDescription);
static std::pair<std::string, std::string> parseMapFields(const std::string& l, int64_t from,
int64_t length, const common::CopyDescription& copyDescription);
};

} // namespace storage
Expand Down
4 changes: 4 additions & 0 deletions src/include/storage/copier/var_list_column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ struct VarListDataColumnChunk {
VarListDataColumnChunk(std::unique_ptr<ColumnChunk> dataChunk)
: dataChunk{std::move(dataChunk)}, numValuesInDataChunk{0},
capacityInDataChunk{StorageConstants::NODE_GROUP_SIZE} {}

void reset();
};

class VarListColumnChunk : public ColumnChunk {
Expand All @@ -27,6 +29,8 @@ class VarListColumnChunk : public ColumnChunk {

void setValueFromString(const char* value, uint64_t length, uint64_t pos);

void resetToEmpty() final;

private:
inline common::page_idx_t getNumPages() const final {
return varListDataColumnChunk.dataChunk->getNumPages() + ColumnChunk::getNumPages();
Expand Down
15 changes: 9 additions & 6 deletions src/include/storage/store/var_list_node_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@ class VarListNodeColumn : public NodeColumn {
ColumnChunk* columnChunk, common::page_idx_t startPageIdx, uint64_t nodeGroupIdx) override;

private:
inline common::offset_t readListOffsetInStorage(
transaction::Transaction* transaction, common::offset_t nodeOffset) {
return nodeOffset == 0 ? 0 : readOffset(transaction, nodeOffset - 1);
inline common::offset_t readListOffsetInStorage(transaction::Transaction* transaction,
common::node_group_idx_t nodeGroupIdx, common::offset_t offsetInNodeGroup) {
return offsetInNodeGroup == 0 ?
0 :
readOffset(transaction, nodeGroupIdx, offsetInNodeGroup - 1);
}

void scanUnfiltered(transaction::Transaction* transaction,
Expand All @@ -85,11 +87,12 @@ class VarListNodeColumn : public NodeColumn {

void rollbackInMemory() final;

common::offset_t readOffset(transaction::Transaction* transaction, common::offset_t valuePos);
common::offset_t readOffset(transaction::Transaction* transaction,
common::node_group_idx_t nodeGroupIdx, common::offset_t offsetInNodeGroup);

ListOffsetInfoInStorage getListOffsetInfoInStorage(transaction::Transaction* transaction,
common::node_group_idx_t nodeGroupIdx, common::offset_t startOffset,
common::offset_t endOffset, std::shared_ptr<common::DataChunkState> state);
common::node_group_idx_t nodeGroupIdx, common::offset_t startOffsetInNodeGroup,
common::offset_t endOffsetInNodeGroup, std::shared_ptr<common::DataChunkState> state);

private:
std::unique_ptr<NodeColumn> dataNodeColumn;
Expand Down
27 changes: 12 additions & 15 deletions src/storage/copier/column_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,25 +342,22 @@ void FixedListColumnChunk::write(const common::Value& fixedListVal, uint64_t pos

std::unique_ptr<ColumnChunk> ColumnChunkFactory::createColumnChunk(
const LogicalType& dataType, CopyDescription* copyDescription) {
switch (dataType.getLogicalTypeID()) {
case LogicalTypeID::BOOL:
case LogicalTypeID::INT64:
case LogicalTypeID::INT32:
case LogicalTypeID::INT16:
case LogicalTypeID::DOUBLE:
case LogicalTypeID::FLOAT:
case LogicalTypeID::DATE:
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::INTERVAL:
switch (dataType.getPhysicalType()) {
case PhysicalTypeID::BOOL:
case PhysicalTypeID::INT64:
case PhysicalTypeID::INT32:
case PhysicalTypeID::INT16:
case PhysicalTypeID::DOUBLE:
case PhysicalTypeID::FLOAT:
case PhysicalTypeID::INTERVAL:
return std::make_unique<ColumnChunk>(dataType, copyDescription);
case LogicalTypeID::FIXED_LIST:
case PhysicalTypeID::FIXED_LIST:
return std::make_unique<FixedListColumnChunk>(dataType, copyDescription);
case LogicalTypeID::BLOB:
case LogicalTypeID::STRING:
case PhysicalTypeID::STRING:
return std::make_unique<StringColumnChunk>(dataType, copyDescription);
case LogicalTypeID::VAR_LIST:
case PhysicalTypeID::VAR_LIST:
return std::make_unique<VarListColumnChunk>(dataType, copyDescription);
case LogicalTypeID::STRUCT:
case PhysicalTypeID::STRUCT:
return std::make_unique<StructColumnChunk>(dataType, copyDescription);
default: {
throw NotImplementedException("ColumnChunkFactory::createColumnChunk for data type " +
Expand Down
Loading

0 comments on commit d116825

Please sign in to comment.