Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add loader for map dataType #1891

Merged
merged 1 commit into from
Aug 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ if(MSVC)
# For now, hardcode _AMD64_
# CMAKE_GENERATOR_PLATFORM can be used for visual studio builds, but not for ninja
add_compile_definitions(_AMD64_)
# Non-english windows system may use other encodings other than utf-8 (e.g. Chinese use GBK).
add_compile_options("/utf-8")
endif()
if(CMAKE_BUILD_TYPE MATCHES Release)
if(MSVC)
Expand Down
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), content BYTEA, PRIMARY KEY (name));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), content BYTEA, audience MAP(STRING, INT64), PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float, MANY_ONE);
Expand Down
6 changes: 3 additions & 3 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}","\\xAA\\xABinteresting\\x0B"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}","\\xAB\\xCD"
Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}","pure ascii characters"
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}","\\xAA\\xABinteresting\\x0B","{audience1= 52,audience53= 42}"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}","\\xAB\\xCD",{audience1= 33}
Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}","pure ascii characters","{}"
3 changes: 2 additions & 1 deletion src/antlr4/Cypher.g4
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ TO: ( 'T' | 't' ) ( 'O' | 'o' ) ;
kU_DataType
: oC_SymbolicName
| ( oC_SymbolicName kU_ListIdentifiers )
| oC_SymbolicName SP? '(' SP? kU_PropertyDefinitions SP? ')' ;
| oC_SymbolicName SP? '(' SP? kU_PropertyDefinitions SP? ')'
| oC_SymbolicName SP? '(' SP? kU_DataType SP? ',' SP? kU_DataType SP? ')' ;

kU_ListIdentifiers : kU_ListIdentifier ( kU_ListIdentifier )* ;

Expand Down
102 changes: 68 additions & 34 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <stdexcept>

#include "common/constants.h"
#include "common/exception.h"
#include "common/null_buffer.h"
#include "common/ser_deser.h"
Expand Down Expand Up @@ -363,44 +364,18 @@ void LogicalType::setPhysicalType() {

LogicalType LogicalTypeUtils::dataTypeFromString(const std::string& dataTypeString) {
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
LogicalType dataType;
auto upperDataTypeString = StringUtils::getUpper(dataTypeString);
auto trimmedStr = StringUtils::ltrim(StringUtils::rtrim(dataTypeString));
auto upperDataTypeString = StringUtils::getUpper(trimmedStr);
if (upperDataTypeString.ends_with("[]")) {
dataType.typeID = LogicalTypeID::VAR_LIST;
dataType.extraTypeInfo = std::make_unique<VarListTypeInfo>(std::make_unique<LogicalType>(
dataTypeFromString(dataTypeString.substr(0, dataTypeString.size() - 2))));
dataType = *parseVarListType(trimmedStr);
} else if (upperDataTypeString.ends_with("]")) {
dataType.typeID = LogicalTypeID::FIXED_LIST;
auto leftBracketPos = dataTypeString.find('[');
auto rightBracketPos = dataTypeString.find(']');
auto childType = std::make_unique<LogicalType>(
dataTypeFromString(dataTypeString.substr(0, leftBracketPos)));
auto fixedNumElementsInList = std::strtoll(
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1).c_str(),
nullptr, 0 /* base */);
dataType.extraTypeInfo =
std::make_unique<FixedListTypeInfo>(std::move(childType), fixedNumElementsInList);
dataType = *parseFixedListType(trimmedStr);
} else if (upperDataTypeString.starts_with("STRUCT")) {
dataType.typeID = LogicalTypeID::STRUCT;
auto leftBracketPos = dataTypeString.find('(');
auto rightBracketPos = dataTypeString.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + dataTypeString);
}
// Remove the leading and trailing brackets.
auto structTypeStr =
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto structFieldsStr = parseStructFields(structTypeStr);
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& structFieldStr : structFieldsStr) {
auto pos = structFieldStr.find(' ');
auto fieldName = structFieldStr.substr(0, pos);
auto fieldTypeString = structFieldStr.substr(pos + 1);
structFields.emplace_back(std::make_unique<StructField>(
fieldName, std::make_unique<LogicalType>(dataTypeFromString(fieldTypeString))));
}
dataType.extraTypeInfo = std::make_unique<StructTypeInfo>(std::move(structFields));
dataType = *parseStructType(trimmedStr);
} else if (upperDataTypeString.starts_with("MAP")) {
dataType = *parseMapType(trimmedStr);
} else {
dataType.typeID = dataTypeIDFromString(dataTypeString);
dataType.typeID = dataTypeIDFromString(upperDataTypeString);
}
dataType.setPhysicalType();
return dataType;
Expand Down Expand Up @@ -675,5 +650,64 @@ std::vector<std::string> LogicalTypeUtils::parseStructFields(const std::string&
return structFieldsStr;
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseVarListType(const std::string& trimmedStr) {
return std::make_unique<LogicalType>(LogicalTypeID::VAR_LIST,
std::make_unique<VarListTypeInfo>(std::make_unique<LogicalType>(
dataTypeFromString(trimmedStr.substr(0, trimmedStr.size() - 2)))));
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseFixedListType(const std::string& trimmedStr) {
auto leftBracketPos = trimmedStr.find('[');
auto rightBracketPos = trimmedStr.find(']');
auto childType =
std::make_unique<LogicalType>(dataTypeFromString(trimmedStr.substr(0, leftBracketPos)));
auto fixedNumElementsInList = std::strtoll(
trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1).c_str(),
nullptr, 0 /* base */);
return std::make_unique<LogicalType>(LogicalTypeID::FIXED_LIST,
std::make_unique<FixedListTypeInfo>(std::move(childType), fixedNumElementsInList));
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseStructType(const std::string& trimmedStr) {
auto leftBracketPos = trimmedStr.find('(');
auto rightBracketPos = trimmedStr.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + trimmedStr);
}
// Remove the leading and trailing brackets.
auto structTypeStr =
trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto structFieldsStr = parseStructFields(structTypeStr);
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& structFieldStr : structFieldsStr) {
auto pos = structFieldStr.find(' ');
auto fieldName = structFieldStr.substr(0, pos);
auto fieldTypeString = structFieldStr.substr(pos + 1);
structFields.emplace_back(std::make_unique<StructField>(
fieldName, std::make_unique<LogicalType>(dataTypeFromString(fieldTypeString))));
}
return std::make_unique<LogicalType>(
LogicalTypeID::STRUCT, std::make_unique<StructTypeInfo>(std::move(structFields)));
}

std::unique_ptr<LogicalType> LogicalTypeUtils::parseMapType(const std::string& trimmedStr) {
auto leftBracketPos = trimmedStr.find('(');
auto rightBracketPos = trimmedStr.find_last_of(')');
if (leftBracketPos == std::string::npos || rightBracketPos == std::string::npos) {
throw Exception("Cannot parse struct type: " + trimmedStr);
}
auto mapTypeStr = trimmedStr.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1);
auto keyValueTypes = StringUtils::split(mapTypeStr, ",");
std::vector<std::unique_ptr<StructField>> structFields;
structFields.emplace_back(std::make_unique<StructField>(InternalKeyword::MAP_KEY,
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[0]))));
structFields.emplace_back(std::make_unique<StructField>(InternalKeyword::MAP_VALUE,
std::make_unique<LogicalType>(dataTypeFromString(keyValueTypes[1]))));
auto childType = std::make_unique<LogicalType>(
LogicalTypeID::STRUCT, std::make_unique<StructTypeInfo>(std::move(structFields)));
return std::make_unique<LogicalType>(
LogicalTypeID::MAP, std::make_unique<VarListTypeInfo>(std::move(childType)));
}

} // namespace common
} // namespace kuzu
3 changes: 2 additions & 1 deletion src/common/types/value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,9 @@ std::string Value::toString() const {
result += structVal->children[0]->toString();
result += "=";
result += structVal->children[1]->toString();
result += (i == childrenSize - 1 ? "}" : ", ");
result += (i == childrenSize - 1 ? "" : ", ");
}
result += "}";
return result;
}
case LogicalTypeID::VAR_LIST:
Expand Down
4 changes: 2 additions & 2 deletions src/function/vector_map_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ std::unique_ptr<FunctionBindData> MapCreationVectorFunctions::bindFunc(
auto valueType = common::VarListType::getChildType(&arguments[1]->dataType);
std::vector<std::unique_ptr<common::StructField>> structFields;
structFields.emplace_back(std::make_unique<common::StructField>(
"key", std::make_unique<common::LogicalType>(*keyType)));
common::InternalKeyword::MAP_KEY, std::make_unique<common::LogicalType>(*keyType)));
structFields.emplace_back(std::make_unique<common::StructField>(
"value", std::make_unique<common::LogicalType>(*valueType)));
common::InternalKeyword::MAP_VALUE, std::make_unique<common::LogicalType>(*valueType)));
auto mapStructType = std::make_unique<common::LogicalType>(common::LogicalTypeID::STRUCT,
std::make_unique<common::StructTypeInfo>(std::move(structFields)));
auto resultType = common::LogicalType(common::LogicalTypeID::MAP,
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ struct InternalKeyword {
static constexpr char TAG[] = "_TAG";
static constexpr char STAR[] = "*";
static constexpr char PLACE_HOLDER[] = "_PLACE_HOLDER";
static constexpr char MAP_KEY[] = "KEY";
static constexpr char MAP_VALUE[] = "VALUE";
};

enum PageSizeClass : uint8_t {
Expand Down
6 changes: 5 additions & 1 deletion src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,8 +408,12 @@ class LogicalTypeUtils {
static std::vector<LogicalType> getAllValidLogicTypes();

private:
static LogicalTypeID dataTypeIDFromString(const std::string& dataTypeIDString);
static LogicalTypeID dataTypeIDFromString(const std::string& trimmedStr);
static std::vector<std::string> parseStructFields(const std::string& structTypeStr);
static std::unique_ptr<LogicalType> parseVarListType(const std::string& trimmedStr);
static std::unique_ptr<LogicalType> parseFixedListType(const std::string& trimmedStr);
static std::unique_ptr<LogicalType> parseStructType(const std::string& trimmedStr);
static std::unique_ptr<LogicalType> parseMapType(const std::string& trimmedStr);
};

enum class DBFileType : uint8_t { ORIGINAL = 0, WAL_VERSION = 1 };
Expand Down
11 changes: 9 additions & 2 deletions src/include/storage/copier/table_copy_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ struct FileBlockInfo {
class TableCopyUtils {
public:
static void throwCopyExceptionIfNotOK(const arrow::Status& status);
static std::unique_ptr<common::Value> getArrowVarList(const std::string& l, int64_t from,
static std::unique_ptr<common::Value> getVarListValue(const std::string& l, int64_t from,
int64_t to, const common::LogicalType& dataType,
const common::CopyDescription& copyDescription);
static std::unique_ptr<common::Value> getArrowFixedListVal(const std::string& l, int64_t from,
Expand All @@ -48,7 +48,7 @@ class TableCopyUtils {
catalog::TableSchema* tableSchema,
std::unordered_map<std::string, FileBlockInfo>& fileBlockInfos);

static std::vector<std::pair<int64_t, int64_t>> getListElementPos(const std::string& l,
static std::vector<std::pair<int64_t, int64_t>> splitByDelimiter(const std::string& l,
int64_t from, int64_t to, const common::CopyDescription& copyDescription);

static std::shared_ptr<arrow::DataType> toArrowDataType(const common::LogicalType& dataType);
Expand All @@ -68,6 +68,13 @@ class TableCopyUtils {
static std::vector<std::string> getColumnNamesToRead(catalog::TableSchema* tableSchema);
static void validateNumElementsInList(
uint64_t numElementsRead, const common::LogicalType& type);
static std::unique_ptr<common::Value> parseVarList(const std::string& l, int64_t from,
int64_t to, const common::LogicalType& dataType,
const common::CopyDescription& copyDescription);
static std::unique_ptr<common::Value> parseMap(const std::string& l, int64_t from, int64_t to,
const common::LogicalType& dataType, const common::CopyDescription& copyDescription);
static std::pair<std::string, std::string> parseMapFields(const std::string& l, int64_t from,
int64_t length, const common::CopyDescription& copyDescription);
};

} // namespace storage
Expand Down
4 changes: 4 additions & 0 deletions src/include/storage/copier/var_list_column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ struct VarListDataColumnChunk {
VarListDataColumnChunk(std::unique_ptr<ColumnChunk> dataChunk)
: dataChunk{std::move(dataChunk)}, numValuesInDataChunk{0},
capacityInDataChunk{StorageConstants::NODE_GROUP_SIZE} {}

void reset();
};

class VarListColumnChunk : public ColumnChunk {
Expand All @@ -27,6 +29,8 @@ class VarListColumnChunk : public ColumnChunk {

void setValueFromString(const char* value, uint64_t length, uint64_t pos);

void resetToEmpty() final;

private:
inline common::page_idx_t getNumPages() const final {
return varListDataColumnChunk.dataChunk->getNumPages() + ColumnChunk::getNumPages();
Expand Down
15 changes: 9 additions & 6 deletions src/include/storage/store/var_list_node_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@ class VarListNodeColumn : public NodeColumn {
ColumnChunk* columnChunk, common::page_idx_t startPageIdx, uint64_t nodeGroupIdx) override;

private:
inline common::offset_t readListOffsetInStorage(
transaction::Transaction* transaction, common::offset_t nodeOffset) {
return nodeOffset == 0 ? 0 : readOffset(transaction, nodeOffset - 1);
inline common::offset_t readListOffsetInStorage(transaction::Transaction* transaction,
common::node_group_idx_t nodeGroupIdx, common::offset_t offsetInNodeGroup) {
return offsetInNodeGroup == 0 ?
0 :
readOffset(transaction, nodeGroupIdx, offsetInNodeGroup - 1);
}

void scanUnfiltered(transaction::Transaction* transaction,
Expand All @@ -85,11 +87,12 @@ class VarListNodeColumn : public NodeColumn {

void rollbackInMemory() final;

common::offset_t readOffset(transaction::Transaction* transaction, common::offset_t valuePos);
common::offset_t readOffset(transaction::Transaction* transaction,
common::node_group_idx_t nodeGroupIdx, common::offset_t offsetInNodeGroup);

ListOffsetInfoInStorage getListOffsetInfoInStorage(transaction::Transaction* transaction,
common::node_group_idx_t nodeGroupIdx, common::offset_t startOffset,
common::offset_t endOffset, std::shared_ptr<common::DataChunkState> state);
common::node_group_idx_t nodeGroupIdx, common::offset_t startOffsetInNodeGroup,
common::offset_t endOffsetInNodeGroup, std::shared_ptr<common::DataChunkState> state);

private:
std::unique_ptr<NodeColumn> dataNodeColumn;
Expand Down
27 changes: 12 additions & 15 deletions src/storage/copier/column_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,25 +342,22 @@ void FixedListColumnChunk::write(const common::Value& fixedListVal, uint64_t pos

std::unique_ptr<ColumnChunk> ColumnChunkFactory::createColumnChunk(
const LogicalType& dataType, CopyDescription* copyDescription) {
switch (dataType.getLogicalTypeID()) {
case LogicalTypeID::BOOL:
case LogicalTypeID::INT64:
case LogicalTypeID::INT32:
case LogicalTypeID::INT16:
case LogicalTypeID::DOUBLE:
case LogicalTypeID::FLOAT:
case LogicalTypeID::DATE:
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::INTERVAL:
switch (dataType.getPhysicalType()) {
case PhysicalTypeID::BOOL:
case PhysicalTypeID::INT64:
case PhysicalTypeID::INT32:
case PhysicalTypeID::INT16:
case PhysicalTypeID::DOUBLE:
case PhysicalTypeID::FLOAT:
case PhysicalTypeID::INTERVAL:
return std::make_unique<ColumnChunk>(dataType, copyDescription);
case LogicalTypeID::FIXED_LIST:
case PhysicalTypeID::FIXED_LIST:
return std::make_unique<FixedListColumnChunk>(dataType, copyDescription);
case LogicalTypeID::BLOB:
case LogicalTypeID::STRING:
case PhysicalTypeID::STRING:
return std::make_unique<StringColumnChunk>(dataType, copyDescription);
case LogicalTypeID::VAR_LIST:
case PhysicalTypeID::VAR_LIST:
return std::make_unique<VarListColumnChunk>(dataType, copyDescription);
case LogicalTypeID::STRUCT:
case PhysicalTypeID::STRUCT:
return std::make_unique<StructColumnChunk>(dataType, copyDescription);
default: {
throw NotImplementedException("ColumnChunkFactory::createColumnChunk for data type " +
Expand Down
Loading
Loading