Skip to content

Commit

Permalink
Add loader and storage support for struct
Browse files Browse the repository at this point in the history
  • Loading branch information
acquamarin committed Apr 26, 2023
1 parent 14e1277 commit 14bf9c3
Show file tree
Hide file tree
Showing 45 changed files with 616 additions and 224 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ __pycache__/
*$py.class
cmake-build-debug/
test/unittest_temp/
tools/python_api/test/test_PYTHON_CSV.csv

# antlr4 jar
scripts/antlr4/antlr4.jar
Expand Down
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table movies (name STRING, length INT32, note STRING, PRIMARY KEY (name));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float, MANY_ONE);
Expand Down
6 changes: 3 additions & 3 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Sóló cón tu párejâ,126, this is a very very good movie
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good
Roma,298,the movie is very interesting and funny
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}"
Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}"
10 changes: 4 additions & 6 deletions src/binder/bind_expression/bind_function_expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,8 @@ std::shared_ptr<Expression> ExpressionBinder::bindNodeLabelFunction(const Expres
auto nodeTableIDs = catalogContent->getNodeTableIDs();
expression_vector children;
children.push_back(node.getInternalIDProperty());
auto labelsValue =
std::make_unique<Value>(DataType(VAR_LIST, std::make_unique<DataType>(STRING)),
populateLabelValues(nodeTableIDs, *catalogContent));
auto labelsValue = std::make_unique<Value>(DataType(std::make_unique<DataType>(STRING)),
populateLabelValues(nodeTableIDs, *catalogContent));
children.push_back(createLiteralExpression(std::move(labelsValue)));
auto execFunc = function::LabelVectorOperation::execFunction;
auto bindData = std::make_unique<function::FunctionBindData>(DataType(STRING));
Expand All @@ -210,9 +209,8 @@ std::shared_ptr<Expression> ExpressionBinder::bindRelLabelFunction(const Express
auto relTableIDs = catalogContent->getRelTableIDs();
expression_vector children;
children.push_back(rel.getInternalIDProperty());
auto labelsValue =
std::make_unique<Value>(DataType(VAR_LIST, std::make_unique<DataType>(STRING)),
populateLabelValues(relTableIDs, *catalogContent));
auto labelsValue = std::make_unique<Value>(DataType(std::make_unique<DataType>(STRING)),
populateLabelValues(relTableIDs, *catalogContent));
children.push_back(createLiteralExpression(std::move(labelsValue)));
auto execFunc = function::LabelVectorOperation::execFunction;
auto bindData = std::make_unique<function::FunctionBindData>(DataType(STRING));
Expand Down
8 changes: 3 additions & 5 deletions src/c_api/data_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@ kuzu_data_type* kuzu_data_type_create(
} else {
auto child_type_pty =
std::make_unique<DataType>(*static_cast<DataType*>(child_type->_data_type));
data_type =
fixed_num_elements_in_list > 0 ?
new DataType(static_cast<DataTypeID>(data_type_id_u8), std::move(child_type_pty),
fixed_num_elements_in_list) :
new DataType(static_cast<DataTypeID>(data_type_id_u8), std::move(child_type_pty));
data_type = fixed_num_elements_in_list > 0 ?
new DataType(std::move(child_type_pty), fixed_num_elements_in_list) :
new DataType(std::move(child_type_pty));
}
c_data_type->_data_type = data_type;
return c_data_type;
Expand Down
6 changes: 5 additions & 1 deletion src/common/types/timestamp_t.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "common/types/timestamp_t.h"

#include "common/exception.h"
#include "common/string_utils.h"

namespace kuzu {
namespace common {
Expand Down Expand Up @@ -120,10 +121,13 @@ timestamp_t Timestamp::FromCString(const char* str, uint64_t len) {

// Find the string len for date
uint32_t dateStrLen = 0;
// Skip leading spaces.
while (common::StringUtils::CharacterIsSpace(str[dateStrLen])) {
dateStrLen++;
}
while (dateStrLen < len && str[dateStrLen] != ' ' && str[dateStrLen] != 'T') {
dateStrLen++;
}

if (!Date::TryConvertDate(str, dateStrLen, pos, date)) {
throw ConversionException(getTimestampConversionExceptionMsg(str, len));
}
Expand Down
67 changes: 30 additions & 37 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ uint64_t SerDeser::deserializeValue(FixedListTypeInfo& value, FileInfo* fileInfo
template<>
uint64_t SerDeser::serializeValue(
const StructTypeInfo& value, FileInfo* fileInfo, uint64_t offset) {
auto childrenTypesToReturn = value.getChildrenTypes();
return serializeVector(childrenTypesToReturn, fileInfo, offset);
return serializeVector(value.fields, fileInfo, offset);
}

template<>
Expand All @@ -51,15 +50,18 @@ uint64_t SerDeser::deserializeValue(StructTypeInfo& value, FileInfo* fileInfo, u
}

template<>
uint64_t SerDeser::serializeValue(const StructField& value, FileInfo* fileInfo, uint64_t offset) {
offset = serializeValue(value.getName(), fileInfo, offset);
return serializeValue(*value.getType(), fileInfo, offset);
uint64_t SerDeser::serializeValue(
const std::unique_ptr<StructField>& value, FileInfo* fileInfo, uint64_t offset) {
offset = serializeValue<std::string>(value->name, fileInfo, offset);
return serializeValue(*value->getType(), fileInfo, offset);
}

template<>
uint64_t SerDeser::deserializeValue(StructField& value, FileInfo* fileInfo, uint64_t offset) {
offset = deserializeValue(value.name, fileInfo, offset);
return deserializeValue(*value.type, fileInfo, offset);
uint64_t SerDeser::deserializeValue(
std::unique_ptr<StructField>& value, FileInfo* fileInfo, uint64_t offset) {
value = std::make_unique<StructField>();
offset = deserializeValue<std::string>(value->name, fileInfo, offset);
return deserializeValue(*value->type, fileInfo, offset);
}

template<>
Expand Down Expand Up @@ -147,48 +149,37 @@ std::unique_ptr<StructField> StructField::copy() const {
}

std::unique_ptr<ExtraTypeInfo> StructTypeInfo::copy() const {
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& field : fields) {
structFields.emplace_back(field->copy());
std::vector<std::unique_ptr<StructField>> structFields{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
structFields[i] = fields[i]->copy();
}
return std::make_unique<StructTypeInfo>(std::move(structFields));
}

std::vector<DataType*> StructTypeInfo::getChildrenTypes() const {
std::vector<DataType*> childrenTypesToReturn;
for (auto& field : fields) {
childrenTypesToReturn.push_back(field->getType());
std::vector<DataType*> childrenTypesToReturn{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
childrenTypesToReturn[i] = fields[i]->getType();
}
return childrenTypesToReturn;
}

std::vector<std::string> StructTypeInfo::getChildrenNames() const {
std::vector<std::string> childrenNames;
for (auto& field : fields) {
childrenNames.push_back(field->getName());
std::vector<std::string> childrenNames{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
childrenNames[i] = fields[i]->getName();
}
return childrenNames;
}

DataType::DataType() : typeID{ANY}, extraTypeInfo{nullptr} {}

DataType::DataType(DataTypeID typeID) : typeID{typeID}, extraTypeInfo{nullptr} {}

DataType::DataType(DataTypeID typeID, std::unique_ptr<DataType> childType)
: typeID{VAR_LIST}, extraTypeInfo{std::make_unique<VarListTypeInfo>(std::move(childType))} {
assert(typeID == VAR_LIST);
}

DataType::DataType(
DataTypeID typeID, std::unique_ptr<DataType> childType, uint64_t fixedNumElementsInList)
: typeID{FIXED_LIST}, extraTypeInfo{std::make_unique<FixedListTypeInfo>(
std::move(childType), fixedNumElementsInList)} {
assert(typeID == FIXED_LIST);
std::vector<StructField*> StructTypeInfo::getStructFields() const {
std::vector<StructField*> structFields{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
structFields[i] = fields[i].get();
}
return structFields;
}

DataType::DataType(DataTypeID typeID, std::vector<std::unique_ptr<StructField>> childrenTypes)
: typeID{STRUCT}, extraTypeInfo{std::make_unique<StructTypeInfo>(std::move(childrenTypes))} {}

DataType::DataType(const DataType& other) {
typeID = other.typeID;
if (other.extraTypeInfo != nullptr) {
Expand Down Expand Up @@ -297,7 +288,7 @@ DataType Types::dataTypeFromString(const std::string& dataTypeString) {
throw Exception("Cannot parse struct type: " + dataTypeString);
}
std::istringstream iss{
dataTypeString.substr(leftBracketPos, rightBracketPos - leftBracketPos)};
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1)};
std::vector<std::unique_ptr<StructField>> childrenTypes;
std::string field, fieldName, fieldType;
while (getline(iss, field, ',')) {
Expand Down Expand Up @@ -360,10 +351,12 @@ std::string Types::dataTypeToString(const DataType& dataType) {
case STRUCT: {
auto structTypeInfo = reinterpret_cast<StructTypeInfo*>(dataType.extraTypeInfo.get());
std::string dataTypeStr = dataTypeToString(dataType.typeID) + "(";
for (auto& childType : structTypeInfo->getChildrenTypes()) {
dataTypeStr += dataTypeToString(*childType);
auto numFields = structTypeInfo->getChildrenTypes().size();
for (auto i = 0u; i < numFields - 1; i++) {
dataTypeStr += dataTypeToString(*structTypeInfo->getChildrenTypes()[i]);
dataTypeStr += ",";
}
dataTypeStr += dataTypeToString(*structTypeInfo->getChildrenTypes()[numFields - 1]);
return dataTypeStr + ")";
}
case ANY:
Expand Down
4 changes: 2 additions & 2 deletions src/function/built_in_aggregate_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ void BuiltInAggregateFunctions::registerCountStar() {
void BuiltInAggregateFunctions::registerCount() {
std::vector<std::unique_ptr<AggregateFunctionDefinition>> definitions;
for (auto& typeID : DataType::getAllValidTypeIDs()) {
auto inputType = (typeID == VAR_LIST ? DataType(VAR_LIST, std::make_unique<DataType>(ANY)) :
DataType(typeID));
auto inputType =
(typeID == VAR_LIST ? DataType(std::make_unique<DataType>(ANY)) : DataType(typeID));
for (auto isDistinct : std::vector<bool>{true, false}) {
definitions.push_back(std::make_unique<AggregateFunctionDefinition>(COUNT_FUNC_NAME,
std::vector<DataTypeID>{typeID}, INT64,
Expand Down
2 changes: 1 addition & 1 deletion src/function/vector_list_operation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ std::unique_ptr<FunctionBindData> ListCreationVectorOperation::bindFunc(
LIST_CREATION_FUNC_NAME, arguments[0]->getDataType(), arguments[i]->getDataType()));
}
}
auto resultType = DataType(VAR_LIST, std::make_unique<DataType>(arguments[0]->getDataType()));
auto resultType = DataType(std::make_unique<DataType>(arguments[0]->getDataType()));
return std::make_unique<FunctionBindData>(resultType);
}

Expand Down
3 changes: 2 additions & 1 deletion src/function/vector_struct_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ std::unique_ptr<FunctionBindData> StructPackVectorOperations::bindFunc(
fields.emplace_back(std::make_unique<common::StructField>(
argument->getAlias(), argument->getDataType().copy()));
}
auto resultType = common::DataType(common::STRUCT, std::move(fields));
auto resultType = common::DataType(std::move(fields));
return std::make_unique<FunctionBindData>(resultType);
}

Expand All @@ -43,6 +43,7 @@ std::unique_ptr<FunctionBindData> StructExtractVectorOperations::bindFunc(
throw common::BinderException("Key name for struct_extract must be STRING literal.");
}
auto key = ((binder::LiteralExpression&)*arguments[1]).getValue()->getValue<std::string>();
common::StringUtils::toUpper(key);
assert(definition->returnTypeID == common::ANY);
auto childrenTypes = typeInfo->getChildrenTypes();
auto childrenNames = typeInfo->getChildrenNames();
Expand Down
4 changes: 4 additions & 0 deletions src/include/common/ser_deser.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,13 @@ class SerDeser {

template<>
uint64_t SerDeser::serializeValue(const DataType& value, FileInfo* fileInfo, uint64_t offset);
template<>
uint64_t SerDeser::serializeValue(const std::string& value, FileInfo* fileInfo, uint64_t offset);

template<>
uint64_t SerDeser::deserializeValue(DataType& value, FileInfo* fileInfo, uint64_t offset);
template<>
uint64_t SerDeser::deserializeValue(std::string& value, FileInfo* fileInfo, uint64_t offset);

} // namespace common
} // namespace kuzu
31 changes: 20 additions & 11 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <vector>

#include "common/api.h"
#include "common/string_utils.h"

namespace kuzu {
namespace common {
Expand All @@ -29,6 +30,7 @@ constexpr column_id_t INVALID_COLUMN_ID = INVALID_PROPERTY_ID;
using vector_idx_t = uint32_t;
constexpr vector_idx_t INVALID_VECTOR_IDX = UINT32_MAX;
using block_idx_t = uint64_t;
using field_idx_t = uint64_t;

// System representation for a variable-sized overflow value.
struct overflow_value_t {
Expand Down Expand Up @@ -116,8 +118,12 @@ class StructField {
friend class SerDeser;

public:
StructField() : type{std::make_unique<DataType>()} {}
StructField(std::string name, std::unique_ptr<DataType> type)
: name{std::move(name)}, type{std::move(type)} {}
: name{std::move(name)}, type{std::move(type)} {
// Note: struct field name is case-insensitive.
StringUtils::toUpper(this->name);
}
inline bool operator!=(const StructField& other) const { return !(*this == other); }
inline std::string getName() const { return name; }
inline DataType* getType() const { return type.get(); }
Expand All @@ -133,15 +139,13 @@ class StructTypeInfo : public ExtraTypeInfo {
friend class SerDeser;

public:
StructTypeInfo() = default;
explicit StructTypeInfo(std::vector<std::unique_ptr<StructField>> fields)
: fields{std::move(fields)} {}
StructTypeInfo() = default;

inline void addChildType(const std::string& name, const DataType& type) {
fields.emplace_back(std::make_unique<StructField>(name, std::make_unique<DataType>(type)));
}
std::vector<DataType*> getChildrenTypes() const;
std::vector<std::string> getChildrenNames() const;
std::vector<StructField*> getStructFields() const;

bool operator==(const kuzu::common::StructTypeInfo& other) const;
std::unique_ptr<ExtraTypeInfo> copy() const override;
Expand All @@ -152,12 +156,17 @@ class StructTypeInfo : public ExtraTypeInfo {

class DataType {
public:
KUZU_API DataType();
KUZU_API explicit DataType(DataTypeID typeID);
KUZU_API DataType(DataTypeID typeID, std::unique_ptr<DataType> childType);
KUZU_API DataType(
DataTypeID typeID, std::unique_ptr<DataType> childType, uint64_t fixedNumElementsInList);
KUZU_API DataType(DataTypeID typeID, std::vector<std::unique_ptr<StructField>> childrenTypes);
KUZU_API DataType() : typeID{ANY}, extraTypeInfo{nullptr} {};
KUZU_API explicit DataType(DataTypeID typeID) : typeID{typeID}, extraTypeInfo{nullptr} {};
KUZU_API DataType(std::unique_ptr<DataType> childType)
: typeID{VAR_LIST}, extraTypeInfo{std::make_unique<VarListTypeInfo>(std::move(childType))} {
}
KUZU_API DataType(std::unique_ptr<DataType> childType, uint64_t fixedNumElementsInList)
: typeID{FIXED_LIST}, extraTypeInfo{std::make_unique<FixedListTypeInfo>(
std::move(childType), fixedNumElementsInList)} {}
KUZU_API DataType(std::vector<std::unique_ptr<StructField>> childrenTypes)
: typeID{STRUCT}, extraTypeInfo{
std::make_unique<StructTypeInfo>(std::move(childrenTypes))} {};
KUZU_API DataType(const DataType& other);
KUZU_API DataType(DataType&& other) noexcept;

Expand Down
4 changes: 2 additions & 2 deletions src/include/function/aggregate/collect.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ struct CollectFunction {
assert(arguments.size() == 1);
auto aggFuncDefinition = reinterpret_cast<AggregateFunctionDefinition*>(definition);
aggFuncDefinition->aggregateFunction->setInputDataType(arguments[0]->dataType);
auto returnType = common::DataType(
common::VAR_LIST, std::make_unique<common::DataType>(arguments[0]->dataType));
auto returnType =
common::DataType(std::make_unique<common::DataType>(arguments[0]->dataType));
return std::make_unique<FunctionBindData>(returnType);
}
};
Expand Down
16 changes: 10 additions & 6 deletions src/include/storage/copier/table_copy_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ class TableCopyExecutor {
uint64_t copy(processor::ExecutionContext* executionContext);

static void throwCopyExceptionIfNotOK(const arrow::Status& status);

static std::unique_ptr<common::Value> getArrowVarList(const std::string& l, int64_t from,
int64_t to, const common::DataType& dataType, common::CopyDescription& copyDescription);
int64_t to, const common::DataType& dataType,
const common::CopyDescription& copyDescription);
static std::unique_ptr<uint8_t[]> getArrowFixedList(const std::string& l, int64_t from,
int64_t to, const common::DataType& dataType, common::CopyDescription& copyDescription);

int64_t to, const common::DataType& dataType,
const common::CopyDescription& copyDescription);
static std::shared_ptr<arrow::csv::StreamingReader> createCSVReader(const std::string& filePath,
common::CSVReaderConfig* csvReaderConfig, catalog::TableSchema* tableSchema);
static std::unique_ptr<parquet::arrow::FileReader> createParquetReader(
Expand All @@ -72,15 +72,19 @@ class TableCopyExecutor {

void countNumLinesNpy(const std::vector<std::string>& filePaths);

static std::vector<std::pair<int64_t, int64_t>> getListElementPos(
const std::string& l, int64_t from, int64_t to, common::CopyDescription& copyDescription);
static std::vector<std::pair<int64_t, int64_t>> getListElementPos(const std::string& l,
int64_t from, int64_t to, const common::CopyDescription& copyDescription);

inline void updateTableStatistics() {
tablesStatistics->setNumTuplesForTable(tableSchema->tableID, numRows);
}

static std::shared_ptr<arrow::DataType> toArrowDataType(const common::DataType& dataType);

private:
static std::unique_ptr<common::Value> convertStringToValue(std::string element,
const common::DataType& type, const common::CopyDescription& copyDescription);

protected:
std::shared_ptr<spdlog::logger> logger;
common::CopyDescription& copyDescription;
Expand Down
Loading

0 comments on commit 14bf9c3

Please sign in to comment.