Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add loader and storage support for struct #1490

Merged
merged 1 commit into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ __pycache__/
*$py.class
cmake-build-debug/
test/unittest_temp/
tools/python_api/test/test_PYTHON_CSV.csv

# antlr4 jar
scripts/antlr4/antlr4.jar
Expand Down
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table movies (name STRING, length INT32, note STRING, PRIMARY KEY (name));
create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float, MANY_ONE);
Expand Down
6 changes: 3 additions & 3 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Sóló cón tu párejâ,126, this is a very very good movie
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good
Roma,298,the movie is very interesting and funny
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}"
Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}"
10 changes: 4 additions & 6 deletions src/binder/bind_expression/bind_function_expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,8 @@ std::shared_ptr<Expression> ExpressionBinder::bindNodeLabelFunction(const Expres
auto nodeTableIDs = catalogContent->getNodeTableIDs();
expression_vector children;
children.push_back(node.getInternalIDProperty());
auto labelsValue =
std::make_unique<Value>(DataType(VAR_LIST, std::make_unique<DataType>(STRING)),
populateLabelValues(nodeTableIDs, *catalogContent));
auto labelsValue = std::make_unique<Value>(DataType(std::make_unique<DataType>(STRING)),
populateLabelValues(nodeTableIDs, *catalogContent));
children.push_back(createLiteralExpression(std::move(labelsValue)));
auto execFunc = function::LabelVectorOperation::execFunction;
auto bindData = std::make_unique<function::FunctionBindData>(DataType(STRING));
Expand All @@ -210,9 +209,8 @@ std::shared_ptr<Expression> ExpressionBinder::bindRelLabelFunction(const Express
auto relTableIDs = catalogContent->getRelTableIDs();
expression_vector children;
children.push_back(rel.getInternalIDProperty());
auto labelsValue =
std::make_unique<Value>(DataType(VAR_LIST, std::make_unique<DataType>(STRING)),
populateLabelValues(relTableIDs, *catalogContent));
auto labelsValue = std::make_unique<Value>(DataType(std::make_unique<DataType>(STRING)),
populateLabelValues(relTableIDs, *catalogContent));
children.push_back(createLiteralExpression(std::move(labelsValue)));
auto execFunc = function::LabelVectorOperation::execFunction;
auto bindData = std::make_unique<function::FunctionBindData>(DataType(STRING));
Expand Down
8 changes: 3 additions & 5 deletions src/c_api/data_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@ kuzu_data_type* kuzu_data_type_create(
} else {
auto child_type_pty =
std::make_unique<DataType>(*static_cast<DataType*>(child_type->_data_type));
data_type =
fixed_num_elements_in_list > 0 ?
new DataType(static_cast<DataTypeID>(data_type_id_u8), std::move(child_type_pty),
fixed_num_elements_in_list) :
new DataType(static_cast<DataTypeID>(data_type_id_u8), std::move(child_type_pty));
data_type = fixed_num_elements_in_list > 0 ?
new DataType(std::move(child_type_pty), fixed_num_elements_in_list) :
new DataType(std::move(child_type_pty));
}
c_data_type->_data_type = data_type;
return c_data_type;
Expand Down
6 changes: 5 additions & 1 deletion src/common/types/timestamp_t.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "common/types/timestamp_t.h"

#include "common/exception.h"
#include "common/string_utils.h"

namespace kuzu {
namespace common {
Expand Down Expand Up @@ -120,10 +121,13 @@ timestamp_t Timestamp::FromCString(const char* str, uint64_t len) {

// Find the string len for date
uint32_t dateStrLen = 0;
// Skip leading spaces.
while (common::StringUtils::CharacterIsSpace(str[dateStrLen])) {
dateStrLen++;
}
while (dateStrLen < len && str[dateStrLen] != ' ' && str[dateStrLen] != 'T') {
dateStrLen++;
}

if (!Date::TryConvertDate(str, dateStrLen, pos, date)) {
throw ConversionException(getTimestampConversionExceptionMsg(str, len));
}
Expand Down
67 changes: 30 additions & 37 deletions src/common/types/types.cpp
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ uint64_t SerDeser::deserializeValue(FixedListTypeInfo& value, FileInfo* fileInfo
template<>
uint64_t SerDeser::serializeValue(
const StructTypeInfo& value, FileInfo* fileInfo, uint64_t offset) {
auto childrenTypesToReturn = value.getChildrenTypes();
return serializeVector(childrenTypesToReturn, fileInfo, offset);
return serializeVector(value.fields, fileInfo, offset);
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
}

template<>
Expand All @@ -51,15 +50,18 @@ uint64_t SerDeser::deserializeValue(StructTypeInfo& value, FileInfo* fileInfo, u
}

template<>
uint64_t SerDeser::serializeValue(const StructField& value, FileInfo* fileInfo, uint64_t offset) {
offset = serializeValue(value.getName(), fileInfo, offset);
return serializeValue(*value.getType(), fileInfo, offset);
uint64_t SerDeser::serializeValue(
const std::unique_ptr<StructField>& value, FileInfo* fileInfo, uint64_t offset) {
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
offset = serializeValue<std::string>(value->name, fileInfo, offset);
return serializeValue(*value->getType(), fileInfo, offset);
}

template<>
uint64_t SerDeser::deserializeValue(StructField& value, FileInfo* fileInfo, uint64_t offset) {
offset = deserializeValue(value.name, fileInfo, offset);
return deserializeValue(*value.type, fileInfo, offset);
uint64_t SerDeser::deserializeValue(
std::unique_ptr<StructField>& value, FileInfo* fileInfo, uint64_t offset) {
value = std::make_unique<StructField>();
offset = deserializeValue<std::string>(value->name, fileInfo, offset);
return deserializeValue(*value->type, fileInfo, offset);
}

template<>
Expand Down Expand Up @@ -147,48 +149,37 @@ std::unique_ptr<StructField> StructField::copy() const {
}

std::unique_ptr<ExtraTypeInfo> StructTypeInfo::copy() const {
std::vector<std::unique_ptr<StructField>> structFields;
for (auto& field : fields) {
structFields.emplace_back(field->copy());
std::vector<std::unique_ptr<StructField>> structFields{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
structFields[i] = fields[i]->copy();
}
return std::make_unique<StructTypeInfo>(std::move(structFields));
}

std::vector<DataType*> StructTypeInfo::getChildrenTypes() const {
std::vector<DataType*> childrenTypesToReturn;
for (auto& field : fields) {
childrenTypesToReturn.push_back(field->getType());
std::vector<DataType*> childrenTypesToReturn{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
childrenTypesToReturn[i] = fields[i]->getType();
}
return childrenTypesToReturn;
}

std::vector<std::string> StructTypeInfo::getChildrenNames() const {
std::vector<std::string> childrenNames;
for (auto& field : fields) {
childrenNames.push_back(field->getName());
std::vector<std::string> childrenNames{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
childrenNames[i] = fields[i]->getName();
}
return childrenNames;
}

DataType::DataType() : typeID{ANY}, extraTypeInfo{nullptr} {}

DataType::DataType(DataTypeID typeID) : typeID{typeID}, extraTypeInfo{nullptr} {}

DataType::DataType(DataTypeID typeID, std::unique_ptr<DataType> childType)
: typeID{VAR_LIST}, extraTypeInfo{std::make_unique<VarListTypeInfo>(std::move(childType))} {
assert(typeID == VAR_LIST);
}

DataType::DataType(
DataTypeID typeID, std::unique_ptr<DataType> childType, uint64_t fixedNumElementsInList)
: typeID{FIXED_LIST}, extraTypeInfo{std::make_unique<FixedListTypeInfo>(
std::move(childType), fixedNumElementsInList)} {
assert(typeID == FIXED_LIST);
std::vector<StructField*> StructTypeInfo::getStructFields() const {
std::vector<StructField*> structFields{fields.size()};
for (auto i = 0u; i < fields.size(); i++) {
structFields[i] = fields[i].get();
}
return structFields;
}

DataType::DataType(DataTypeID typeID, std::vector<std::unique_ptr<StructField>> childrenTypes)
: typeID{STRUCT}, extraTypeInfo{std::make_unique<StructTypeInfo>(std::move(childrenTypes))} {}

DataType::DataType(const DataType& other) {
typeID = other.typeID;
if (other.extraTypeInfo != nullptr) {
Expand Down Expand Up @@ -297,7 +288,7 @@ DataType Types::dataTypeFromString(const std::string& dataTypeString) {
throw Exception("Cannot parse struct type: " + dataTypeString);
}
std::istringstream iss{
dataTypeString.substr(leftBracketPos, rightBracketPos - leftBracketPos)};
dataTypeString.substr(leftBracketPos + 1, rightBracketPos - leftBracketPos - 1)};
std::vector<std::unique_ptr<StructField>> childrenTypes;
std::string field, fieldName, fieldType;
while (getline(iss, field, ',')) {
Expand Down Expand Up @@ -360,10 +351,12 @@ std::string Types::dataTypeToString(const DataType& dataType) {
case STRUCT: {
auto structTypeInfo = reinterpret_cast<StructTypeInfo*>(dataType.extraTypeInfo.get());
std::string dataTypeStr = dataTypeToString(dataType.typeID) + "(";
for (auto& childType : structTypeInfo->getChildrenTypes()) {
dataTypeStr += dataTypeToString(*childType);
auto numFields = structTypeInfo->getChildrenTypes().size();
for (auto i = 0u; i < numFields - 1; i++) {
dataTypeStr += dataTypeToString(*structTypeInfo->getChildrenTypes()[i]);
dataTypeStr += ",";
}
dataTypeStr += dataTypeToString(*structTypeInfo->getChildrenTypes()[numFields - 1]);
return dataTypeStr + ")";
}
case ANY:
Expand Down
4 changes: 2 additions & 2 deletions src/function/built_in_aggregate_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ void BuiltInAggregateFunctions::registerCountStar() {
void BuiltInAggregateFunctions::registerCount() {
std::vector<std::unique_ptr<AggregateFunctionDefinition>> definitions;
for (auto& typeID : DataType::getAllValidTypeIDs()) {
auto inputType = (typeID == VAR_LIST ? DataType(VAR_LIST, std::make_unique<DataType>(ANY)) :
DataType(typeID));
auto inputType =
(typeID == VAR_LIST ? DataType(std::make_unique<DataType>(ANY)) : DataType(typeID));
for (auto isDistinct : std::vector<bool>{true, false}) {
definitions.push_back(std::make_unique<AggregateFunctionDefinition>(COUNT_FUNC_NAME,
std::vector<DataTypeID>{typeID}, INT64,
Expand Down
2 changes: 1 addition & 1 deletion src/function/vector_list_operation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ std::unique_ptr<FunctionBindData> ListCreationVectorOperation::bindFunc(
LIST_CREATION_FUNC_NAME, arguments[0]->getDataType(), arguments[i]->getDataType()));
}
}
auto resultType = DataType(VAR_LIST, std::make_unique<DataType>(arguments[0]->getDataType()));
auto resultType = DataType(std::make_unique<DataType>(arguments[0]->getDataType()));
return std::make_unique<FunctionBindData>(resultType);
}

Expand Down
3 changes: 2 additions & 1 deletion src/function/vector_struct_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ std::unique_ptr<FunctionBindData> StructPackVectorOperations::bindFunc(
fields.emplace_back(std::make_unique<common::StructField>(
argument->getAlias(), argument->getDataType().copy()));
}
auto resultType = common::DataType(common::STRUCT, std::move(fields));
auto resultType = common::DataType(std::move(fields));
return std::make_unique<FunctionBindData>(resultType);
}

Expand All @@ -43,6 +43,7 @@ std::unique_ptr<FunctionBindData> StructExtractVectorOperations::bindFunc(
throw common::BinderException("Key name for struct_extract must be STRING literal.");
}
auto key = ((binder::LiteralExpression&)*arguments[1]).getValue()->getValue<std::string>();
common::StringUtils::toUpper(key);
assert(definition->returnTypeID == common::ANY);
auto childrenTypes = typeInfo->getChildrenTypes();
auto childrenNames = typeInfo->getChildrenNames();
Expand Down
4 changes: 4 additions & 0 deletions src/include/common/ser_deser.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,13 @@ class SerDeser {

template<>
uint64_t SerDeser::serializeValue(const DataType& value, FileInfo* fileInfo, uint64_t offset);
template<>
uint64_t SerDeser::serializeValue(const std::string& value, FileInfo* fileInfo, uint64_t offset);

template<>
uint64_t SerDeser::deserializeValue(DataType& value, FileInfo* fileInfo, uint64_t offset);
template<>
uint64_t SerDeser::deserializeValue(std::string& value, FileInfo* fileInfo, uint64_t offset);

} // namespace common
} // namespace kuzu
31 changes: 20 additions & 11 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <vector>

#include "common/api.h"
#include "common/string_utils.h"

namespace kuzu {
namespace common {
Expand All @@ -29,6 +30,7 @@ constexpr column_id_t INVALID_COLUMN_ID = INVALID_PROPERTY_ID;
using vector_idx_t = uint32_t;
constexpr vector_idx_t INVALID_VECTOR_IDX = UINT32_MAX;
using block_idx_t = uint64_t;
using field_idx_t = uint64_t;

// System representation for a variable-sized overflow value.
struct overflow_value_t {
Expand Down Expand Up @@ -116,8 +118,12 @@ class StructField {
friend class SerDeser;

public:
StructField() : type{std::make_unique<DataType>()} {}
StructField(std::string name, std::unique_ptr<DataType> type)
: name{std::move(name)}, type{std::move(type)} {}
: name{std::move(name)}, type{std::move(type)} {
// Note: struct field name is case-insensitive.
StringUtils::toUpper(this->name);
}
inline bool operator!=(const StructField& other) const { return !(*this == other); }
inline std::string getName() const { return name; }
inline DataType* getType() const { return type.get(); }
Expand All @@ -133,15 +139,13 @@ class StructTypeInfo : public ExtraTypeInfo {
friend class SerDeser;

public:
StructTypeInfo() = default;
explicit StructTypeInfo(std::vector<std::unique_ptr<StructField>> fields)
: fields{std::move(fields)} {}
StructTypeInfo() = default;

inline void addChildType(const std::string& name, const DataType& type) {
fields.emplace_back(std::make_unique<StructField>(name, std::make_unique<DataType>(type)));
}
std::vector<DataType*> getChildrenTypes() const;
std::vector<std::string> getChildrenNames() const;
std::vector<StructField*> getStructFields() const;

bool operator==(const kuzu::common::StructTypeInfo& other) const;
std::unique_ptr<ExtraTypeInfo> copy() const override;
Expand All @@ -152,12 +156,17 @@ class StructTypeInfo : public ExtraTypeInfo {

class DataType {
public:
KUZU_API DataType();
KUZU_API explicit DataType(DataTypeID typeID);
KUZU_API DataType(DataTypeID typeID, std::unique_ptr<DataType> childType);
KUZU_API DataType(
DataTypeID typeID, std::unique_ptr<DataType> childType, uint64_t fixedNumElementsInList);
KUZU_API DataType(DataTypeID typeID, std::vector<std::unique_ptr<StructField>> childrenTypes);
KUZU_API DataType() : typeID{ANY}, extraTypeInfo{nullptr} {};
KUZU_API explicit DataType(DataTypeID typeID) : typeID{typeID}, extraTypeInfo{nullptr} {};
KUZU_API DataType(std::unique_ptr<DataType> childType)
: typeID{VAR_LIST}, extraTypeInfo{std::make_unique<VarListTypeInfo>(std::move(childType))} {
}
KUZU_API DataType(std::unique_ptr<DataType> childType, uint64_t fixedNumElementsInList)
: typeID{FIXED_LIST}, extraTypeInfo{std::make_unique<FixedListTypeInfo>(
std::move(childType), fixedNumElementsInList)} {}
KUZU_API DataType(std::vector<std::unique_ptr<StructField>> childrenTypes)
: typeID{STRUCT}, extraTypeInfo{
std::make_unique<StructTypeInfo>(std::move(childrenTypes))} {};
KUZU_API DataType(const DataType& other);
KUZU_API DataType(DataType&& other) noexcept;

Expand Down
4 changes: 2 additions & 2 deletions src/include/function/aggregate/collect.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ struct CollectFunction {
assert(arguments.size() == 1);
auto aggFuncDefinition = reinterpret_cast<AggregateFunctionDefinition*>(definition);
aggFuncDefinition->aggregateFunction->setInputDataType(arguments[0]->dataType);
auto returnType = common::DataType(
common::VAR_LIST, std::make_unique<common::DataType>(arguments[0]->dataType));
auto returnType =
common::DataType(std::make_unique<common::DataType>(arguments[0]->dataType));
return std::make_unique<FunctionBindData>(returnType);
}
};
Expand Down
16 changes: 10 additions & 6 deletions src/include/storage/copier/table_copy_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ class TableCopyExecutor {
uint64_t copy(processor::ExecutionContext* executionContext);

static void throwCopyExceptionIfNotOK(const arrow::Status& status);

static std::unique_ptr<common::Value> getArrowVarList(const std::string& l, int64_t from,
int64_t to, const common::DataType& dataType, common::CopyDescription& copyDescription);
int64_t to, const common::DataType& dataType,
const common::CopyDescription& copyDescription);
static std::unique_ptr<uint8_t[]> getArrowFixedList(const std::string& l, int64_t from,
int64_t to, const common::DataType& dataType, common::CopyDescription& copyDescription);

int64_t to, const common::DataType& dataType,
const common::CopyDescription& copyDescription);
static std::shared_ptr<arrow::csv::StreamingReader> createCSVReader(const std::string& filePath,
common::CSVReaderConfig* csvReaderConfig, catalog::TableSchema* tableSchema);
static std::unique_ptr<parquet::arrow::FileReader> createParquetReader(
Expand All @@ -72,15 +72,19 @@ class TableCopyExecutor {

void countNumLinesNpy(const std::vector<std::string>& filePaths);

static std::vector<std::pair<int64_t, int64_t>> getListElementPos(
const std::string& l, int64_t from, int64_t to, common::CopyDescription& copyDescription);
static std::vector<std::pair<int64_t, int64_t>> getListElementPos(const std::string& l,
int64_t from, int64_t to, const common::CopyDescription& copyDescription);

inline void updateTableStatistics() {
tablesStatistics->setNumTuplesForTable(tableSchema->tableID, numRows);
}

static std::shared_ptr<arrow::DataType> toArrowDataType(const common::DataType& dataType);

private:
static std::unique_ptr<common::Value> convertStringToValue(std::string element,
const common::DataType& type, const common::CopyDescription& copyDescription);

protected:
std::shared_ptr<spdlog::logger> logger;
common::CopyDescription& copyDescription;
Expand Down
Loading