Skip to content

Commit

Permalink
Add Tensor dataType to system
Browse files Browse the repository at this point in the history
  • Loading branch information
acquamarin committed Feb 17, 2023
1 parent 49bb742 commit 5480a9e
Show file tree
Hide file tree
Showing 31 changed files with 2,705 additions and 2,304 deletions.
2 changes: 1 addition & 1 deletion dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], PRIMARY KEY (ID));
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades TENSOR(INT64,4),PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table movies (name STRING, PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
Expand Down
18 changes: 9 additions & 9 deletions dataset/tinysnb/vPerson.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
id,fname,Gender,ISStudent,isWorker,age,eyeSight,birthdate,registerTime,lastJobDuration,workedHours,usedNames,courseScoresPerTerm
0,Alice,1,true,false,35,5.0,1900-01-01,2011-08-20 11:25:30Z+00:00,3 years 2 days 13 hours 2 minutes,"[10,5]","[Aida]","[[10,8],[6,7,8]]"
2,Bob,2,true,false,30,5.1,1900-01-01,2008-11-03 13:25:30.000526-02:00,10 years 5 months 13 hours 24 us,"[12,8]","[Bobby]","[[8,9],[9,10]]"
3,Carol,1,false,true,45,5.0,1940-06-22,1911-08-20 02:32:21,48 hours 24 minutes 11 seconds,"[4,5]","[Carmen,Fred]","[[8,10]]"
5,Dan,2,false,true,20,4.8,1950-7-23,2031-11-30 12:25:30Z,10 years 5 months 13 hours 24 us,"[1,9]","[Wolfeschlegelstein,Daniel]","[[7,4],[8,8],[9]]"
7,Elizabeth,1,false,true,20,4.7,1980-10-26,1976-12-23 11:21:42,48 hours 24 minutes 11 seconds,"[2]","[Ein]","[[6],[7],[8]]"
8,Farooq,2,true,false,25,4.5,1980-10-26,1972-07-31 13:22:30.678559,18 minutes 24 milliseconds,"[3,4,5,6,7]","[Fesdwe]","[[8]]"
9,Greg,2,false,false,40,4.9,1980-10-26,1976-12-23 11:21:42Z+06:40,10 years 5 months 13 hours 24 us,"[1]","[Grad]","[[10]]"
10,Hubert Blaine Wolfeschlegelsteinhausenbergerdorff,2,false,true,83,4.9,1990-11-27,2023-02-21 13:25:30,3 years 2 days 13 hours 2 minutes,"[10,11,12,3,4,5,6,7]","[Ad,De,Hi,Kye,Orlan]","[[7],[10],[6,7]]"
id,fname,Gender,ISStudent,isWorker,age,eyeSight,birthdate,registerTime,lastJobDuration,workedHours,usedNames,courseScoresPerTerm,grades
0,Alice,1,true,false,35,5.0,1900-01-01,2011-08-20 11:25:30Z+00:00,3 years 2 days 13 hours 2 minutes,"[10,5]","[Aida]","[[10,8],[6,7,8]]","[96,54,86,92]"
2,Bob,2,true,false,30,5.1,1900-01-01,2008-11-03 13:25:30.000526-02:00,10 years 5 months 13 hours 24 us,"[12,8]","[Bobby]","[[8,9],[9,10]]","[98,42,93,88]"
3,Carol,1,false,true,45,5.0,1940-06-22,1911-08-20 02:32:21,48 hours 24 minutes 11 seconds,"[4,5]","[Carmen,Fred]","[[8,10]]","[91,75,21,95]"
5,Dan,2,false,true,20,4.8,1950-7-23,2031-11-30 12:25:30Z,10 years 5 months 13 hours 24 us,"[1,9]","[Wolfeschlegelstein,Daniel]","[[7,4],[8,8],[9]]","[76,88,99,89]"
7,Elizabeth,1,false,true,20,4.7,1980-10-26,1976-12-23 11:21:42,48 hours 24 minutes 11 seconds,"[2]","[Ein]","[[6],[7],[8]]","[96,59,65,88]"
8,Farooq,2,true,false,25,4.5,1980-10-26,1972-07-31 13:22:30.678559,18 minutes 24 milliseconds,"[3,4,5,6,7]","[Fesdwe]","[[8]]","[80,78,34,83]"
9,Greg,2,false,false,40,4.9,1980-10-26,1976-12-23 11:21:42Z+06:40,10 years 5 months 13 hours 24 us,"[1]","[Grad]","[[10]]","[43,83,67,43]"
10,Hubert Blaine Wolfeschlegelsteinhausenbergerdorff,2,false,true,83,4.9,1990-11-27,2023-02-21 13:25:30,3 years 2 days 13 hours 2 minutes,"[10,11,12,3,4,5,6,7]","[Ad,De,Hi,Kye,Orlan]","[[7],[10],[6,7]]","[77,64,100,54]"
5 changes: 4 additions & 1 deletion src/antlr4/Cypher.g4
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,15 @@ TO: ( 'T' | 't' ) ( 'O' | 'o' ) ;

kU_DataType
: oC_SymbolicName
| ( oC_SymbolicName kU_ListIdentifiers ) ;
| ( oC_SymbolicName kU_ListIdentifiers )
| ( oC_SymbolicName kU_TensorIdentifier );

kU_ListIdentifiers : kU_ListIdentifier ( kU_ListIdentifier )* ;

kU_ListIdentifier : '[' ']' ;

kU_TensorIdentifier : '(' SP? kU_DataType SP? ',' SP? oC_IntegerLiteral SP? ')' ;

oC_AnyCypherOption
: oC_Explain
| oC_Profile ;
Expand Down
6 changes: 4 additions & 2 deletions src/catalog/catalog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ template<>
uint64_t SerDeser::serializeValue<DataType>(
const DataType& value, FileInfo* fileInfo, uint64_t offset) {
offset = SerDeser::serializeValue<DataTypeID>(value.typeID, fileInfo, offset);
offset = SerDeser::serializeValue<uint64_t>(value.numElementsInTensor, fileInfo, offset);
if (value.childType) {
assert(value.typeID == LIST);
assert(value.typeID == LIST || value.typeID == TENSOR);
return SerDeser::serializeValue<DataType>(*value.childType, fileInfo, offset);
}
return offset;
Expand All @@ -49,7 +50,8 @@ template<>
uint64_t SerDeser::deserializeValue<DataType>(
DataType& value, FileInfo* fileInfo, uint64_t offset) {
offset = SerDeser::deserializeValue<DataTypeID>(value.typeID, fileInfo, offset);
if (value.typeID == LIST) {
offset = SerDeser::deserializeValue<uint64_t>(value.numElementsInTensor, fileInfo, offset);
if (value.typeID == LIST || value.typeID == TENSOR) {
auto childDataType = std::make_unique<DataType>();
offset = SerDeser::deserializeValue<DataType>(*childDataType, fileInfo, offset);
value.childType = std::move(childDataType);
Expand Down
7 changes: 3 additions & 4 deletions src/common/arrow/arrow_row_batch.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "common/arrow/arrow_row_batch.h"

#include "common/types/value.h"
#include "processor/result/flat_tuple.h"

namespace kuzu {
namespace common {
Expand Down Expand Up @@ -184,7 +183,7 @@ void ArrowRowBatch::templateCopyNonNullValue<LIST>(
ArrowVector* vector, const main::DataTypeInfo& typeInfo, Value* value, std::int64_t pos) {
vector->data.resize((pos + 2) * sizeof(std::uint32_t));
auto offsets = (std::uint32_t*)vector->data.data();
auto numElements = value->listVal.size();
auto numElements = value->listOrTensorVal.size();
offsets[pos + 1] = offsets[pos] + numElements;
auto numChildElements = offsets[pos + 1] + 1;
auto currentNumBytesForChildValidity = vector->childData[0]->validity.size();
Expand All @@ -199,8 +198,8 @@ void ArrowRowBatch::templateCopyNonNullValue<LIST>(
numChildElements * Types::getDataTypeSize(typeInfo.childrenTypesInfo[0]->typeID));
}
for (auto i = 0u; i < numElements; i++) {
appendValue(
vector->childData[0].get(), *typeInfo.childrenTypesInfo[0], value->listVal[i].get());
appendValue(vector->childData[0].get(), *typeInfo.childrenTypesInfo[0],
value->listOrTensorVal[i].get());
}
}

Expand Down
32 changes: 32 additions & 0 deletions src/common/csv_reader/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,38 @@ std::unique_ptr<Value> CSVReader::getList(const DataType& dataType) {
DataType(LIST, std::make_unique<DataType>(dataType)), std::move(listVal));
}

std::unique_ptr<Value> CSVReader::getTensor(const DataType& dataType) {
std::vector<std::unique_ptr<Value>> tensorVal;
// Move the linePtrStart one character forward, because hasNextToken() will first increment it.
CSVReader listCSVReader(line, linePtrEnd - 1, linePtrStart - 1, config);
uint64_t numElementsRead = 0;
if (dataType.typeID == INT64) {
while (listCSVReader.hasNextToken()) {
if (!listCSVReader.skipTokenIfNull()) {
tensorVal.emplace_back(std::make_unique<Value>(listCSVReader.getInt64()));
numElementsRead++;
}
}
} else if (dataType.typeID == DOUBLE) {
while (listCSVReader.hasNextToken()) {
if (!listCSVReader.skipTokenIfNull()) {
tensorVal.emplace_back(std::make_unique<Value>(listCSVReader.getDouble()));
numElementsRead++;
}
}
} else {
throw ReaderException(
"Unsupported data type " + Types::dataTypeToString(dataType.typeID) + " inside TENSOR");
}
if (numElementsRead != dataType.numElementsInTensor) {
throw ReaderException(StringUtils::string_format(
"Each TENSOR should have fixed number of elements. Expected: %d, Actual: %d.",
dataType.numElementsInTensor, numElementsRead));
}
return std::make_unique<Value>(
DataType(TENSOR, std::make_unique<DataType>(dataType)), std::move(tensorVal));
}

void CSVReader::setNextTokenIsProcessed() {
nextTokenIsNotProcessed = false;
nextTokenLen = UINT64_MAX;
Expand Down
68 changes: 56 additions & 12 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,36 +11,46 @@ namespace common {
DataType::DataType() : typeID{ANY}, childType{nullptr} {}

DataType::DataType(DataTypeID typeID) : typeID{typeID}, childType{nullptr} {
assert(typeID != LIST);
assert(typeID != LIST && typeID != TENSOR);
}

DataType::DataType(DataTypeID typeID, std::unique_ptr<DataType> childType)
: typeID{typeID}, childType{std::move(childType)} {
assert(typeID == LIST);
}

DataType::DataType(const DataType& other) : typeID{other.typeID} {
if (other.childType) {
DataType::DataType(
DataTypeID typeID, std::unique_ptr<DataType> childType, uint64_t numElementsInTensor)
: typeID{typeID}, childType{std::move(childType)}, numElementsInTensor{numElementsInTensor} {
assert(typeID == TENSOR);
}

DataType::DataType(const DataType& other) {
typeID = other.typeID;
if (other.childType != nullptr) {
childType = other.childType->copy();
numElementsInTensor = other.numElementsInTensor;
}
}

DataType::DataType(DataType&& other) noexcept
: typeID{other.typeID}, childType{std::move(other.childType)} {}
: typeID{other.typeID}, childType{std::move(other.childType)}, numElementsInTensor{
other.numElementsInTensor} {}

std::vector<DataTypeID> DataType::getNumericalTypeIDs() {
return std::vector<DataTypeID>{INT64, DOUBLE};
}

std::vector<DataTypeID> DataType::getAllValidTypeIDs() {
return std::vector<DataTypeID>{
INTERNAL_ID, BOOL, INT64, DOUBLE, STRING, DATE, TIMESTAMP, INTERVAL, LIST};
INTERNAL_ID, BOOL, INT64, DOUBLE, STRING, DATE, TIMESTAMP, INTERVAL, LIST, TENSOR};
}

DataType& DataType::operator=(const DataType& other) {
typeID = other.typeID;
if (other.childType) {
childType = other.childType->copy();
numElementsInTensor = other.numElementsInTensor;
}
return *this;
}
Expand All @@ -51,6 +61,9 @@ bool DataType::operator==(const DataType& other) const {
}
if (typeID == LIST && *childType != *other.childType) {
return false;
} else if (typeID == TENSOR && (numElementsInTensor != other.numElementsInTensor ||
*childType != *other.childType)) {
return false;
}
return true;
}
Expand All @@ -62,15 +75,19 @@ bool DataType::operator!=(const DataType& other) const {
DataType& DataType::operator=(DataType&& other) noexcept {
typeID = other.typeID;
childType = std::move(other.childType);
numElementsInTensor = other.numElementsInTensor;
return *this;
}

std::unique_ptr<DataType> DataType::copy() {
if (childType) {
return make_unique<DataType>(typeID, childType->copy());
} else {
return std::make_unique<DataType>(typeID);
if (childType != nullptr) {
if (typeID == LIST) {
return make_unique<DataType>(typeID, childType->copy());
} else if (typeID == TENSOR) {
return make_unique<DataType>(typeID, childType->copy(), numElementsInTensor);
}
}
return std::make_unique<DataType>(typeID);
}

DataTypeID DataType::getTypeID() const {
Expand All @@ -88,12 +105,38 @@ DataType Types::dataTypeFromString(const std::string& dataTypeString) {
dataType.childType = std::make_unique<DataType>(
dataTypeFromString(dataTypeString.substr(0, dataTypeString.size() - 2)));
return dataType;
} else if (dataTypeString.ends_with(")")) {
dataType.typeID = TENSOR;
auto splitPos = dataTypeString.find(',');
auto childTypeStartPos = strlen("TENSOR") + 1; // The child type is followed by "TENSOR(".
dataType.childType = std::make_unique<DataType>(dataTypeFromString(
dataTypeString.substr(childTypeStartPos, splitPos - childTypeStartPos)));
dataType.numElementsInTensor = std::strtoll(
dataTypeString.substr(splitPos + 1, dataTypeString.size() - splitPos - 1).c_str(),
nullptr, 0 /* base */);
validateTensorType(dataType);
return dataType;
} else {
dataType.typeID = dataTypeIDFromString(dataTypeString);
}
return dataType;
}

void Types::validateTensorType(DataType& dataType) {
auto validNumericTypes = DataType::getNumericalTypeIDs();
if (find(validNumericTypes.begin(), validNumericTypes.end(), dataType.childType->typeID) ==
validNumericTypes.end()) {
throw BinderException("The child type of a tensor must be a numeric type. Given: " +
dataTypeToString(*dataType.childType) + ".");
}
if (dataType.numElementsInTensor == 0) {
// Note: the parser already guarantees that the number of elements is a non-negative
// number. However, we still need to check whether the number of elements is 0.
throw BinderException("The number of elements in a tensor must be greater than 0. Given: " +
std::to_string(dataType.numElementsInTensor) + ".");
}
}

DataTypeID Types::dataTypeIDFromString(const std::string& dataTypeIDString) {
if ("INTERNAL_ID" == dataTypeIDString) {
return INTERNAL_ID;
Expand All @@ -117,10 +160,9 @@ DataTypeID Types::dataTypeIDFromString(const std::string& dataTypeIDString) {
}

std::string Types::dataTypeToString(const DataType& dataType) {
if (dataType.typeID == LIST) {
if (dataType.typeID == LIST || dataType.typeID == TENSOR) {
assert(dataType.childType);
auto result = dataTypeToString(*dataType.childType) + "[]";
return result;
return dataTypeToString(*dataType.childType) + "[]";
} else {
return dataTypeToString(dataType.typeID);
}
Expand Down Expand Up @@ -152,6 +194,8 @@ std::string Types::dataTypeToString(DataTypeID dataTypeID) {
return "STRING";
case LIST:
return "LIST";
case TENSOR:
return "TENSOR";
default:
assert(false);
}
Expand Down
42 changes: 33 additions & 9 deletions src/common/types/value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ Value Value::createDefaultValue(const DataType& dataType) {
return Value(std::string(""));
case LIST:
return Value(dataType, std::vector<std::unique_ptr<Value>>{});
case TENSOR:
return Value(dataType, std::vector<std::unique_ptr<Value>>{});
default:
throw RuntimeException("Data type " + Types::dataTypeToString(dataType) +
" is not supported for Value::createDefaultValue");
Expand Down Expand Up @@ -104,7 +106,7 @@ Value::Value(const std::string& val_) : dataType{STRING}, isNull_{false} {

Value::Value(DataType dataType, std::vector<std::unique_ptr<Value>> vals)
: dataType{std::move(dataType)}, isNull_{false} {
listVal = std::move(vals);
listOrTensorVal = std::move(vals);
}

Value::Value(std::unique_ptr<NodeVal> val_) : dataType{NODE}, isNull_{false} {
Expand Down Expand Up @@ -151,7 +153,10 @@ void Value::copyValueFrom(const uint8_t* value) {
strVal = ((ku_string_t*)value)->getAsString();
} break;
case LIST: {
listVal = convertKUListToVector(*(ku_list_t*)value);
listOrTensorVal = convertKUListToVector(*(ku_list_t*)value);
} break;
case TENSOR: {
listOrTensorVal = convertKUTensorToVector(value);
} break;
default:
throw RuntimeException(
Expand Down Expand Up @@ -191,9 +196,10 @@ void Value::copyValueFrom(const Value& other) {
case STRING: {
strVal = other.strVal;
} break;
case LIST: {
for (auto& value : other.listVal) {
listVal.push_back(value->copy());
case LIST:
case TENSOR: {
for (auto& value : other.listOrTensorVal) {
listOrTensorVal.push_back(value->copy());
}
} break;
case NODE: {
Expand All @@ -209,7 +215,7 @@ void Value::copyValueFrom(const Value& other) {
}

const std::vector<std::unique_ptr<Value>>& Value::getListValReference() const {
return listVal;
return listOrTensorVal;
}

std::string Value::toString() const {
Expand All @@ -233,11 +239,12 @@ std::string Value::toString() const {
return TypeUtils::toString(val.internalIDVal);
case STRING:
return strVal;
case TENSOR:
case LIST: {
std::string result = "[";
for (auto i = 0u; i < listVal.size(); ++i) {
result += listVal[i]->toString();
result += (i == listVal.size() - 1 ? "]" : ",");
for (auto i = 0u; i < listOrTensorVal.size(); ++i) {
result += listOrTensorVal[i]->toString();
result += (i == listOrTensorVal.size() - 1 ? "]" : ",");
}
return result;
}
Expand Down Expand Up @@ -279,6 +286,23 @@ std::vector<std::unique_ptr<Value>> Value::convertKUListToVector(ku_list_t& list
return listResultValue;
}

std::vector<std::unique_ptr<Value>> Value::convertKUTensorToVector(const uint8_t* tensor) const {
std::vector<std::unique_ptr<Value>> tensorResultValue;
auto numBytesPerElement = Types::getDataTypeSize(*dataType.childType);
if (dataType.childType->typeID == common::DataTypeID::INT64) {
for (auto i = 0; i < dataType.numElementsInTensor; ++i) {
tensorResultValue.emplace_back(
std::make_unique<Value>(*(int64_t*)(tensor + i * numBytesPerElement)));
}
} else if (dataType.childType->typeID == common::DataTypeID::DOUBLE) {
for (auto i = 0; i < dataType.numElementsInTensor; ++i) {
tensorResultValue.emplace_back(
std::make_unique<Value>(*(double_t*)(tensor + i * numBytesPerElement)));
}
}
return tensorResultValue;
}

static std::string propertiesToString(
const std::vector<std::pair<std::string, std::unique_ptr<Value>>>& properties) {
std::string result = "{";
Expand Down
4 changes: 2 additions & 2 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,13 @@ void ValueVector::copyValue(uint8_t* dest, const Value& value) {
} break;
case LIST: {
auto& entry = *(ku_list_t*)dest;
auto numElements = value.listVal.size();
auto numElements = value.listOrTensorVal.size();
auto elementSize = Types::getDataTypeSize(*dataType.childType);
InMemOverflowBufferUtils::allocateSpaceForList(
entry, numElements * elementSize, getOverflowBuffer());
entry.size = numElements;
for (auto i = 0u; i < numElements; ++i) {
copyValue((uint8_t*)entry.overflowPtr + i * elementSize, *value.listVal[i]);
copyValue((uint8_t*)entry.overflowPtr + i * elementSize, *value.listOrTensorVal[i]);
}
} break;
default:
Expand Down
7 changes: 5 additions & 2 deletions src/function/built_in_aggregate_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,11 @@ void BuiltInAggregateFunctions::registerCountStar() {
void BuiltInAggregateFunctions::registerCount() {
std::vector<std::unique_ptr<AggregateFunctionDefinition>> definitions;
for (auto& typeID : DataType::getAllValidTypeIDs()) {
auto inputType =
typeID == LIST ? DataType(LIST, std::make_unique<DataType>(ANY)) : DataType(typeID);
auto inputType = typeID == common::TENSOR ?
DataType(TENSOR, std::make_unique<DataType>(ANY),
UINT64_MAX /* numElementsInTensor */) :
(typeID == LIST ? DataType(LIST, std::make_unique<DataType>(ANY)) :
DataType(typeID));
for (auto isDistinct : std::vector<bool>{true, false}) {
definitions.push_back(std::make_unique<AggregateFunctionDefinition>(COUNT_FUNC_NAME,
std::vector<DataTypeID>{typeID}, INT64,
Expand Down
Loading

0 comments on commit 5480a9e

Please sign in to comment.