Skip to content

Commit

Permalink
Add null to struct fields
Browse files Browse the repository at this point in the history
  • Loading branch information
acquamarin committed May 11, 2023
1 parent 4f02b2d commit 0628564
Show file tree
Hide file tree
Showing 16 changed files with 149 additions and 67 deletions.
4 changes: 2 additions & 2 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}"
Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: NULL}"
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: NULL, release: 2018-11-13 13:33:11, film: 2014-09-12}"
Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}"
2 changes: 2 additions & 0 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <stdexcept>

#include "common/exception.h"
#include "common/null_buffer.h"
#include "common/ser_deser.h"
#include "common/types/types_include.h"

Expand Down Expand Up @@ -491,6 +492,7 @@ uint32_t Types::getDataTypeSize(const DataType& dataType) {
for (auto& childType : structTypeInfo->getChildrenTypes()) {
size += getDataTypeSize(*childType);
}
size += NullBuffer::getNumBytesForNullValues(structTypeInfo->getChildrenNames().size());
return size;
}
case INTERNAL_ID:
Expand Down
12 changes: 9 additions & 3 deletions src/common/types/value.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "common/types/value.h"

#include "common/null_bytes.h"
#include "common/null_buffer.h"
#include "common/string_utils.h"

namespace kuzu {
Expand Down Expand Up @@ -378,11 +378,17 @@ std::vector<std::unique_ptr<Value>> Value::convertKUStructToVector(const uint8_t
std::vector<std::unique_ptr<Value>> structVal;
auto childrenTypes = structTypeInfo->getChildrenTypes();
auto numFields = childrenTypes.size();
auto structNullValues = kuStruct;
auto structValues = structNullValues + NullBuffer::getNumBytesForNullValues(numFields);
for (auto i = 0; i < numFields; i++) {
auto childValue = std::make_unique<Value>(Value::createDefaultValue(*childrenTypes[i]));
childValue->copyValueFrom(kuStruct);
if (NullBuffer::isNull(structNullValues, i)) {
childValue->setNull(true);
} else {
childValue->copyValueFrom(structValues);
}
structVal.emplace_back(std::move(childValue));
kuStruct += Types::getDataTypeSize(*childrenTypes[i]);
structValues += Types::getDataTypeSize(*childrenTypes[i]);
}
return structVal;
}
Expand Down
44 changes: 35 additions & 9 deletions src/common/vector/value_vector_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "common/vector/value_vector_utils.h"

#include "common/in_mem_overflow_buffer_utils.h"
#include "common/null_bytes.h"
#include "common/null_buffer.h"

using namespace kuzu;
using namespace common;
Expand All @@ -10,9 +10,18 @@ void ValueVectorUtils::copyNonNullDataWithSameTypeIntoPos(
ValueVector& resultVector, uint64_t pos, const uint8_t* srcData) {
switch (resultVector.dataType.typeID) {
case STRUCT: {
for (auto& childVector : StructVector::getChildrenVectors(&resultVector)) {
copyNonNullDataWithSameTypeIntoPos(*childVector, pos, srcData);
srcData += Types::getDataTypeSize(childVector->dataType);
auto structFields = StructVector::getChildrenVectors(&resultVector);
auto structNullBytes = srcData;
auto structValues =
structNullBytes + NullBuffer::getNumBytesForNullValues(structFields.size());
for (auto i = 0u; i < structFields.size(); i++) {
auto structField = structFields[i];
if (NullBuffer::isNull(structNullBytes, i)) {
structField->setNull(pos, true /* isNull */);
} else {
copyNonNullDataWithSameTypeIntoPos(*structField, pos, structValues);
}
structValues += Types::getDataTypeSize(structField->dataType);
}
} break;
case VAR_LIST: {
Expand Down Expand Up @@ -45,9 +54,22 @@ void ValueVectorUtils::copyNonNullDataWithSameTypeOutFromPos(const ValueVector&
uint64_t pos, uint8_t* dstData, InMemOverflowBuffer& dstOverflowBuffer) {
switch (srcVector.dataType.typeID) {
case STRUCT: {
for (auto& childVector : StructVector::getChildrenVectors(&srcVector)) {
copyNonNullDataWithSameTypeOutFromPos(*childVector, pos, dstData, dstOverflowBuffer);
dstData += Types::getDataTypeSize(childVector->dataType);
// The storage structure of STRUCT type in factorizedTable is:
// [NULLBYTES, FIELD1, FIELD2, ...]
auto structFields = StructVector::getChildrenVectors(&srcVector);
NullBuffer::initNullBytes(dstData, structFields.size());
auto structNullBytes = dstData;
auto structValues =
structNullBytes + NullBuffer::getNumBytesForNullValues(structFields.size());
for (auto i = 0u; i < structFields.size(); i++) {
auto structField = structFields[i];
if (structField->isNull(pos)) {
NullBuffer::setNull(structNullBytes, i);
} else {
copyNonNullDataWithSameTypeOutFromPos(
*structField, pos, structValues, dstOverflowBuffer);
}
structValues += Types::getDataTypeSize(structField->dataType);
}
} break;
case VAR_LIST: {
Expand Down Expand Up @@ -111,8 +133,12 @@ void ValueVectorUtils::copyValue(uint8_t* dstValue, common::ValueVector& dstVect
for (auto i = 0u; i < srcFields.size(); i++) {
auto srcField = srcFields[i];
auto dstField = dstFields[i];
copyValue(dstField->getData() + dstField->getNumBytesPerValue() * dstPos, *dstField,
srcField->getData() + srcField->getNumBytesPerValue() * srcPos, *srcField);
if (srcField->isNull(srcPos)) {
dstField->setNull(dstPos, true /* isNull */);
} else {
copyValue(dstField->getData() + dstField->getNumBytesPerValue() * dstPos, *dstField,
srcField->getData() + srcField->getNumBytesPerValue() * srcPos, *srcField);
}
}
} break;
case STRING: {
Expand Down
51 changes: 51 additions & 0 deletions src/function/vector_struct_operations.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "function/struct/vector_struct_operations.h"

#include "binder/expression/literal_expression.h"
#include "binder/expression_binder.h"
#include "function/function_definition.h"

namespace kuzu {
Expand All @@ -19,13 +20,63 @@ std::unique_ptr<FunctionBindData> StructPackVectorOperations::bindFunc(
const binder::expression_vector& arguments, kuzu::function::FunctionDefinition* definition) {
std::vector<std::unique_ptr<common::StructField>> fields;
for (auto& argument : arguments) {
if (argument->getDataType().typeID == common::ANY) {
binder::ExpressionBinder::resolveAnyDataType(
*argument, common::DataType{common::INT64});
}
fields.emplace_back(std::make_unique<common::StructField>(
argument->getAlias(), argument->getDataType().copy()));
}
auto resultType = common::DataType(std::move(fields));
return std::make_unique<FunctionBindData>(resultType);
}

void StructPackVectorOperations::execFunc(
const std::vector<std::shared_ptr<common::ValueVector>>& parameters,
common::ValueVector& result) {
for (auto i = 0u; i < parameters.size(); i++) {
auto& parameter = parameters[i];
if (parameter->state == result.state) {
continue;
}
// If the parameter's state is inconsistent with the result's state, we need to copy the
// parameter's value to the corresponding child vector.
copyParameterValueToStructFieldVector(
parameter.get(), common::StructVector::getChildVector(&result, i).get());
}
}

void StructPackVectorOperations::copyParameterValueToStructFieldVector(
const common::ValueVector* parameter, common::ValueVector* structField) {
// If the parameter is unFlat, then its state must be consistent with the result's state.
// Thus, we don't need to copy values to structFieldVector.
assert(parameter->state->isFlat());
auto srcPos = parameter->state->selVector->selectedPositions[0];
auto srcValue = parameter->getData() + parameter->getNumBytesPerValue() * srcPos;
bool isSrcValueNull = parameter->isNull(srcPos);
if (structField->state->isFlat()) {
auto pos = structField->state->selVector->selectedPositions[0];
if (isSrcValueNull) {
structField->setNull(pos, true /* isNull */);
} else {
common::ValueVectorUtils::copyValue(
structField->getData() + structField->getNumBytesPerValue() * pos, *structField,
srcValue, *parameter);
}
} else {
for (auto j = 0u; j < structField->state->selVector->selectedSize; j++) {
auto pos = structField->state->selVector->selectedPositions[j];
if (isSrcValueNull) {
structField->setNull(pos, true /* isNull */);
} else {
common::ValueVectorUtils::copyValue(
structField->getData() + structField->getNumBytesPerValue() * pos, *structField,
srcValue, *parameter);
}
}
}
}

std::vector<std::unique_ptr<VectorOperationDefinition>>
StructExtractVectorOperations::getDefinitions() {
std::vector<std::unique_ptr<VectorOperationDefinition>> definitions;
Expand Down
File renamed without changes.
35 changes: 2 additions & 33 deletions src/include/function/struct/vector_struct_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,9 @@ struct StructPackVectorOperations : public VectorOperations {
static std::unique_ptr<FunctionBindData> bindFunc(
const binder::expression_vector& arguments, FunctionDefinition* definition);
static void execFunc(const std::vector<std::shared_ptr<common::ValueVector>>& parameters,
common::ValueVector& result) {
for (auto i = 0u; i < parameters.size(); i++) {
auto& parameter = parameters[i];
if (parameter->state == result.state) {
continue;
}
// If the parameter's state is inconsistent with the result's state, we need to copy the
// parameter's value to the corresponding child vector.
copyParameterValueToStructFieldVector(
parameter.get(), common::StructVector::getChildVector(&result, i).get());
}
}
common::ValueVector& result);
static void copyParameterValueToStructFieldVector(
const common::ValueVector* parameter, common::ValueVector* structField) {
// If the parameter is unFlat, then its state must be consistent with the result's state.
// Thus, we don't need to copy values to structFieldVector.
assert(parameter->state->isFlat());
auto srcValue =
parameter->getData() +
parameter->getNumBytesPerValue() * parameter->state->selVector->selectedPositions[0];
if (structField->state->isFlat()) {
common::ValueVectorUtils::copyValue(
structField->getData() + structField->getNumBytesPerValue() *
structField->state->selVector->selectedPositions[0],
*structField, srcValue, *parameter);
} else {
for (auto j = 0u; j < structField->state->selVector->selectedSize; j++) {
auto pos = structField->state->selVector->selectedPositions[j];
common::ValueVectorUtils::copyValue(
structField->getData() + structField->getNumBytesPerValue() * pos, *structField,
srcValue, *parameter);
}
}
}
const common::ValueVector* parameter, common::ValueVector* structField);
};

struct StructExtractBindData : public FunctionBindData {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
namespace kuzu {
namespace storage {

class InMemNodeColumn;

class InMemColumnChunk {
public:
InMemColumnChunk() = default;
Expand Down Expand Up @@ -91,7 +93,7 @@ class InMemColumnChunk {

virtual void copyStructValueToFields(arrow::Array& array, uint64_t posInArray,
const common::CopyDescription& copyDescription, common::offset_t nodeOffset,
uint64_t numValues) {
uint64_t numValues, InMemNodeColumn* column) {
assert(false);
}

Expand All @@ -111,7 +113,7 @@ class InMemStructColumnChunk : public InMemColumnChunk {

void copyStructValueToFields(arrow::Array& array, uint64_t posInArray,
const common::CopyDescription& copyDescription, common::offset_t nodeOffset,
uint64_t numValues) override;
uint64_t numValues, InMemNodeColumn* column) override;

std::vector<InMemColumnChunk*> getInMemColumnChunksForFields();

Expand Down Expand Up @@ -157,8 +159,9 @@ void InMemColumnChunk::templateCopyValuesToPage<std::string, InMemOverflowFile*,
PageByteCursor& overflowCursor, common::CopyDescription& copyDesc);
template<>
void InMemColumnChunk::templateCopyValuesToPage<std::string, common::CopyDescription&,
common::offset_t>(const PageElementCursor& pageCursor, arrow::Array& array, uint64_t posInArray,
uint64_t numValues, common::CopyDescription& copyDesc, common::offset_t nodeOffset);
common::offset_t, InMemNodeColumn*>(const PageElementCursor& pageCursor, arrow::Array& array,
uint64_t posInArray, uint64_t numValues, common::CopyDescription& copyDesc,
common::offset_t nodeOffset, InMemNodeColumn* column);

template<>
void InMemColumnChunk::setValueFromString<bool>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class NodeInMemStructColumn : public InMemNodeColumn {
void flushChunk(
InMemColumnChunk* chunk, common::offset_t startOffset, common::offset_t endOffset) override;

inline InMemNodeColumn* getField(uint64_t fieldIdx) { return fields[fieldIdx].get(); }

private:
std::vector<std::unique_ptr<InMemNodeColumn>> fields;
};
Expand Down
2 changes: 1 addition & 1 deletion src/processor/result/factorized_table.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "processor/result/factorized_table.h"

#include "common/exception.h"
#include "common/null_bytes.h"
#include "common/null_buffer.h"
#include "common/vector/value_vector_utils.h"

using namespace kuzu::common;
Expand Down
6 changes: 3 additions & 3 deletions src/storage/copier/node_copier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ void NodeCopier::copyArrayIntoColumnChunk(InMemColumnChunk* columnChunk,
auto numValuesToCopy =
std::min(numValuesLeftToCopy, reinterpret_cast<InMemStructColumnChunk*>(columnChunk)
->getMinNumValuesLeftOnPage(offset));
columnChunk->templateCopyValuesToPage<std::string, CopyDescription&, common::offset_t>(
PageElementCursor{}, arrowArray, posInArray, numValuesToCopy, copyDescription,
offset);
columnChunk->templateCopyValuesToPage<std::string, CopyDescription&, common::offset_t,
InMemNodeColumn*>(PageElementCursor{}, arrowArray, posInArray, numValuesToCopy,
copyDescription, offset, column);
numValuesLeftToCopy -= numValuesToCopy;
continue;
}
Expand Down
3 changes: 2 additions & 1 deletion src/storage/copier/rel_copy_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -775,12 +775,13 @@ void RelCopyExecutor::calculateListHeadersTask(offset_t numNodes, atomic_uint64_
auto numNodesInChunk =
std::min((offset_t)ListsMetadataConstants::LISTS_CHUNK_SIZE, numNodes - nodeOffset);
csr_offset_t csrOffset = (*listSizes)[chunkNodeOffset].load(std::memory_order_relaxed);
for (auto i = 1u; i <= numNodesInChunk; i++) {
for (auto i = 1u; i < numNodesInChunk; i++) {
auto currNodeOffset = chunkNodeOffset + i;
auto numElementsInList = (*listSizes)[currNodeOffset].load(std::memory_order_relaxed);
listHeadersBuilder->setCSROffset(currNodeOffset, csrOffset);
csrOffset += numElementsInList;
}
listHeadersBuilder->setCSROffset(chunkNodeOffset + numNodesInChunk, csrOffset);
nodeOffset += numNodesInChunk;
}
logger->trace("End: adjListHeadersBuilder={0:p}", (void*)listHeadersBuilder);
Expand Down
24 changes: 18 additions & 6 deletions src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <regex>

#include "common/types/types.h"
#include "storage/in_mem_storage_structure/in_mem_node_column.h"

namespace kuzu {
namespace storage {
Expand Down Expand Up @@ -78,8 +79,9 @@ void InMemColumnChunk::templateCopyValuesToPage<std::string, InMemOverflowFile*,
template<>
void InMemColumnChunk::templateCopyValuesToPage<std::string, common::CopyDescription&,
common::offset_t>(const PageElementCursor& pageCursor, arrow::Array& array, uint64_t posInArray,
uint64_t numValues, common::CopyDescription& copyDesc, common::offset_t nodeOffset) {
copyStructValueToFields(array, posInArray, copyDesc, nodeOffset, numValues);
uint64_t numValues, common::CopyDescription& copyDesc, common::offset_t nodeOffset,
InMemNodeColumn* column) {
copyStructValueToFields(array, posInArray, copyDesc, nodeOffset, numValues, column);
}

template<>
Expand Down Expand Up @@ -161,8 +163,7 @@ InMemStructColumnChunk::InMemStructColumnChunk(

void InMemStructColumnChunk::copyStructValueToFields(arrow::Array& array, uint64_t posInArray,
const common::CopyDescription& copyDescription, common::offset_t startOffset,
uint64_t numValues) {
// TODO(Ziyi): support null values in struct.
uint64_t numValues, InMemNodeColumn* column) {
for (auto i = 0u; i < numValues; i++) {
auto& stringArray = (arrow::StringArray&)array;
auto structView = stringArray.GetView(i + posInArray);
Expand All @@ -181,8 +182,19 @@ void InMemStructColumnChunk::copyStructValueToFields(arrow::Array& array, uint64
std::regex_replace(structField.substr(0, delimPos), whiteSpacePattern, "");
auto structFieldIdx = getStructFieldIdx(structFieldNames, structFieldName);
auto structFieldValue = structField.substr(delimPos + 1);
copyValueToStructColumnField(i + startOffset, structFieldIdx, structFieldValue,
*structFieldTypes[structFieldIdx]);
auto offset = i + startOffset;
auto capitalizedStructFiledValue =
std::regex_replace(structFieldValue, whiteSpacePattern, "");
common::StringUtils::toUpper(capitalizedStructFiledValue);
if (capitalizedStructFiledValue == "NULL") {
continue;
} else {
reinterpret_cast<NodeInMemStructColumn*>(column)
->getField(structFieldIdx)
->setNull(offset, false /* isNull */);
copyValueToStructColumnField(
offset, structFieldIdx, structFieldValue, *structFieldTypes[structFieldIdx]);
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/storage/storage_structure/disk_overflow_file.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "storage/storage_structure/disk_overflow_file.h"

#include "common/in_mem_overflow_buffer_utils.h"
#include "common/null_bytes.h"
#include "common/null_buffer.h"
#include "common/string_utils.h"
#include "common/type_utils.h"

Expand Down
Loading

0 comments on commit 0628564

Please sign in to comment.