Skip to content

Commit

Permalink
Pass nullMask to setValuesFromUncompressed so that we know which valu…
Browse files Browse the repository at this point in the history
…es don't need to be checked (#3247)

Also fixed NullMask::resize, which was truncating the existing data
  • Loading branch information
benjaminwinger committed Apr 17, 2024
1 parent 521e925 commit 5f53253
Show file tree
Hide file tree
Showing 20 changed files with 2,259 additions and 142 deletions.
2 changes: 2 additions & 0 deletions src/catalog/catalog_entry/table_catalog_entry.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "catalog/catalog_entry/table_catalog_entry.h"

#include <algorithm>

#include "catalog/catalog_entry/node_table_catalog_entry.h"
#include "catalog/catalog_entry/rdf_graph_catalog_entry.h"
#include "catalog/catalog_entry/rel_group_catalog_entry.h"
Expand Down
38 changes: 14 additions & 24 deletions src/common/arrow/arrow_null_mask_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,12 @@ bool ArrowNullMaskTree::copyFromBuffer(const void* buffer, uint64_t srcOffset, u
return true;
}

bool ArrowNullMaskTree::applyParentBitmap(const NullMask* parent, uint64_t count) {
bool ArrowNullMaskTree::applyParentBitmap(const NullMask* parent) {
if (parent == nullptr) {
return false;
}
const uint64_t* buffer = parent->data;
if (buffer != nullptr) {
for (uint64_t i = 0; i < (count >> NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2); i++) {
mask->buffer[i] |= buffer[i];
}
if (parent->getData() != nullptr) {
*mask |= *parent;
return true;
}
return false;
Expand All @@ -50,8 +47,7 @@ void ArrowNullMaskTree::scanListPushDown(const ArrowSchema* schema, const ArrowA
uint64_t srcOffset, uint64_t count) {
const offsetsT* offsets = ((const offsetsT*)array->buffers[1]) + srcOffset;
offsetsT auxiliaryLength = offsets[count] - offsets[0];
NullMask pushDownMask((auxiliaryLength + NullMask::NUM_BITS_PER_NULL_ENTRY - 1) >>
NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2);
NullMask pushDownMask(auxiliaryLength);
for (uint64_t i = 0; i < count; i++) {
pushDownMask.setNullFromRange(offsets[i] - offsets[0], offsets[i + 1] - offsets[i],
isNull(i));
Expand All @@ -70,13 +66,11 @@ void ArrowNullMaskTree::scanStructPushDown(const ArrowSchema* schema, const Arro

ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray* array,
uint64_t srcOffset, uint64_t count, const NullMask* parentBitmap)
: offset{0},
mask{std::make_shared<common::NullMask>((count + NullMask::NUM_BITS_PER_NULL_ENTRY - 1) >>
NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2)},
: offset{0}, mask{std::make_shared<common::NullMask>(count)},
children(std::make_shared<std::vector<ArrowNullMaskTree>>()) {
if (schema->dictionary != nullptr) {
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
applyParentBitmap(parentBitmap);
dictionary = std::make_shared<ArrowNullMaskTree>(schema->dictionary, array->dictionary,
array->dictionary->offset, array->dictionary->length);
return;
Expand Down Expand Up @@ -108,34 +102,34 @@ ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray
case 'w':
case 't':
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
applyParentBitmap(parentBitmap);
break;
case '+':
switch (arrowType[1]) {
case 'l':
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
applyParentBitmap(parentBitmap);
scanListPushDown<int32_t>(schema, array, srcOffset, count);
break;
case 'L':
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
applyParentBitmap(parentBitmap);
scanListPushDown<int64_t>(schema, array, srcOffset, count);
break;
case 'w':
// TODO manh: array null resolution
KU_UNREACHABLE;
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
applyParentBitmap(parentBitmap);
break;
case 's':
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
applyParentBitmap(parentBitmap);
scanStructPushDown(schema, array, srcOffset, count);
break;
case 'm':
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
applyParentBitmap(parentBitmap);
scanListPushDown<int32_t>(schema, array, srcOffset, count);
break;
case 'u': {
Expand Down Expand Up @@ -176,9 +170,7 @@ ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray
}
}
if (parentBitmap != nullptr) {
for (uint64_t i = 0; i < count >> NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2; i++) {
mask->buffer[i] |= parentBitmap->buffer[i];
}
*mask |= *parentBitmap;
}
} break;
case 'v':
Expand All @@ -192,9 +184,7 @@ ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray
true);
}
if (parentBitmap != nullptr) {
for (uint64_t i = 0; i < count >> NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2; i++) {
mask->buffer[i] |= parentBitmap->buffer[i];
}
*mask |= *parentBitmap;
}
break;
case 'r':
Expand Down
58 changes: 50 additions & 8 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#include "common/null_mask.h"

#include <algorithm>
#include <cstring>

#include "common/assert.h"

namespace kuzu {
namespace common {

Expand All @@ -17,10 +20,41 @@ void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) {

bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
auto [srcNullEntryPos, srcNullBitPos] = getNullEntryAndBitPos(srcOffset);
auto [dstNullEntryPos, dstNullBitPos] = getNullEntryAndBitPos(dstOffset);
// If both offsets are aligned relative to each other then copy up to the first byte using the
// non-aligned method, then copy aligned, then copy the end unaligned again.
if (!invert && (srcOffset % 8 == dstOffset % 8) && numBitsToCopy >= 8 &&
numBitsToCopy - (srcOffset % 8) >= 8) {
auto numBitsInFirstByte = 8 - (srcOffset % 8);
bool hasNullInSrcNullMask = false;
if (copyUnaligned(srcNullEntries, srcOffset, dstNullEntries, dstOffset, numBitsInFirstByte,
false)) {
hasNullInSrcNullMask = true;
}
auto* src =
reinterpret_cast<const uint8_t*>(srcNullEntries) + (srcOffset + numBitsInFirstByte) / 8;
auto* dst =
reinterpret_cast<uint8_t*>(dstNullEntries) + (dstOffset + numBitsInFirstByte) / 8;
auto numBytesForAlignedCopy = (numBitsToCopy - numBitsInFirstByte) / 8;
memcpy(dst, src, numBytesForAlignedCopy);
if (std::any_of(src, src + numBytesForAlignedCopy, [&](uint8_t val) { return val != 0; })) {
hasNullInSrcNullMask = true;
}
auto lastByteStart = numBitsInFirstByte + numBytesForAlignedCopy * 8;
auto numBitsInLastByte = numBitsToCopy - numBitsInFirstByte - numBytesForAlignedCopy * 8;
return copyUnaligned(srcNullEntries, srcOffset + lastByteStart, dstNullEntries,
dstOffset + lastByteStart, numBitsInLastByte, false) ||
hasNullInSrcNullMask;
} else {
return copyUnaligned(srcNullEntries, srcOffset, dstNullEntries, dstOffset, numBitsToCopy,
invert);
}
}
bool NullMask::copyUnaligned(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
uint64_t bitPos = 0;
bool hasNullInSrcNullMask = false;
auto [srcNullEntryPos, srcNullBitPos] = getNullEntryAndBitPos(srcOffset + bitPos);
auto [dstNullEntryPos, dstNullBitPos] = getNullEntryAndBitPos(dstOffset + bitPos);
while (bitPos < numBitsToCopy) {
auto curDstNullEntryPos = dstNullEntryPos;
auto curDstNullBitPos = dstNullBitPos;
Expand Down Expand Up @@ -80,16 +114,17 @@ bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
}

void NullMask::resize(uint64_t capacity) {
auto resizedBuffer = std::make_unique<uint64_t[]>(capacity);
memcpy(resizedBuffer.get(), buffer.get(), numNullEntries);
auto numNullEntries = (capacity + NUM_BITS_PER_NULL_ENTRY - 1) / NUM_BITS_PER_NULL_ENTRY;
auto resizedBuffer = std::make_unique<uint64_t[]>(numNullEntries);
memcpy(resizedBuffer.get(), data.data(), data.size_bytes());
buffer = std::move(resizedBuffer);
data = buffer.get();
numNullEntries = capacity;
data = std::span(buffer.get(), numNullEntries);
}

bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
if (copyNullMask(srcNullEntries, srcOffset, this->data, dstOffset, numBitsToCopy, invert)) {
if (copyNullMask(srcNullEntries, srcOffset, this->data.data(), dstOffset, numBitsToCopy,
invert)) {
this->mayContainNulls = true;
return true;
}
Expand All @@ -100,7 +135,7 @@ void NullMask::setNullFromRange(uint64_t offset, uint64_t numBitsToSet, bool isN
if (isNull) {
this->mayContainNulls = true;
}
setNullRange(data, offset, numBitsToSet, isNull);
setNullRange(data.data(), offset, numBitsToSet, isNull);
}

void NullMask::setNullRange(uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet,
Expand Down Expand Up @@ -144,5 +179,12 @@ void NullMask::setNullRange(uint64_t* nullEntries, uint64_t offset, uint64_t num
}
}

void NullMask::operator|=(const NullMask& other) {
KU_ASSERT(other.data.size() == data.size());
for (size_t i = 0; i < data.size(); i++) {
data[i] |= other.getData()[i];
}
}

} // namespace common
} // namespace kuzu
3 changes: 1 addition & 2 deletions src/common/vector/auxiliary_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ void ListAuxiliaryBuffer::resizeDataVector(ValueVector* dataVector) {
auto buffer = std::make_unique<uint8_t[]>(capacity * dataVector->getNumBytesPerValue());
memcpy(buffer.get(), dataVector->valueBuffer.get(), size * dataVector->getNumBytesPerValue());
dataVector->valueBuffer = std::move(buffer);
dataVector->nullMask->resize((capacity + NullMask::NUM_BITS_PER_NULL_ENTRY - 1) >>
NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2);
dataVector->nullMask.resize(capacity);
// If the dataVector is a struct vector, we need to resize its field vectors.
if (dataVector->dataType.getPhysicalType() == PhysicalTypeID::STRUCT) {
resizeStructDataVector(dataVector);
Expand Down
8 changes: 4 additions & 4 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common/vector/value_vector.h"

#include "common/constants.h"
#include "common/exception/runtime.h"
#include "common/null_buffer.h"
#include "common/types/blob.h"
Expand All @@ -11,7 +12,7 @@ namespace kuzu {
namespace common {

ValueVector::ValueVector(LogicalType dataType, storage::MemoryManager* memoryManager)
: dataType{std::move(dataType)} {
: dataType{std::move(dataType)}, nullMask{DEFAULT_VECTOR_CAPACITY} {
if (this->dataType.getLogicalTypeID() == LogicalTypeID::ANY) {
// LCOV_EXCL_START
// Alternatively we can assign a default type here but I don't think it's a good practice.
Expand All @@ -21,7 +22,6 @@ ValueVector::ValueVector(LogicalType dataType, storage::MemoryManager* memoryMan
}
numBytesPerValue = getDataTypeSize(this->dataType);
initializeValueBuffer();
nullMask = std::make_unique<NullMask>();
auxiliaryBuffer = AuxiliaryBufferFactory::getAuxiliaryBuffer(this->dataType, memoryManager);
}

Expand Down Expand Up @@ -61,7 +61,7 @@ bool ValueVector::discardNull(ValueVector& vector) {

bool ValueVector::setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
return nullMask->copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy, invert);
return nullMask.copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy, invert);
}

template<typename T>
Expand Down Expand Up @@ -383,7 +383,7 @@ void ValueVector::setValue(uint32_t pos, std::string_view val) {
}

void ValueVector::setNull(uint32_t pos, bool isNull) {
nullMask->setNull(pos, isNull);
nullMask.setNull(pos, isNull);
}

void StringVector::addString(ValueVector* vector, uint32_t vectorPos, ku_string_t& srcStr) {
Expand Down
4 changes: 2 additions & 2 deletions src/function/vector_cast_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ static void resolveNestedVector(std::shared_ptr<ValueVector> inputVector, ValueV
// copy data and nullmask from input
memcpy(resultVector->getData(), inputVector->getData(),
numOfEntries * resultVector->getNumBytesPerValue());
resultVector->setNullFromBits(inputVector->getNullMaskData(), 0, 0, numOfEntries);
resultVector->setNullFromBits(inputVector->getNullMask().getData(), 0, 0, numOfEntries);

numOfEntries = ListVector::getDataVectorSize(inputVector.get());
ListVector::resizeDataVector(resultVector, numOfEntries);
Expand Down Expand Up @@ -56,7 +56,7 @@ static void resolveNestedVector(std::shared_ptr<ValueVector> inputVector, ValueV
// copy data and nullmask from input
memcpy(resultVector->getData(), inputVector->getData(),
numOfEntries * resultVector->getNumBytesPerValue());
resultVector->setNullFromBits(inputVector->getNullMaskData(), 0, 0, numOfEntries);
resultVector->setNullFromBits(inputVector->getNullMask().getData(), 0, 0, numOfEntries);

auto inputFieldVectors = StructVector::getFieldVectors(inputVector.get());
auto resultFieldVectors = StructVector::getFieldVectors(resultVector);
Expand Down
2 changes: 1 addition & 1 deletion src/include/common/arrow/arrow_nullmask_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class ArrowNullMaskTree {

private:
bool copyFromBuffer(const void* buffer, uint64_t srcOffset, uint64_t count);
bool applyParentBitmap(const NullMask* buffer, uint64_t count);
bool applyParentBitmap(const NullMask* buffer);

template<typename offsetsT>
void scanListPushDown(const ArrowSchema* schema, const ArrowArray* array, uint64_t srcOffset,
Expand Down
Loading

0 comments on commit 5f53253

Please sign in to comment.