Skip to content

Commit

Permalink
Pass nullMask to setValuesFromUncompressed so that we know which valu…
Browse files Browse the repository at this point in the history
…es don't need to be checked

Also fixed NullMask::resize, which was truncating the existing data
  • Loading branch information
benjaminwinger committed Apr 12, 2024
1 parent 65a080f commit 126aff4
Show file tree
Hide file tree
Showing 19 changed files with 2,229 additions and 133 deletions.
2 changes: 2 additions & 0 deletions src/catalog/catalog_entry/table_catalog_entry.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "catalog/catalog_entry/table_catalog_entry.h"

#include <algorithm>

#include "catalog/catalog_entry/node_table_catalog_entry.h"
#include "catalog/catalog_entry/rdf_graph_catalog_entry.h"
#include "catalog/catalog_entry/rel_group_catalog_entry.h"
Expand Down
22 changes: 6 additions & 16 deletions src/common/arrow/arrow_null_mask_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,8 @@ bool ArrowNullMaskTree::applyParentBitmap(const NullMask* parent, uint64_t count
if (parent == nullptr) {
return false;
}
const uint64_t* buffer = parent->data;
if (buffer != nullptr) {
for (uint64_t i = 0; i < (count >> NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2); i++) {
mask->buffer[i] |= buffer[i];
}
if (parent->getData() != nullptr) {
mask->copyFromNullBits(parent->getData(), 0, 0, count);
return true;
}
return false;
Expand All @@ -50,8 +47,7 @@ void ArrowNullMaskTree::scanListPushDown(const ArrowSchema* schema, const ArrowA
uint64_t srcOffset, uint64_t count) {
const offsetsT* offsets = ((const offsetsT*)array->buffers[1]) + srcOffset;
offsetsT auxiliaryLength = offsets[count] - offsets[0];
NullMask pushDownMask((auxiliaryLength + NullMask::NUM_BITS_PER_NULL_ENTRY - 1) >>
NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2);
NullMask pushDownMask(auxiliaryLength);
for (uint64_t i = 0; i < count; i++) {
pushDownMask.setNullFromRange(offsets[i] - offsets[0], offsets[i + 1] - offsets[i],
isNull(i));
Expand All @@ -70,9 +66,7 @@ void ArrowNullMaskTree::scanStructPushDown(const ArrowSchema* schema, const Arro

ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray* array,
uint64_t srcOffset, uint64_t count, const NullMask* parentBitmap)
: offset{0},
mask{std::make_shared<common::NullMask>((count + NullMask::NUM_BITS_PER_NULL_ENTRY - 1) >>
NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2)},
: offset{0}, mask{std::make_shared<common::NullMask>(count)},
children(std::make_shared<std::vector<ArrowNullMaskTree>>()) {
if (schema->dictionary != nullptr) {
copyFromBuffer(array->buffers[0], srcOffset, count);
Expand Down Expand Up @@ -176,9 +170,7 @@ ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray
}
}
if (parentBitmap != nullptr) {
for (uint64_t i = 0; i < count >> NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2; i++) {
mask->buffer[i] |= parentBitmap->buffer[i];
}
mask->copyFromNullBits(parentBitmap->getData(), 0, 0, count);
}
} break;
case 'v':
Expand All @@ -192,9 +184,7 @@ ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray
true);
}
if (parentBitmap != nullptr) {
for (uint64_t i = 0; i < count >> NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2; i++) {
mask->buffer[i] |= parentBitmap->buffer[i];
}
mask->copyFromNullBits(parentBitmap->getData(), 0, 0, count);
}
break;
case 'r':
Expand Down
49 changes: 41 additions & 8 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common/null_mask.h"

#include <algorithm>
#include <cstring>

namespace kuzu {
Expand All @@ -17,10 +18,41 @@ void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) {

bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
auto [srcNullEntryPos, srcNullBitPos] = getNullEntryAndBitPos(srcOffset);
auto [dstNullEntryPos, dstNullBitPos] = getNullEntryAndBitPos(dstOffset);
// If both offsets are aligned relative to each other then copy up to the first byte using the
// non-aligned method, then copy aligned, then copy the end unaligned again.
if (!invert && (srcOffset % 8 == dstOffset % 8) && numBitsToCopy >= 8 &&
numBitsToCopy - (srcOffset % 8) >= 8) {
auto numBitsInFirstByte = 8 - (srcOffset % 8);
bool hasNullInSrcNullMask = false;
if (copyUnaligned(srcNullEntries, srcOffset, dstNullEntries, dstOffset, numBitsInFirstByte,
false)) {
hasNullInSrcNullMask = true;
}
auto* src =
reinterpret_cast<const uint8_t*>(srcNullEntries) + (srcOffset + numBitsInFirstByte) / 8;
auto* dst =
reinterpret_cast<uint8_t*>(dstNullEntries) + (dstOffset + numBitsInFirstByte) / 8;
auto numBytesForAlignedCopy = (numBitsToCopy - numBitsInFirstByte) / 8;
memcpy(dst, src, numBytesForAlignedCopy);
if (std::any_of(src, src + numBytesForAlignedCopy, [&](uint8_t val) { return val != 0; })) {
hasNullInSrcNullMask = true;
}
auto lastByteStart = numBitsInFirstByte + numBytesForAlignedCopy * 8;
auto numBitsInLastByte = numBitsToCopy - numBitsInFirstByte - numBytesForAlignedCopy * 8;
return copyUnaligned(srcNullEntries, srcOffset + lastByteStart, dstNullEntries,
dstOffset + lastByteStart, numBitsInLastByte, false) ||
hasNullInSrcNullMask;
} else {
return copyUnaligned(srcNullEntries, srcOffset, dstNullEntries, dstOffset, numBitsToCopy,
invert);
}
}
bool NullMask::copyUnaligned(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
uint64_t bitPos = 0;
bool hasNullInSrcNullMask = false;
auto [srcNullEntryPos, srcNullBitPos] = getNullEntryAndBitPos(srcOffset + bitPos);
auto [dstNullEntryPos, dstNullBitPos] = getNullEntryAndBitPos(dstOffset + bitPos);
while (bitPos < numBitsToCopy) {
auto curDstNullEntryPos = dstNullEntryPos;
auto curDstNullBitPos = dstNullBitPos;
Expand Down Expand Up @@ -80,16 +112,17 @@ bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
}

void NullMask::resize(uint64_t capacity) {
auto resizedBuffer = std::make_unique<uint64_t[]>(capacity);
memcpy(resizedBuffer.get(), buffer.get(), numNullEntries);
auto numNullEntries = (capacity + NUM_BITS_PER_NULL_ENTRY - 1) / NUM_BITS_PER_NULL_ENTRY;
auto resizedBuffer = std::make_unique<uint64_t[]>(numNullEntries);
memcpy(resizedBuffer.get(), data.data(), data.size_bytes());
buffer = std::move(resizedBuffer);
data = buffer.get();
numNullEntries = capacity;
data = std::span(buffer.get(), numNullEntries);
}

bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
if (copyNullMask(srcNullEntries, srcOffset, this->data, dstOffset, numBitsToCopy, invert)) {
if (copyNullMask(srcNullEntries, srcOffset, this->data.data(), dstOffset, numBitsToCopy,
invert)) {
this->mayContainNulls = true;
return true;
}
Expand All @@ -100,7 +133,7 @@ void NullMask::setNullFromRange(uint64_t offset, uint64_t numBitsToSet, bool isN
if (isNull) {
this->mayContainNulls = true;
}
setNullRange(data, offset, numBitsToSet, isNull);
setNullRange(data.data(), offset, numBitsToSet, isNull);
}

void NullMask::setNullRange(uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet,
Expand Down
3 changes: 1 addition & 2 deletions src/common/vector/auxiliary_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ void ListAuxiliaryBuffer::resizeDataVector(ValueVector* dataVector) {
auto buffer = std::make_unique<uint8_t[]>(capacity * dataVector->getNumBytesPerValue());
memcpy(buffer.get(), dataVector->valueBuffer.get(), size * dataVector->getNumBytesPerValue());
dataVector->valueBuffer = std::move(buffer);
dataVector->nullMask->resize((capacity + NullMask::NUM_BITS_PER_NULL_ENTRY - 1) >>
NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2);
dataVector->nullMask.resize(capacity);
// If the dataVector is a struct vector, we need to resize its field vectors.
if (dataVector->dataType.getPhysicalType() == PhysicalTypeID::STRUCT) {
resizeStructDataVector(dataVector);
Expand Down
8 changes: 4 additions & 4 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common/vector/value_vector.h"

#include "common/constants.h"
#include "common/exception/runtime.h"
#include "common/null_buffer.h"
#include "common/types/blob.h"
Expand All @@ -11,7 +12,7 @@ namespace kuzu {
namespace common {

ValueVector::ValueVector(LogicalType dataType, storage::MemoryManager* memoryManager)
: dataType{std::move(dataType)} {
: dataType{std::move(dataType)}, nullMask{DEFAULT_VECTOR_CAPACITY} {
if (this->dataType.getLogicalTypeID() == LogicalTypeID::ANY) {
// LCOV_EXCL_START
// Alternatively we can assign a default type here but I don't think it's a good practice.
Expand All @@ -21,7 +22,6 @@ ValueVector::ValueVector(LogicalType dataType, storage::MemoryManager* memoryMan
}
numBytesPerValue = getDataTypeSize(this->dataType);
initializeValueBuffer();
nullMask = std::make_unique<NullMask>();
auxiliaryBuffer = AuxiliaryBufferFactory::getAuxiliaryBuffer(this->dataType, memoryManager);
}

Expand Down Expand Up @@ -61,7 +61,7 @@ bool ValueVector::discardNull(ValueVector& vector) {

bool ValueVector::setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
return nullMask->copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy, invert);
return nullMask.copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy, invert);
}

template<typename T>
Expand Down Expand Up @@ -383,7 +383,7 @@ void ValueVector::setValue(uint32_t pos, std::string_view val) {
}

void ValueVector::setNull(uint32_t pos, bool isNull) {
nullMask->setNull(pos, isNull);
nullMask.setNull(pos, isNull);
}

void StringVector::addString(ValueVector* vector, uint32_t vectorPos, ku_string_t& srcStr) {
Expand Down
4 changes: 2 additions & 2 deletions src/function/vector_cast_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ static void resolveNestedVector(std::shared_ptr<ValueVector> inputVector, ValueV
// copy data and nullmask from input
memcpy(resultVector->getData(), inputVector->getData(),
numOfEntries * resultVector->getNumBytesPerValue());
resultVector->setNullFromBits(inputVector->getNullMaskData(), 0, 0, numOfEntries);
resultVector->setNullFromBits(inputVector->getNullMask().getData(), 0, 0, numOfEntries);

numOfEntries = ListVector::getDataVectorSize(inputVector.get());
ListVector::resizeDataVector(resultVector, numOfEntries);
Expand Down Expand Up @@ -56,7 +56,7 @@ static void resolveNestedVector(std::shared_ptr<ValueVector> inputVector, ValueV
// copy data and nullmask from input
memcpy(resultVector->getData(), inputVector->getData(),
numOfEntries * resultVector->getNumBytesPerValue());
resultVector->setNullFromBits(inputVector->getNullMaskData(), 0, 0, numOfEntries);
resultVector->setNullFromBits(inputVector->getNullMask().getData(), 0, 0, numOfEntries);

auto inputFieldVectors = StructVector::getFieldVectors(inputVector.get());
auto resultFieldVectors = StructVector::getFieldVectors(resultVector);
Expand Down
37 changes: 18 additions & 19 deletions src/include/common/null_mask.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#pragma once

#include <algorithm>
#include <memory>
#include <utility>

#include "common/constants.h"
#include <span>

namespace kuzu {
namespace common {
Expand Down Expand Up @@ -68,43 +67,41 @@ const uint64_t NULL_HIGH_MASKS[65] = {0x0, 0x8000000000000000, 0xc00000000000000
0xfffffffffffffffe, 0xffffffffffffffff};

class NullMask {
friend class ArrowNullMaskTree;

public:
static constexpr uint64_t NO_NULL_ENTRY = 0;
static constexpr uint64_t ALL_NULL_ENTRY = ~uint64_t(NO_NULL_ENTRY);
static constexpr uint64_t NUM_BITS_PER_NULL_ENTRY_LOG2 = 6;
static constexpr uint64_t NUM_BITS_PER_NULL_ENTRY = (uint64_t)1 << NUM_BITS_PER_NULL_ENTRY_LOG2;
static constexpr uint64_t NUM_BYTES_PER_NULL_ENTRY = NUM_BITS_PER_NULL_ENTRY >> 3;
static constexpr uint64_t DEFAULT_NUM_NULL_ENTRIES =
DEFAULT_VECTOR_CAPACITY >> NUM_BITS_PER_NULL_ENTRY_LOG2;

NullMask() : NullMask{DEFAULT_NUM_NULL_ENTRIES} {}

explicit NullMask(uint64_t numNullEntries)
: mayContainNulls{false}, numNullEntries{numNullEntries} {
// For creating a managed null mask
explicit NullMask(uint64_t capacity) : mayContainNulls{false} {
auto numNullEntries = (capacity + NUM_BITS_PER_NULL_ENTRY - 1) / NUM_BITS_PER_NULL_ENTRY;
buffer = std::make_unique<uint64_t[]>(numNullEntries);
data = buffer.get();
std::fill(data, data + numNullEntries, NO_NULL_ENTRY);
data = std::span(buffer.get(), numNullEntries);
std::fill(data.begin(), data.end(), NO_NULL_ENTRY);
}

// For creating a null mask using existing data
explicit NullMask(std::span<uint64_t> nullData) : data{nullData}, mayContainNulls{true} {}

inline void setAllNonNull() {
if (!mayContainNulls) {
return;
}
std::fill(data, data + numNullEntries, NO_NULL_ENTRY);
std::fill(data.begin(), data.end(), NO_NULL_ENTRY);
mayContainNulls = false;
}
inline void setAllNull() {
std::fill(data, data + numNullEntries, ALL_NULL_ENTRY);
std::fill(data.begin(), data.end(), ALL_NULL_ENTRY);
mayContainNulls = true;
}

inline bool hasNoNullsGuarantee() const { return !mayContainNulls; }

static void setNull(uint64_t* nullEntries, uint32_t pos, bool isNull);
inline void setNull(uint32_t pos, bool isNull) {
setNull(data, pos, isNull);
setNull(data.data(), pos, isNull);
if (isNull) {
mayContainNulls = true;
}
Expand All @@ -115,12 +112,12 @@ class NullMask {
return nullEntries[entryPos] & NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
}

inline bool isNull(uint32_t pos) const { return isNull(data, pos); }
inline bool isNull(uint32_t pos) const { return isNull(data.data(), pos); }

// const because updates to the data must set mayContainNulls if any value
// becomes non-null
// Modifying the underlying data should be done with setNull or copyFromNullData
inline const uint64_t* getData() { return data; }
inline const uint64_t* getData() const { return data.data(); }

static inline uint64_t getNumNullEntries(uint64_t numNullBits) {
return (numNullBits >> NUM_BITS_PER_NULL_ENTRY_LOG2) +
Expand Down Expand Up @@ -154,11 +151,13 @@ class NullMask {
pos - (nullEntryPos << NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2));
}

static bool copyUnaligned(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert = false);

private:
uint64_t* data;
std::span<uint64_t> data;
std::unique_ptr<uint64_t[]> buffer;
bool mayContainNulls;
uint64_t numNullEntries;
};

} // namespace common
Expand Down
14 changes: 7 additions & 7 deletions src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,16 @@ class KUZU_API ValueVector {

void setState(const std::shared_ptr<DataChunkState>& state_);

void setAllNull() { nullMask->setAllNull(); }
void setAllNonNull() { nullMask->setAllNonNull(); }
void setAllNull() { nullMask.setAllNull(); }
void setAllNonNull() { nullMask.setAllNonNull(); }
// On return true, there are no null. On return false, there may or may not be nulls.
bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); }
bool hasNoNullsGuarantee() const { return nullMask.hasNoNullsGuarantee(); }
void setNullRange(uint32_t startPos, uint32_t len, bool value) {
nullMask->setNullFromRange(startPos, len, value);
nullMask.setNullFromRange(startPos, len, value);
}
const uint64_t* getNullMaskData() { return nullMask->getData(); }
const NullMask& getNullMask() const { return nullMask; }
void setNull(uint32_t pos, bool isNull);
uint8_t isNull(uint32_t pos) const { return nullMask->isNull(pos); }
uint8_t isNull(uint32_t pos) const { return nullMask.isNull(pos); }
void setAsSingleNullEntry() {
state->selVector->selectedSize = 1;
setNull(state->selVector->selectedPositions[0], true);
Expand Down Expand Up @@ -106,7 +106,7 @@ class KUZU_API ValueVector {
private:
bool _isSequential = false;
std::unique_ptr<uint8_t[]> valueBuffer;
std::unique_ptr<NullMask> nullMask;
NullMask nullMask;
uint32_t numBytesPerValue;
std::unique_ptr<AuxiliaryBuffer> auxiliaryBuffer;
};
Expand Down
Loading

0 comments on commit 126aff4

Please sign in to comment.