Skip to content

Commit

Permalink
Pass nullMask to setValuesFromUncompressed so that we know which valu…
Browse files Browse the repository at this point in the history
…es don't need to be checked

Also fixed NullMask::resize, which was truncating the existing data
  • Loading branch information
benjaminwinger committed Apr 10, 2024
1 parent d59070f commit 70d0b3d
Show file tree
Hide file tree
Showing 18 changed files with 2,184 additions and 114 deletions.
4 changes: 2 additions & 2 deletions src/common/arrow/arrow_null_mask_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ bool ArrowNullMaskTree::applyParentBitmap(const NullMask* parent, uint64_t count
if (parent == nullptr) {
return false;
}
const uint64_t* buffer = parent->data;
if (buffer != nullptr) {
auto buffer = parent->data;
if (buffer.data() != nullptr) {
for (uint64_t i = 0; i < (count >> NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2); i++) {
mask->buffer[i] |= buffer[i];
}
Expand Down
13 changes: 7 additions & 6 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,17 @@ bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
}

void NullMask::resize(uint64_t capacity) {
auto resizedBuffer = std::make_unique<uint64_t[]>(capacity);
memcpy(resizedBuffer.get(), buffer.get(), numNullEntries);
auto numNullEntries = (capacity + NUM_BITS_PER_NULL_ENTRY - 1) / NUM_BITS_PER_NULL_ENTRY;
auto resizedBuffer = std::make_unique<uint64_t[]>(numNullEntries);
memcpy(resizedBuffer.get(), data.data(), data.size_bytes());
buffer = std::move(resizedBuffer);
data = buffer.get();
numNullEntries = capacity;
data = std::span(buffer.get(), numNullEntries);
}

bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
if (copyNullMask(srcNullEntries, srcOffset, this->data, dstOffset, numBitsToCopy, invert)) {
if (copyNullMask(srcNullEntries, srcOffset, this->data.data(), dstOffset, numBitsToCopy,
invert)) {
this->mayContainNulls = true;
return true;
}
Expand All @@ -100,7 +101,7 @@ void NullMask::setNullFromRange(uint64_t offset, uint64_t numBitsToSet, bool isN
if (isNull) {
this->mayContainNulls = true;
}
setNullRange(data, offset, numBitsToSet, isNull);
setNullRange(data.data(), offset, numBitsToSet, isNull);
}

void NullMask::setNullRange(uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet,
Expand Down
3 changes: 1 addition & 2 deletions src/common/vector/auxiliary_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ void ListAuxiliaryBuffer::resizeDataVector(ValueVector* dataVector) {
auto buffer = std::make_unique<uint8_t[]>(capacity * dataVector->getNumBytesPerValue());
memcpy(buffer.get(), dataVector->valueBuffer.get(), size * dataVector->getNumBytesPerValue());
dataVector->valueBuffer = std::move(buffer);
dataVector->nullMask->resize((capacity + NullMask::NUM_BITS_PER_NULL_ENTRY - 1) >>
NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2);
dataVector->nullMask.resize(capacity);
// If the dataVector is a struct vector, we need to resize its field vectors.
if (dataVector->dataType.getPhysicalType() == PhysicalTypeID::STRUCT) {
resizeStructDataVector(dataVector);
Expand Down
8 changes: 4 additions & 4 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common/vector/value_vector.h"

#include "common/constants.h"
#include "common/exception/runtime.h"
#include "common/null_buffer.h"
#include "common/types/blob.h"
Expand All @@ -11,7 +12,7 @@ namespace kuzu {
namespace common {

ValueVector::ValueVector(LogicalType dataType, storage::MemoryManager* memoryManager)
: dataType{std::move(dataType)} {
: dataType{std::move(dataType)}, nullMask{DEFAULT_VECTOR_CAPACITY} {
if (this->dataType.getLogicalTypeID() == LogicalTypeID::ANY) {
// LCOV_EXCL_START
// Alternatively we can assign a default type here but I don't think it's a good practice.
Expand All @@ -21,7 +22,6 @@ ValueVector::ValueVector(LogicalType dataType, storage::MemoryManager* memoryMan
}
numBytesPerValue = getDataTypeSize(this->dataType);
initializeValueBuffer();
nullMask = std::make_unique<NullMask>();
auxiliaryBuffer = AuxiliaryBufferFactory::getAuxiliaryBuffer(this->dataType, memoryManager);
}

Expand Down Expand Up @@ -61,7 +61,7 @@ bool ValueVector::discardNull(ValueVector& vector) {

bool ValueVector::setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
return nullMask->copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy, invert);
return nullMask.copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy, invert);
}

template<typename T>
Expand Down Expand Up @@ -383,7 +383,7 @@ void ValueVector::setValue(uint32_t pos, std::string_view val) {
}

void ValueVector::setNull(uint32_t pos, bool isNull) {
nullMask->setNull(pos, isNull);
nullMask.setNull(pos, isNull);
}

void StringVector::addString(ValueVector* vector, uint32_t vectorPos, ku_string_t& srcStr) {
Expand Down
4 changes: 2 additions & 2 deletions src/function/vector_cast_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ static void resolveNestedVector(std::shared_ptr<ValueVector> inputVector, ValueV
// copy data and nullmask from input
memcpy(resultVector->getData(), inputVector->getData(),
numOfEntries * resultVector->getNumBytesPerValue());
resultVector->setNullFromBits(inputVector->getNullMaskData(), 0, 0, numOfEntries);
resultVector->setNullFromBits(inputVector->getNullMask().getData(), 0, 0, numOfEntries);

numOfEntries = ListVector::getDataVectorSize(inputVector.get());
ListVector::resizeDataVector(resultVector, numOfEntries);
Expand Down Expand Up @@ -56,7 +56,7 @@ static void resolveNestedVector(std::shared_ptr<ValueVector> inputVector, ValueV
// copy data and nullmask from input
memcpy(resultVector->getData(), inputVector->getData(),
numOfEntries * resultVector->getNumBytesPerValue());
resultVector->setNullFromBits(inputVector->getNullMaskData(), 0, 0, numOfEntries);
resultVector->setNullFromBits(inputVector->getNullMask().getData(), 0, 0, numOfEntries);

auto inputFieldVectors = StructVector::getFieldVectors(inputVector.get());
auto resultFieldVectors = StructVector::getFieldVectors(resultVector);
Expand Down
31 changes: 15 additions & 16 deletions src/include/common/null_mask.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#pragma once

#include <algorithm>
#include <memory>
#include <utility>

#include "common/constants.h"
#include <span>

namespace kuzu {
namespace common {
Expand Down Expand Up @@ -76,35 +76,35 @@ class NullMask {
static constexpr uint64_t NUM_BITS_PER_NULL_ENTRY_LOG2 = 6;
static constexpr uint64_t NUM_BITS_PER_NULL_ENTRY = (uint64_t)1 << NUM_BITS_PER_NULL_ENTRY_LOG2;
static constexpr uint64_t NUM_BYTES_PER_NULL_ENTRY = NUM_BITS_PER_NULL_ENTRY >> 3;
static constexpr uint64_t DEFAULT_NUM_NULL_ENTRIES =
DEFAULT_VECTOR_CAPACITY >> NUM_BITS_PER_NULL_ENTRY_LOG2;

NullMask() : NullMask{DEFAULT_NUM_NULL_ENTRIES} {}

explicit NullMask(uint64_t numNullEntries)
: mayContainNulls{false}, numNullEntries{numNullEntries} {
// For creating a managed null mask
explicit NullMask(uint64_t capacity) : mayContainNulls{false} {
auto numNullEntries = (capacity + 63) / 64;
buffer = std::make_unique<uint64_t[]>(numNullEntries);
data = buffer.get();
std::fill(data, data + numNullEntries, NO_NULL_ENTRY);
data = std::span(buffer.get(), numNullEntries);
std::fill(data.begin(), data.end(), NO_NULL_ENTRY);
}

// For creating a null mask using existing data
explicit NullMask(std::span<uint64_t> nullData) : data{nullData}, mayContainNulls{true} {}

inline void setAllNonNull() {
if (!mayContainNulls) {
return;
}
std::fill(data, data + numNullEntries, NO_NULL_ENTRY);
std::fill(data.begin(), data.end(), NO_NULL_ENTRY);
mayContainNulls = false;
}
inline void setAllNull() {
std::fill(data, data + numNullEntries, ALL_NULL_ENTRY);
std::fill(data.begin(), data.end(), ALL_NULL_ENTRY);
mayContainNulls = true;
}

inline bool hasNoNullsGuarantee() const { return !mayContainNulls; }

static void setNull(uint64_t* nullEntries, uint32_t pos, bool isNull);
inline void setNull(uint32_t pos, bool isNull) {
setNull(data, pos, isNull);
setNull(data.data(), pos, isNull);
if (isNull) {
mayContainNulls = true;
}
Expand All @@ -115,12 +115,12 @@ class NullMask {
return nullEntries[entryPos] & NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
}

inline bool isNull(uint32_t pos) const { return isNull(data, pos); }
inline bool isNull(uint32_t pos) const { return isNull(data.data(), pos); }

// const because updates to the data must set mayContainNulls if any value
// becomes non-null
// Modifying the underlying data should be done with setNull or copyFromNullData
inline const uint64_t* getData() { return data; }
inline const uint64_t* getData() const { return data.data(); }

static inline uint64_t getNumNullEntries(uint64_t numNullBits) {
return (numNullBits >> NUM_BITS_PER_NULL_ENTRY_LOG2) +
Expand Down Expand Up @@ -155,10 +155,9 @@ class NullMask {
}

private:
uint64_t* data;
std::span<uint64_t> data;
std::unique_ptr<uint64_t[]> buffer;
bool mayContainNulls;
uint64_t numNullEntries;
};

} // namespace common
Expand Down
14 changes: 7 additions & 7 deletions src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,16 @@ class KUZU_API ValueVector {

void setState(const std::shared_ptr<DataChunkState>& state_);

void setAllNull() { nullMask->setAllNull(); }
void setAllNonNull() { nullMask->setAllNonNull(); }
void setAllNull() { nullMask.setAllNull(); }
void setAllNonNull() { nullMask.setAllNonNull(); }
// On return true, there are no null. On return false, there may or may not be nulls.
bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); }
bool hasNoNullsGuarantee() const { return nullMask.hasNoNullsGuarantee(); }
void setNullRange(uint32_t startPos, uint32_t len, bool value) {
nullMask->setNullFromRange(startPos, len, value);
nullMask.setNullFromRange(startPos, len, value);
}
const uint64_t* getNullMaskData() { return nullMask->getData(); }
const NullMask& getNullMask() const { return nullMask; }
void setNull(uint32_t pos, bool isNull);
uint8_t isNull(uint32_t pos) const { return nullMask->isNull(pos); }
uint8_t isNull(uint32_t pos) const { return nullMask.isNull(pos); }
void setAsSingleNullEntry() {
state->selVector->selectedSize = 1;
setNull(state->selVector->selectedPositions[0], true);
Expand Down Expand Up @@ -106,7 +106,7 @@ class KUZU_API ValueVector {
private:
bool _isSequential = false;
std::unique_ptr<uint8_t[]> valueBuffer;
std::unique_ptr<NullMask> nullMask;
NullMask nullMask;
uint32_t numBytesPerValue;
std::unique_ptr<AuxiliaryBuffer> auxiliaryBuffer;
};
Expand Down
20 changes: 13 additions & 7 deletions src/include/storage/compression/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
namespace kuzu {
namespace common {
class ValueVector;
}
class NullMask;
} // namespace common

namespace storage {
class ColumnChunk;
Expand Down Expand Up @@ -58,9 +59,13 @@ class CompressionAlg {

// Takes a single uncompressed value from the srcBuffer and compresses it into the dstBuffer
// Offsets refer to value offsets, not byte offsets
//
// nullMask may be null if no mask is available (all values are non-null)
// Storage of null values is handled by the implementation and decompression of null values
// does not have to produce the original value passed to this function.
virtual void setValuesFromUncompressed(const uint8_t* srcBuffer, common::offset_t srcOffset,
uint8_t* dstBuffer, common::offset_t dstOffset, common::offset_t numValues,
const CompressionMetadata& metadata) const = 0;
const CompressionMetadata& metadata, const common::NullMask* nullMask) const = 0;

// Returns compression metadata, including any relevant parameters specific to this dataset
// which will need to be passed to compressNextPage. Since this may need to scan the entire
Expand Down Expand Up @@ -128,7 +133,8 @@ class ConstantCompression final : public CompressionAlg {

// Nothing to do; constant compressed data is only updated if the update is to the same value
void setValuesFromUncompressed(const uint8_t*, common::offset_t, uint8_t*, common::offset_t,
common::offset_t, const CompressionMetadata&) const override {};
common::offset_t, const CompressionMetadata&,
const common::NullMask* /*nullMask*/) const override {};

private:
uint8_t numBytesPerValue;
Expand All @@ -146,7 +152,7 @@ class Uncompressed : public CompressionAlg {

inline void setValuesFromUncompressed(const uint8_t* srcBuffer, common::offset_t srcOffset,
uint8_t* dstBuffer, common::offset_t dstOffset, common::offset_t numValues,
const CompressionMetadata& /*metadata*/) const final {
const CompressionMetadata& /*metadata*/, const common::NullMask* /*nullMask*/) const final {
memcpy(dstBuffer + dstOffset * numBytesPerValue, srcBuffer + srcOffset * numBytesPerValue,
numBytesPerValue * numValues);
}
Expand Down Expand Up @@ -222,7 +228,7 @@ class IntegerBitpacking : public CompressionAlg {

void setValuesFromUncompressed(const uint8_t* srcBuffer, common::offset_t srcOffset,
uint8_t* dstBuffer, common::offset_t dstOffset, common::offset_t numValues,
const CompressionMetadata& metadata) const final;
const CompressionMetadata& metadata, const common::NullMask* nullMask) const final;

BitpackHeader getBitWidth(const uint8_t* srcBuffer, uint64_t numValues) const;

Expand Down Expand Up @@ -278,7 +284,7 @@ class BooleanBitpacking : public CompressionAlg {

void setValuesFromUncompressed(const uint8_t* srcBuffer, common::offset_t srcOffset,
uint8_t* dstBuffer, common::offset_t dstOffset, common::offset_t numValues,
const CompressionMetadata& metadata) const final;
const CompressionMetadata& metadata, const common::NullMask* nullMask) const final;

static inline uint64_t numValues(uint64_t dataSize) { return dataSize * 8; }

Expand Down Expand Up @@ -339,7 +345,7 @@ class WriteCompressedValuesToPage : public CompressedFunctor {

void operator()(uint8_t* frame, uint16_t posInFrame, const uint8_t* data,
common::offset_t dataOffset, common::offset_t numValues,
const CompressionMetadata& metadata);
const CompressionMetadata& metadata, const common::NullMask* nullMask = nullptr);

void operator()(uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector,
uint32_t posInVector, const CompressionMetadata& metadata);
Expand Down
8 changes: 5 additions & 3 deletions src/include/storage/store/column.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "catalog/catalog.h"
#include "common/null_mask.h"
#include "storage/stats/metadata_dah_info.h"
#include "storage/stats/property_statistics.h"
#include "storage/storage_structure/disk_array.h"
Expand All @@ -18,7 +19,7 @@ using write_values_from_vector_func_t = std::function<void(uint8_t* frame, uint1
common::ValueVector* vector, uint32_t posInVector, const CompressionMetadata& metadata)>;
using write_values_func_t = std::function<void(uint8_t* frame, uint16_t posInFrame,
const uint8_t* data, common::offset_t dataOffset, common::offset_t numValues,
const CompressionMetadata& metadata)>;
const CompressionMetadata& metadata, const common::NullMask*)>;

using read_values_to_page_func_t =
std::function<void(uint8_t* frame, PageCursor& pageCursor, uint8_t* result,
Expand Down Expand Up @@ -110,7 +111,7 @@ class Column {

// Append values to the end of the node group, resizing it if necessary
common::offset_t appendValues(common::node_group_idx_t nodeGroupIdx, const uint8_t* data,
common::offset_t numValues);
const common::NullMask* nullChunkData, common::offset_t numValues);

ReadState getReadState(transaction::TransactionType transactionType,
common::node_group_idx_t nodeGroupIdx) const;
Expand Down Expand Up @@ -140,7 +141,8 @@ class Column {
common::node_group_idx_t nodeGroupIdx, common::offset_t offsetInChunk,
common::ValueVector* vectorToWriteFrom, uint32_t posInVectorToWriteFrom);
virtual void writeValues(ReadState& state, common::offset_t offsetInChunk, const uint8_t* data,
common::offset_t dataOffset = 0, common::offset_t numValues = 1);
const common::NullMask* nullChunkData, common::offset_t dataOffset = 0,
common::offset_t numValues = 1);

// Produces a page cursor for the offset relative to the given node group
PageCursor getPageCursorForOffsetInGroup(common::offset_t offsetInChunk,
Expand Down
5 changes: 5 additions & 0 deletions src/include/storage/store/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "common/constants.h"
#include "common/data_chunk/sel_vector.h"
#include "common/enums/rel_multiplicity.h"
#include "common/null_mask.h"
#include "common/types/types.h"
#include "common/vector/value_vector.h"
#include "storage/buffer_manager/bm_file_handle.h"
Expand Down Expand Up @@ -216,6 +217,10 @@ class NullColumnChunk final : public BoolColumnChunk {
void write(ColumnChunk* srcChunk, common::offset_t srcOffsetInChunk,
common::offset_t dstOffsetInChunk, common::offset_t numValuesToCopy) override;

common::NullMask getNullMask() const {
return common::NullMask(std::span(reinterpret_cast<uint64_t*>(getData()), capacity / 64));
}

protected:
bool mayHaveNullValue;
};
Expand Down
Loading

0 comments on commit 70d0b3d

Please sign in to comment.