Skip to content

Commit

Permalink
Store nulls as densely packed bitfields
Browse files Browse the repository at this point in the history
instead of one value per byte
  • Loading branch information
benjaminwinger committed Aug 8, 2023
1 parent 1485114 commit 985f5e2
Show file tree
Hide file tree
Showing 15 changed files with 210 additions and 55 deletions.
52 changes: 48 additions & 4 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
namespace kuzu {
namespace common {

void NullMask::setNull(uint32_t pos, bool isNull) {
void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) {
auto entryPos = pos >> NUM_BITS_PER_NULL_ENTRY_LOG2;
auto bitPosInEntry = pos - (entryPos << NUM_BITS_PER_NULL_ENTRY_LOG2);
if (isNull) {
data[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
mayContainNulls = true;
nullEntries[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
} else {
data[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry];
nullEntries[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry];
}
}

Expand Down Expand Up @@ -87,5 +86,50 @@ void NullMask::resize(uint64_t capacity) {
numNullEntries = capacity;
}

bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy) {
if (copyNullMask(srcNullEntries, srcOffset, this->data, dstOffset, numBitsToCopy)) {
this->mayContainNulls = true;
return true;
}
return false;
}

void NullMask::setNullRange(uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull) {
auto [firstEntryPos, firstBitPos] = getNullEntryAndBitPos(offset);
auto [lastEntryPos, lastBitPos] = getNullEntryAndBitPos(offset + numBitsToSet);

// If the range spans multiple entries, set the entries in the middle to the appropriate value with std::fill
if (lastEntryPos > firstEntryPos + 1) {
std::fill(nullEntries + firstEntryPos + 1, nullEntries + lastEntryPos, isNull ? ALL_NULL_ENTRY : NO_NULL_ENTRY);
}

if (firstEntryPos == lastEntryPos) {
if (isNull) {
// Set bits between the first and the last bit pos to true
nullEntries[firstEntryPos] |= (~NULL_LOWER_MASKS[firstBitPos] & ~NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]);
} else {
// Set bits between the first and the last bit pos to false
nullEntries[firstEntryPos] &= (NULL_LOWER_MASKS[firstBitPos] | NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]);
}
} else {
if (isNull) {
// Set bits including and after the first bit pos to true
nullEntries[firstEntryPos] |= ~NULL_HIGH_MASKS[firstBitPos];
if (lastBitPos > 0) {
// Set bits before the last bit pos to true
nullEntries[lastEntryPos] |= ~NULL_LOWER_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos];
}
} else {
// Set bits including and after the first bit pos to false
nullEntries[firstEntryPos] &= NULL_LOWER_MASKS[firstBitPos];
if (lastBitPos > 0) {
// Set bits before the last bit pos to false
nullEntries[lastEntryPos] &= NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos];
}
}
}
}

} // namespace common
} // namespace kuzu
3 changes: 3 additions & 0 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ void LogicalType::setPhysicalType() {
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::NULL_: {
physicalType = PhysicalTypeID::NULL_;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
Expand Down
5 changes: 5 additions & 0 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ bool NodeIDVector::discardNull(ValueVector& vector) {
}
}

bool ValueVector::setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy) {
return nullMask->copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy);
}

template<typename T>
void ValueVector::setValue(uint32_t pos, T val) {
((T*)valueBuffer.get())[pos] = val;
Expand Down
27 changes: 22 additions & 5 deletions src/include/common/null_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,15 @@ class NullMask {
mayContainNulls = true;
}

inline void setMayContainNulls() { mayContainNulls = true; }
inline bool hasNoNullsGuarantee() const { return !mayContainNulls; }

void setNull(uint32_t pos, bool isNull);
static void setNull(uint64_t* nullEntries, uint32_t pos, bool isNull);
inline void setNull(uint32_t pos, bool isNull) {
setNull(data, pos, isNull);
if (isNull) {
mayContainNulls = true;
}
}

static inline bool isNull(const uint64_t* nullEntries, uint32_t pos) {
auto [entryPos, bitPosInEntry] = getNullEntryAndBitPos(pos);
Expand All @@ -108,18 +113,30 @@ class NullMask {

inline bool isNull(uint32_t pos) const { return isNull(data, pos); }

inline uint64_t* getData() { return data; }
// const because updates to the data must set mayContainNulls if any value
// becomes non-null
// Modifying the underlying data shuld be done with setNull or copyFromNullData
inline const uint64_t* getData() { return data; }

static inline uint64_t getNumNullEntries(uint64_t numNullBits) {
return (numNullBits >> NUM_BITS_PER_NULL_ENTRY_LOG2) +
((numNullBits - (numNullBits << NUM_BITS_PER_NULL_ENTRY_LOG2)) == 0 ? 0 : 1);
}

// This function returns true if we have copied a nullBit with value 1 (indicate a null
// value) to dstNullEntries.
// Copies bitpacked null flags from one buffer to another, starting at an arbitrary bit
// offset and preserving adjacent bits.
//
// returns true if we have copied a nullBit with value 1 (indicates a null value) to
// dstNullEntries.
static bool copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy);

bool copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);

// Sets the given number of bits to null (if isNull is true) or non-null (if isNull is false), starting at the offset
static void setNullRange(uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull);

void resize(uint64_t capacity);

private:
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ KUZU_API enum class LogicalTypeID : uint8_t {
SERIAL = 13,

// fixed size types
NULL_ = 21,
BOOL = 22,
INT64 = 23,
INT32 = 24,
Expand Down Expand Up @@ -110,6 +111,7 @@ enum class PhysicalTypeID : uint8_t {
INTERVAL = 7,
INTERNAL_ID = 9,
ARROW_COLUMN = 10,
NULL_ = 11,

// Variable size types.
STRING = 20,
Expand Down
6 changes: 4 additions & 2 deletions src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,24 @@ class ValueVector {

inline void setAllNull() { nullMask->setAllNull(); }
inline void setAllNonNull() { nullMask->setAllNonNull(); }
inline void setMayContainNulls() { nullMask->setMayContainNulls(); }
// On return true, there are no null. On return false, there may or may not be nulls.
inline bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); }
inline void setRangeNonNull(uint32_t startPos, uint32_t len) {
for (auto i = 0u; i < len; ++i) {
setNull(startPos + i, false);
}
}
inline uint64_t* getNullMaskData() { return nullMask->getData(); }
inline const uint64_t* getNullMaskData() { return nullMask->getData(); }
inline void setNull(uint32_t pos, bool isNull) { nullMask->setNull(pos, isNull); }
inline uint8_t isNull(uint32_t pos) const { return nullMask->isNull(pos); }
inline void setAsSingleNullEntry() {
state->selVector->selectedSize = 1;
setNull(state->selVector->selectedPositions[0], true);
}

bool setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);

inline uint32_t getNumBytesPerValue() const { return numBytesPerValue; }

template<typename T>
Expand Down
43 changes: 35 additions & 8 deletions src/include/storage/copier/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ class NullColumnChunk;
// Currently, `InMemColumnChunk` is used to populate rel columns. Eventually, we will merge them.
class ColumnChunk {
public:
friend class ColumnChunkFactory;

// ColumnChunks must be initialized after construction, so this constructor should only be used
// through the ColumnChunkFactory
explicit ColumnChunk(common::LogicalType dataType, common::CopyDescription* copyDescription,
bool hasNullChunk = true);
virtual ~ColumnChunk() = default;
Expand Down Expand Up @@ -51,6 +55,7 @@ class ColumnChunk {

virtual common::page_idx_t flushBuffer(BMFileHandle* dataFH, common::page_idx_t startPageIdx);

// Returns the size of the data type in bytes
static uint32_t getDataTypeSizeInChunk(common::LogicalType& dataType);

template<typename T>
Expand All @@ -68,11 +73,13 @@ class ColumnChunk {

virtual void write(const common::Value& val, uint64_t posToWrite);

// numValues must be greater than the number of values the ColumnChunk was first initialized
// with
virtual void resize(uint64_t numValues);

protected:
ColumnChunk(common::LogicalType dataType, common::offset_t numValues,
common::CopyDescription* copyDescription, bool hasNullChunk);
// Initializes the data buffer. Is (and should be) only called in constructor.
virtual void initialize(common::offset_t numValues);

template<typename T>
void templateCopyArrowArray(
Expand Down Expand Up @@ -106,19 +113,39 @@ class ColumnChunk {
class NullColumnChunk : public ColumnChunk {
public:
NullColumnChunk()
: ColumnChunk(common::LogicalType(common::LogicalTypeID::BOOL),
nullptr /* copyDescription */, false /* hasNullChunk */) {
resetNullBuffer();
}
: ColumnChunk(common::LogicalType(common::LogicalTypeID::NULL_),
nullptr /* copyDescription */, false /* hasNullChunk */) {}

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); }

inline bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
inline void setNull(common::offset_t pos, bool isNull) { ((bool*)buffer.get())[pos] = isNull; }
inline bool isNull(common::offset_t pos) const {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
return common::NullMask::isNull((uint64_t*)buffer.get(), pos);
}
inline void setNull(common::offset_t pos, bool isNull) {
common::NullMask::setNull(
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
(uint64_t*)buffer.get(), pos, isNull);
}

void append(NullColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend);

void resize(uint64_t numValues) final;

void setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet);

protected:
uint64_t numBytesForValues(common::offset_t numValues) const {
// 8 values per byte, and we need a buffer size which is a multiple of 8 bytes
return ceil(numValues / 8.0 / 8.0) * 8;
}
void initialize(common::offset_t numValues) final {
numBytesPerValue = 0;
numBytes = numBytesForValues(numValues);
// Each byte defaults to 0, indicating everything is non-null
buffer = std::make_unique<uint8_t[]>(numBytes);
}
};

class FixedListColumnChunk : public ColumnChunk {
Expand Down
2 changes: 1 addition & 1 deletion src/include/storage/storage_structure/lists/lists.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct InMemList {

inline uint8_t* getListData() const { return listData.get(); }
inline bool hasNullBuffer() const { return nullMask != nullptr; }
inline uint64_t* getNullMask() const { return nullMask->getData(); }
inline common::NullMask* getNullMask() const { return nullMask.get(); }

uint64_t numElements;
std::unique_ptr<uint8_t[]> listData;
Expand Down
56 changes: 41 additions & 15 deletions src/storage/copier/column_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,19 @@ namespace kuzu {
namespace storage {

ColumnChunk::ColumnChunk(LogicalType dataType, CopyDescription* copyDescription, bool hasNullChunk)
: ColumnChunk{
std::move(dataType), StorageConstants::NODE_GROUP_SIZE, copyDescription, hasNullChunk} {}

ColumnChunk::ColumnChunk(
LogicalType dataType, offset_t numValues, CopyDescription* copyDescription, bool hasNullChunk)
: dataType{std::move(dataType)}, numBytesPerValue{getDataTypeSizeInChunk(this->dataType)},
numBytes{numBytesPerValue * numValues}, copyDescription{copyDescription} {
buffer = std::make_unique<uint8_t[]>(numBytes);
copyDescription{copyDescription} {
if (hasNullChunk) {
nullChunk = std::make_unique<NullColumnChunk>();
}
}

void ColumnChunk::initialize(offset_t numValues) {
numBytes = numBytesPerValue * numValues;
buffer = std::make_unique<uint8_t[]>(numBytes);
static_cast<ColumnChunk*>(nullChunk.get())->initialize(numValues);
}

void ColumnChunk::resetToEmpty() {
if (nullChunk) {
nullChunk->resetNullBuffer();
Expand Down Expand Up @@ -283,12 +283,30 @@ uint32_t ColumnChunk::getDataTypeSizeInChunk(common::LogicalType& dataType) {
case LogicalTypeID::INTERNAL_ID: {
return sizeof(offset_t);
}
// This should never be used for Nulls,
// which use a different way of calculating the buffer size
// FIXME(bmwinger): Setting this to 0 breaks everything.
// It's being used in NullNodeColumn, and maybe there are some functions
// relying on it despite the value being meaningless for a null bitfield.
case LogicalTypeID::NULL_: {
return 1;
}
default: {
return StorageUtils::getDataTypeSize(dataType);
}
}
}

// TODO(bmwinger): Eventually, to support bitpacked bools, all these functions will need to be
// updated to support values sizes of less than one byte.
// But for the moment, this is the only generic ColumnChunk function which is needed by
// NullColumnChunk, and it's invoked directly on the nullColumn, so we don't need dynamic dispatch
void NullColumnChunk::append(NullColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) {
NullMask::copyNullMask((uint64_t*)other->buffer.get(), startPosInOtherChunk,
(uint64_t*)buffer.get(), startPosInChunk, numValuesToAppend);
}

void FixedListColumnChunk::append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) {
auto otherChunk = (FixedListColumnChunk*)other;
Expand Down Expand Up @@ -342,6 +360,7 @@ void FixedListColumnChunk::write(const common::Value& fixedListVal, uint64_t pos

std::unique_ptr<ColumnChunk> ColumnChunkFactory::createColumnChunk(
const LogicalType& dataType, CopyDescription* copyDescription) {
std::unique_ptr<ColumnChunk> chunk;
switch (dataType.getPhysicalType()) {
case PhysicalTypeID::BOOL:
case PhysicalTypeID::INT64:
Expand All @@ -350,21 +369,28 @@ std::unique_ptr<ColumnChunk> ColumnChunkFactory::createColumnChunk(
case PhysicalTypeID::DOUBLE:
case PhysicalTypeID::FLOAT:
case PhysicalTypeID::INTERVAL:
return std::make_unique<ColumnChunk>(dataType, copyDescription);
chunk = std::make_unique<ColumnChunk>(dataType, copyDescription);
break;
case PhysicalTypeID::FIXED_LIST:
return std::make_unique<FixedListColumnChunk>(dataType, copyDescription);
chunk = std::make_unique<FixedListColumnChunk>(dataType, copyDescription);
break;
case PhysicalTypeID::STRING:
return std::make_unique<StringColumnChunk>(dataType, copyDescription);
chunk = std::make_unique<StringColumnChunk>(dataType, copyDescription);
break;
case PhysicalTypeID::VAR_LIST:
return std::make_unique<VarListColumnChunk>(dataType, copyDescription);
chunk = std::make_unique<VarListColumnChunk>(dataType, copyDescription);
break;
case PhysicalTypeID::STRUCT:
return std::make_unique<StructColumnChunk>(dataType, copyDescription);
chunk = std::make_unique<StructColumnChunk>(dataType, copyDescription);
break;
default: {
throw NotImplementedException("ColumnChunkFactory::createColumnChunk for data type " +
LogicalTypeUtils::dataTypeToString(dataType) +
" is not supported.");
}
}
chunk->initialize(StorageConstants::NODE_GROUP_SIZE);
return chunk;
}

// Bool
Expand Down Expand Up @@ -417,7 +443,8 @@ common::offset_t ColumnChunk::getOffsetInBuffer(common::offset_t pos) const {
}

void NullColumnChunk::resize(uint64_t numValues) {
auto numBytesAfterResize = numValues * numBytesPerValue;
auto numBytesAfterResize = numBytesForValues(numValues);
assert(numBytesAfterResize > numBytes);
auto reservedBuffer = std::make_unique<uint8_t[]>(numBytesAfterResize);
memset(reservedBuffer.get(), 0 /* non null */, numBytesAfterResize);
memcpy(reservedBuffer.get(), buffer.get(), numBytes);
Expand All @@ -426,8 +453,7 @@ void NullColumnChunk::resize(uint64_t numValues) {
}

void NullColumnChunk::setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet) {
memset(buffer.get() + startPosInChunk * numBytesPerValue, 0 /* non null */,
numValuesToSet * numBytesPerValue);
NullMask::setNullRange((uint64_t*)buffer.get(), startPosInChunk, numValuesToSet, false);
}

} // namespace storage
Expand Down
Loading

0 comments on commit 985f5e2

Please sign in to comment.