Skip to content

Commit

Permalink
Merge pull request #1862 from kuzudb/bitpacking
Browse files Browse the repository at this point in the history
Store nulls as densely packed bitfields
  • Loading branch information
ray6080 committed Aug 9, 2023
2 parents 25dfe2c + 2414fe3 commit 95d3ba3
Show file tree
Hide file tree
Showing 15 changed files with 219 additions and 55 deletions.
58 changes: 54 additions & 4 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
namespace kuzu {
namespace common {

void NullMask::setNull(uint32_t pos, bool isNull) {
void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) {
auto entryPos = pos >> NUM_BITS_PER_NULL_ENTRY_LOG2;
auto bitPosInEntry = pos - (entryPos << NUM_BITS_PER_NULL_ENTRY_LOG2);
if (isNull) {
data[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
mayContainNulls = true;
nullEntries[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
} else {
data[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry];
nullEntries[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry];
}
}

Expand Down Expand Up @@ -87,5 +86,56 @@ void NullMask::resize(uint64_t capacity) {
numNullEntries = capacity;
}

bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy) {
if (copyNullMask(srcNullEntries, srcOffset, this->data, dstOffset, numBitsToCopy)) {
this->mayContainNulls = true;
return true;
}
return false;
}

void NullMask::setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull) {
auto [firstEntryPos, firstBitPos] = getNullEntryAndBitPos(offset);
auto [lastEntryPos, lastBitPos] = getNullEntryAndBitPos(offset + numBitsToSet);

// If the range spans multiple entries, set the entries in the middle to the appropriate value
// with std::fill
if (lastEntryPos > firstEntryPos + 1) {
std::fill(nullEntries + firstEntryPos + 1, nullEntries + lastEntryPos,
isNull ? ALL_NULL_ENTRY : NO_NULL_ENTRY);
}

if (firstEntryPos == lastEntryPos) {
if (isNull) {
// Set bits between the first and the last bit pos to true
nullEntries[firstEntryPos] |= (~NULL_LOWER_MASKS[firstBitPos] &
~NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]);
} else {
// Set bits between the first and the last bit pos to false
nullEntries[firstEntryPos] &= (NULL_LOWER_MASKS[firstBitPos] |
NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]);
}
} else {
if (isNull) {
// Set bits including and after the first bit pos to true
nullEntries[firstEntryPos] |= ~NULL_HIGH_MASKS[firstBitPos];
if (lastBitPos > 0) {
// Set bits before the last bit pos to true
nullEntries[lastEntryPos] |=
~NULL_LOWER_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos];
}
} else {
// Set bits including and after the first bit pos to false
nullEntries[firstEntryPos] &= NULL_LOWER_MASKS[firstBitPos];
if (lastBitPos > 0) {
// Set bits before the last bit pos to false
nullEntries[lastEntryPos] &= NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos];
}
}
}
}

} // namespace common
} // namespace kuzu
3 changes: 3 additions & 0 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ void LogicalType::setPhysicalType() {
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::NULL_: {
physicalType = PhysicalTypeID::NULL_;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
Expand Down
5 changes: 5 additions & 0 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ bool NodeIDVector::discardNull(ValueVector& vector) {
}
}

bool ValueVector::setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy) {
return nullMask->copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy);
}

template<typename T>
void ValueVector::setValue(uint32_t pos, T val) {
((T*)valueBuffer.get())[pos] = val;
Expand Down
29 changes: 24 additions & 5 deletions src/include/common/null_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,15 @@ class NullMask {
mayContainNulls = true;
}

inline void setMayContainNulls() { mayContainNulls = true; }
inline bool hasNoNullsGuarantee() const { return !mayContainNulls; }

void setNull(uint32_t pos, bool isNull);
static void setNull(uint64_t* nullEntries, uint32_t pos, bool isNull);
inline void setNull(uint32_t pos, bool isNull) {
setNull(data, pos, isNull);
if (isNull) {
mayContainNulls = true;
}
}

static inline bool isNull(const uint64_t* nullEntries, uint32_t pos) {
auto [entryPos, bitPosInEntry] = getNullEntryAndBitPos(pos);
Expand All @@ -108,18 +113,32 @@ class NullMask {

inline bool isNull(uint32_t pos) const { return isNull(data, pos); }

inline uint64_t* getData() { return data; }
// const because updates to the data must set mayContainNulls if any value
// becomes non-null
// Modifying the underlying data shuld be done with setNull or copyFromNullData
inline const uint64_t* getData() { return data; }

static inline uint64_t getNumNullEntries(uint64_t numNullBits) {
return (numNullBits >> NUM_BITS_PER_NULL_ENTRY_LOG2) +
((numNullBits - (numNullBits << NUM_BITS_PER_NULL_ENTRY_LOG2)) == 0 ? 0 : 1);
}

// This function returns true if we have copied a nullBit with value 1 (indicate a null
// value) to dstNullEntries.
// Copies bitpacked null flags from one buffer to another, starting at an arbitrary bit
// offset and preserving adjacent bits.
//
// returns true if we have copied a nullBit with value 1 (indicates a null value) to
// dstNullEntries.
static bool copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy);

bool copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);

// Sets the given number of bits to null (if isNull is true) or non-null (if isNull is false),
// starting at the offset
static void setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull);

void resize(uint64_t capacity);

private:
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ KUZU_API enum class LogicalTypeID : uint8_t {
SERIAL = 13,

// fixed size types
NULL_ = 21,
BOOL = 22,
INT64 = 23,
INT32 = 24,
Expand Down Expand Up @@ -110,6 +111,7 @@ enum class PhysicalTypeID : uint8_t {
INTERVAL = 7,
INTERNAL_ID = 9,
ARROW_COLUMN = 10,
NULL_ = 11,

// Variable size types.
STRING = 20,
Expand Down
6 changes: 4 additions & 2 deletions src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,24 @@ class ValueVector {

inline void setAllNull() { nullMask->setAllNull(); }
inline void setAllNonNull() { nullMask->setAllNonNull(); }
inline void setMayContainNulls() { nullMask->setMayContainNulls(); }
// On return true, there are no null. On return false, there may or may not be nulls.
inline bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); }
inline void setRangeNonNull(uint32_t startPos, uint32_t len) {
for (auto i = 0u; i < len; ++i) {
setNull(startPos + i, false);
}
}
inline uint64_t* getNullMaskData() { return nullMask->getData(); }
inline const uint64_t* getNullMaskData() { return nullMask->getData(); }
inline void setNull(uint32_t pos, bool isNull) { nullMask->setNull(pos, isNull); }
inline uint8_t isNull(uint32_t pos) const { return nullMask->isNull(pos); }
inline void setAsSingleNullEntry() {
state->selVector->selectedSize = 1;
setNull(state->selVector->selectedPositions[0], true);
}

bool setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);

inline uint32_t getNumBytesPerValue() const { return numBytesPerValue; }

template<typename T>
Expand Down
43 changes: 35 additions & 8 deletions src/include/storage/copier/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ class NullColumnChunk;
// Currently, `InMemColumnChunk` is used to populate rel columns. Eventually, we will merge them.
class ColumnChunk {
public:
friend class ColumnChunkFactory;

// ColumnChunks must be initialized after construction, so this constructor should only be used
// through the ColumnChunkFactory
explicit ColumnChunk(common::LogicalType dataType, common::CopyDescription* copyDescription,
bool hasNullChunk = true);
virtual ~ColumnChunk() = default;
Expand Down Expand Up @@ -51,6 +55,7 @@ class ColumnChunk {

virtual common::page_idx_t flushBuffer(BMFileHandle* dataFH, common::page_idx_t startPageIdx);

// Returns the size of the data type in bytes
static uint32_t getDataTypeSizeInChunk(common::LogicalType& dataType);

template<typename T>
Expand All @@ -68,6 +73,8 @@ class ColumnChunk {

virtual void write(const common::Value& val, uint64_t posToWrite);

// numValues must be at least the number of values the ColumnChunk was first initialized
// with
virtual void resize(uint64_t numValues);

template<typename T>
Expand All @@ -76,8 +83,8 @@ class ColumnChunk {
}

protected:
ColumnChunk(common::LogicalType dataType, common::offset_t numValues,
common::CopyDescription* copyDescription, bool hasNullChunk);
// Initializes the data buffer. Is (and should be) only called in constructor.
virtual void initialize(common::offset_t numValues);

template<typename T>
void templateCopyArrowArray(
Expand Down Expand Up @@ -106,19 +113,39 @@ class ColumnChunk {
class NullColumnChunk : public ColumnChunk {
public:
NullColumnChunk()
: ColumnChunk(common::LogicalType(common::LogicalTypeID::BOOL),
nullptr /* copyDescription */, false /* hasNullChunk */) {
resetNullBuffer();
}
: ColumnChunk(common::LogicalType(common::LogicalTypeID::NULL_),
nullptr /* copyDescription */, false /* hasNullChunk */) {}

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); }

inline bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
inline void setNull(common::offset_t pos, bool isNull) { ((bool*)buffer.get())[pos] = isNull; }
inline bool isNull(common::offset_t pos) const {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
return common::NullMask::isNull((uint64_t*)buffer.get(), pos);
}
inline void setNull(common::offset_t pos, bool isNull) {
common::NullMask::setNull(
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
(uint64_t*)buffer.get(), pos, isNull);
}

void append(NullColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend);

void resize(uint64_t numValues) final;

void setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet);

protected:
uint64_t numBytesForValues(common::offset_t numValues) const {
// 8 values per byte, and we need a buffer size which is a multiple of 8 bytes
return ceil(numValues / 8.0 / 8.0) * 8;
}
void initialize(common::offset_t numValues) final {
numBytesPerValue = 0;
numBytes = numBytesForValues(numValues);
// Each byte defaults to 0, indicating everything is non-null
buffer = std::make_unique<uint8_t[]>(numBytes);
}
};

class FixedListColumnChunk : public ColumnChunk {
Expand Down
2 changes: 1 addition & 1 deletion src/include/storage/storage_structure/lists/lists.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct InMemList {

inline uint8_t* getListData() const { return listData.get(); }
inline bool hasNullBuffer() const { return nullMask != nullptr; }
inline uint64_t* getNullMask() const { return nullMask->getData(); }
inline common::NullMask* getNullMask() const { return nullMask.get(); }

uint64_t numElements;
std::unique_ptr<uint8_t[]> listData;
Expand Down
Loading

0 comments on commit 95d3ba3

Please sign in to comment.