Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store nulls as densely packed bitfields #1862

Merged
merged 1 commit into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 54 additions & 4 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
namespace kuzu {
namespace common {

void NullMask::setNull(uint32_t pos, bool isNull) {
void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) {
auto entryPos = pos >> NUM_BITS_PER_NULL_ENTRY_LOG2;
auto bitPosInEntry = pos - (entryPos << NUM_BITS_PER_NULL_ENTRY_LOG2);
if (isNull) {
data[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
mayContainNulls = true;
nullEntries[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry];
} else {
data[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry];
nullEntries[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry];
}
}

Expand Down Expand Up @@ -87,5 +86,56 @@
numNullEntries = capacity;
}

bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy) {
if (copyNullMask(srcNullEntries, srcOffset, this->data, dstOffset, numBitsToCopy)) {
this->mayContainNulls = true;
return true;
}
return false;
}

void NullMask::setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull) {
auto [firstEntryPos, firstBitPos] = getNullEntryAndBitPos(offset);
auto [lastEntryPos, lastBitPos] = getNullEntryAndBitPos(offset + numBitsToSet);

// If the range spans multiple entries, set the entries in the middle to the appropriate value
// with std::fill
if (lastEntryPos > firstEntryPos + 1) {
std::fill(nullEntries + firstEntryPos + 1, nullEntries + lastEntryPos,
isNull ? ALL_NULL_ENTRY : NO_NULL_ENTRY);
}

if (firstEntryPos == lastEntryPos) {
if (isNull) {
// Set bits between the first and the last bit pos to true
nullEntries[firstEntryPos] |= (~NULL_LOWER_MASKS[firstBitPos] &
~NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]);
} else {
// Set bits between the first and the last bit pos to false
nullEntries[firstEntryPos] &= (NULL_LOWER_MASKS[firstBitPos] |
NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]);
}
} else {
if (isNull) {
// Set bits including and after the first bit pos to true
nullEntries[firstEntryPos] |= ~NULL_HIGH_MASKS[firstBitPos];
if (lastBitPos > 0) {

Check warning on line 124 in src/common/null_mask.cpp

View check run for this annotation

Codecov / codecov/patch

src/common/null_mask.cpp#L123-L124

Added lines #L123 - L124 were not covered by tests
// Set bits before the last bit pos to true
nullEntries[lastEntryPos] |=
~NULL_LOWER_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos];

Check warning on line 127 in src/common/null_mask.cpp

View check run for this annotation

Codecov / codecov/patch

src/common/null_mask.cpp#L126-L127

Added lines #L126 - L127 were not covered by tests
}
} else {
// Set bits including and after the first bit pos to false
nullEntries[firstEntryPos] &= NULL_LOWER_MASKS[firstBitPos];
if (lastBitPos > 0) {
// Set bits before the last bit pos to false
nullEntries[lastEntryPos] &= NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos];
}
}
}
}

} // namespace common
} // namespace kuzu
3 changes: 3 additions & 0 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ void LogicalType::setPhysicalType() {
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::NULL_: {
physicalType = PhysicalTypeID::NULL_;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
Expand Down
5 changes: 5 additions & 0 deletions src/common/vector/value_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ bool NodeIDVector::discardNull(ValueVector& vector) {
}
}

bool ValueVector::setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t dstOffset, uint64_t numBitsToCopy) {
return nullMask->copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy);
}

template<typename T>
void ValueVector::setValue(uint32_t pos, T val) {
((T*)valueBuffer.get())[pos] = val;
Expand Down
29 changes: 24 additions & 5 deletions src/include/common/null_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,15 @@ class NullMask {
mayContainNulls = true;
}

inline void setMayContainNulls() { mayContainNulls = true; }
inline bool hasNoNullsGuarantee() const { return !mayContainNulls; }

void setNull(uint32_t pos, bool isNull);
static void setNull(uint64_t* nullEntries, uint32_t pos, bool isNull);
inline void setNull(uint32_t pos, bool isNull) {
setNull(data, pos, isNull);
if (isNull) {
mayContainNulls = true;
}
}

static inline bool isNull(const uint64_t* nullEntries, uint32_t pos) {
auto [entryPos, bitPosInEntry] = getNullEntryAndBitPos(pos);
Expand All @@ -108,18 +113,32 @@ class NullMask {

inline bool isNull(uint32_t pos) const { return isNull(data, pos); }

inline uint64_t* getData() { return data; }
// const because updates to the data must set mayContainNulls if any value
// becomes non-null
// Modifying the underlying data shuld be done with setNull or copyFromNullData
inline const uint64_t* getData() { return data; }

static inline uint64_t getNumNullEntries(uint64_t numNullBits) {
return (numNullBits >> NUM_BITS_PER_NULL_ENTRY_LOG2) +
((numNullBits - (numNullBits << NUM_BITS_PER_NULL_ENTRY_LOG2)) == 0 ? 0 : 1);
}

// This function returns true if we have copied a nullBit with value 1 (indicate a null
// value) to dstNullEntries.
// Copies bitpacked null flags from one buffer to another, starting at an arbitrary bit
// offset and preserving adjacent bits.
//
// returns true if we have copied a nullBit with value 1 (indicates a null value) to
// dstNullEntries.
static bool copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy);

bool copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);

// Sets the given number of bits to null (if isNull is true) or non-null (if isNull is false),
// starting at the offset
static void setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull);

void resize(uint64_t capacity);

private:
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ KUZU_API enum class LogicalTypeID : uint8_t {
SERIAL = 13,

// fixed size types
NULL_ = 21,
BOOL = 22,
INT64 = 23,
INT32 = 24,
Expand Down Expand Up @@ -110,6 +111,7 @@ enum class PhysicalTypeID : uint8_t {
INTERVAL = 7,
INTERNAL_ID = 9,
ARROW_COLUMN = 10,
NULL_ = 11,

// Variable size types.
STRING = 20,
Expand Down
6 changes: 4 additions & 2 deletions src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,24 @@ class ValueVector {

inline void setAllNull() { nullMask->setAllNull(); }
inline void setAllNonNull() { nullMask->setAllNonNull(); }
inline void setMayContainNulls() { nullMask->setMayContainNulls(); }
// On return true, there are no null. On return false, there may or may not be nulls.
inline bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); }
inline void setRangeNonNull(uint32_t startPos, uint32_t len) {
for (auto i = 0u; i < len; ++i) {
setNull(startPos + i, false);
}
}
inline uint64_t* getNullMaskData() { return nullMask->getData(); }
inline const uint64_t* getNullMaskData() { return nullMask->getData(); }
inline void setNull(uint32_t pos, bool isNull) { nullMask->setNull(pos, isNull); }
inline uint8_t isNull(uint32_t pos) const { return nullMask->isNull(pos); }
inline void setAsSingleNullEntry() {
state->selVector->selectedSize = 1;
setNull(state->selVector->selectedPositions[0], true);
}

bool setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);

inline uint32_t getNumBytesPerValue() const { return numBytesPerValue; }

template<typename T>
Expand Down
43 changes: 35 additions & 8 deletions src/include/storage/copier/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ class NullColumnChunk;
// Currently, `InMemColumnChunk` is used to populate rel columns. Eventually, we will merge them.
class ColumnChunk {
public:
friend class ColumnChunkFactory;

// ColumnChunks must be initialized after construction, so this constructor should only be used
// through the ColumnChunkFactory
explicit ColumnChunk(common::LogicalType dataType, common::CopyDescription* copyDescription,
bool hasNullChunk = true);
virtual ~ColumnChunk() = default;
Expand Down Expand Up @@ -51,6 +55,7 @@ class ColumnChunk {

virtual common::page_idx_t flushBuffer(BMFileHandle* dataFH, common::page_idx_t startPageIdx);

// Returns the size of the data type in bytes
static uint32_t getDataTypeSizeInChunk(common::LogicalType& dataType);

template<typename T>
Expand All @@ -68,11 +73,13 @@ class ColumnChunk {

virtual void write(const common::Value& val, uint64_t posToWrite);

// numValues must be at least the number of values the ColumnChunk was first initialized
// with
virtual void resize(uint64_t numValues);

protected:
ColumnChunk(common::LogicalType dataType, common::offset_t numValues,
common::CopyDescription* copyDescription, bool hasNullChunk);
// Initializes the data buffer. Is (and should be) only called in constructor.
virtual void initialize(common::offset_t numValues);

template<typename T>
void templateCopyArrowArray(
Expand Down Expand Up @@ -106,19 +113,39 @@ class ColumnChunk {
class NullColumnChunk : public ColumnChunk {
public:
NullColumnChunk()
: ColumnChunk(common::LogicalType(common::LogicalTypeID::BOOL),
nullptr /* copyDescription */, false /* hasNullChunk */) {
resetNullBuffer();
ray6080 marked this conversation as resolved.
Show resolved Hide resolved
}
: ColumnChunk(common::LogicalType(common::LogicalTypeID::NULL_),
nullptr /* copyDescription */, false /* hasNullChunk */) {}

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); }

inline bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
inline void setNull(common::offset_t pos, bool isNull) { ((bool*)buffer.get())[pos] = isNull; }
inline bool isNull(common::offset_t pos) const {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
return common::NullMask::isNull((uint64_t*)buffer.get(), pos);
}
inline void setNull(common::offset_t pos, bool isNull) {
common::NullMask::setNull(
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
(uint64_t*)buffer.get(), pos, isNull);
}

void append(NullColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend);

void resize(uint64_t numValues) final;

void setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet);

protected:
uint64_t numBytesForValues(common::offset_t numValues) const {
// 8 values per byte, and we need a buffer size which is a multiple of 8 bytes
return ceil(numValues / 8.0 / 8.0) * 8;
}
void initialize(common::offset_t numValues) final {
numBytesPerValue = 0;
numBytes = numBytesForValues(numValues);
// Each byte defaults to 0, indicating everything is non-null
buffer = std::make_unique<uint8_t[]>(numBytes);
}
};

class FixedListColumnChunk : public ColumnChunk {
Expand Down
2 changes: 1 addition & 1 deletion src/include/storage/storage_structure/lists/lists.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ struct InMemList {

inline uint8_t* getListData() const { return listData.get(); }
inline bool hasNullBuffer() const { return nullMask != nullptr; }
inline uint64_t* getNullMask() const { return nullMask->getData(); }
inline common::NullMask* getNullMask() const { return nullMask.get(); }

uint64_t numElements;
std::unique_ptr<uint8_t[]> listData;
Expand Down
Loading
Loading