Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Boolean bitpacking #1884

Merged
merged 1 commit into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) {
}

bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy) {
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
auto [srcNullEntryPos, srcNullBitPos] = getNullEntryAndBitPos(srcOffset);
auto [dstNullEntryPos, dstNullBitPos] = getNullEntryAndBitPos(dstOffset);
uint64_t bitPos = 0;
Expand All @@ -25,7 +25,8 @@ bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
auto curDstNullEntryPos = dstNullEntryPos;
auto curDstNullBitPos = dstNullBitPos;
uint64_t numBitsToReadInCurrentEntry;
uint64_t srcNullMaskEntry = srcNullEntries[srcNullEntryPos];
uint64_t srcNullMaskEntry =
invert ? ~srcNullEntries[srcNullEntryPos] : srcNullEntries[srcNullEntryPos];
if (dstNullBitPos < srcNullBitPos) {
numBitsToReadInCurrentEntry =
std::min(NullMask::NUM_BITS_PER_NULL_ENTRY - srcNullBitPos, numBitsToCopy - bitPos);
Expand Down
3 changes: 0 additions & 3 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,6 @@ void LogicalType::setPhysicalType() {
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::NULL_: {
physicalType = PhysicalTypeID::NULL_;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
Expand Down
2 changes: 1 addition & 1 deletion src/include/common/null_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class NullMask {
// returns true if we have copied a nullBit with value 1 (indicates a null value) to
// dstNullEntries.
static bool copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy);
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert = false);

bool copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);
Expand Down
2 changes: 0 additions & 2 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ KUZU_API enum class LogicalTypeID : uint8_t {
SERIAL = 13,

// fixed size types
NULL_ = 21,
BOOL = 22,
INT64 = 23,
INT32 = 24,
Expand Down Expand Up @@ -111,7 +110,6 @@ enum class PhysicalTypeID : uint8_t {
INTERVAL = 7,
INTERNAL_ID = 9,
ARROW_COLUMN = 10,
NULL_ = 11,

// Variable size types.
STRING = 20,
Expand Down
47 changes: 28 additions & 19 deletions src/include/storage/copier/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,31 +110,41 @@ class ColumnChunk {
const common::CopyDescription* copyDescription;
};

class NullColumnChunk : public ColumnChunk {
public:
NullColumnChunk()
: ColumnChunk(common::LogicalType(common::LogicalTypeID::NULL_),
nullptr /* copyDescription */, false /* hasNullChunk */) {}
template<>
inline void ColumnChunk::setValue(bool val, common::offset_t pos) {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
common::NullMask::setNull((uint64_t*)buffer.get(), pos, val);
}

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); }
template<>
inline bool ColumnChunk::getValue(common::offset_t pos) const {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
return common::NullMask::isNull((uint64_t*)buffer.get(), pos);
}

inline bool isNull(common::offset_t pos) const {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
return common::NullMask::isNull((uint64_t*)buffer.get(), pos);
}
inline void setNull(common::offset_t pos, bool isNull) {
common::NullMask::setNull(
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
(uint64_t*)buffer.get(), pos, isNull);
}
class BoolColumnChunk : public ColumnChunk {
public:
BoolColumnChunk(common::CopyDescription* copyDescription, bool hasNullChunk = true)
: ColumnChunk(
common::LogicalType(common::LogicalTypeID::BOOL), copyDescription, hasNullChunk) {}

void append(NullColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend);
void append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;
};

class NullColumnChunk : public BoolColumnChunk {
public:
NullColumnChunk() : BoolColumnChunk(nullptr /*copyDescription*/, false /*hasNullChunk*/) {}
// Maybe this should be combined with BoolColumnChunk if the only difference is these functions?
inline bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
inline void setNull(common::offset_t pos, bool isNull) { setValue(isNull, pos); }

void resize(uint64_t numValues) final;

void setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet);

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); }

protected:
uint64_t numBytesForValues(common::offset_t numValues) const {
// 8 values per byte, and we need a buffer size which is a multiple of 8 bytes
Expand Down Expand Up @@ -172,8 +182,7 @@ void ColumnChunk::templateCopyArrowArray<uint8_t*>(
arrow::Array* array, common::offset_t startPosInSegment, uint32_t numValuesToAppend);
// BOOL
template<>
void ColumnChunk::setValueFromString<bool>(
const char* value, uint64_t length, common::offset_t pos);
void ColumnChunk::setValueFromString<bool>(const char* value, uint64_t length, uint64_t pos);
// FIXED_LIST
template<>
void ColumnChunk::setValueFromString<uint8_t*>(const char* value, uint64_t length, uint64_t pos);
Expand Down
15 changes: 14 additions & 1 deletion src/include/storage/storage_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,20 @@ struct PageElementCursor {
};

struct PageUtils {
static uint32_t getNumElementsInAPage(uint32_t elementSize, bool hasNull);
static constexpr uint32_t getNumElementsInAPage(uint32_t elementSize, bool hasNull) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why move from cpp to header file? If you want to keep it inside the header file, mark this function as inline.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd moved it because so it can be constexpr, and constexpr (according to cppreference) implies inline (for cases where it's evaluated at runtime presumably).

Frankly, the static_assert I'd used it for is probably not necessary, but I thought it at least made more sense to make it a single static assert than multiple runtime asserts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Yeah, this makes sense.

assert(elementSize > 0);
auto numBytesPerNullEntry = common::NullMask::NUM_BITS_PER_NULL_ENTRY >> 3;
auto numNullEntries =
hasNull ?
(uint32_t)ceil((double)common::BufferPoolConstants::PAGE_4KB_SIZE /
(double)(((uint64_t)elementSize
<< common::NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2) +
numBytesPerNullEntry)) :
0;
return (common::BufferPoolConstants::PAGE_4KB_SIZE -
(numNullEntries * numBytesPerNullEntry)) /
elementSize;
}

// This function returns the page pageIdx of the page where element will be found and the pos of
// the element in the page as the offset.
Expand Down
18 changes: 17 additions & 1 deletion src/include/storage/store/node_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ struct NullNodeColumnFunc {
uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector);
};

struct BoolNodeColumnFunc {
static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor,
common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead);
static void writeValueToPage(
uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector);
};

class NullNodeColumn;
// TODO(Guodong): This is intentionally duplicated with `Column`, as for now, we don't change rel
// tables. `Column` is used for rel tables only. Eventually, we should remove `Column`.
Expand All @@ -50,7 +57,7 @@ class NodeColumn {
virtual ~NodeColumn() = default;

// Expose for feature store
void batchLookup(const common::offset_t* nodeOffsets, size_t size, uint8_t* result);
virtual void batchLookup(const common::offset_t* nodeOffsets, size_t size, uint8_t* result);

virtual void scan(transaction::Transaction* transaction, common::ValueVector* nodeIDVector,
common::ValueVector* resultVector);
Expand Down Expand Up @@ -129,6 +136,15 @@ class NodeColumn {
write_node_column_func_t writeNodeColumnFunc;
};

class BoolNodeColumn : public NodeColumn {
public:
BoolNodeColumn(const catalog::MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
bool requireNullColumn = true);

void batchLookup(const common::offset_t* nodeOffsets, size_t size, uint8_t* result) final;
};

class NullNodeColumn : public NodeColumn {
public:
NullNodeColumn(common::page_idx_t metaDAHPageIdx, BMFileHandle* dataFH,
Expand Down
59 changes: 31 additions & 28 deletions src/storage/copier/column_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
void ColumnChunk::initialize(offset_t numValues) {
numBytes = numBytesPerValue * numValues;
buffer = std::make_unique<uint8_t[]>(numBytes);
static_cast<ColumnChunk*>(nullChunk.get())->initialize(numValues);
if (nullChunk) {
static_cast<ColumnChunk*>(nullChunk.get())->initialize(numValues);
}
}

void ColumnChunk::resetToEmpty() {
Expand Down Expand Up @@ -184,21 +186,22 @@
arrow::Array* array, offset_t startPosInChunk, uint32_t numValuesToAppend) {
auto* boolArray = (arrow::BooleanArray*)array;
auto data = boolArray->data();
auto valuesInChunk = (bool*)(buffer.get());

auto arrowBuffer = boolArray->values()->data();
// Might read off the end with the cast, but copyNullMask should ignore the extra data
//
// The arrow BooleanArray offset should be the offset in bits
// Unfortunately this is not documented.
NullMask::copyNullMask((uint64_t*)arrowBuffer, boolArray->offset(), (uint64_t*)buffer.get(),
startPosInChunk, numValuesToAppend);

if (data->MayHaveNulls()) {
for (auto i = 0u; i < numValuesToAppend; i++) {
auto posInChunk = startPosInChunk + i;
if (data->IsNull(i)) {
nullChunk->setNull(posInChunk, true);
continue;
}
valuesInChunk[posInChunk] = boolArray->Value(i);
}
} else {
for (auto i = 0u; i < numValuesToAppend; i++) {
auto posInChunk = startPosInChunk + i;
valuesInChunk[posInChunk] = boolArray->Value(i);
}
auto arrowNullBitMap = boolArray->null_bitmap_data();

// Offset should apply to both bool data and nulls
NullMask::copyNullMask((uint64_t*)arrowNullBitMap, boolArray->offset(),

Check warning on line 202 in src/storage/copier/column_chunk.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/copier/column_chunk.cpp#L202

Added line #L202 was not covered by tests
(uint64_t*)nullChunk->buffer.get(), startPosInChunk, numValuesToAppend,
true /*invert*/);
}
}

Expand Down Expand Up @@ -283,12 +286,9 @@
case LogicalTypeID::INTERNAL_ID: {
return sizeof(offset_t);
}
// This should never be used for Nulls,
// which use a different way of calculating the buffer size
// FIXME(bmwinger): Setting this to 0 breaks everything.
// It's being used in NullNodeColumn, and maybe there are some functions
// relying on it despite the value being meaningless for a null bitfield.
case LogicalTypeID::NULL_: {
// Used by NodeColumnn to represent the de-compressed size of booleans in-memory
// Does not reflect the size of booleans on-disk in BoolColumnChunk
case LogicalTypeID::BOOL: {
return 1;
}
default: {
Expand All @@ -297,14 +297,15 @@
}
}

// TODO(bmwinger): Eventually, to support bitpacked bools, all these functions will need to be
// updated to support values sizes of less than one byte.
// But for the moment, this is the only generic ColumnChunk function which is needed by
// NullColumnChunk, and it's invoked directly on the nullColumn, so we don't need dynamic dispatch
void NullColumnChunk::append(NullColumnChunk* other, offset_t startPosInOtherChunk,
void BoolColumnChunk::append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) {
NullMask::copyNullMask((uint64_t*)other->buffer.get(), startPosInOtherChunk,
(uint64_t*)buffer.get(), startPosInChunk, numValuesToAppend);
NullMask::copyNullMask((uint64_t*)static_cast<BoolColumnChunk*>(other)->buffer.get(),
startPosInOtherChunk, (uint64_t*)buffer.get(), startPosInChunk, numValuesToAppend);

if (nullChunk) {
nullChunk->append(

Check warning on line 306 in src/storage/copier/column_chunk.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/copier/column_chunk.cpp#L306

Added line #L306 was not covered by tests
other->getNullChunk(), startPosInOtherChunk, startPosInChunk, numValuesToAppend);
}
}

void FixedListColumnChunk::append(ColumnChunk* other, offset_t startPosInOtherChunk,
Expand Down Expand Up @@ -363,6 +364,8 @@
std::unique_ptr<ColumnChunk> chunk;
switch (dataType.getPhysicalType()) {
case PhysicalTypeID::BOOL:
chunk = std::make_unique<BoolColumnChunk>(copyDescription);
break;
case PhysicalTypeID::INT64:
case PhysicalTypeID::INT32:
case PhysicalTypeID::INT16:
Expand Down
13 changes: 0 additions & 13 deletions src/storage/storage_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,19 +266,6 @@ std::string StorageUtils::appendSuffixOrInsertBeforeWALSuffix(
}
}

uint32_t PageUtils::getNumElementsInAPage(uint32_t elementSize, bool hasNull) {
assert(elementSize > 0);
auto numBytesPerNullEntry = NullMask::NUM_BITS_PER_NULL_ENTRY >> 3;
auto numNullEntries =
hasNull ? (uint32_t)ceil(
(double)BufferPoolConstants::PAGE_4KB_SIZE /
(double)(((uint64_t)elementSize << NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2) +
numBytesPerNullEntry)) :
0;
return (BufferPoolConstants::PAGE_4KB_SIZE - (numNullEntries * numBytesPerNullEntry)) /
elementSize;
}

void StorageUtils::initializeListsHeaders(table_id_t relTableID, uint64_t numNodesInTable,
const std::string& directory, RelDataDirection relDirection) {
auto listHeadersBuilder = make_unique<ListHeadersBuilder>(
Expand Down
Loading
Loading