Skip to content

Commit

Permalink
Re-merge NULL_ and BOOL and implement bool bitpacking for ColumnChunk
Browse files Browse the repository at this point in the history
  • Loading branch information
benjaminwinger committed Aug 9, 2023
1 parent 765d6f3 commit 694395a
Show file tree
Hide file tree
Showing 11 changed files with 213 additions and 78 deletions.
5 changes: 3 additions & 2 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) {
}

bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy) {
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert) {
auto [srcNullEntryPos, srcNullBitPos] = getNullEntryAndBitPos(srcOffset);
auto [dstNullEntryPos, dstNullBitPos] = getNullEntryAndBitPos(dstOffset);
uint64_t bitPos = 0;
Expand All @@ -25,7 +25,8 @@ bool NullMask::copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
auto curDstNullEntryPos = dstNullEntryPos;
auto curDstNullBitPos = dstNullBitPos;
uint64_t numBitsToReadInCurrentEntry;
uint64_t srcNullMaskEntry = srcNullEntries[srcNullEntryPos];
uint64_t srcNullMaskEntry =
invert ? ~srcNullEntries[srcNullEntryPos] : srcNullEntries[srcNullEntryPos];
if (dstNullBitPos < srcNullBitPos) {
numBitsToReadInCurrentEntry =
std::min(NullMask::NUM_BITS_PER_NULL_ENTRY - srcNullBitPos, numBitsToCopy - bitPos);
Expand Down
3 changes: 0 additions & 3 deletions src/common/types/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,6 @@ void LogicalType::setPhysicalType() {
case LogicalTypeID::BOOL: {
physicalType = PhysicalTypeID::BOOL;
} break;
case LogicalTypeID::NULL_: {
physicalType = PhysicalTypeID::NULL_;
} break;
case LogicalTypeID::TIMESTAMP:
case LogicalTypeID::SERIAL:
case LogicalTypeID::INT64: {
Expand Down
2 changes: 1 addition & 1 deletion src/include/common/null_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class NullMask {
// returns true if we have copied a nullBit with value 1 (indicates a null value) to
// dstNullEntries.
static bool copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset,
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy);
uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy, bool invert = false);

bool copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBitsToCopy);
Expand Down
2 changes: 0 additions & 2 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ KUZU_API enum class LogicalTypeID : uint8_t {
SERIAL = 13,

// fixed size types
NULL_ = 21,
BOOL = 22,
INT64 = 23,
INT32 = 24,
Expand Down Expand Up @@ -111,7 +110,6 @@ enum class PhysicalTypeID : uint8_t {
INTERVAL = 7,
INTERNAL_ID = 9,
ARROW_COLUMN = 10,
NULL_ = 11,

// Variable size types.
STRING = 20,
Expand Down
47 changes: 28 additions & 19 deletions src/include/storage/copier/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,31 +110,41 @@ class ColumnChunk {
const common::CopyDescription* copyDescription;
};

class NullColumnChunk : public ColumnChunk {
public:
NullColumnChunk()
: ColumnChunk(common::LogicalType(common::LogicalTypeID::NULL_),
nullptr /* copyDescription */, false /* hasNullChunk */) {}
template<>
inline void ColumnChunk::setValue(bool val, common::offset_t pos) {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
common::NullMask::setNull((uint64_t*)buffer.get(), pos, val);
}

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); }
template<>
inline bool ColumnChunk::getValue(common::offset_t pos) const {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
return common::NullMask::isNull((uint64_t*)buffer.get(), pos);
}

inline bool isNull(common::offset_t pos) const {
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
return common::NullMask::isNull((uint64_t*)buffer.get(), pos);
}
inline void setNull(common::offset_t pos, bool isNull) {
common::NullMask::setNull(
// Buffer is rounded up to the nearest 8 bytes so that this cast is safe
(uint64_t*)buffer.get(), pos, isNull);
}
class BoolColumnChunk : public ColumnChunk {
public:
BoolColumnChunk(common::CopyDescription* copyDescription, bool hasNullChunk = true)
: ColumnChunk(
common::LogicalType(common::LogicalTypeID::BOOL), copyDescription, hasNullChunk) {}

void append(NullColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend);
void append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;
};

class NullColumnChunk : public BoolColumnChunk {
public:
NullColumnChunk() : BoolColumnChunk(nullptr /*copyDescription*/, false /*hasNullChunk*/) {}
// Maybe this should be combined with BoolColumnChunk if the only difference is these functions?
inline bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
inline void setNull(common::offset_t pos, bool isNull) { setValue(isNull, pos); }

void resize(uint64_t numValues) final;

void setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet);

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); }

protected:
uint64_t numBytesForValues(common::offset_t numValues) const {
// 8 values per byte, and we need a buffer size which is a multiple of 8 bytes
Expand Down Expand Up @@ -172,8 +182,7 @@ void ColumnChunk::templateCopyArrowArray<uint8_t*>(
arrow::Array* array, common::offset_t startPosInSegment, uint32_t numValuesToAppend);
// BOOL
template<>
void ColumnChunk::setValueFromString<bool>(
const char* value, uint64_t length, common::offset_t pos);
void ColumnChunk::setValueFromString<bool>(const char* value, uint64_t length, uint64_t pos);
// FIXED_LIST
template<>
void ColumnChunk::setValueFromString<uint8_t*>(const char* value, uint64_t length, uint64_t pos);
Expand Down
15 changes: 14 additions & 1 deletion src/include/storage/storage_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,20 @@ struct PageElementCursor {
};

struct PageUtils {
static uint32_t getNumElementsInAPage(uint32_t elementSize, bool hasNull);
static constexpr uint32_t getNumElementsInAPage(uint32_t elementSize, bool hasNull) {
assert(elementSize > 0);
auto numBytesPerNullEntry = common::NullMask::NUM_BITS_PER_NULL_ENTRY >> 3;
auto numNullEntries =
hasNull ?
(uint32_t)ceil((double)common::BufferPoolConstants::PAGE_4KB_SIZE /
(double)(((uint64_t)elementSize
<< common::NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2) +
numBytesPerNullEntry)) :
0;
return (common::BufferPoolConstants::PAGE_4KB_SIZE -
(numNullEntries * numBytesPerNullEntry)) /
elementSize;
}

// This function returns the page pageIdx of the page where element will be found and the pos of
// the element in the page as the offset.
Expand Down
18 changes: 17 additions & 1 deletion src/include/storage/store/node_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ struct NullNodeColumnFunc {
uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector);
};

struct BoolNodeColumnFunc {
static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor,
common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead);
static void writeValueToPage(
uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector);
};

class NullNodeColumn;
// TODO(Guodong): This is intentionally duplicated with `Column`, as for now, we don't change rel
// tables. `Column` is used for rel tables only. Eventually, we should remove `Column`.
Expand All @@ -50,7 +57,7 @@ class NodeColumn {
virtual ~NodeColumn() = default;

// Expose for feature store
void batchLookup(const common::offset_t* nodeOffsets, size_t size, uint8_t* result);
virtual void batchLookup(const common::offset_t* nodeOffsets, size_t size, uint8_t* result);

virtual void scan(transaction::Transaction* transaction, common::ValueVector* nodeIDVector,
common::ValueVector* resultVector);
Expand Down Expand Up @@ -129,6 +136,15 @@ class NodeColumn {
write_node_column_func_t writeNodeColumnFunc;
};

class BoolNodeColumn : public NodeColumn {
public:
BoolNodeColumn(const catalog::MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
bool requireNullColumn = true);

void batchLookup(const common::offset_t* nodeOffsets, size_t size, uint8_t* result) final;
};

class NullNodeColumn : public NodeColumn {
public:
NullNodeColumn(common::page_idx_t metaDAHPageIdx, BMFileHandle* dataFH,
Expand Down
59 changes: 31 additions & 28 deletions src/storage/copier/column_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ ColumnChunk::ColumnChunk(LogicalType dataType, CopyDescription* copyDescription,
void ColumnChunk::initialize(offset_t numValues) {
numBytes = numBytesPerValue * numValues;
buffer = std::make_unique<uint8_t[]>(numBytes);
static_cast<ColumnChunk*>(nullChunk.get())->initialize(numValues);
if (nullChunk) {
static_cast<ColumnChunk*>(nullChunk.get())->initialize(numValues);
}
}

void ColumnChunk::resetToEmpty() {
Expand Down Expand Up @@ -184,21 +186,22 @@ void ColumnChunk::templateCopyArrowArray<bool>(
arrow::Array* array, offset_t startPosInChunk, uint32_t numValuesToAppend) {
auto* boolArray = (arrow::BooleanArray*)array;
auto data = boolArray->data();
auto valuesInChunk = (bool*)(buffer.get());

auto arrowBuffer = boolArray->values()->data();
// Might read off the end with the cast, but copyNullMask should ignore the extra data
//
// The arrow BooleanArray offset should be the offset in bits
// Unfortunately this is not documented.
NullMask::copyNullMask((uint64_t*)arrowBuffer, boolArray->offset(), (uint64_t*)buffer.get(),
startPosInChunk, numValuesToAppend);

if (data->MayHaveNulls()) {
for (auto i = 0u; i < numValuesToAppend; i++) {
auto posInChunk = startPosInChunk + i;
if (data->IsNull(i)) {
nullChunk->setNull(posInChunk, true);
continue;
}
valuesInChunk[posInChunk] = boolArray->Value(i);
}
} else {
for (auto i = 0u; i < numValuesToAppend; i++) {
auto posInChunk = startPosInChunk + i;
valuesInChunk[posInChunk] = boolArray->Value(i);
}
auto arrowNullBitMap = boolArray->null_bitmap_data();

// Offset should apply to both bool data and nulls
NullMask::copyNullMask((uint64_t*)arrowNullBitMap, boolArray->offset(),

Check warning on line 202 in src/storage/copier/column_chunk.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/copier/column_chunk.cpp#L202

Added line #L202 was not covered by tests
(uint64_t*)nullChunk->buffer.get(), startPosInChunk, numValuesToAppend,
true /*invert*/);
}
}

Expand Down Expand Up @@ -283,12 +286,9 @@ uint32_t ColumnChunk::getDataTypeSizeInChunk(LogicalType& dataType) {
case LogicalTypeID::INTERNAL_ID: {
return sizeof(offset_t);
}
// This should never be used for Nulls,
// which use a different way of calculating the buffer size
// FIXME(bmwinger): Setting this to 0 breaks everything.
// It's being used in NullNodeColumn, and maybe there are some functions
// relying on it despite the value being meaningless for a null bitfield.
case LogicalTypeID::NULL_: {
// Used by NodeColumnn to represent the de-compressed size of booleans in-memory
// Does not reflect the size of booleans on-disk in BoolColumnChunk
case LogicalTypeID::BOOL: {
return 1;
}
default: {
Expand All @@ -297,14 +297,15 @@ uint32_t ColumnChunk::getDataTypeSizeInChunk(LogicalType& dataType) {
}
}

// TODO(bmwinger): Eventually, to support bitpacked bools, all these functions will need to be
// updated to support values sizes of less than one byte.
// But for the moment, this is the only generic ColumnChunk function which is needed by
// NullColumnChunk, and it's invoked directly on the nullColumn, so we don't need dynamic dispatch
void NullColumnChunk::append(NullColumnChunk* other, offset_t startPosInOtherChunk,
void BoolColumnChunk::append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) {
NullMask::copyNullMask((uint64_t*)other->buffer.get(), startPosInOtherChunk,
(uint64_t*)buffer.get(), startPosInChunk, numValuesToAppend);
NullMask::copyNullMask((uint64_t*)static_cast<BoolColumnChunk*>(other)->buffer.get(),
startPosInOtherChunk, (uint64_t*)buffer.get(), startPosInChunk, numValuesToAppend);

if (nullChunk) {
nullChunk->append(

Check warning on line 306 in src/storage/copier/column_chunk.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/copier/column_chunk.cpp#L306

Added line #L306 was not covered by tests
other->getNullChunk(), startPosInOtherChunk, startPosInChunk, numValuesToAppend);
}
}

void FixedListColumnChunk::append(ColumnChunk* other, offset_t startPosInOtherChunk,
Expand Down Expand Up @@ -363,6 +364,8 @@ std::unique_ptr<ColumnChunk> ColumnChunkFactory::createColumnChunk(
std::unique_ptr<ColumnChunk> chunk;
switch (dataType.getPhysicalType()) {
case PhysicalTypeID::BOOL:
chunk = std::make_unique<BoolColumnChunk>(copyDescription);
break;
case PhysicalTypeID::INT64:
case PhysicalTypeID::INT32:
case PhysicalTypeID::INT16:
Expand Down
13 changes: 0 additions & 13 deletions src/storage/storage_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,19 +266,6 @@ std::string StorageUtils::appendSuffixOrInsertBeforeWALSuffix(
}
}

uint32_t PageUtils::getNumElementsInAPage(uint32_t elementSize, bool hasNull) {
assert(elementSize > 0);
auto numBytesPerNullEntry = NullMask::NUM_BITS_PER_NULL_ENTRY >> 3;
auto numNullEntries =
hasNull ? (uint32_t)ceil(
(double)BufferPoolConstants::PAGE_4KB_SIZE /
(double)(((uint64_t)elementSize << NullMask::NUM_BITS_PER_NULL_ENTRY_LOG2) +
numBytesPerNullEntry)) :
0;
return (BufferPoolConstants::PAGE_4KB_SIZE - (numNullEntries * numBytesPerNullEntry)) /
elementSize;
}

void StorageUtils::initializeListsHeaders(table_id_t relTableID, uint64_t numNodesInTable,
const std::string& directory, RelDataDirection relDirection) {
auto listHeadersBuilder = make_unique<ListHeadersBuilder>(
Expand Down
Loading

0 comments on commit 694395a

Please sign in to comment.