From 2414fe3d5ba10a3fa3827aa56b86eb5674feeedc Mon Sep 17 00:00:00 2001 From: Benjamin Winger Date: Mon, 24 Jul 2023 16:39:37 -0400 Subject: [PATCH] Store nulls as densely packed bitfields instead of one value per byte --- src/common/null_mask.cpp | 58 +++++++++++++++++-- src/common/types/types.cpp | 3 + src/common/vector/value_vector.cpp | 5 ++ src/include/common/null_mask.h | 29 ++++++++-- src/include/common/types/types.h | 2 + src/include/common/vector/value_vector.h | 6 +- src/include/storage/copier/column_chunk.h | 43 +++++++++++--- .../storage/storage_structure/lists/lists.h | 2 +- src/storage/copier/column_chunk.cpp | 56 +++++++++++++----- src/storage/storage_structure/lists/lists.cpp | 7 +-- .../lists/lists_update_iterator.cpp | 2 +- .../storage_structure/storage_structure.cpp | 7 +-- src/storage/store/node_column.cpp | 29 ++++++---- test/common/CMakeLists.txt | 1 + test/common/null_mask_test.cpp | 24 ++++++++ 15 files changed, 219 insertions(+), 55 deletions(-) create mode 100644 test/common/null_mask_test.cpp diff --git a/src/common/null_mask.cpp b/src/common/null_mask.cpp index 43428a959f..3a4d9b7e57 100644 --- a/src/common/null_mask.cpp +++ b/src/common/null_mask.cpp @@ -5,14 +5,13 @@ namespace kuzu { namespace common { -void NullMask::setNull(uint32_t pos, bool isNull) { +void NullMask::setNull(uint64_t* nullEntries, uint32_t pos, bool isNull) { auto entryPos = pos >> NUM_BITS_PER_NULL_ENTRY_LOG2; auto bitPosInEntry = pos - (entryPos << NUM_BITS_PER_NULL_ENTRY_LOG2); if (isNull) { - data[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry]; - mayContainNulls = true; + nullEntries[entryPos] |= NULL_BITMASKS_WITH_SINGLE_ONE[bitPosInEntry]; } else { - data[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry]; + nullEntries[entryPos] &= NULL_BITMASKS_WITH_SINGLE_ZERO[bitPosInEntry]; } } @@ -87,5 +86,56 @@ void NullMask::resize(uint64_t capacity) { numNullEntries = capacity; } +bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset, + uint64_t dstOffset, uint64_t numBitsToCopy) { + if (copyNullMask(srcNullEntries, srcOffset, this->data, dstOffset, numBitsToCopy)) { + this->mayContainNulls = true; + return true; + } + return false; +} + +void NullMask::setNullRange( + uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull) { + auto [firstEntryPos, firstBitPos] = getNullEntryAndBitPos(offset); + auto [lastEntryPos, lastBitPos] = getNullEntryAndBitPos(offset + numBitsToSet); + + // If the range spans multiple entries, set the entries in the middle to the appropriate value + // with std::fill + if (lastEntryPos > firstEntryPos + 1) { + std::fill(nullEntries + firstEntryPos + 1, nullEntries + lastEntryPos, + isNull ? ALL_NULL_ENTRY : NO_NULL_ENTRY); + } + + if (firstEntryPos == lastEntryPos) { + if (isNull) { + // Set bits between the first and the last bit pos to true + nullEntries[firstEntryPos] |= (~NULL_LOWER_MASKS[firstBitPos] & + ~NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]); + } else { + // Set bits between the first and the last bit pos to false + nullEntries[firstEntryPos] &= (NULL_LOWER_MASKS[firstBitPos] | + NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]); + } + } else { + if (isNull) { + // Set bits including and after the first bit pos to true + nullEntries[firstEntryPos] |= ~NULL_HIGH_MASKS[firstBitPos]; + if (lastBitPos > 0) { + // Set bits before the last bit pos to true + nullEntries[lastEntryPos] |= + ~NULL_LOWER_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]; + } + } else { + // Set bits including and after the first bit pos to false + nullEntries[firstEntryPos] &= NULL_LOWER_MASKS[firstBitPos]; + if (lastBitPos > 0) { + // Set bits before the last bit pos to false + nullEntries[lastEntryPos] &= NULL_HIGH_MASKS[NUM_BITS_PER_NULL_ENTRY - lastBitPos]; + } + } + } +} + } // namespace common } // namespace kuzu diff --git a/src/common/types/types.cpp b/src/common/types/types.cpp index 413b496c89..13ef9e6ca7 100644 --- a/src/common/types/types.cpp +++ b/src/common/types/types.cpp @@ -312,6 +312,9 @@ void LogicalType::setPhysicalType() { case LogicalTypeID::BOOL: { physicalType = PhysicalTypeID::BOOL; } break; + case LogicalTypeID::NULL_: { + physicalType = PhysicalTypeID::NULL_; + } break; case LogicalTypeID::TIMESTAMP: case LogicalTypeID::SERIAL: case LogicalTypeID::INT64: { diff --git a/src/common/vector/value_vector.cpp b/src/common/vector/value_vector.cpp index 2cc2b8ad9d..f79ad2f9bb 100644 --- a/src/common/vector/value_vector.cpp +++ b/src/common/vector/value_vector.cpp @@ -48,6 +48,11 @@ bool NodeIDVector::discardNull(ValueVector& vector) { } } +bool ValueVector::setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset, + uint64_t dstOffset, uint64_t numBitsToCopy) { + return nullMask->copyFromNullBits(srcNullEntries, srcOffset, dstOffset, numBitsToCopy); +} + template void ValueVector::setValue(uint32_t pos, T val) { ((T*)valueBuffer.get())[pos] = val; diff --git a/src/include/common/null_mask.h b/src/include/common/null_mask.h index a59ebb1a1b..181605892f 100644 --- a/src/include/common/null_mask.h +++ b/src/include/common/null_mask.h @@ -96,10 +96,15 @@ class NullMask { mayContainNulls = true; } - inline void setMayContainNulls() { mayContainNulls = true; } inline bool hasNoNullsGuarantee() const { return !mayContainNulls; } - void setNull(uint32_t pos, bool isNull); + static void setNull(uint64_t* nullEntries, uint32_t pos, bool isNull); + inline void setNull(uint32_t pos, bool isNull) { + setNull(data, pos, isNull); + if (isNull) { + mayContainNulls = true; + } + } static inline bool isNull(const uint64_t* nullEntries, uint32_t pos) { auto [entryPos, bitPosInEntry] = getNullEntryAndBitPos(pos); @@ -108,18 +113,32 @@ class NullMask { inline bool isNull(uint32_t pos) const { return isNull(data, pos); } - inline uint64_t* getData() { return data; } + // const because updates to the data must set mayContainNulls if any value + // becomes non-null + // Modifying the underlying data shuld be done with setNull or copyFromNullData + inline const uint64_t* getData() { return data; } static inline uint64_t getNumNullEntries(uint64_t numNullBits) { return (numNullBits >> NUM_BITS_PER_NULL_ENTRY_LOG2) + ((numNullBits - (numNullBits << NUM_BITS_PER_NULL_ENTRY_LOG2)) == 0 ? 0 : 1); } - // This function returns true if we have copied a nullBit with value 1 (indicate a null - // value) to dstNullEntries. + // Copies bitpacked null flags from one buffer to another, starting at an arbitrary bit + // offset and preserving adjacent bits. + // + // returns true if we have copied a nullBit with value 1 (indicates a null value) to + // dstNullEntries. static bool copyNullMask(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t* dstNullEntries, uint64_t dstOffset, uint64_t numBitsToCopy); + bool copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset, + uint64_t numBitsToCopy); + + // Sets the given number of bits to null (if isNull is true) or non-null (if isNull is false), + // starting at the offset + static void setNullRange( + uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull); + void resize(uint64_t capacity); private: diff --git a/src/include/common/types/types.h b/src/include/common/types/types.h index d60a469eea..15eb8a73c1 100644 --- a/src/include/common/types/types.h +++ b/src/include/common/types/types.h @@ -74,6 +74,7 @@ KUZU_API enum class LogicalTypeID : uint8_t { SERIAL = 13, // fixed size types + NULL_ = 21, BOOL = 22, INT64 = 23, INT32 = 24, @@ -110,6 +111,7 @@ enum class PhysicalTypeID : uint8_t { INTERVAL = 7, INTERNAL_ID = 9, ARROW_COLUMN = 10, + NULL_ = 11, // Variable size types. STRING = 20, diff --git a/src/include/common/vector/value_vector.h b/src/include/common/vector/value_vector.h index ab7065b543..efd33faaeb 100644 --- a/src/include/common/vector/value_vector.h +++ b/src/include/common/vector/value_vector.h @@ -33,7 +33,6 @@ class ValueVector { inline void setAllNull() { nullMask->setAllNull(); } inline void setAllNonNull() { nullMask->setAllNonNull(); } - inline void setMayContainNulls() { nullMask->setMayContainNulls(); } // On return true, there are no null. On return false, there may or may not be nulls. inline bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); } inline void setRangeNonNull(uint32_t startPos, uint32_t len) { @@ -41,7 +40,7 @@ class ValueVector { setNull(startPos + i, false); } } - inline uint64_t* getNullMaskData() { return nullMask->getData(); } + inline const uint64_t* getNullMaskData() { return nullMask->getData(); } inline void setNull(uint32_t pos, bool isNull) { nullMask->setNull(pos, isNull); } inline uint8_t isNull(uint32_t pos) const { return nullMask->isNull(pos); } inline void setAsSingleNullEntry() { @@ -49,6 +48,9 @@ class ValueVector { setNull(state->selVector->selectedPositions[0], true); } + bool setNullFromBits(const uint64_t* srcNullEntries, uint64_t srcOffset, uint64_t dstOffset, + uint64_t numBitsToCopy); + inline uint32_t getNumBytesPerValue() const { return numBytesPerValue; } template diff --git a/src/include/storage/copier/column_chunk.h b/src/include/storage/copier/column_chunk.h index 6d5ced28f0..6946edfacc 100644 --- a/src/include/storage/copier/column_chunk.h +++ b/src/include/storage/copier/column_chunk.h @@ -17,6 +17,10 @@ class NullColumnChunk; // Currently, `InMemColumnChunk` is used to populate rel columns. Eventually, we will merge them. class ColumnChunk { public: + friend class ColumnChunkFactory; + + // ColumnChunks must be initialized after construction, so this constructor should only be used + // through the ColumnChunkFactory explicit ColumnChunk(common::LogicalType dataType, common::CopyDescription* copyDescription, bool hasNullChunk = true); virtual ~ColumnChunk() = default; @@ -51,6 +55,7 @@ class ColumnChunk { virtual common::page_idx_t flushBuffer(BMFileHandle* dataFH, common::page_idx_t startPageIdx); + // Returns the size of the data type in bytes static uint32_t getDataTypeSizeInChunk(common::LogicalType& dataType); template @@ -68,11 +73,13 @@ class ColumnChunk { virtual void write(const common::Value& val, uint64_t posToWrite); + // numValues must be at least the number of values the ColumnChunk was first initialized + // with virtual void resize(uint64_t numValues); protected: - ColumnChunk(common::LogicalType dataType, common::offset_t numValues, - common::CopyDescription* copyDescription, bool hasNullChunk); + // Initializes the data buffer. Is (and should be) only called in constructor. + virtual void initialize(common::offset_t numValues); template void templateCopyArrowArray( @@ -106,19 +113,39 @@ class ColumnChunk { class NullColumnChunk : public ColumnChunk { public: NullColumnChunk() - : ColumnChunk(common::LogicalType(common::LogicalTypeID::BOOL), - nullptr /* copyDescription */, false /* hasNullChunk */) { - resetNullBuffer(); - } + : ColumnChunk(common::LogicalType(common::LogicalTypeID::NULL_), + nullptr /* copyDescription */, false /* hasNullChunk */) {} inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, numBytes); } - inline bool isNull(common::offset_t pos) const { return getValue(pos); } - inline void setNull(common::offset_t pos, bool isNull) { ((bool*)buffer.get())[pos] = isNull; } + inline bool isNull(common::offset_t pos) const { + // Buffer is rounded up to the nearest 8 bytes so that this cast is safe + return common::NullMask::isNull((uint64_t*)buffer.get(), pos); + } + inline void setNull(common::offset_t pos, bool isNull) { + common::NullMask::setNull( + // Buffer is rounded up to the nearest 8 bytes so that this cast is safe + (uint64_t*)buffer.get(), pos, isNull); + } + + void append(NullColumnChunk* other, common::offset_t startPosInOtherChunk, + common::offset_t startPosInChunk, uint32_t numValuesToAppend); void resize(uint64_t numValues) final; void setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet); + +protected: + uint64_t numBytesForValues(common::offset_t numValues) const { + // 8 values per byte, and we need a buffer size which is a multiple of 8 bytes + return ceil(numValues / 8.0 / 8.0) * 8; + } + void initialize(common::offset_t numValues) final { + numBytesPerValue = 0; + numBytes = numBytesForValues(numValues); + // Each byte defaults to 0, indicating everything is non-null + buffer = std::make_unique(numBytes); + } }; class FixedListColumnChunk : public ColumnChunk { diff --git a/src/include/storage/storage_structure/lists/lists.h b/src/include/storage/storage_structure/lists/lists.h index 430e8160ca..4ca09015bc 100644 --- a/src/include/storage/storage_structure/lists/lists.h +++ b/src/include/storage/storage_structure/lists/lists.h @@ -20,7 +20,7 @@ struct InMemList { inline uint8_t* getListData() const { return listData.get(); } inline bool hasNullBuffer() const { return nullMask != nullptr; } - inline uint64_t* getNullMask() const { return nullMask->getData(); } + inline common::NullMask* getNullMask() const { return nullMask.get(); } uint64_t numElements; std::unique_ptr listData; diff --git a/src/storage/copier/column_chunk.cpp b/src/storage/copier/column_chunk.cpp index 02641c2910..0eb8bd22b5 100644 --- a/src/storage/copier/column_chunk.cpp +++ b/src/storage/copier/column_chunk.cpp @@ -13,19 +13,19 @@ namespace kuzu { namespace storage { ColumnChunk::ColumnChunk(LogicalType dataType, CopyDescription* copyDescription, bool hasNullChunk) - : ColumnChunk{ - std::move(dataType), StorageConstants::NODE_GROUP_SIZE, copyDescription, hasNullChunk} {} - -ColumnChunk::ColumnChunk( - LogicalType dataType, offset_t numValues, CopyDescription* copyDescription, bool hasNullChunk) : dataType{std::move(dataType)}, numBytesPerValue{getDataTypeSizeInChunk(this->dataType)}, - numBytes{numBytesPerValue * numValues}, copyDescription{copyDescription} { - buffer = std::make_unique(numBytes); + copyDescription{copyDescription} { if (hasNullChunk) { nullChunk = std::make_unique(); } } +void ColumnChunk::initialize(offset_t numValues) { + numBytes = numBytesPerValue * numValues; + buffer = std::make_unique(numBytes); + static_cast(nullChunk.get())->initialize(numValues); +} + void ColumnChunk::resetToEmpty() { if (nullChunk) { nullChunk->resetNullBuffer(); @@ -283,12 +283,30 @@ uint32_t ColumnChunk::getDataTypeSizeInChunk(common::LogicalType& dataType) { case LogicalTypeID::INTERNAL_ID: { return sizeof(offset_t); } + // This should never be used for Nulls, + // which use a different way of calculating the buffer size + // FIXME(bmwinger): Setting this to 0 breaks everything. + // It's being used in NullNodeColumn, and maybe there are some functions + // relying on it despite the value being meaningless for a null bitfield. + case LogicalTypeID::NULL_: { + return 1; + } default: { return StorageUtils::getDataTypeSize(dataType); } } } +// TODO(bmwinger): Eventually, to support bitpacked bools, all these functions will need to be +// updated to support values sizes of less than one byte. +// But for the moment, this is the only generic ColumnChunk function which is needed by +// NullColumnChunk, and it's invoked directly on the nullColumn, so we don't need dynamic dispatch +void NullColumnChunk::append(NullColumnChunk* other, common::offset_t startPosInOtherChunk, + common::offset_t startPosInChunk, uint32_t numValuesToAppend) { + NullMask::copyNullMask((uint64_t*)other->buffer.get(), startPosInOtherChunk, + (uint64_t*)buffer.get(), startPosInChunk, numValuesToAppend); +} + void FixedListColumnChunk::append(ColumnChunk* other, common::offset_t startPosInOtherChunk, common::offset_t startPosInChunk, uint32_t numValuesToAppend) { auto otherChunk = (FixedListColumnChunk*)other; @@ -342,6 +360,7 @@ void FixedListColumnChunk::write(const common::Value& fixedListVal, uint64_t pos std::unique_ptr ColumnChunkFactory::createColumnChunk( const LogicalType& dataType, CopyDescription* copyDescription) { + std::unique_ptr chunk; switch (dataType.getPhysicalType()) { case PhysicalTypeID::BOOL: case PhysicalTypeID::INT64: @@ -350,21 +369,28 @@ std::unique_ptr ColumnChunkFactory::createColumnChunk( case PhysicalTypeID::DOUBLE: case PhysicalTypeID::FLOAT: case PhysicalTypeID::INTERVAL: - return std::make_unique(dataType, copyDescription); + chunk = std::make_unique(dataType, copyDescription); + break; case PhysicalTypeID::FIXED_LIST: - return std::make_unique(dataType, copyDescription); + chunk = std::make_unique(dataType, copyDescription); + break; case PhysicalTypeID::STRING: - return std::make_unique(dataType, copyDescription); + chunk = std::make_unique(dataType, copyDescription); + break; case PhysicalTypeID::VAR_LIST: - return std::make_unique(dataType, copyDescription); + chunk = std::make_unique(dataType, copyDescription); + break; case PhysicalTypeID::STRUCT: - return std::make_unique(dataType, copyDescription); + chunk = std::make_unique(dataType, copyDescription); + break; default: { throw NotImplementedException("ColumnChunkFactory::createColumnChunk for data type " + LogicalTypeUtils::dataTypeToString(dataType) + " is not supported."); } } + chunk->initialize(StorageConstants::NODE_GROUP_SIZE); + return chunk; } // Bool @@ -417,7 +443,8 @@ common::offset_t ColumnChunk::getOffsetInBuffer(common::offset_t pos) const { } void NullColumnChunk::resize(uint64_t numValues) { - auto numBytesAfterResize = numValues * numBytesPerValue; + auto numBytesAfterResize = numBytesForValues(numValues); + assert(numBytesAfterResize > numBytes); auto reservedBuffer = std::make_unique(numBytesAfterResize); memset(reservedBuffer.get(), 0 /* non null */, numBytesAfterResize); memcpy(reservedBuffer.get(), buffer.get(), numBytes); @@ -426,8 +453,7 @@ void NullColumnChunk::resize(uint64_t numValues) { } void NullColumnChunk::setRangeNoNull(common::offset_t startPosInChunk, uint32_t numValuesToSet) { - memset(buffer.get() + startPosInChunk * numBytesPerValue, 0 /* non null */, - numValuesToSet * numBytesPerValue); + NullMask::setNullRange((uint64_t*)buffer.get(), startPosInChunk, numValuesToSet, false); } } // namespace storage diff --git a/src/storage/storage_structure/lists/lists.cpp b/src/storage/storage_structure/lists/lists.cpp index 3faf76fb8b..b6582b8349 100644 --- a/src/storage/storage_structure/lists/lists.cpp +++ b/src/storage/storage_structure/lists/lists.cpp @@ -140,7 +140,7 @@ void Lists::fillInMemListsFromFrame(InMemList& inMemList, const uint8_t* frame, if (deletedRelOffsetsInList.empty()) { memcpy(listData, frameData, numElementsToReadInCurPage * elementSize); if (inMemList.hasNullBuffer()) { - NullMask::copyNullMask(nullBufferInPage, elemPosInPage, inMemList.getNullMask(), + inMemList.getNullMask()->copyFromNullBits(nullBufferInPage, elemPosInPage, nextPosToWriteToInMemList, numElementsToReadInCurPage); } readPropertyUpdatesToInMemListIfExists(inMemList, updatedPersistentListOffsets, @@ -164,9 +164,8 @@ void Lists::fillInMemListsFromFrame(InMemList& inMemList, const uint8_t* frame, // Otherwise, we can directly read from persistentStore. memcpy(listData, frameData, elementSize); if (inMemList.hasNullBuffer()) { - NullMask::copyNullMask(nullBufferInPage, elemPosInPage, - inMemList.getNullMask(), nextPosToWriteToInMemList, - 1 /* numBitsToCopy */); + inMemList.getNullMask()->copyFromNullBits(nullBufferInPage, elemPosInPage, + nextPosToWriteToInMemList, 1 /* numBitsToCopy */); } } listData += elementSize; diff --git a/src/storage/storage_structure/lists/lists_update_iterator.cpp b/src/storage/storage_structure/lists/lists_update_iterator.cpp index c59cdcf843..72e772dcda 100644 --- a/src/storage/storage_structure/lists/lists_update_iterator.cpp +++ b/src/storage/storage_structure/lists/lists_update_iterator.cpp @@ -247,7 +247,7 @@ void ListsUpdateIterator::writeAtOffset(InMemList& inMemList, page_idx_t pageLis memcpy(frame + lists->getElemByteOffset(elementOffsetInListPage), dataToFill, numElementsToWriteToCurrentPage * elementSize); if (inMemList.hasNullBuffer()) { - NullMask::copyNullMask(inMemList.getNullMask(), numUpdatedElements, + NullMask::copyNullMask(inMemList.getNullMask()->getData(), numUpdatedElements, (uint64_t*)lists->getNullBufferInPage(frame), elementOffsetInListPage, numElementsToWriteToCurrentPage); } diff --git a/src/storage/storage_structure/storage_structure.cpp b/src/storage/storage_structure/storage_structure.cpp index f47b4cda9c..5996a526d2 100644 --- a/src/storage/storage_structure/storage_structure.cpp +++ b/src/storage/storage_structure/storage_structure.cpp @@ -102,11 +102,8 @@ void BaseColumnOrList::readInternalIDsFromAPageBySequentialCopy(Transaction* tra void BaseColumnOrList::readNullBitsFromAPage(ValueVector* valueVector, const uint8_t* frame, uint64_t posInPage, uint64_t posInVector, uint64_t numBitsToRead) const { - auto hasNullInSrcNullMask = NullMask::copyNullMask((uint64_t*)getNullBufferInPage(frame), - posInPage, valueVector->getNullMaskData(), posInVector, numBitsToRead); - if (hasNullInSrcNullMask) { - valueVector->setMayContainNulls(); - } + valueVector->setNullFromBits( + (uint64_t*)getNullBufferInPage(frame), posInPage, posInVector, numBitsToRead); } void BaseColumnOrList::readAPageBySequentialCopy(Transaction* transaction, ValueVector* vector, diff --git a/src/storage/store/node_column.cpp b/src/storage/store/node_column.cpp index fd786e09ed..4902e3a5f7 100644 --- a/src/storage/store/node_column.cpp +++ b/src/storage/store/node_column.cpp @@ -44,15 +44,19 @@ void FixedSizedNodeColumnFunc::writeInternalIDValueToPage( void NullNodeColumnFunc::readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead) { - for (auto i = 0u; i < numValuesToRead; i++) { - bool isNull = *(frame + pageCursor.elemPosInPage + i); - resultVector->setNull(posInVector + i, isNull); - } + // Read bit-packed null flags from the frame into the result vector + // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. + // Otherwise it could read off the end of the page. + resultVector->setNullFromBits( + (uint64_t*)frame, pageCursor.elemPosInPage, posInVector, numValuesToRead); } void NullNodeColumnFunc::writeValueToPage( uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector) { - *(frame + posInFrame) = vector->isNull(posInVector); + // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. + // Otherwise it could read off the end of the page. + NullMask::setNull( + (uint64_t*)frame, posInFrame, NullMask::isNull(vector->getNullMaskData(), posInVector)); } NodeColumn::NodeColumn(const Property& property, BMFileHandle* dataFH, BMFileHandle* metadataFH, @@ -348,12 +352,17 @@ void NodeColumn::scan(transaction::Transaction* transaction, common::node_group_ scanUnfiltered(transaction, pageCursor, numValuesToScan, resultVector, offsetInVector); } -NullNodeColumn::NullNodeColumn(page_idx_t metaDAHeaderPageIdx, BMFileHandle* nodeGroupsDataFH, - BMFileHandle* nodeGroupsMetaFH, BufferManager* bufferManager, WAL* wal) - : NodeColumn{LogicalType(LogicalTypeID::BOOL), MetadataDAHInfo{metaDAHeaderPageIdx}, - nodeGroupsDataFH, nodeGroupsMetaFH, bufferManager, wal, false /* requireNullColumn */} { +NullNodeColumn::NullNodeColumn(page_idx_t metaDAHPageIdx, BMFileHandle* dataFH, + BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal) + : NodeColumn{LogicalType(LogicalTypeID::NULL_), MetadataDAHInfo{metaDAHPageIdx}, dataFH, + metadataFH, bufferManager, wal, false /* requireNullColumn */} { readNodeColumnFunc = NullNodeColumnFunc::readValuesFromPage; writeNodeColumnFunc = NullNodeColumnFunc::writeValueToPage; + // 8 values per byte + numValuesPerPage = PageUtils::getNumElementsInAPage(1, false) * 8; + // Page size must be aligned to 8 byte chunks for the 64-bit NullMask algorithms to work + // without the possibility of memory errors from reading/writing off the end of a page. + assert(PageUtils::getNumElementsInAPage(1, false) % 8 == 0); } void NullNodeColumn::scan( @@ -377,7 +386,7 @@ page_idx_t NullNodeColumn::append( void NullNodeColumn::setNull(common::offset_t nodeOffset) { auto walPageInfo = createWALVersionOfPageForValue(nodeOffset); try { - *(walPageInfo.frame + walPageInfo.posInPage) = true; + NullMask::setNull((uint64_t*)walPageInfo.frame, walPageInfo.posInPage, true); } catch (Exception& e) { bufferManager->unpin(*wal->fileHandle, walPageInfo.pageIdxInWAL); dataFH->releaseWALPageIdxLock(walPageInfo.originalPageIdx); diff --git a/test/common/CMakeLists.txt b/test/common/CMakeLists.txt index 79b7b54639..1b8371dd44 100644 --- a/test/common/CMakeLists.txt +++ b/test/common/CMakeLists.txt @@ -1,6 +1,7 @@ add_kuzu_test(types_test date_test.cpp interval_test.cpp + null_mask_test.cpp string_test.cpp time_test.cpp timestamp_test.cpp diff --git a/test/common/null_mask_test.cpp b/test/common/null_mask_test.cpp new file mode 100644 index 0000000000..747b6d20af --- /dev/null +++ b/test/common/null_mask_test.cpp @@ -0,0 +1,24 @@ +#include "common/null_mask.h" +#include "gtest/gtest.h" + +using namespace kuzu::common; + +TEST(NullMaskTests, TestRangeSingleEntry) { + std::vector data{NullMask::NO_NULL_ENTRY}; + NullMask::setNullRange(data.data(), 5, 5, true); + ASSERT_EQ(data[0], 0b1111100000); + + data[0] = NullMask::ALL_NULL_ENTRY; + NullMask::setNullRange(data.data(), 5, 5, false); + ASSERT_EQ(data[0], ~0b1111100000); +} + +TEST(NullMaskTests, TestRangeMultipleEntries) { + std::vector data{ + NullMask::ALL_NULL_ENTRY, NullMask::ALL_NULL_ENTRY, NullMask::ALL_NULL_ENTRY}; + NullMask::setNullRange(data.data(), 5, 150, false); + ASSERT_EQ(data[0], 0b11111); + ASSERT_EQ(data[1], 0); + ASSERT_FALSE(NullMask::isNull(data.data(), 154)); + ASSERT_TRUE(NullMask::isNull(data.data(), 155)); +}