Skip to content

Commit

Permalink
Merge pull request #1945 from kuzudb/null-read-opt
Browse files Browse the repository at this point in the history
Skip reading null data if the property is known to have no nulls
  • Loading branch information
benjaminwinger committed Aug 31, 2023
2 parents a260a35 + 0991de8 commit 9aaff49
Show file tree
Hide file tree
Showing 27 changed files with 482 additions and 118 deletions.
7 changes: 7 additions & 0 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ bool NullMask::copyFromNullBits(const uint64_t* srcNullEntries, uint64_t srcOffs
return false;
}

void NullMask::setNullFromRange(uint64_t offset, uint64_t numBitsToSet, bool isNull) {
if (isNull) {
this->mayContainNulls = true;
}
setNullRange(data, offset, numBitsToSet, isNull);
}

void NullMask::setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull) {
auto [firstEntryPos, firstBitPos] = getNullEntryAndBitPos(offset);
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/null_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ class NullMask {
static void setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull);

void setNullFromRange(uint64_t offset, uint64_t numBitsToSet, bool isNull);

void resize(uint64_t capacity);

private:
Expand Down
4 changes: 1 addition & 3 deletions src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ class ValueVector {
// On return true, there are no null. On return false, there may or may not be nulls.
inline bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); }
inline void setRangeNonNull(uint32_t startPos, uint32_t len) {
for (auto i = 0u; i < len; ++i) {
setNull(startPos + i, false);
}
nullMask->setNullFromRange(startPos, len, false);
}
inline const uint64_t* getNullMaskData() { return nullMask->getData(); }
inline void setNull(uint32_t pos, bool isNull) { nullMask->setNull(pos, isNull); }
Expand Down
34 changes: 30 additions & 4 deletions src/include/storage/copier/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class BoolColumnChunk : public ColumnChunk {
arrow::Array* array, common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;

void append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;
common::offset_t startPosInChunk, uint32_t numValuesToAppend) override;

void resize(uint64_t capacity) final;

Expand All @@ -200,12 +200,38 @@ class BoolColumnChunk : public ColumnChunk {

class NullColumnChunk : public BoolColumnChunk {
public:
NullColumnChunk() : BoolColumnChunk(nullptr /*copyDescription*/, false /*hasNullChunk*/) {}
NullColumnChunk()
: BoolColumnChunk(nullptr /*copyDescription*/, false /*hasNullChunk*/), mayHaveNullValue{
false} {}
// Maybe this should be combined with BoolColumnChunk if the only difference is these functions?
inline bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
inline void setNull(common::offset_t pos, bool isNull) { setValue(isNull, pos); }
inline void setNull(common::offset_t pos, bool isNull) {
setValue(isNull, pos);
if (isNull) {
mayHaveNullValue = true;
}
}

inline bool mayHaveNull() const { return mayHaveNullValue; }

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, bufferSize); }
inline void resetNullBuffer() {
memset(buffer.get(), 0 /* non null */, bufferSize);
mayHaveNullValue = false;
}

inline void copyFromBuffer(uint64_t* srcBuffer, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBits, bool invert = false) {
if (common::NullMask::copyNullMask(
srcBuffer, srcOffset, (uint64_t*)buffer.get(), dstOffset, numBits, invert)) {
mayHaveNullValue = true;
}
}

void append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;

protected:
bool mayHaveNullValue;
};

class FixedListColumnChunk : public ColumnChunk {
Expand Down
23 changes: 16 additions & 7 deletions src/include/storage/store/node_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "storage/copier/column_chunk.h"
#include "storage/storage_structure/disk_array.h"
#include "storage/storage_structure/storage_structure.h"
#include "storage/store/property_statistics.h"

namespace kuzu {
namespace transaction {
Expand Down Expand Up @@ -57,10 +58,11 @@ class NodeColumn {
public:
NodeColumn(const catalog::Property& property, BMFileHandle* dataFH, BMFileHandle* metadataFH,
BufferManager* bufferManager, WAL* wal, transaction::Transaction* transaction,
bool requireNullColumn = true);
RWPropertyStats propertyStatistics, bool requireNullColumn = true);
NodeColumn(common::LogicalType dataType, const catalog::MetadataDAHInfo& metaDAHeaderInfo,
BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction, bool requireNullColumn);
transaction::Transaction* transaction, RWPropertyStats PropertyStatistics,
bool requireNullColumn);
virtual ~NodeColumn() = default;

// Expose for feature store
Expand Down Expand Up @@ -140,13 +142,15 @@ class NodeColumn {
std::vector<std::unique_ptr<NodeColumn>> childrenColumns;
read_node_column_func_t readNodeColumnFunc;
write_node_column_func_t writeNodeColumnFunc;
RWPropertyStats propertyStatistics;
};

class BoolNodeColumn : public NodeColumn {
public:
BoolNodeColumn(const catalog::MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction, bool requireNullColumn = true);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics,
bool requireNullColumn = true);

void batchLookup(transaction::Transaction* transaction, const common::offset_t* nodeOffsets,
size_t size, uint8_t* result) final;
Expand All @@ -158,10 +162,15 @@ class NullNodeColumn : public NodeColumn {
public:
NullNodeColumn(common::page_idx_t metaDAHPageIdx, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics);

void scan(transaction::Transaction* transaction, common::ValueVector* nodeIDVector,
common::ValueVector* resultVector) final;
void scan(transaction::Transaction* transaction, common::node_group_idx_t nodeGroupIdx,
common::offset_t startOffsetInGroup, common::offset_t endOffsetInGroup,
common::ValueVector* resultVector, uint64_t offsetInVector = 0) final;
void scan(common::node_group_idx_t nodeGroupIdx, ColumnChunk* columnChunk) final;

void lookup(transaction::Transaction* transaction, common::ValueVector* nodeIDVector,
common::ValueVector* resultVector) final;
common::page_idx_t append(
Expand Down Expand Up @@ -190,14 +199,14 @@ class SerialNodeColumn : public NodeColumn {
struct NodeColumnFactory {
static inline std::unique_ptr<NodeColumn> createNodeColumn(const catalog::Property& property,
BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction) {
transaction::Transaction* transaction, RWPropertyStats propertyStatistics) {
return createNodeColumn(*property.getDataType(), *property.getMetadataDAHInfo(), dataFH,
metadataFH, bufferManager, wal, transaction);
metadataFH, bufferManager, wal, transaction, propertyStatistics);
}
static std::unique_ptr<NodeColumn> createNodeColumn(const common::LogicalType& dataType,
const catalog::MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics);
};

} // namespace storage
Expand Down
65 changes: 32 additions & 33 deletions src/include/storage/store/nodes_statistics_and_deleted_ids.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@ namespace storage {
class NodeStatisticsAndDeletedIDs : public TableStatistics {

public:
NodeStatisticsAndDeletedIDs(common::table_id_t tableID, common::offset_t maxNodeOffset)
explicit NodeStatisticsAndDeletedIDs(const catalog::TableSchema& schema)
: TableStatistics{schema}, tableID{schema.tableID} {}

NodeStatisticsAndDeletedIDs(common::table_id_t tableID, common::offset_t maxNodeOffset,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStatistics)
: NodeStatisticsAndDeletedIDs(tableID, maxNodeOffset,
std::vector<
common::offset_t>() /* no deleted node offsets during initial loading */) {}
std::vector<common::offset_t>() /* no deleted node offsets during initial loading */,
std::move(propertyStatistics)) {}

NodeStatisticsAndDeletedIDs(common::table_id_t tableID, common::offset_t maxNodeOffset,
const std::vector<common::offset_t>& deletedNodeOffsets);
const std::vector<common::offset_t>& deletedNodeOffsets,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStatistics);

NodeStatisticsAndDeletedIDs(const NodeStatisticsAndDeletedIDs& other)
: TableStatistics{other.getNumTuples()}, tableID{other.tableID},
adjListsAndColumns{other.adjListsAndColumns},
hasDeletedNodesPerMorsel{other.hasDeletedNodesPerMorsel},
deletedNodeOffsetsPerMorsel{other.deletedNodeOffsetsPerMorsel} {}
NodeStatisticsAndDeletedIDs(const NodeStatisticsAndDeletedIDs& other) = default;

inline common::offset_t getMaxNodeOffset() {
return getMaxNodeOffsetFromNumTuples(getNumTuples());
Expand All @@ -48,7 +51,7 @@ class NodeStatisticsAndDeletedIDs : public TableStatistics {

std::vector<common::offset_t> getDeletedNodeOffsets();

static inline uint64_t geNumTuplesFromMaxNodeOffset(common::offset_t maxNodeOffset) {
static inline uint64_t getNumTuplesFromMaxNodeOffset(common::offset_t maxNodeOffset) {
return (maxNodeOffset == UINT64_MAX) ? 0ull : maxNodeOffset + 1ull;
}

Expand Down Expand Up @@ -86,7 +89,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
logger->info("Initialized {}.", "NodesStatisticsAndDeletedIDs");
}

// Should be used ony by tests;
// Should be used only by tests;
explicit NodesStatisticsAndDeletedIDs(
std::unordered_map<common::table_id_t, std::unique_ptr<NodeStatisticsAndDeletedIDs>>&
nodesStatisticsAndDeletedIDs);
Expand All @@ -109,7 +112,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
}

inline void setNumTuplesForTable(common::table_id_t tableID, uint64_t numTuples) override {
initTableStatisticPerTableForWriteTrxIfNecessary();
initTableStatisticsForWriteTrx();
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
Expand All @@ -118,21 +121,18 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {

inline common::offset_t getMaxNodeOffset(
transaction::Transaction* transaction, common::table_id_t tableID) {
return getMaxNodeOffset(transaction == nullptr || transaction->isReadOnly() ?
transaction::TransactionType::READ_ONLY :
transaction::TransactionType::WRITE,
tableID);
}

inline common::offset_t getMaxNodeOffset(
transaction::TransactionType transactionType, common::table_id_t tableID) {
return (transactionType == transaction::TransactionType::READ_ONLY ||
tablesStatisticsContentForWriteTrx == nullptr) ?
getNodeStatisticsAndDeletedIDs(tableID)->getMaxNodeOffset() :
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
->getMaxNodeOffset();
assert(transaction);
if (transaction->getType() == transaction::TransactionType::READ_ONLY) {
return getNodeStatisticsAndDeletedIDs(tableID)->getMaxNodeOffset();
} else {
std::unique_lock xLck{mtx};
return tablesStatisticsContentForWriteTrx == nullptr ?
getNodeStatisticsAndDeletedIDs(tableID)->getMaxNodeOffset() :
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
->getMaxNodeOffset();
}
}

// This function is only used for testing purpose.
Expand All @@ -146,7 +146,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
// keep the interface simple and no transaction is passed.
common::offset_t addNode(common::table_id_t tableID) {
lock_t lck{mtx};
initTableStatisticPerTableForWriteTrxIfNecessary();
initTableStatisticsForWriteTrxNoLock();
return ((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
Expand All @@ -156,7 +156,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
// Refer to the comments for addNode.
void deleteNode(common::table_id_t tableID, common::offset_t nodeOffset) {
lock_t lck{mtx};
initTableStatisticPerTableForWriteTrxIfNecessary();
initTableStatisticsForWriteTrxNoLock();
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
Expand All @@ -179,10 +179,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {

inline std::unique_ptr<TableStatistics> constructTableStatistic(
catalog::TableSchema* tableSchema) override {
// We use UINT64_MAX to represent an empty nodeTable which doesn't contain
// any nodes.
return std::make_unique<NodeStatisticsAndDeletedIDs>(
tableSchema->tableID, UINT64_MAX /* maxNodeOffset */);
return std::make_unique<NodeStatisticsAndDeletedIDs>(*tableSchema);
}

inline std::unique_ptr<TableStatistics> constructTableStatistic(
Expand All @@ -197,6 +194,8 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
}

std::unique_ptr<TableStatistics> deserializeTableStatistics(uint64_t numTuples,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStats,
uint64_t& offset, common::FileInfo* fileInfo, uint64_t tableID) override;

void serializeTableStatistics(
Expand Down
52 changes: 52 additions & 0 deletions src/include/storage/store/property_statistics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include "common/types/types.h"
#include "transaction/transaction.h"

namespace kuzu {
namespace storage {

class PropertyStatistics {
public:
PropertyStatistics() = default;
explicit PropertyStatistics(bool mayHaveNullValue) : mayHaveNullValue{mayHaveNullValue} {}
PropertyStatistics(const PropertyStatistics& other) {
this->mayHaveNullValue = other.mayHaveNullValue;
}

inline bool mayHaveNull() const { return mayHaveNullValue; }
PropertyStatistics(PropertyStatistics& other) = default;

void serialize(common::FileInfo* fileInfo, uint64_t& offset);
static std::unique_ptr<PropertyStatistics> deserialize(
common::FileInfo* fileInfo, uint64_t& offset);

inline void setHasNull() { mayHaveNullValue = true; }

private:
// Stores whether or not the property is known to have contained a null value
// If false, the property is guaranteed to not contain any nulls
bool mayHaveNullValue = false;
};

class TablesStatistics;

// Accessor used by NodeColumn, so that it doesn't need to handle the TableStatistics directly
class RWPropertyStats {
public:
RWPropertyStats(TablesStatistics* tablesStatistics, common::table_id_t tableID,
common::property_id_t propertyID);

static RWPropertyStats empty() { return RWPropertyStats(nullptr, 0, 0); }

bool mayHaveNull(const transaction::Transaction& transaction);
void setHasNull(const transaction::Transaction& transaction);

private:
TablesStatistics* tablesStatistics;
common::table_id_t tableID;
common::property_id_t propertyID;
};

} // namespace storage
} // namespace kuzu
14 changes: 10 additions & 4 deletions src/include/storage/store/rels_statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,13 @@ class RelStatistics : public TableStatistics {
friend class RelsStatistics;

public:
RelStatistics() : TableStatistics{0 /* numTuples */}, nextRelOffset{0} {}
RelStatistics(uint64_t numRels, common::offset_t nextRelOffset)
: TableStatistics{numRels}, nextRelOffset{nextRelOffset} {}
RelStatistics(const catalog::TableSchema& tableSchema)
: TableStatistics{tableSchema}, nextRelOffset{0} {}
RelStatistics(uint64_t numRels,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStats,
common::offset_t nextRelOffset)
: TableStatistics{numRels, std::move(propertyStats)}, nextRelOffset{nextRelOffset} {}

inline common::offset_t getNextRelOffset() const { return nextRelOffset; }

Expand Down Expand Up @@ -64,7 +68,7 @@ class RelsStatistics : public TablesStatistics {

inline std::unique_ptr<TableStatistics> constructTableStatistic(
catalog::TableSchema* tableSchema) override {
return std::make_unique<RelStatistics>();
return std::make_unique<RelStatistics>(*tableSchema);
}

inline std::unique_ptr<TableStatistics> constructTableStatistic(
Expand All @@ -84,6 +88,8 @@ class RelsStatistics : public TablesStatistics {
}

std::unique_ptr<TableStatistics> deserializeTableStatistics(uint64_t numTuples,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStats,
uint64_t& offset, common::FileInfo* fileInfo, uint64_t tableID) override;

void serializeTableStatistics(
Expand Down
3 changes: 2 additions & 1 deletion src/include/storage/store/string_node_column.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "node_column.h"
#include "storage/store/table_statistics.h"

namespace kuzu {
namespace storage {
Expand All @@ -14,7 +15,7 @@ class StringNodeColumn : public NodeColumn {
public:
StringNodeColumn(common::LogicalType dataType, const catalog::MetadataDAHInfo& metaDAHeaderInfo,
BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics);

void scan(transaction::Transaction* transaction, common::node_group_idx_t nodeGroupIdx,
common::offset_t startOffsetInGroup, common::offset_t endOffsetInGroup,
Expand Down
Loading

0 comments on commit 9aaff49

Please sign in to comment.