Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip reading null data if the property is known to have no nulls #1945

Merged
merged 3 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/common/null_mask.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@
return false;
}

void NullMask::setNullFromRange(uint64_t offset, uint64_t numBitsToSet, bool isNull) {
if (isNull) {
this->mayContainNulls = true;

Check warning on line 101 in src/common/null_mask.cpp

View check run for this annotation

Codecov / codecov/patch

src/common/null_mask.cpp#L101

Added line #L101 was not covered by tests
}
setNullRange(data, offset, numBitsToSet, isNull);
}

void NullMask::setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull) {
auto [firstEntryPos, firstBitPos] = getNullEntryAndBitPos(offset);
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/null_mask.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ class NullMask {
static void setNullRange(
uint64_t* nullEntries, uint64_t offset, uint64_t numBitsToSet, bool isNull);

void setNullFromRange(uint64_t offset, uint64_t numBitsToSet, bool isNull);

void resize(uint64_t capacity);

private:
Expand Down
4 changes: 1 addition & 3 deletions src/include/common/vector/value_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ class ValueVector {
// On return true, there are no null. On return false, there may or may not be nulls.
inline bool hasNoNullsGuarantee() const { return nullMask->hasNoNullsGuarantee(); }
inline void setRangeNonNull(uint32_t startPos, uint32_t len) {
for (auto i = 0u; i < len; ++i) {
setNull(startPos + i, false);
}
nullMask->setNullFromRange(startPos, len, false);
}
inline const uint64_t* getNullMaskData() { return nullMask->getData(); }
inline void setNull(uint32_t pos, bool isNull) { nullMask->setNull(pos, isNull); }
Expand Down
34 changes: 30 additions & 4 deletions src/include/storage/copier/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class BoolColumnChunk : public ColumnChunk {
arrow::Array* array, common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;

void append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;
common::offset_t startPosInChunk, uint32_t numValuesToAppend) override;

void resize(uint64_t capacity) final;

Expand All @@ -200,12 +200,38 @@ class BoolColumnChunk : public ColumnChunk {

class NullColumnChunk : public BoolColumnChunk {
public:
NullColumnChunk() : BoolColumnChunk(nullptr /*copyDescription*/, false /*hasNullChunk*/) {}
NullColumnChunk()
: BoolColumnChunk(nullptr /*copyDescription*/, false /*hasNullChunk*/), mayHaveNullValue{
false} {}
// Maybe this should be combined with BoolColumnChunk if the only difference is these functions?
inline bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
inline void setNull(common::offset_t pos, bool isNull) { setValue(isNull, pos); }
inline void setNull(common::offset_t pos, bool isNull) {
setValue(isNull, pos);
if (isNull) {
mayHaveNullValue = true;
}
}

inline bool mayHaveNull() const { return mayHaveNullValue; }

inline void resetNullBuffer() { memset(buffer.get(), 0 /* non null */, bufferSize); }
inline void resetNullBuffer() {
memset(buffer.get(), 0 /* non null */, bufferSize);
mayHaveNullValue = false;
}

inline void copyFromBuffer(uint64_t* srcBuffer, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBits, bool invert = false) {
if (common::NullMask::copyNullMask(
srcBuffer, srcOffset, (uint64_t*)buffer.get(), dstOffset, numBits, invert)) {
mayHaveNullValue = true;
}
}

void append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
common::offset_t startPosInChunk, uint32_t numValuesToAppend) final;

protected:
bool mayHaveNullValue;
};

class FixedListColumnChunk : public ColumnChunk {
Expand Down
23 changes: 16 additions & 7 deletions src/include/storage/store/node_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "storage/copier/column_chunk.h"
#include "storage/storage_structure/disk_array.h"
#include "storage/storage_structure/storage_structure.h"
#include "storage/store/property_statistics.h"

namespace kuzu {
namespace transaction {
Expand Down Expand Up @@ -57,10 +58,11 @@ class NodeColumn {
public:
NodeColumn(const catalog::Property& property, BMFileHandle* dataFH, BMFileHandle* metadataFH,
BufferManager* bufferManager, WAL* wal, transaction::Transaction* transaction,
bool requireNullColumn = true);
RWPropertyStats propertyStatistics, bool requireNullColumn = true);
NodeColumn(common::LogicalType dataType, const catalog::MetadataDAHInfo& metaDAHeaderInfo,
BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction, bool requireNullColumn);
transaction::Transaction* transaction, RWPropertyStats PropertyStatistics,
bool requireNullColumn);
virtual ~NodeColumn() = default;

// Expose for feature store
Expand Down Expand Up @@ -140,13 +142,15 @@ class NodeColumn {
std::vector<std::unique_ptr<NodeColumn>> childrenColumns;
read_node_column_func_t readNodeColumnFunc;
write_node_column_func_t writeNodeColumnFunc;
RWPropertyStats propertyStatistics;
};

class BoolNodeColumn : public NodeColumn {
public:
BoolNodeColumn(const catalog::MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction, bool requireNullColumn = true);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics,
bool requireNullColumn = true);

void batchLookup(transaction::Transaction* transaction, const common::offset_t* nodeOffsets,
size_t size, uint8_t* result) final;
Expand All @@ -158,10 +162,15 @@ class NullNodeColumn : public NodeColumn {
public:
NullNodeColumn(common::page_idx_t metaDAHPageIdx, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics);

void scan(transaction::Transaction* transaction, common::ValueVector* nodeIDVector,
common::ValueVector* resultVector) final;
void scan(transaction::Transaction* transaction, common::node_group_idx_t nodeGroupIdx,
common::offset_t startOffsetInGroup, common::offset_t endOffsetInGroup,
common::ValueVector* resultVector, uint64_t offsetInVector = 0) final;
void scan(common::node_group_idx_t nodeGroupIdx, ColumnChunk* columnChunk) final;

void lookup(transaction::Transaction* transaction, common::ValueVector* nodeIDVector,
common::ValueVector* resultVector) final;
common::page_idx_t append(
Expand Down Expand Up @@ -190,14 +199,14 @@ class SerialNodeColumn : public NodeColumn {
struct NodeColumnFactory {
static inline std::unique_ptr<NodeColumn> createNodeColumn(const catalog::Property& property,
BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction) {
transaction::Transaction* transaction, RWPropertyStats propertyStatistics) {
return createNodeColumn(*property.getDataType(), *property.getMetadataDAHInfo(), dataFH,
metadataFH, bufferManager, wal, transaction);
metadataFH, bufferManager, wal, transaction, propertyStatistics);
}
static std::unique_ptr<NodeColumn> createNodeColumn(const common::LogicalType& dataType,
const catalog::MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH,
BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics);
};

} // namespace storage
Expand Down
65 changes: 32 additions & 33 deletions src/include/storage/store/nodes_statistics_and_deleted_ids.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@ namespace storage {
class NodeStatisticsAndDeletedIDs : public TableStatistics {

public:
NodeStatisticsAndDeletedIDs(common::table_id_t tableID, common::offset_t maxNodeOffset)
explicit NodeStatisticsAndDeletedIDs(const catalog::TableSchema& schema)
: TableStatistics{schema}, tableID{schema.tableID} {}

NodeStatisticsAndDeletedIDs(common::table_id_t tableID, common::offset_t maxNodeOffset,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStatistics)
: NodeStatisticsAndDeletedIDs(tableID, maxNodeOffset,
std::vector<
common::offset_t>() /* no deleted node offsets during initial loading */) {}
std::vector<common::offset_t>() /* no deleted node offsets during initial loading */,
std::move(propertyStatistics)) {}

NodeStatisticsAndDeletedIDs(common::table_id_t tableID, common::offset_t maxNodeOffset,
const std::vector<common::offset_t>& deletedNodeOffsets);
const std::vector<common::offset_t>& deletedNodeOffsets,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStatistics);

NodeStatisticsAndDeletedIDs(const NodeStatisticsAndDeletedIDs& other)
: TableStatistics{other.getNumTuples()}, tableID{other.tableID},
adjListsAndColumns{other.adjListsAndColumns},
hasDeletedNodesPerMorsel{other.hasDeletedNodesPerMorsel},
deletedNodeOffsetsPerMorsel{other.deletedNodeOffsetsPerMorsel} {}
NodeStatisticsAndDeletedIDs(const NodeStatisticsAndDeletedIDs& other) = default;

inline common::offset_t getMaxNodeOffset() {
return getMaxNodeOffsetFromNumTuples(getNumTuples());
Expand All @@ -48,7 +51,7 @@ class NodeStatisticsAndDeletedIDs : public TableStatistics {

std::vector<common::offset_t> getDeletedNodeOffsets();

static inline uint64_t geNumTuplesFromMaxNodeOffset(common::offset_t maxNodeOffset) {
static inline uint64_t getNumTuplesFromMaxNodeOffset(common::offset_t maxNodeOffset) {
return (maxNodeOffset == UINT64_MAX) ? 0ull : maxNodeOffset + 1ull;
}

Expand Down Expand Up @@ -86,7 +89,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
logger->info("Initialized {}.", "NodesStatisticsAndDeletedIDs");
}

// Should be used ony by tests;
// Should be used only by tests;
explicit NodesStatisticsAndDeletedIDs(
std::unordered_map<common::table_id_t, std::unique_ptr<NodeStatisticsAndDeletedIDs>>&
nodesStatisticsAndDeletedIDs);
Expand All @@ -109,7 +112,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
}

inline void setNumTuplesForTable(common::table_id_t tableID, uint64_t numTuples) override {
initTableStatisticPerTableForWriteTrxIfNecessary();
initTableStatisticsForWriteTrx();
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
Expand All @@ -118,21 +121,18 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {

inline common::offset_t getMaxNodeOffset(
transaction::Transaction* transaction, common::table_id_t tableID) {
return getMaxNodeOffset(transaction == nullptr || transaction->isReadOnly() ?
transaction::TransactionType::READ_ONLY :
transaction::TransactionType::WRITE,
tableID);
}

inline common::offset_t getMaxNodeOffset(
transaction::TransactionType transactionType, common::table_id_t tableID) {
return (transactionType == transaction::TransactionType::READ_ONLY ||
tablesStatisticsContentForWriteTrx == nullptr) ?
getNodeStatisticsAndDeletedIDs(tableID)->getMaxNodeOffset() :
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
->getMaxNodeOffset();
assert(transaction);
if (transaction->getType() == transaction::TransactionType::READ_ONLY) {
return getNodeStatisticsAndDeletedIDs(tableID)->getMaxNodeOffset();
} else {
std::unique_lock xLck{mtx};
return tablesStatisticsContentForWriteTrx == nullptr ?
getNodeStatisticsAndDeletedIDs(tableID)->getMaxNodeOffset() :
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
->getMaxNodeOffset();
}
}

// This function is only used for testing purpose.
Expand All @@ -146,7 +146,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
// keep the interface simple and no transaction is passed.
common::offset_t addNode(common::table_id_t tableID) {
lock_t lck{mtx};
initTableStatisticPerTableForWriteTrxIfNecessary();
initTableStatisticsForWriteTrxNoLock();
return ((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
Expand All @@ -156,7 +156,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
// Refer to the comments for addNode.
void deleteNode(common::table_id_t tableID, common::offset_t nodeOffset) {
lock_t lck{mtx};
initTableStatisticPerTableForWriteTrxIfNecessary();
initTableStatisticsForWriteTrxNoLock();
((NodeStatisticsAndDeletedIDs*)tablesStatisticsContentForWriteTrx
->tableStatisticPerTable[tableID]
.get())
Expand All @@ -179,10 +179,7 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {

inline std::unique_ptr<TableStatistics> constructTableStatistic(
catalog::TableSchema* tableSchema) override {
// We use UINT64_MAX to represent an empty nodeTable which doesn't contain
// any nodes.
return std::make_unique<NodeStatisticsAndDeletedIDs>(
tableSchema->tableID, UINT64_MAX /* maxNodeOffset */);
return std::make_unique<NodeStatisticsAndDeletedIDs>(*tableSchema);
}

inline std::unique_ptr<TableStatistics> constructTableStatistic(
Expand All @@ -197,6 +194,8 @@ class NodesStatisticsAndDeletedIDs : public TablesStatistics {
}

std::unique_ptr<TableStatistics> deserializeTableStatistics(uint64_t numTuples,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStats,
uint64_t& offset, common::FileInfo* fileInfo, uint64_t tableID) override;

void serializeTableStatistics(
Expand Down
52 changes: 52 additions & 0 deletions src/include/storage/store/property_statistics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include "common/types/types.h"
#include "transaction/transaction.h"

namespace kuzu {
namespace storage {

class PropertyStatistics {
public:
PropertyStatistics() = default;
explicit PropertyStatistics(bool mayHaveNullValue) : mayHaveNullValue{mayHaveNullValue} {}
PropertyStatistics(const PropertyStatistics& other) {
this->mayHaveNullValue = other.mayHaveNullValue;
}

inline bool mayHaveNull() const { return mayHaveNullValue; }
PropertyStatistics(PropertyStatistics& other) = default;

void serialize(common::FileInfo* fileInfo, uint64_t& offset);
static std::unique_ptr<PropertyStatistics> deserialize(
common::FileInfo* fileInfo, uint64_t& offset);

inline void setHasNull() { mayHaveNullValue = true; }

private:
// Stores whether or not the property is known to have contained a null value
// If false, the property is guaranteed to not contain any nulls
bool mayHaveNullValue = false;
};

class TablesStatistics;

// Accessor used by NodeColumn, so that it doesn't need to handle the TableStatistics directly
class RWPropertyStats {
public:
RWPropertyStats(TablesStatistics* tablesStatistics, common::table_id_t tableID,
common::property_id_t propertyID);

static RWPropertyStats empty() { return RWPropertyStats(nullptr, 0, 0); }

bool mayHaveNull(const transaction::Transaction& transaction);
void setHasNull(const transaction::Transaction& transaction);

private:
TablesStatistics* tablesStatistics;
common::table_id_t tableID;
common::property_id_t propertyID;
};

} // namespace storage
} // namespace kuzu
14 changes: 10 additions & 4 deletions src/include/storage/store/rels_statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,13 @@ class RelStatistics : public TableStatistics {
friend class RelsStatistics;

public:
RelStatistics() : TableStatistics{0 /* numTuples */}, nextRelOffset{0} {}
RelStatistics(uint64_t numRels, common::offset_t nextRelOffset)
: TableStatistics{numRels}, nextRelOffset{nextRelOffset} {}
RelStatistics(const catalog::TableSchema& tableSchema)
: TableStatistics{tableSchema}, nextRelOffset{0} {}
RelStatistics(uint64_t numRels,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStats,
common::offset_t nextRelOffset)
: TableStatistics{numRels, std::move(propertyStats)}, nextRelOffset{nextRelOffset} {}

inline common::offset_t getNextRelOffset() const { return nextRelOffset; }

Expand Down Expand Up @@ -64,7 +68,7 @@ class RelsStatistics : public TablesStatistics {

inline std::unique_ptr<TableStatistics> constructTableStatistic(
catalog::TableSchema* tableSchema) override {
return std::make_unique<RelStatistics>();
return std::make_unique<RelStatistics>(*tableSchema);
}

inline std::unique_ptr<TableStatistics> constructTableStatistic(
Expand All @@ -84,6 +88,8 @@ class RelsStatistics : public TablesStatistics {
}

std::unique_ptr<TableStatistics> deserializeTableStatistics(uint64_t numTuples,
std::unordered_map<common::property_id_t, std::unique_ptr<PropertyStatistics>>&&
propertyStats,
uint64_t& offset, common::FileInfo* fileInfo, uint64_t tableID) override;

void serializeTableStatistics(
Expand Down
3 changes: 2 additions & 1 deletion src/include/storage/store/string_node_column.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "node_column.h"
#include "storage/store/table_statistics.h"

namespace kuzu {
namespace storage {
Expand All @@ -14,7 +15,7 @@ class StringNodeColumn : public NodeColumn {
public:
StringNodeColumn(common::LogicalType dataType, const catalog::MetadataDAHInfo& metaDAHeaderInfo,
BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal,
transaction::Transaction* transaction);
transaction::Transaction* transaction, RWPropertyStats propertyStatistics);

void scan(transaction::Transaction* transaction, common::node_group_idx_t nodeGroupIdx,
common::offset_t startOffsetInGroup, common::offset_t endOffsetInGroup,
Expand Down
Loading
Loading