Skip to content

Commit

Permalink
remove index column
Browse files Browse the repository at this point in the history
performance improve
  • Loading branch information
Kuzu CI committed Mar 14, 2024
1 parent efcce5c commit f7523d7
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 109 deletions.
1 change: 1 addition & 0 deletions src/include/storage/store/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class ColumnChunk {

template<typename T>
void setValue(T val, common::offset_t pos) {
KU_ASSERT(pos < capacity);
((T*)buffer.get())[pos] = val;
if (pos >= numValues) {
numValues = pos + 1;
Expand Down
4 changes: 4 additions & 0 deletions src/include/storage/store/var_list_column.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "column.h"
#include "var_list_column_chunk.h"

// List is a nested data type which is stored as two chunks:
// 1. Offset column (type: INT64). Using offset to partition the data column into multiple lists.
Expand Down Expand Up @@ -38,6 +39,8 @@ struct ListOffsetInfoInStorage {
uint64_t getListLength(uint64_t nodePos) const;
common::offset_t getListEndOffset(uint64_t nodePos) const;
common::offset_t getListStartOffset(uint64_t nodePos) const;

bool checkOffsetOder(uint64_t startPos, uint64_t endPos) const;
};

class VarListColumn : public Column {
Expand Down Expand Up @@ -107,6 +110,7 @@ class VarListColumn : public Column {
private:
std::unique_ptr<Column> sizeColumn;
std::unique_ptr<Column> dataColumn;
std::unique_ptr<VarListDataColumnChunk> tmpDataColumnChunk;
};

} // namespace storage
Expand Down
13 changes: 0 additions & 13 deletions src/include/storage/store/var_list_column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,18 +91,6 @@ class VarListColumnChunk : public ColumnChunk {
void append(ColumnChunk* other, common::offset_t startPosInOtherChunk,
uint32_t numValuesToAppend) final;

inline void initializeIndices() {
indicesColumnChunk = ColumnChunkFactory::createColumnChunk(
*common::LogicalType::INT64(), false /*enableCompression*/, capacity);
indicesColumnChunk->getNullChunk()->resetToAllNull();
for (auto i = 0u; i < numValues; i++) {
indicesColumnChunk->setValue<common::offset_t>(i, i);
indicesColumnChunk->getNullChunk()->setNull(i, nullChunk->isNull(i));
}
indicesColumnChunk->setNumValues(numValues);
}

void resetFromOtherChunk(VarListColumnChunk* other);
void appendNullList();

protected:
Expand All @@ -114,7 +102,6 @@ class VarListColumnChunk : public ColumnChunk {
// `needFinalize` is set to true whenever `write` is called.
// During `finalize`, the whole column chunk will be re-written according to indices.
bool needFinalize;
std::unique_ptr<ColumnChunk> indicesColumnChunk;
};

} // namespace storage
Expand Down
1 change: 1 addition & 0 deletions src/storage/store/column_chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ void ColumnChunk::append(
KU_ASSERT(nullChunk->getNumValues() == getNumValues());
nullChunk->append(other->nullChunk.get(), startPosInOtherChunk, numValuesToAppend);
}
KU_ASSERT(numValues + numValuesToAppend <= capacity);
memcpy(buffer.get() + numValues * numBytesPerValue,
other->buffer.get() + startPosInOtherChunk * numBytesPerValue,
numValuesToAppend * numBytesPerValue);
Expand Down
109 changes: 91 additions & 18 deletions src/storage/store/var_list_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ offset_t ListOffsetInfoInStorage::getListStartOffset(uint64_t nodePos) const {
}

offset_t ListOffsetInfoInStorage::getListEndOffset(uint64_t nodePos) const {
KU_ASSERT(nodePos < offsetVectors.size() * common::DEFAULT_VECTOR_CAPACITY);
auto offsetVector = offsetVectors[(nodePos) / DEFAULT_VECTOR_CAPACITY].get();
return offsetVector->getValue<offset_t>((nodePos) % DEFAULT_VECTOR_CAPACITY);
}
Expand All @@ -30,6 +31,19 @@ uint64_t ListOffsetInfoInStorage::getListLength(uint64_t nodePos) const {
return sizeVector->getValue<uint64_t>(nodePos % common::DEFAULT_VECTOR_CAPACITY);
}

bool ListOffsetInfoInStorage::checkOffsetOder(uint64_t startPos, uint64_t endPos) const {
offset_t prevStartOffset = getListStartOffset(startPos);
for (auto i = startPos; i < endPos; i++) {
offset_t currentOffset = getListStartOffset(i);
auto length = getListLength(i);
if (currentOffset != prevStartOffset) {
return false;
}
prevStartOffset += length;
}
return true;
}

VarListColumn::VarListColumn(std::string name, LogicalType dataType,
const MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH, BMFileHandle* metadataFH,
BufferManager* bufferManager, WAL* wal, Transaction* transaction,
Expand All @@ -44,6 +58,9 @@ VarListColumn::VarListColumn(std::string name, LogicalType dataType,
dataColumn = ColumnFactory::createColumn(dataColName,
*VarListType::getChildType(&this->dataType)->copy(), *metaDAHeaderInfo.childrenInfos[1],
dataFH, metadataFH, bufferManager, wal, transaction, propertyStatistics, enableCompression);
tmpDataColumnChunk =
std::make_unique<VarListDataColumnChunk>(ColumnChunkFactory::createColumnChunk(
*VarListType::getChildType(&this->dataType)->copy(), enableCompression, 0));
}

void VarListColumn::scan(Transaction* transaction, node_group_idx_t nodeGroupIdx,
Expand All @@ -61,34 +78,77 @@ void VarListColumn::scan(Transaction* transaction, node_group_idx_t nodeGroupIdx
resultVector->getValue<list_entry_t>(offsetInVector - 1).size;
auto offsetToWriteListData = listOffsetInVector;
auto numValues = endOffsetInGroup - startOffsetInGroup;
KU_ASSERT(numValues >= 0);
for (auto i = 0u; i < numValues; i++) {
auto length = listOffsetInfoInStorage.getListLength(i);
resultVector->setValue(i + offsetInVector, list_entry_t{listOffsetInVector, length});
listOffsetInVector += length;
}
ListVector::resizeDataVector(resultVector, listOffsetInVector);
auto dataVector = ListVector::getDataVector(resultVector);
dataColumn->scan(transaction, nodeGroupIdx, listOffsetInfoInStorage.getListStartOffset(0),
listOffsetInfoInStorage.getListStartOffset(numValues), dataVector, offsetToWriteListData);
bool checkOffsetOrder = listOffsetInfoInStorage.checkOffsetOder(0, numValues);
if (checkOffsetOrder) {
dataColumn->scan(transaction, nodeGroupIdx, listOffsetInfoInStorage.getListStartOffset(0),
listOffsetInfoInStorage.getListStartOffset(numValues), dataVector,
offsetToWriteListData);
} else {
for (auto i = 0u; i < numValues; i++) {
offset_t startOffset = listOffsetInfoInStorage.getListStartOffset(i);
offset_t appendLen = listOffsetInfoInStorage.getListLength(i);
KU_ASSERT(appendLen >= 0);
dataColumn->scan(transaction, nodeGroupIdx, startOffset, startOffset + appendLen,
dataVector, offsetToWriteListData);
offsetToWriteListData += appendLen;
}
}
}

void VarListColumn::scan(Transaction* transaction, node_group_idx_t nodeGroupIdx,
kuzu::storage::ColumnChunk* columnChunk, offset_t startOffset, offset_t endOffset) {
if (nodeGroupIdx >= metadataDA->getNumElements(transaction->getType())) {
columnChunk->setNumValues(0);
} else {
auto varListColumnChunk = ku_dynamic_cast<ColumnChunk*, VarListColumnChunk*>(columnChunk);
Column::scan(transaction, nodeGroupIdx, columnChunk, startOffset, endOffset);
auto sizeColumnChunk =
ku_dynamic_cast<ColumnChunk*, VarListColumnChunk*>(columnChunk)->getSizeColumnChunk();
auto sizeColumnChunk = varListColumnChunk->getSizeColumnChunk();
sizeColumn->scan(transaction, nodeGroupIdx, sizeColumnChunk, startOffset, endOffset);
auto varListColumnChunk = ku_dynamic_cast<ColumnChunk*, VarListColumnChunk*>(columnChunk);
auto startVarListOffset = varListColumnChunk->getListStartOffset(0);
auto endVarListOffset = varListColumnChunk->getListStartOffset(columnChunk->getNumValues());
auto numElements = endVarListOffset - startVarListOffset;
varListColumnChunk->resizeDataColumnChunk(std::bit_ceil(numElements));
dataColumn->scan(transaction, nodeGroupIdx, varListColumnChunk->getDataColumnChunk(),
startVarListOffset, endVarListOffset);
varListColumnChunk->resetOffset();
uint64_t resizeNumValues = varListColumnChunk->getDataColumnChunk()->getNumValues();
bool checkOffsetOrder = true;
offset_t prevOffset = varListColumnChunk->getListStartOffset(0);
for (auto i = 0u; i < columnChunk->getNumValues(); i++) {
auto startVarListOffset = varListColumnChunk->getListStartOffset(i);
auto appendLen = varListColumnChunk->getListLen(i);
resizeNumValues += appendLen;
if (startVarListOffset != prevOffset) {
checkOffsetOrder = false;
}
prevOffset += appendLen;
}
if (checkOffsetOrder) {
varListColumnChunk->resizeDataColumnChunk(std::bit_ceil(resizeNumValues));
offset_t startVarListOffset = varListColumnChunk->getListStartOffset(0);
offset_t endVarListOffset =
varListColumnChunk->getListStartOffset(columnChunk->getNumValues());
dataColumn->scan(transaction, nodeGroupIdx, varListColumnChunk->getDataColumnChunk(),
startVarListOffset, endVarListOffset);
varListColumnChunk->resetOffset();
} else {
varListColumnChunk->resizeDataColumnChunk(std::bit_ceil(resizeNumValues));
tmpDataColumnChunk->resizeBuffer(std::bit_ceil(resizeNumValues));
auto dataVarListColumnChunk = varListColumnChunk->getDataColumnChunk();
for (auto i = 0u; i < columnChunk->getNumValues(); i++) {
offset_t startVarListOffset = varListColumnChunk->getListStartOffset(i);
offset_t endVarListOffset = varListColumnChunk->getListEndOffset(i);
dataColumn->scan(transaction, nodeGroupIdx,
tmpDataColumnChunk->dataColumnChunk.get(), startVarListOffset,
endVarListOffset);
KU_ASSERT(endVarListOffset - startVarListOffset ==
tmpDataColumnChunk->dataColumnChunk->getNumValues());
dataVarListColumnChunk->append(tmpDataColumnChunk->dataColumnChunk.get(), 0,
tmpDataColumnChunk->dataColumnChunk->getNumValues());
}
varListColumnChunk->resetOffset();
}
}
}

Expand Down Expand Up @@ -145,11 +205,24 @@ void VarListColumn::scanUnfiltered(Transaction* transaction, node_group_idx_t no
offsetInVector += listLen;
}
ListVector::resizeDataVector(resultVector, offsetInVector);
auto startListOffsetInStorage = listOffsetInfoInStorage.getListStartOffset(0);
auto endListOffsetInStorage = listOffsetInfoInStorage.getListStartOffset(numValuesToScan);
auto dataVector = ListVector::getDataVector(resultVector);
dataColumn->scan(transaction, nodeGroupIdx, startListOffsetInStorage, endListOffsetInStorage,
dataVector, 0 /* offsetInVector */);
offsetInVector = 0;
bool checkOffsetOrder = listOffsetInfoInStorage.checkOffsetOder(0, numValuesToScan);
if (checkOffsetOrder) {
auto startListOffsetInStorage = listOffsetInfoInStorage.getListStartOffset(0);
numValuesToScan = numValuesToScan == 0 ? 0 : numValuesToScan - 1;
auto endListOffsetInStorage = listOffsetInfoInStorage.getListEndOffset(numValuesToScan);
dataColumn->scan(transaction, nodeGroupIdx, startListOffsetInStorage,
endListOffsetInStorage, dataVector, 0 /* offsetInVector */);
} else {
for (auto i = 0u; i < numValuesToScan; i++) {
auto startListOffsetInStorage = listOffsetInfoInStorage.getListStartOffset(i);
auto appendLen = listOffsetInfoInStorage.getListLength(i);
dataColumn->scan(transaction, nodeGroupIdx, startListOffsetInStorage,

Check warning on line 221 in src/storage/store/var_list_column.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/store/var_list_column.cpp#L218-L221

Added lines #L218 - L221 were not covered by tests
startListOffsetInStorage + appendLen, dataVector, offsetInVector);
offsetInVector += appendLen;

Check warning on line 223 in src/storage/store/var_list_column.cpp

View check run for this annotation

Codecov / codecov/patch

src/storage/store/var_list_column.cpp#L223

Added line #L223 was not covered by tests
}
}
}

void VarListColumn::scanFiltered(Transaction* transaction, node_group_idx_t nodeGroupIdx,
Expand All @@ -166,10 +239,10 @@ void VarListColumn::scanFiltered(Transaction* transaction, node_group_idx_t node
for (auto i = 0u; i < resultVector->state->selVector->selectedSize; i++) {
auto pos = resultVector->state->selVector->selectedPositions[i];
auto startOffsetInStorageToScan = listOffsetInfoInStorage.getListStartOffset(pos);
auto endOffsetInStorageToScan = listOffsetInfoInStorage.getListStartOffset(pos + 1);
auto appendLen = listOffsetInfoInStorage.getListLength(pos);
auto dataVector = ListVector::getDataVector(resultVector);
dataColumn->scan(transaction, nodeGroupIdx, startOffsetInStorageToScan,
endOffsetInStorageToScan, dataVector, listOffset);
startOffsetInStorageToScan + appendLen, dataVector, listOffset);
listOffset += resultVector->getValue<list_entry_t>(pos).size;
}
}
Expand Down
Loading

0 comments on commit f7523d7

Please sign in to comment.