From 27da653dff89307ba16291d064d0459dc5f019b2 Mon Sep 17 00:00:00 2001 From: ziyi chen Date: Thu, 24 Aug 2023 14:36:41 -0400 Subject: [PATCH] Implement top-k optimization --- src/include/common/constants.h | 6 + .../operator/order_by/key_block_merger.h | 33 +- .../processor/operator/order_by/order_by.h | 118 +-- .../operator/order_by/order_by_key_encoder.h | 33 +- .../operator/order_by/order_by_merge.h | 6 +- .../operator/order_by/order_by_scan.h | 48 +- .../processor/operator/order_by/sort_state.h | 95 ++ .../processor/operator/order_by/top_k.h | 198 +++++ .../operator/order_by/top_k_scanner.h | 44 + .../processor/operator/physical_operator.h | 2 + src/include/processor/result/result_set.h | 2 +- src/optimizer/optimizer.cpp | 4 +- src/optimizer/top_k_optimizer.cpp | 2 +- src/processor/map/map_order_by.cpp | 37 +- .../operator/order_by/CMakeLists.txt | 5 +- .../operator/order_by/key_block_merger.cpp | 27 +- src/processor/operator/order_by/order_by.cpp | 81 +- .../order_by/order_by_key_encoder.cpp | 35 +- .../operator/order_by/order_by_merge.cpp | 9 +- .../operator/order_by/order_by_scan.cpp | 100 +-- .../operator/order_by/sort_state.cpp | 178 ++++ src/processor/operator/order_by/top_k.cpp | 301 +++++++ .../operator/order_by/top_k_scanner.cpp | 27 + src/processor/operator/physical_operator.cpp | 6 + src/processor/processor.cpp | 1 + src/processor/result/factorized_table.cpp | 10 +- test/CMakeLists.txt | 1 - test/processor/CMakeLists.txt | 1 - test/processor/order_by/CMakeLists.txt | 4 - .../order_by/key_block_merger_test.cpp | 552 ------------ .../order_by/order_by_key_encoder_test.cpp | 837 ------------------ test/processor/order_by/radix_sort_test.cpp | 468 ---------- test/test_files/copy/copy_node_csv.test | 32 + .../tinysnb/order_by/single_label.test | 13 + 34 files changed, 1081 insertions(+), 2235 deletions(-) create mode 100644 src/include/processor/operator/order_by/sort_state.h create mode 100644 src/include/processor/operator/order_by/top_k.h create mode 100644 src/include/processor/operator/order_by/top_k_scanner.h create mode 100644 src/processor/operator/order_by/sort_state.cpp create mode 100644 src/processor/operator/order_by/top_k.cpp create mode 100644 src/processor/operator/order_by/top_k_scanner.cpp delete mode 100644 test/processor/CMakeLists.txt delete mode 100644 test/processor/order_by/CMakeLists.txt delete mode 100644 test/processor/order_by/key_block_merger_test.cpp delete mode 100644 test/processor/order_by/order_by_key_encoder_test.cpp delete mode 100644 test/processor/order_by/radix_sort_test.cpp diff --git a/src/include/common/constants.h b/src/include/common/constants.h index 1704d3b8b0..555e7417d4 100644 --- a/src/include/common/constants.h +++ b/src/include/common/constants.h @@ -161,5 +161,11 @@ struct ClientContextConstants { static constexpr uint64_t TIMEOUT_IN_MS = 0; }; +struct OrderByConstants { + static constexpr uint64_t NUM_BYTES_FOR_PAYLOAD_IDX = 8; + static constexpr uint64_t MIN_SIZE_TO_REDUCE = common::DEFAULT_VECTOR_CAPACITY * 5; + static constexpr uint64_t MIN_LIMIT_RATIO_TO_REDUCE = 2; +}; + } // namespace common } // namespace kuzu diff --git a/src/include/processor/operator/order_by/key_block_merger.h b/src/include/processor/operator/order_by/key_block_merger.h index e6b6787f26..53a2b81d02 100644 --- a/src/include/processor/operator/order_by/key_block_merger.h +++ b/src/include/processor/operator/order_by/key_block_merger.h @@ -63,22 +63,7 @@ class MergedKeyBlocks { }; struct BlockPtrInfo { - inline BlockPtrInfo( - uint64_t startTupleIdx, uint64_t endTupleIdx, std::shared_ptr& keyBlocks) - : keyBlocks{keyBlocks}, curBlockIdx{startTupleIdx / keyBlocks->getNumTuplesPerBlock()}, - endBlockIdx{endTupleIdx == 0 ? 0 : (endTupleIdx - 1) / keyBlocks->getNumTuplesPerBlock()}, - endTupleIdx{endTupleIdx} { - if (startTupleIdx == endTupleIdx) { - curTuplePtr = nullptr; - endTuplePtr = nullptr; - curBlockEndTuplePtr = nullptr; - } else { - curTuplePtr = keyBlocks->getTuple(startTupleIdx); - endTuplePtr = keyBlocks->getBlockEndTuplePtr(endBlockIdx, endTupleIdx, endBlockIdx); - curBlockEndTuplePtr = - keyBlocks->getBlockEndTuplePtr(curBlockIdx, endTupleIdx, endBlockIdx); - } - } + BlockPtrInfo(uint64_t startTupleIdx, uint64_t endTupleIdx, MergedKeyBlocks* keyBlocks); inline bool hasMoreTuplesToRead() const { return curTuplePtr != endTuplePtr; } @@ -90,7 +75,7 @@ struct BlockPtrInfo { void updateTuplePtrIfNecessary(); - std::shared_ptr& keyBlocks; + MergedKeyBlocks* keyBlocks; uint8_t* curTuplePtr; uint64_t curBlockIdx; uint64_t endBlockIdx; @@ -101,9 +86,9 @@ struct BlockPtrInfo { class KeyBlockMerger { public: - explicit KeyBlockMerger(std::vector>& factorizedTables, + explicit KeyBlockMerger(std::vector factorizedTables, std::vector& strKeyColsInfo, uint32_t numBytesPerTuple) - : factorizedTables{factorizedTables}, strKeyColsInfo{strKeyColsInfo}, + : factorizedTables{std::move(factorizedTables)}, strKeyColsInfo{strKeyColsInfo}, numBytesPerTuple{numBytesPerTuple}, numBytesToCompare{numBytesPerTuple - 8}, hasStringCol{!strKeyColsInfo.empty()} {} @@ -123,7 +108,7 @@ class KeyBlockMerger { // FactorizedTables[i] stores all order_by columns encoded and sorted by the ith thread. // MergeSort uses factorizedTable to access the full contents of the string key columns // when resolving ties. - std::vector>& factorizedTables; + std::vector factorizedTables; // We also store the colIdxInFactorizedTable, colOffsetInEncodedKeyBlock, isAscOrder, isStrCol // for each string column. So, we don't need to compute them again during merge sort. std::vector& strKeyColsInfo; @@ -200,15 +185,15 @@ class KeyBlockMergeTaskDispatcher { // This function is used to initialize the columns of keyBlockMergeTaskDispatcher based on // sharedFactorizedTablesAndSortedKeyBlocks. void init(storage::MemoryManager* memoryManager, - std::shared_ptr>> sortedKeyBlocks, - std::vector>& factorizedTables, - std::vector& strKeyColsInfo, uint64_t numBytesPerTuple); + std::queue>* sortedKeyBlocks, + std::vector factorizedTables, std::vector& strKeyColsInfo, + uint64_t numBytesPerTuple); private: std::mutex mtx; storage::MemoryManager* memoryManager; - std::shared_ptr>> sortedKeyBlocks; + std::queue>* sortedKeyBlocks; std::vector> activeKeyBlockMergeTasks; std::unique_ptr keyBlockMerger; }; diff --git a/src/include/processor/operator/order_by/order_by.h b/src/include/processor/operator/order_by/order_by.h index 5017b68dcf..d26aa9038c 100644 --- a/src/include/processor/operator/order_by/order_by.h +++ b/src/include/processor/operator/order_by/order_by.h @@ -1,110 +1,26 @@ #pragma once -#include - #include "common/data_chunk/data_chunk_state.h" #include "common/in_mem_overflow_buffer.h" -#include "processor/operator/order_by/radix_sort.h" #include "processor/operator/sink.h" -#include "processor/result/factorized_table.h" #include "processor/result/result_set.h" +#include "sort_state.h" namespace kuzu { namespace processor { -// This class contains factorizedTables, nextFactorizedTableIdx, strKeyColsInfo, -// sortedKeyBlocks and the size of each tuple in keyBlocks. The class is shared between the -// order_by, orderByMerge, orderByScan operators. All functions are guaranteed to be thread-safe, so -// caller doesn't need to acquire a lock before calling these functions. -class SharedFactorizedTablesAndSortedKeyBlocks { -public: - explicit SharedFactorizedTablesAndSortedKeyBlocks() - : nextFactorizedTableIdx{0}, - sortedKeyBlocks{std::make_shared>>()} {} - - uint8_t getNextFactorizedTableIdx() { - std::unique_lock lck{mtx}; - return nextFactorizedTableIdx++; - } - - void appendFactorizedTable( - uint8_t factorizedTableIdx, std::shared_ptr factorizedTable) { - std::unique_lock lck{mtx}; - // If the factorizedTables is full, resize the factorizedTables and - // insert the factorizedTable to the set. - if (factorizedTableIdx >= factorizedTables.size()) { - factorizedTables.resize(factorizedTableIdx + 1); - } - factorizedTables[factorizedTableIdx] = std::move(factorizedTable); - } - - void appendSortedKeyBlock(std::shared_ptr mergedDataBlocks) { - std::unique_lock lck{mtx}; - sortedKeyBlocks->emplace(mergedDataBlocks); - } - - void setNumBytesPerTuple(uint32_t _numBytesPerTuple) { - assert(numBytesPerTuple == UINT32_MAX); - numBytesPerTuple = _numBytesPerTuple; - } - - void combineFTHasNoNullGuarantee() { - for (auto i = 1u; i < factorizedTables.size(); i++) { - factorizedTables[0]->mergeMayContainNulls(*factorizedTables[i]); - } - } - - void setStrKeyColInfo(std::vector _strKeyColsInfo) { - assert(strKeyColsInfo.empty()); - strKeyColsInfo = std::move(_strKeyColsInfo); - } - -private: - std::mutex mtx; - -public: - std::vector> factorizedTables; - uint8_t nextFactorizedTableIdx; - std::shared_ptr>> sortedKeyBlocks; - - uint32_t numBytesPerTuple = UINT32_MAX; // encoding size - std::vector strKeyColsInfo; -}; - -struct OrderByDataInfo { -public: - OrderByDataInfo(std::vector> keysPosAndType, - std::vector> payloadsPosAndType, - std::vector isPayloadFlat, std::vector isAscOrder, bool mayContainUnflatKey) - : keysPosAndType{std::move(keysPosAndType)}, payloadsPosAndType{std::move( - payloadsPosAndType)}, - isPayloadFlat{std::move(isPayloadFlat)}, isAscOrder{std::move(isAscOrder)}, - mayContainUnflatKey{mayContainUnflatKey} {} - - OrderByDataInfo(const OrderByDataInfo& other) - : OrderByDataInfo{other.keysPosAndType, other.payloadsPosAndType, other.isPayloadFlat, - other.isAscOrder, other.mayContainUnflatKey} {} - -public: - std::vector> keysPosAndType; - std::vector> payloadsPosAndType; - std::vector isPayloadFlat; - std::vector isAscOrder; - // TODO(Ziyi): We should figure out unflat keys in a more general way. - bool mayContainUnflatKey; -}; - class OrderBy : public Sink { public: OrderBy(std::unique_ptr resultSetDescriptor, - const OrderByDataInfo& orderByDataInfo, - std::shared_ptr sharedState, - std::unique_ptr child, uint32_t id, const std::string& paramsString) + const OrderByDataInfo& orderByDataInfo, std::unique_ptr localState, + std::shared_ptr sharedState, std::unique_ptr child, + uint32_t id, const std::string& paramsString) : Sink{std::move(resultSetDescriptor), PhysicalOperatorType::ORDER_BY, std::move(child), id, paramsString}, - orderByDataInfo{orderByDataInfo}, sharedState{std::move(sharedState)} {} + orderByDataInfo{orderByDataInfo}, localState{std::move(localState)}, + sharedState{std::move(sharedState)} {} - void initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) override; + void initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) final; void executeInternal(ExecutionContext* context) override; @@ -117,24 +33,20 @@ class OrderBy : public Sink { } std::unique_ptr clone() override { - return std::make_unique(resultSetDescriptor->copy(), orderByDataInfo, sharedState, - children[0]->clone(), id, paramsString); + return std::make_unique(resultSetDescriptor->copy(), orderByDataInfo, + std::make_unique(), sharedState, children[0]->clone(), id, + paramsString); } private: - std::unique_ptr populateTableSchema(); - - void initGlobalStateInternal(ExecutionContext* context) override; + void initGlobalStateInternal(ExecutionContext* context) final; private: - uint8_t factorizedTableIdx; OrderByDataInfo orderByDataInfo; - std::unique_ptr orderByKeyEncoder; - std::unique_ptr radixSorter; - std::vector keyVectors; - std::vector vectorsToAppend; - std::shared_ptr sharedState; - std::shared_ptr localFactorizedTable; + std::unique_ptr localState; + std::shared_ptr sharedState; + std::vector orderByVectors; + std::vector payloadVectors; }; } // namespace processor diff --git a/src/include/processor/operator/order_by/order_by_key_encoder.h b/src/include/processor/operator/order_by/order_by_key_encoder.h index cd29fcb034..491aa8f36c 100644 --- a/src/include/processor/operator/order_by/order_by_key_encoder.h +++ b/src/include/processor/operator/order_by/order_by_key_encoder.h @@ -28,6 +28,29 @@ namespace processor { #define BSWAP16(x) ((uint16_t)((((uint16_t)(x)&0xff00) >> 8) | (((uint16_t)(x)&0x00ff) << 8))) +struct OrderByDataInfo { +public: + OrderByDataInfo(std::vector> keysPosAndType, + std::vector> payloadsPosAndType, + std::vector isPayloadFlat, std::vector isAscOrder, bool mayContainUnflatKey) + : keysPosAndType{std::move(keysPosAndType)}, payloadsPosAndType{std::move( + payloadsPosAndType)}, + isPayloadFlat{std::move(isPayloadFlat)}, isAscOrder{std::move(isAscOrder)}, + mayContainUnflatKey{mayContainUnflatKey} {} + + OrderByDataInfo(const OrderByDataInfo& other) + : OrderByDataInfo{other.keysPosAndType, other.payloadsPosAndType, other.isPayloadFlat, + other.isAscOrder, other.mayContainUnflatKey} {} + +public: + std::vector> keysPosAndType; + std::vector> payloadsPosAndType; + std::vector isPayloadFlat; + std::vector isAscOrder; + // TODO(Ziyi): We should figure out unflat keys in a more general way. + bool mayContainUnflatKey; +}; + // The OrderByKeyEncoder encodes all columns in the ORDER BY clause into a single binary sequence // that, when compared using memcmp will yield the correct overall sorting order. On little-endian // hardware, the least-significant byte is stored at the smallest address. To encode the sorting @@ -45,9 +68,8 @@ using encode_function_t = std::function; class OrderByKeyEncoder { public: - OrderByKeyEncoder(std::vector& orderByVectors, - std::vector& isAscOrder, storage::MemoryManager* memoryManager, uint8_t ftIdx, - uint32_t numTuplesPerBlockInFT, uint32_t numBytesPerTuple); + OrderByKeyEncoder(const OrderByDataInfo& orderByDataInfo, storage::MemoryManager* memoryManager, + uint8_t ftIdx, uint32_t numTuplesPerBlockInFT, uint32_t numBytesPerTuple); inline std::vector>& getKeyBlocks() { return keyBlocks; } @@ -86,7 +108,9 @@ class OrderByKeyEncoder { static uint32_t getEncodingSize(const common::LogicalType& dataType); - void encodeKeys(); + void encodeKeys(std::vector orderByKeys); + + inline void clear() { keyBlocks.clear(); } private: template @@ -121,7 +145,6 @@ class OrderByKeyEncoder { private: storage::MemoryManager* memoryManager; std::vector> keyBlocks; - std::vector& orderByVectors; std::vector isAscOrder; uint32_t numBytesPerTuple; uint32_t maxNumTuplesPerBlock; diff --git a/src/include/processor/operator/order_by/order_by_merge.h b/src/include/processor/operator/order_by/order_by_merge.h index 0c7ace198c..4d61148268 100644 --- a/src/include/processor/operator/order_by/order_by_merge.h +++ b/src/include/processor/operator/order_by/order_by_merge.h @@ -13,7 +13,7 @@ class OrderByMerge : public Sink { public: // This constructor will only be called by the mapper when constructing the orderByMerge // operator, because the mapper doesn't know the existence of keyBlockMergeTaskDispatcher - OrderByMerge(std::shared_ptr sharedState, + OrderByMerge(std::shared_ptr sharedState, std::shared_ptr sharedDispatcher, std::unique_ptr child, uint32_t id, const std::string& paramsString) : Sink{nullptr /* resultSetDescriptor */, PhysicalOperatorType::ORDER_BY_MERGE, @@ -21,7 +21,7 @@ class OrderByMerge : public Sink { sharedState{std::move(sharedState)}, sharedDispatcher{std::move(sharedDispatcher)} {} // This constructor is used for cloning only. - OrderByMerge(std::shared_ptr sharedState, + OrderByMerge(std::shared_ptr sharedState, std::shared_ptr sharedDispatcher, uint32_t id, const std::string& paramsString) : Sink{nullptr /* resultSetDescriptor */, PhysicalOperatorType::ORDER_BY_MERGE, id, @@ -42,7 +42,7 @@ class OrderByMerge : public Sink { void initGlobalStateInternal(ExecutionContext* context) override; private: - std::shared_ptr sharedState; + std::shared_ptr sharedState; std::unique_ptr localMerger; std::shared_ptr sharedDispatcher; }; diff --git a/src/include/processor/operator/order_by/order_by_scan.h b/src/include/processor/operator/order_by/order_by_scan.h index c8dcf21267..4ccf60ad2e 100644 --- a/src/include/processor/operator/order_by/order_by_scan.h +++ b/src/include/processor/operator/order_by/order_by_scan.h @@ -6,51 +6,49 @@ namespace kuzu { namespace processor { -struct MergedKeyBlockScanState { - bool scanSingleTuple; - uint32_t nextTupleIdxToReadInMergedKeyBlock; - std::shared_ptr mergedKeyBlock; - uint32_t tupleIdxAndFactorizedTableIdxOffset; - std::vector colsToScan; - std::unique_ptr tuplesToRead; - std::unique_ptr blockPtrInfo; +struct OrderByScanLocalState { + std::vector vectorsToRead; + std::unique_ptr payloadScanner; + + void init( + std::vector& outVectorPos, SortSharedState& sharedState, ResultSet& resultSet); + + inline uint64_t scan() { return payloadScanner->scan(vectorsToRead); } }; // To preserve the ordering of tuples, the orderByScan operator will only // be executed in single-thread mode. class OrderByScan : public PhysicalOperator { public: - OrderByScan(std::vector outVectorPos, - std::shared_ptr sharedState, + OrderByScan(std::vector outVectorPos, std::shared_ptr sharedState, std::unique_ptr child, uint32_t id, const std::string& paramsString) : PhysicalOperator{PhysicalOperatorType::ORDER_BY_SCAN, std::move(child), id, paramsString}, - outVectorPos{std::move(outVectorPos)}, sharedState{std::move(sharedState)} {} + outVectorPos{std::move(outVectorPos)}, + localState{std::make_unique()}, sharedState{ + std::move(sharedState)} {} // This constructor is used for cloning only. - OrderByScan(std::vector outVectorPos, - std::shared_ptr sharedState, uint32_t id, - const std::string& paramsString) + OrderByScan(std::vector outVectorPos, std::shared_ptr sharedState, + uint32_t id, const std::string& paramsString) : PhysicalOperator{PhysicalOperatorType::ORDER_BY_SCAN, id, paramsString}, - outVectorPos{std::move(outVectorPos)}, sharedState{std::move(sharedState)} {} + outVectorPos{std::move(outVectorPos)}, + localState{std::make_unique()}, sharedState{ + std::move(sharedState)} {} - inline bool isSource() const override { return true; } + inline bool isSource() const final { return true; } - void initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) override; + bool getNextTuplesInternal(ExecutionContext* context) final; - bool getNextTuplesInternal(ExecutionContext* context) override; + void initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) final; - std::unique_ptr clone() override { + std::unique_ptr clone() final { return std::make_unique(outVectorPos, sharedState, id, paramsString); } -private: - void initMergedKeyBlockScanState(); - private: std::vector outVectorPos; - std::shared_ptr sharedState; - std::vector vectorsToRead; - std::unique_ptr mergedKeyBlockScanState; + std::unique_ptr localState; + std::shared_ptr sharedState; }; } // namespace processor diff --git a/src/include/processor/operator/order_by/sort_state.h b/src/include/processor/operator/order_by/sort_state.h new file mode 100644 index 0000000000..3694d3c94e --- /dev/null +++ b/src/include/processor/operator/order_by/sort_state.h @@ -0,0 +1,95 @@ +#pragma once + +#include + +#include "processor/operator/order_by/radix_sort.h" +#include "processor/result/factorized_table.h" + +namespace kuzu { +namespace processor { + +struct LocalPayloadTableInfo { + uint64_t globalIdx; + FactorizedTable* payloadTable; +}; + +class SortSharedState { +public: + SortSharedState() + : nextFactorizedTableIdx{0}, + sortedKeyBlocks{std::make_shared>>()} {}; + + inline uint64_t getNumBytesPerTuple() const { return numBytesPerTuple; } + + inline std::vector& getStrKeyColInfo() { return strKeyColsInfo; } + + inline std::queue>* getSortedKeyBlocks() { + return sortedKeyBlocks.get(); + } + + void init(const OrderByDataInfo& orderByDataInfo); + + LocalPayloadTableInfo getLocalPayloadTable(storage::MemoryManager& memoryManager); + + void appendLocalSortedKeyBlock(std::shared_ptr mergedDataBlocks); + + void combineFTHasNoNullGuarantee(); + + std::vector getPayloadTables() const; + + inline MergedKeyBlocks* getMergedKeyBlock() const { + return sortedKeyBlocks->empty() ? nullptr : sortedKeyBlocks->front().get(); + } + +private: + void calculatePayloadSchema(const kuzu::processor::OrderByDataInfo& orderByDataInfo); + +private: + std::mutex mtx; + std::vector> payloadTables; + uint8_t nextFactorizedTableIdx; + std::shared_ptr>> sortedKeyBlocks; + uint32_t numBytesPerTuple; + std::vector strKeyColsInfo; + +private: + std::unique_ptr payloadSchema; +}; + +class SortLocalState { +public: + void init(const OrderByDataInfo& orderByDataInfo, SortSharedState& sharedState, + storage::MemoryManager* memoryManager); + + void append(std::vector keyVectors, + std::vector payloadVectors); + + void finalize(SortSharedState& sharedState); + +private: + std::unique_ptr orderByKeyEncoder; + std::unique_ptr radixSorter; + LocalPayloadTableInfo localPayloadTableInfo; +}; + +class PayloadScanner { +public: + PayloadScanner(MergedKeyBlocks* keyBlockToScan, std::vector payloadTables, + uint64_t skipNumber = UINT64_MAX, uint64_t limitNumber = UINT64_MAX); + + uint64_t scan(std::vector vectorsToRead); + +private: + bool scanSingleTuple; + uint32_t payloadIdxOffset; + std::vector colsToScan; + std::unique_ptr tuplesToRead; + std::unique_ptr blockPtrInfo; + MergedKeyBlocks* keyBlockToScan; + uint32_t nextTupleIdxToReadInMergedKeyBlock; + uint64_t endTuplesIdxToReadInMergedKeyBlock; + std::vector payloadTables; +}; + +} // namespace processor +} // namespace kuzu diff --git a/src/include/processor/operator/order_by/top_k.h b/src/include/processor/operator/order_by/top_k.h new file mode 100644 index 0000000000..98263d9b8d --- /dev/null +++ b/src/include/processor/operator/order_by/top_k.h @@ -0,0 +1,198 @@ +#pragma once + +#include + +#include "function/binary_function_executor.h" +#include "processor/operator/sink.h" +#include "sort_state.h" + +namespace kuzu { +namespace processor { + +struct TopKScanState { + // TODO(Xiyang): Move the initialization of payloadScanner to mapper. + inline void init(MergedKeyBlocks* keyBlockToScan, std::vector payloadTables, + uint64_t skipNum, uint64_t limitNum) { + payloadScanner = std::make_unique( + keyBlockToScan, std::move(payloadTables), skipNum, limitNum); + } + + std::unique_ptr payloadScanner; +}; + +class TopKSortState { + +public: + TopKSortState(); + + void init(const OrderByDataInfo& orderByDataInfo, storage::MemoryManager* memoryManager); + + void append(std::vector keyVectors, + std::vector payloadVectors); + + void finalize(); + + inline uint64_t getNumTuples() { return numTuples; } + + inline SortSharedState* getSharedState() { return orderBySharedState.get(); } + + inline void initScan(TopKScanState& scanState, uint64_t skip, uint64_t limit) { + scanState.init(orderBySharedState->getMergedKeyBlock(), + orderBySharedState->getPayloadTables(), skip, limit); + } + +private: + std::unique_ptr orderByLocalState; + std::unique_ptr orderBySharedState; + + uint64_t numTuples; + storage::MemoryManager* memoryManager; +}; + +class TopKBuffer { + using vector_select_comparison_func = + std::function; + +public: + TopKBuffer() { sortState = std::make_unique(); } + + void init(const OrderByDataInfo& orderByDataInfo, storage::MemoryManager* memoryManager, + uint64_t skipNumber, uint64_t limitNumber); + + void append(std::vector keyVectors, + std::vector payloadVectors); + + void reduce(); + + inline void finalize() { sortState->finalize(); } + + void merge(TopKBuffer* other); + + inline void initScan(TopKScanState& scanState) { sortState->initScan(scanState, skip, limit); } + +private: + void initVectors(); + + uint64_t findKeyVectorPosInPayload(const DataPos& keyPos); + + template + void getSelectComparisonFunction( + common::PhysicalTypeID typeID, vector_select_comparison_func& selectFunc); + + void initCompareFuncs(); + + void setBoundaryValue(); + + bool compareBoundaryValue(std::vector& keyVectors); + + bool compareFlatKeys( + common::vector_idx_t vectorIdxToCompare, std::vector keyVectors); + + void compareUnflatKeys( + common::vector_idx_t vectorIdxToCompare, std::vector keyVectors); + + static void appendSelState( + common::SelectionVector* selVector, common::SelectionVector* selVectorToAppend); + +public: + std::unique_ptr sortState; + uint64_t skip; + uint64_t limit; + const OrderByDataInfo* orderByDataInfo; + storage::MemoryManager* memoryManager; + std::vector compareFuncs; + std::vector equalsFuncs; + bool hasBoundaryValue = false; + +private: + // Holds the ownership of all temp vectors. + std::vector> tmpVectors; + std::vector> boundaryVecs; + + std::vector payloadVecsToScan; + std::vector keyVecsToScan; + std::vector lastPayloadVecsToScan; + std::vector lastKeyVecsToScan; +}; + +class TopKLocalState { +public: + TopKLocalState() { buffer = std::make_unique(); } + + void init(const OrderByDataInfo& orderByDataInfo, storage::MemoryManager* memoryManager, + ResultSet& resultSet, uint64_t skipNumber, uint64_t limitNumber); + + void append(); + + inline void finalize() { buffer->finalize(); } + + std::unique_ptr buffer; + +private: + std::vector orderByVectors; + std::vector payloadVectors; +}; + +class TopKSharedState { +public: + TopKSharedState() { buffer = std::make_unique(); } + + void init(const OrderByDataInfo& orderByDataInfo, storage::MemoryManager* memoryManager, + uint64_t skipNumber, uint64_t limitNumber) { + buffer->init(orderByDataInfo, memoryManager, skipNumber, limitNumber); + } + + void mergeLocalState(TopKLocalState* localState) { + std::unique_lock lck{mtx}; + buffer->merge(localState->buffer.get()); + } + + void finalize() { buffer->finalize(); } + + std::unique_ptr buffer; + +private: + std::mutex mtx; +}; + +class TopK : public Sink { +public: + TopK(std::unique_ptr resultSetDescriptor, + std::unique_ptr localState, std::shared_ptr sharedState, + OrderByDataInfo orderByDataInfo, uint64_t skipNumber, uint64_t limitNumber, + std::unique_ptr child, uint32_t id, const std::string& paramsString) + : Sink{std::move(resultSetDescriptor), PhysicalOperatorType::TOP_K, std::move(child), id, + paramsString}, + localState{std::move(localState)}, sharedState{std::move(sharedState)}, + orderByDataInfo{std::move(orderByDataInfo)}, skipNumber{skipNumber}, limitNumber{ + limitNumber} {} + + inline void initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) final { + localState->init( + orderByDataInfo, context->memoryManager, *resultSet, skipNumber, limitNumber); + } + + inline void initGlobalStateInternal(ExecutionContext* context) final { + sharedState->init(orderByDataInfo, context->memoryManager, skipNumber, limitNumber); + } + + void executeInternal(ExecutionContext* context) final; + + void finalize(ExecutionContext* context) final { sharedState->finalize(); } + + std::unique_ptr clone() final { + return std::make_unique(resultSetDescriptor->copy(), + std::make_unique(), sharedState, orderByDataInfo, skipNumber, + limitNumber, children[0]->clone(), id, paramsString); + } + +private: + std::unique_ptr localState; + std::shared_ptr sharedState; + OrderByDataInfo orderByDataInfo; + uint64_t skipNumber; + uint64_t limitNumber; +}; + +} // namespace processor +} // namespace kuzu diff --git a/src/include/processor/operator/order_by/top_k_scanner.h b/src/include/processor/operator/order_by/top_k_scanner.h new file mode 100644 index 0000000000..e6b0840dd1 --- /dev/null +++ b/src/include/processor/operator/order_by/top_k_scanner.h @@ -0,0 +1,44 @@ +#pragma once + +#include "top_k.h" + +namespace kuzu { +namespace processor { + +struct TopKLocalScanState { + std::vector vectorsToScan; + std::unique_ptr scanState; + + void init( + std::vector& outVectorPos, TopKSharedState& sharedState, ResultSet& resultSet); + + inline uint64_t scan() { return scanState->payloadScanner->scan(vectorsToScan); } +}; + +class TopKScan : public PhysicalOperator { +public: + TopKScan(std::vector outVectorPos, std::shared_ptr sharedState, + std::unique_ptr child, uint32_t id, const std::string& paramsString) + : PhysicalOperator{PhysicalOperatorType::TOP_K_SCAN, std::move(child), id, paramsString}, + outVectorPos{std::move(outVectorPos)}, localState{std::make_unique()}, + sharedState{std::move(sharedState)} {} + + inline bool isSource() const final { return true; } + + void initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) final; + + bool getNextTuplesInternal(ExecutionContext* context) final; + + std::unique_ptr clone() final { + return std::make_unique( + outVectorPos, sharedState, children[0]->clone(), id, paramsString); + } + +private: + std::vector outVectorPos; + std::unique_ptr localState; + std::shared_ptr sharedState; +}; + +} // namespace processor +} // namespace kuzu diff --git a/src/include/processor/operator/physical_operator.h b/src/include/processor/operator/physical_operator.h index bffabe9a54..2aa8b572d4 100644 --- a/src/include/processor/operator/physical_operator.h +++ b/src/include/processor/operator/physical_operator.h @@ -58,6 +58,8 @@ enum class PhysicalOperatorType : uint8_t { SET_REL_PROPERTY, SIMPLE_RECURSIVE_JOIN, SKIP, + TOP_K, + TOP_K_SCAN, ORDER_BY, ORDER_BY_MERGE, ORDER_BY_SCAN, diff --git a/src/include/processor/result/result_set.h b/src/include/processor/result/result_set.h index 5761cc824b..9298d07eee 100644 --- a/src/include/processor/result/result_set.h +++ b/src/include/processor/result/result_set.h @@ -22,7 +22,7 @@ class ResultSet { inline std::shared_ptr getDataChunk(data_chunk_pos_t dataChunkPos) { return dataChunks[dataChunkPos]; } - inline std::shared_ptr getValueVector(const DataPos& dataPos) { + inline std::shared_ptr getValueVector(const DataPos& dataPos) const { return dataChunks[dataPos.dataChunkPos]->valueVectors[dataPos.valueVectorPos]; } diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 93301cc9d1..e3b6d9c4e3 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -34,8 +34,8 @@ void Optimizer::optimize(planner::LogicalPlan* plan) { auto hashJoinSIPOptimizer = HashJoinSIPOptimizer(); hashJoinSIPOptimizer.rewrite(plan); - // auto topKOptimizer = TopKOptimizer(); - // topKOptimizer.rewrite(plan); + auto topKOptimizer = TopKOptimizer(); + topKOptimizer.rewrite(plan); auto factorizationRewriter = FactorizationRewriter(); factorizationRewriter.rewrite(plan); diff --git a/src/optimizer/top_k_optimizer.cpp b/src/optimizer/top_k_optimizer.cpp index d4a853e2e2..92c8ff0055 100644 --- a/src/optimizer/top_k_optimizer.cpp +++ b/src/optimizer/top_k_optimizer.cpp @@ -9,7 +9,7 @@ namespace kuzu { namespace optimizer { void TopKOptimizer::rewrite(planner::LogicalPlan* plan) { - visitOperator(plan->getLastOperator()); + plan->setLastOperator(visitOperator(plan->getLastOperator())); } std::shared_ptr TopKOptimizer::visitOperator(std::shared_ptr op) { diff --git a/src/processor/map/map_order_by.cpp b/src/processor/map/map_order_by.cpp index b81ddc63d3..58cc5f04a0 100644 --- a/src/processor/map/map_order_by.cpp +++ b/src/processor/map/map_order_by.cpp @@ -2,6 +2,8 @@ #include "processor/operator/order_by/order_by.h" #include "processor/operator/order_by/order_by_merge.h" #include "processor/operator/order_by/order_by_scan.h" +#include "processor/operator/order_by/top_k.h" +#include "processor/operator/order_by/top_k_scanner.h" #include "processor/plan_mapper.h" using namespace kuzu::common; @@ -12,10 +14,6 @@ namespace processor { std::unique_ptr PlanMapper::mapOrderBy(LogicalOperator* logicalOperator) { auto logicalOrderBy = (LogicalOrderBy*)logicalOperator; - if (logicalOrderBy->isTopK()) { - // TODO(Ziyi): fill - assert(false); - } auto outSchema = logicalOrderBy->getSchema(); auto inSchema = logicalOrderBy->getChild(0)->getSchema(); auto prevOperator = mapOperator(logicalOrderBy->getChild(0).get()); @@ -38,17 +36,28 @@ std::unique_ptr PlanMapper::mapOrderBy(LogicalOperator* logica auto mayContainUnflatKey = inSchema->getNumGroups() == 1; auto orderByDataInfo = OrderByDataInfo(keysPosAndType, payloadsPosAndType, isPayloadFlat, logicalOrderBy->getIsAscOrders(), mayContainUnflatKey); - auto orderBySharedState = std::make_shared(); - auto orderBy = - make_unique(std::make_unique(inSchema), orderByDataInfo, - orderBySharedState, std::move(prevOperator), getOperatorID(), paramsString); - auto dispatcher = std::make_shared(); - auto orderByMerge = make_unique(orderBySharedState, std::move(dispatcher), - std::move(orderBy), getOperatorID(), paramsString); - auto orderByScan = make_unique( - outVectorPos, orderBySharedState, std::move(orderByMerge), getOperatorID(), paramsString); - return orderByScan; + if (logicalOrderBy->isTopK()) { + auto topKSharedState = std::make_shared(); + auto topK = make_unique(std::make_unique(inSchema), + std::make_unique(), topKSharedState, orderByDataInfo, + logicalOrderBy->getSkipNum(), logicalOrderBy->getLimitNum(), std::move(prevOperator), + getOperatorID(), paramsString); + auto topKScan = make_unique( + outVectorPos, topKSharedState, std::move(topK), getOperatorID(), paramsString); + return topKScan; + } else { + auto orderBySharedState = std::make_shared(); + auto orderBy = make_unique(std::make_unique(inSchema), + orderByDataInfo, std::make_unique(), orderBySharedState, + std::move(prevOperator), getOperatorID(), paramsString); + auto dispatcher = std::make_shared(); + auto orderByMerge = make_unique(orderBySharedState, std::move(dispatcher), + std::move(orderBy), getOperatorID(), paramsString); + auto orderByScan = make_unique(outVectorPos, orderBySharedState, + std::move(orderByMerge), getOperatorID(), paramsString); + return orderByScan; + } } } // namespace processor diff --git a/src/processor/operator/order_by/CMakeLists.txt b/src/processor/operator/order_by/CMakeLists.txt index 9acdb5d6bd..c61555999e 100644 --- a/src/processor/operator/order_by/CMakeLists.txt +++ b/src/processor/operator/order_by/CMakeLists.txt @@ -5,7 +5,10 @@ add_library(kuzu_processor_operator_order_by order_by_key_encoder.cpp order_by_merge.cpp order_by_scan.cpp - radix_sort.cpp) + radix_sort.cpp + sort_state.cpp + top_k.cpp + top_k_scanner.cpp) set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ diff --git a/src/processor/operator/order_by/key_block_merger.cpp b/src/processor/operator/order_by/key_block_merger.cpp index 470e60c2c4..006ee4bd46 100644 --- a/src/processor/operator/order_by/key_block_merger.cpp +++ b/src/processor/operator/order_by/key_block_merger.cpp @@ -38,6 +38,21 @@ uint8_t* MergedKeyBlocks::getBlockEndTuplePtr( getKeyBlockBuffer(blockIdx) + endTupleOffset; } +BlockPtrInfo::BlockPtrInfo(uint64_t startTupleIdx, uint64_t endTupleIdx, MergedKeyBlocks* keyBlocks) + : keyBlocks{keyBlocks}, curBlockIdx{startTupleIdx / keyBlocks->getNumTuplesPerBlock()}, + endBlockIdx{endTupleIdx == 0 ? 0 : (endTupleIdx - 1) / keyBlocks->getNumTuplesPerBlock()}, + endTupleIdx{endTupleIdx} { + if (startTupleIdx == endTupleIdx) { + curTuplePtr = nullptr; + endTuplePtr = nullptr; + curBlockEndTuplePtr = nullptr; + } else { + curTuplePtr = keyBlocks->getTuple(startTupleIdx); + endTuplePtr = keyBlocks->getBlockEndTuplePtr(endBlockIdx, endTupleIdx, endBlockIdx); + curBlockEndTuplePtr = keyBlocks->getBlockEndTuplePtr(curBlockIdx, endTupleIdx, endBlockIdx); + } +} + void BlockPtrInfo::updateTuplePtrIfNecessary() { if (curTuplePtr == curBlockEndTuplePtr) { curBlockIdx++; @@ -126,16 +141,16 @@ void KeyBlockMerger::mergeKeyBlocks(KeyBlockMergeMorsel& keyBlockMergeMorsel) co auto leftBlockPtrInfo = BlockPtrInfo(keyBlockMergeMorsel.leftKeyBlockStartIdx, keyBlockMergeMorsel.leftKeyBlockEndIdx, - keyBlockMergeMorsel.keyBlockMergeTask->leftKeyBlock); + keyBlockMergeMorsel.keyBlockMergeTask->leftKeyBlock.get()); auto rightBlockPtrInfo = BlockPtrInfo(keyBlockMergeMorsel.rightKeyBlockStartIdx, keyBlockMergeMorsel.rightKeyBlockEndIdx, - keyBlockMergeMorsel.keyBlockMergeTask->rightKeyBlock); + keyBlockMergeMorsel.keyBlockMergeTask->rightKeyBlock.get()); auto resultBlockPtrInfo = BlockPtrInfo( keyBlockMergeMorsel.leftKeyBlockStartIdx + keyBlockMergeMorsel.rightKeyBlockStartIdx, keyBlockMergeMorsel.leftKeyBlockEndIdx + keyBlockMergeMorsel.rightKeyBlockEndIdx, - keyBlockMergeMorsel.keyBlockMergeTask->resultKeyBlock); + keyBlockMergeMorsel.keyBlockMergeTask->resultKeyBlock.get()); while (leftBlockPtrInfo.hasMoreTuplesToRead() && rightBlockPtrInfo.hasMoreTuplesToRead()) { uint64_t nextNumBytesToMerge = @@ -296,9 +311,9 @@ void KeyBlockMergeTaskDispatcher::doneMorsel(std::unique_ptr>> sortedKeyBlocks, - std::vector>& factorizedTables, - std::vector& strKeyColsInfo, uint64_t numBytesPerTuple) { + std::queue>* sortedKeyBlocks, + std::vector factorizedTables, std::vector& strKeyColsInfo, + uint64_t numBytesPerTuple) { assert(this->keyBlockMerger == nullptr); this->memoryManager = memoryManager; this->sortedKeyBlocks = sortedKeyBlocks; diff --git a/src/processor/operator/order_by/order_by.cpp b/src/processor/operator/order_by/order_by.cpp index c06e6555e6..573633f748 100644 --- a/src/processor/operator/order_by/order_by.cpp +++ b/src/processor/operator/order_by/order_by.cpp @@ -6,94 +6,27 @@ namespace kuzu { namespace processor { void OrderBy::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) { + localState->init(orderByDataInfo, *sharedState, context->memoryManager); for (auto [dataPos, _] : orderByDataInfo.payloadsPosAndType) { - auto vector = resultSet->getValueVector(dataPos); - vectorsToAppend.push_back(vector.get()); + payloadVectors.push_back(resultSet->getValueVector(dataPos).get()); } - // TODO(Ziyi): this is implemented differently from other sink operators. Normally we append - // local table to global at the end of the execution. But here your encoder seem to need encode - // tableIdx which closely associated with the execution order of thread. We prefer a unified - // design pattern for sink operators. - localFactorizedTable = - make_shared(context->memoryManager, populateTableSchema()); - factorizedTableIdx = sharedState->getNextFactorizedTableIdx(); - sharedState->appendFactorizedTable(factorizedTableIdx, localFactorizedTable); for (auto [dataPos, _] : orderByDataInfo.keysPosAndType) { - keyVectors.push_back(resultSet->getValueVector(dataPos).get()); + orderByVectors.push_back(resultSet->getValueVector(dataPos).get()); } - orderByKeyEncoder = std::make_unique(keyVectors, orderByDataInfo.isAscOrder, - context->memoryManager, factorizedTableIdx, localFactorizedTable->getNumTuplesPerBlock(), - sharedState->numBytesPerTuple); - radixSorter = std::make_unique(context->memoryManager, *localFactorizedTable, - *orderByKeyEncoder, sharedState->strKeyColsInfo); } -std::unique_ptr OrderBy::populateTableSchema() { - std::unique_ptr tableSchema = std::make_unique(); - // The orderByKeyEncoder requires that the orderByKey columns are flat in the - // factorizedTable. If there is only one unflat dataChunk, we need to flatten the payload - // columns in factorizedTable because the payload and key columns are in the same - // dataChunk. - for (auto i = 0u; i < orderByDataInfo.payloadsPosAndType.size(); ++i) { - auto [dataPos, dataType] = orderByDataInfo.payloadsPosAndType[i]; - bool isUnflat = !orderByDataInfo.isPayloadFlat[i] && !orderByDataInfo.mayContainUnflatKey; - tableSchema->appendColumn(std::make_unique(isUnflat, dataPos.dataChunkPos, - isUnflat ? (uint32_t)sizeof(overflow_value_t) : - LogicalTypeUtils::getRowLayoutSize(dataType))); - } - return tableSchema; -} - -void OrderBy::initGlobalStateInternal(kuzu::processor::ExecutionContext* context) { - std::vector strKeyColInfo; - auto encodedKeyBlockColOffset = 0ul; - auto tableSchema = populateTableSchema(); - for (auto i = 0u; i < orderByDataInfo.keysPosAndType.size(); ++i) { - auto [dataPos, dataType] = orderByDataInfo.keysPosAndType[i]; - if (PhysicalTypeID::STRING == dataType.getPhysicalType()) { - // If this is a string column, we need to find the factorizedTable offset for this - // column. - auto factorizedTableColIdx = 0ul; - for (auto j = 0u; j < orderByDataInfo.payloadsPosAndType.size(); j++) { - auto [payloadDataPos, _] = orderByDataInfo.payloadsPosAndType[j]; - if (payloadDataPos == dataPos) { - factorizedTableColIdx = j; - } - } - strKeyColInfo.emplace_back( - StrKeyColInfo(tableSchema->getColOffset(factorizedTableColIdx), - encodedKeyBlockColOffset, orderByDataInfo.isAscOrder[i])); - } - encodedKeyBlockColOffset += OrderByKeyEncoder::getEncodingSize(dataType); - } - sharedState->setStrKeyColInfo(strKeyColInfo); - // TODO(Ziyi): comment about +8 - auto numBytesPerTuple = encodedKeyBlockColOffset + 8; - sharedState->setNumBytesPerTuple(numBytesPerTuple); +void OrderBy::initGlobalStateInternal(ExecutionContext* context) { + sharedState->init(orderByDataInfo); } void OrderBy::executeInternal(ExecutionContext* context) { // Append thread-local tuples. while (children[0]->getNextTuple(context)) { for (auto i = 0u; i < resultSet->multiplicity; i++) { - orderByKeyEncoder->encodeKeys(); - // The orderByKeyEncoder requires that the orderByKey columns are flat in the - // factorizedTable. If there is a single dataChunk (unflat or not), it means all the key - // columns and payload columns are in the same datachunk. Since we need to flatten key - // columns, we flatten all columns in the factorized table. If there are multiple - // datachunks, then the datachunks that the keys belong to are guaranteed by the - // frontend to be flattened (see ProjectionEnumerator), so a column is flat in - // factorized table if and only if its corresponding vector is flat. - localFactorizedTable->append(vectorsToAppend); - } - } - for (auto& keyBlock : orderByKeyEncoder->getKeyBlocks()) { - if (keyBlock->numTuples > 0) { - radixSorter->sortSingleKeyBlock(*keyBlock); - sharedState->appendSortedKeyBlock( - make_shared(orderByKeyEncoder->getNumBytesPerTuple(), keyBlock)); + localState->append(orderByVectors, payloadVectors); } } + localState->finalize(*sharedState); } } // namespace processor diff --git a/src/processor/operator/order_by/order_by_key_encoder.cpp b/src/processor/operator/order_by/order_by_key_encoder.cpp index 8bab23ccd1..6b41ebf864 100644 --- a/src/processor/operator/order_by/order_by_key_encoder.cpp +++ b/src/processor/operator/order_by/order_by_key_encoder.cpp @@ -1,8 +1,7 @@ #include "processor/operator/order_by/order_by_key_encoder.h" -#include - #include +#include #include "common/string_utils.h" @@ -12,10 +11,10 @@ using namespace kuzu::storage; namespace kuzu { namespace processor { -OrderByKeyEncoder::OrderByKeyEncoder(std::vector& orderByVectors, - std::vector& isAscOrder, MemoryManager* memoryManager, uint8_t ftIdx, - uint32_t numTuplesPerBlockInFT, uint32_t numBytesPerTuple) - : memoryManager{memoryManager}, orderByVectors{orderByVectors}, isAscOrder{isAscOrder}, +OrderByKeyEncoder::OrderByKeyEncoder(const OrderByDataInfo& orderByDataInfo, + MemoryManager* memoryManager, uint8_t ftIdx, uint32_t numTuplesPerBlockInFT, + uint32_t numBytesPerTuple) + : memoryManager{memoryManager}, isAscOrder{orderByDataInfo.isAscOrder}, numBytesPerTuple{numBytesPerTuple}, ftIdx{ftIdx}, numTuplesPerBlockInFT{numTuplesPerBlockInFT}, swapBytes{isLittleEndian()} { if (numTuplesPerBlockInFT > MAX_FT_BLOCK_OFFSET) { @@ -30,14 +29,16 @@ OrderByKeyEncoder::OrderByKeyEncoder(std::vector& orderByVectors, "TupleSize({} bytes) is larger than the LARGE_PAGE_SIZE({} bytes)", numBytesPerTuple, BufferPoolConstants::PAGE_256KB_SIZE)); } - encodeFunctions.resize(orderByVectors.size()); - for (auto i = 0u; i < orderByVectors.size(); i++) { - getEncodingFunction(orderByVectors[i]->dataType.getPhysicalType(), encodeFunctions[i]); + encodeFunctions.reserve(orderByDataInfo.keysPosAndType.size()); + for (auto& [_, type] : orderByDataInfo.keysPosAndType) { + encode_function_t encodeFunction; + getEncodingFunction(type.getPhysicalType(), encodeFunction); + encodeFunctions.push_back(std::move(encodeFunction)); } } -void OrderByKeyEncoder::encodeKeys() { - uint32_t numEntries = orderByVectors[0]->state->selVector->selectedSize; +void OrderByKeyEncoder::encodeKeys(std::vector orderByKeys) { + uint32_t numEntries = orderByKeys[0]->state->selVector->selectedSize; uint32_t encodedTuples = 0; while (numEntries > 0) { allocateMemoryIfFull(); @@ -46,10 +47,10 @@ void OrderByKeyEncoder::encodeKeys() { auto tuplePtr = keyBlocks.back()->getData() + keyBlocks.back()->numTuples * numBytesPerTuple; uint32_t tuplePtrOffset = 0; - for (auto keyColIdx = 0u; keyColIdx < orderByVectors.size(); keyColIdx++) { - encodeVector(orderByVectors[keyColIdx], tuplePtr + tuplePtrOffset, encodedTuples, + for (auto keyColIdx = 0u; keyColIdx < orderByKeys.size(); keyColIdx++) { + encodeVector(orderByKeys[keyColIdx], tuplePtr + tuplePtrOffset, encodedTuples, numEntriesToEncode, keyColIdx); - tuplePtrOffset += getEncodingSize(orderByVectors[keyColIdx]->dataType); + tuplePtrOffset += getEncodingSize(orderByKeys[keyColIdx]->dataType); } encodeFTIdx(numEntriesToEncode, tuplePtr + tuplePtrOffset); encodedTuples += numEntriesToEncode; @@ -231,10 +232,8 @@ void OrderByKeyEncoder::getEncodingFunction(PhysicalTypeID physicalType, encode_ func = encodeTemplate; return; } - default: { - throw RuntimeException("Cannot encode data with physical type: " + - PhysicalTypeUtils::physicalTypeToString(physicalType)); - } + default: + throw NotImplementedException{"OrderByKeyEncoder::getEncodingFunction"}; } } diff --git a/src/processor/operator/order_by/order_by_merge.cpp b/src/processor/operator/order_by/order_by_merge.cpp index 813d837ec6..d4682cc015 100644 --- a/src/processor/operator/order_by/order_by_merge.cpp +++ b/src/processor/operator/order_by/order_by_merge.cpp @@ -10,8 +10,8 @@ namespace processor { void OrderByMerge::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) { // OrderByMerge is the only sink operator in a pipeline and only modifies the // sharedState by merging sortedKeyBlocks, So we don't need to initialize the resultSet. - localMerger = make_unique( - sharedState->factorizedTables, sharedState->strKeyColsInfo, sharedState->numBytesPerTuple); + localMerger = make_unique(sharedState->getPayloadTables(), + sharedState->getStrKeyColInfo(), sharedState->getNumBytesPerTuple()); } void OrderByMerge::executeInternal(ExecutionContext* context) { @@ -29,8 +29,9 @@ void OrderByMerge::executeInternal(ExecutionContext* context) { void OrderByMerge::initGlobalStateInternal(ExecutionContext* context) { // TODO(Ziyi): directly feed sharedState to merger and dispatcher. - sharedDispatcher->init(context->memoryManager, sharedState->sortedKeyBlocks, - sharedState->factorizedTables, sharedState->strKeyColsInfo, sharedState->numBytesPerTuple); + sharedDispatcher->init(context->memoryManager, sharedState->getSortedKeyBlocks(), + sharedState->getPayloadTables(), sharedState->getStrKeyColInfo(), + sharedState->getNumBytesPerTuple()); } } // namespace processor diff --git a/src/processor/operator/order_by/order_by_scan.cpp b/src/processor/operator/order_by/order_by_scan.cpp index 5aace112ee..63b6f9c9d8 100644 --- a/src/processor/operator/order_by/order_by_scan.cpp +++ b/src/processor/operator/order_by/order_by_scan.cpp @@ -5,98 +5,24 @@ using namespace kuzu::common; namespace kuzu { namespace processor { -void OrderByScan::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) { - for (auto dataPos : outVectorPos) { - auto valueVector = resultSet->getValueVector(dataPos); - vectorsToRead.push_back(valueVector.get()); +void OrderByScanLocalState::init( + std::vector& outVectorPos, SortSharedState& sharedState, ResultSet& resultSet) { + for (auto& dataPos : outVectorPos) { + vectorsToRead.push_back(resultSet.getValueVector(dataPos).get()); } - initMergedKeyBlockScanState(); + payloadScanner = std::make_unique( + sharedState.getMergedKeyBlock(), sharedState.getPayloadTables()); } -bool OrderByScan::getNextTuplesInternal(ExecutionContext* context) { - // If there is no more tuples to read, just return false. - if (mergedKeyBlockScanState == nullptr || - mergedKeyBlockScanState->nextTupleIdxToReadInMergedKeyBlock >= - mergedKeyBlockScanState->mergedKeyBlock->getNumTuples()) { - return false; - } else { - // If there is an unflat col in factorizedTable, we can only read one - // tuple at a time. Otherwise, we can read min(DEFAULT_VECTOR_CAPACITY, - // numTuplesRemainingInMemBlock) tuples. - if (mergedKeyBlockScanState->scanSingleTuple) { - auto tupleInfoBuffer = mergedKeyBlockScanState->blockPtrInfo->curTuplePtr + - mergedKeyBlockScanState->tupleIdxAndFactorizedTableIdxOffset; - auto blockIdx = OrderByKeyEncoder::getEncodedFTBlockIdx(tupleInfoBuffer); - auto blockOffset = OrderByKeyEncoder::getEncodedFTBlockOffset(tupleInfoBuffer); - auto ft = - sharedState->factorizedTables[OrderByKeyEncoder::getEncodedFTIdx(tupleInfoBuffer)]; - ft->scan(vectorsToRead, blockIdx * ft->getNumTuplesPerBlock() + blockOffset, - 1 /* numTuples */); - mergedKeyBlockScanState->blockPtrInfo->curTuplePtr += - mergedKeyBlockScanState->mergedKeyBlock->getNumBytesPerTuple(); - mergedKeyBlockScanState->blockPtrInfo->updateTuplePtrIfNecessary(); - mergedKeyBlockScanState->nextTupleIdxToReadInMergedKeyBlock++; - metrics->numOutputTuple.increase(1); - } else { - auto numTuplesToRead = std::min(DEFAULT_VECTOR_CAPACITY, - mergedKeyBlockScanState->mergedKeyBlock->getNumTuples() - - mergedKeyBlockScanState->nextTupleIdxToReadInMergedKeyBlock); - auto numTuplesRead = 0; - while (numTuplesRead < numTuplesToRead) { - auto numTuplesToReadInCurBlock = std::min(numTuplesToRead - numTuplesRead, - mergedKeyBlockScanState->blockPtrInfo->getNumTuplesLeftInCurBlock()); - - for (auto i = 0u; i < numTuplesToReadInCurBlock; i++) { - auto tupleInfoBuffer = - mergedKeyBlockScanState->blockPtrInfo->curTuplePtr + - mergedKeyBlockScanState->tupleIdxAndFactorizedTableIdxOffset; - auto blockIdx = OrderByKeyEncoder::getEncodedFTBlockIdx(tupleInfoBuffer); - auto blockOffset = OrderByKeyEncoder::getEncodedFTBlockOffset(tupleInfoBuffer); - auto ft = - sharedState - ->factorizedTables[OrderByKeyEncoder::getEncodedFTIdx(tupleInfoBuffer)]; - mergedKeyBlockScanState->tuplesToRead[numTuplesRead + i] = - ft->getTuple(blockIdx * ft->getNumTuplesPerBlock() + blockOffset); - mergedKeyBlockScanState->blockPtrInfo->curTuplePtr += - mergedKeyBlockScanState->mergedKeyBlock->getNumBytesPerTuple(); - } - mergedKeyBlockScanState->blockPtrInfo->updateTuplePtrIfNecessary(); - numTuplesRead += numTuplesToReadInCurBlock; - } - // TODO(Ziyi): This is a hacky way of using factorizedTable::lookup function, - // since the tuples in tuplesToRead may not belong to factorizedTable0. The - // lookup function doesn't perform a check on whether it holds all the tuples in - // tuplesToRead. We should optimize this lookup function in the orderByScan - // optimization PR. - sharedState->factorizedTables[0]->lookup(vectorsToRead, - mergedKeyBlockScanState->colsToScan, mergedKeyBlockScanState->tuplesToRead.get(), 0, - numTuplesToRead); - metrics->numOutputTuple.increase(numTuplesToRead); - mergedKeyBlockScanState->nextTupleIdxToReadInMergedKeyBlock += numTuplesToRead; - } - return true; - } +void OrderByScan::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) { + localState->init(outVectorPos, *sharedState, *resultSet); } -void OrderByScan::initMergedKeyBlockScanState() { - if (sharedState->sortedKeyBlocks->empty()) { - return; - } - mergedKeyBlockScanState = std::make_unique(); - mergedKeyBlockScanState->nextTupleIdxToReadInMergedKeyBlock = 0; - mergedKeyBlockScanState->mergedKeyBlock = sharedState->sortedKeyBlocks->front(); - mergedKeyBlockScanState->tupleIdxAndFactorizedTableIdxOffset = - mergedKeyBlockScanState->mergedKeyBlock->getNumBytesPerTuple() - 8; - mergedKeyBlockScanState->colsToScan = std::vector(vectorsToRead.size()); - iota(mergedKeyBlockScanState->colsToScan.begin(), mergedKeyBlockScanState->colsToScan.end(), 0); - mergedKeyBlockScanState->scanSingleTuple = sharedState->factorizedTables[0]->hasUnflatCol(); - if (!mergedKeyBlockScanState->scanSingleTuple) { - mergedKeyBlockScanState->tuplesToRead = - std::make_unique(DEFAULT_VECTOR_CAPACITY); - } - mergedKeyBlockScanState->blockPtrInfo = make_unique(0 /* startTupleIdx */, - mergedKeyBlockScanState->mergedKeyBlock->getNumTuples(), - mergedKeyBlockScanState->mergedKeyBlock); +bool OrderByScan::getNextTuplesInternal(ExecutionContext* context) { + // If there is no more tuples to read, just return false. + auto numTuplesRead = localState->scan(); + metrics->numOutputTuple.increase(numTuplesRead); + return numTuplesRead != 0; } } // namespace processor diff --git a/src/processor/operator/order_by/sort_state.cpp b/src/processor/operator/order_by/sort_state.cpp new file mode 100644 index 0000000000..5e1999cae9 --- /dev/null +++ b/src/processor/operator/order_by/sort_state.cpp @@ -0,0 +1,178 @@ +#include "processor/operator/order_by/sort_state.h" + +using namespace kuzu::common; + +namespace kuzu { +namespace processor { + +void SortSharedState::init(const OrderByDataInfo& orderByDataInfo) { + calculatePayloadSchema(orderByDataInfo); + auto encodedKeyBlockColOffset = 0ul; + for (auto i = 0u; i < orderByDataInfo.keysPosAndType.size(); ++i) { + auto& [dataPos, dataType] = orderByDataInfo.keysPosAndType[i]; + if (PhysicalTypeID::STRING == dataType.getPhysicalType()) { + // If this is a string column, we need to find the factorizedTable offset for this + // column. + auto ftColIdx = 0ul; + for (auto j = 0u; j < orderByDataInfo.payloadsPosAndType.size(); j++) { + auto [payloadDataPos, _] = orderByDataInfo.payloadsPosAndType[j]; + if (payloadDataPos == dataPos) { + ftColIdx = j; + } + } + strKeyColsInfo.emplace_back(payloadSchema->getColOffset(ftColIdx), + encodedKeyBlockColOffset, orderByDataInfo.isAscOrder[i]); + } + encodedKeyBlockColOffset += OrderByKeyEncoder::getEncodingSize(dataType); + } + numBytesPerTuple = encodedKeyBlockColOffset + OrderByConstants::NUM_BYTES_FOR_PAYLOAD_IDX; +} + +LocalPayloadTableInfo SortSharedState::getLocalPayloadTable(storage::MemoryManager& memoryManager) { + std::unique_lock lck{mtx}; + auto payloadTable = std::make_unique(&memoryManager, payloadSchema->copy()); + auto payloadTableInfo = LocalPayloadTableInfo{nextFactorizedTableIdx++, payloadTable.get()}; + payloadTables.push_back(std::move(payloadTable)); + return payloadTableInfo; +} + +void SortSharedState::appendLocalSortedKeyBlock(std::shared_ptr mergedDataBlocks) { + std::unique_lock lck{mtx}; + sortedKeyBlocks->emplace(mergedDataBlocks); +} + +void SortSharedState::combineFTHasNoNullGuarantee() { + for (auto i = 1u; i < payloadTables.size(); i++) { + payloadTables[0]->mergeMayContainNulls(*payloadTables[i]); + } +} + +std::vector SortSharedState::getPayloadTables() const { + std::vector payloadTablesToReturn; + payloadTablesToReturn.reserve(payloadTables.size()); + for (auto& payloadTable : payloadTables) { + payloadTablesToReturn.push_back(payloadTable.get()); + } + return payloadTablesToReturn; +} + +void SortSharedState::calculatePayloadSchema( + const kuzu::processor::OrderByDataInfo& orderByDataInfo) { + // The orderByKeyEncoder requires that the orderByKey columns are flat in the + // factorizedTable. If there is only one unflat dataChunk, we need to flatten the payload + // columns in factorizedTable because the payload and key columns are in the same + // dataChunk. + payloadSchema = std::make_unique(); + for (auto i = 0u; i < orderByDataInfo.payloadsPosAndType.size(); ++i) { + auto [dataPos, dataType] = orderByDataInfo.payloadsPosAndType[i]; + bool isUnflat = !orderByDataInfo.isPayloadFlat[i] && !orderByDataInfo.mayContainUnflatKey; + payloadSchema->appendColumn(std::make_unique(isUnflat, dataPos.dataChunkPos, + isUnflat ? (uint32_t)sizeof(overflow_value_t) : + LogicalTypeUtils::getRowLayoutSize(dataType))); + } +} + +void SortLocalState::init(const OrderByDataInfo& orderByDataInfo, SortSharedState& sharedState, + storage::MemoryManager* memoryManager) { + localPayloadTableInfo = sharedState.getLocalPayloadTable(*memoryManager); + orderByKeyEncoder = std::make_unique(orderByDataInfo, memoryManager, + localPayloadTableInfo.globalIdx, localPayloadTableInfo.payloadTable->getNumTuplesPerBlock(), + sharedState.getNumBytesPerTuple()); + radixSorter = std::make_unique(memoryManager, *localPayloadTableInfo.payloadTable, + *orderByKeyEncoder, sharedState.getStrKeyColInfo()); +} + +void SortLocalState::append(std::vector keyVectors, + std::vector payloadVectors) { + orderByKeyEncoder->encodeKeys(std::move(keyVectors)); + localPayloadTableInfo.payloadTable->append(std::move(payloadVectors)); +} + +void SortLocalState::finalize(kuzu::processor::SortSharedState& sharedState) { + for (auto& keyBlock : orderByKeyEncoder->getKeyBlocks()) { + if (keyBlock->numTuples > 0) { + radixSorter->sortSingleKeyBlock(*keyBlock); + sharedState.appendLocalSortedKeyBlock( + make_shared(orderByKeyEncoder->getNumBytesPerTuple(), keyBlock)); + } + } + orderByKeyEncoder->clear(); +} + +PayloadScanner::PayloadScanner(MergedKeyBlocks* keyBlockToScan, + std::vector payloadTables, uint64_t skipNumber, uint64_t limitNumber) + : keyBlockToScan{std::move(keyBlockToScan)}, payloadTables{std::move(payloadTables)} { + if (this->keyBlockToScan == nullptr || this->keyBlockToScan->getNumTuples() == 0) { + nextTupleIdxToReadInMergedKeyBlock = 0; + endTuplesIdxToReadInMergedKeyBlock = 0; + return; + } + payloadIdxOffset = + this->keyBlockToScan->getNumBytesPerTuple() - OrderByConstants::NUM_BYTES_FOR_PAYLOAD_IDX; + colsToScan = std::vector(this->payloadTables[0]->getTableSchema()->getNumColumns()); + iota(colsToScan.begin(), colsToScan.end(), 0); + scanSingleTuple = this->payloadTables[0]->hasUnflatCol(); + if (!scanSingleTuple) { + tuplesToRead = std::make_unique(DEFAULT_VECTOR_CAPACITY); + } + nextTupleIdxToReadInMergedKeyBlock = skipNumber == UINT64_MAX ? 0 : skipNumber; + endTuplesIdxToReadInMergedKeyBlock = + limitNumber == UINT64_MAX ? this->keyBlockToScan->getNumTuples() : + std::min(nextTupleIdxToReadInMergedKeyBlock + limitNumber, + this->keyBlockToScan->getNumTuples()); + blockPtrInfo = std::make_unique(nextTupleIdxToReadInMergedKeyBlock, + endTuplesIdxToReadInMergedKeyBlock, this->keyBlockToScan); +} + +uint64_t PayloadScanner::scan(std::vector vectorsToRead) { + if (nextTupleIdxToReadInMergedKeyBlock >= endTuplesIdxToReadInMergedKeyBlock) { + return 0; + } else { + // If there is an unflat col in factorizedTable, we can only read one + // tuple at a time. Otherwise, we can read min(DEFAULT_VECTOR_CAPACITY, + // numTuplesRemainingInMemBlock) tuples. + if (scanSingleTuple) { + auto payloadInfo = blockPtrInfo->curTuplePtr + payloadIdxOffset; + auto blockIdx = OrderByKeyEncoder::getEncodedFTBlockIdx(payloadInfo); + auto blockOffset = OrderByKeyEncoder::getEncodedFTBlockOffset(payloadInfo); + auto payloadTable = payloadTables[OrderByKeyEncoder::getEncodedFTIdx(payloadInfo)]; + payloadTable->scan(vectorsToRead, + blockIdx * payloadTable->getNumTuplesPerBlock() + blockOffset, 1 /* numTuples */); + blockPtrInfo->curTuplePtr += keyBlockToScan->getNumBytesPerTuple(); + blockPtrInfo->updateTuplePtrIfNecessary(); + nextTupleIdxToReadInMergedKeyBlock++; + return 1; + } else { + auto numTuplesToRead = std::min(DEFAULT_VECTOR_CAPACITY, + endTuplesIdxToReadInMergedKeyBlock - nextTupleIdxToReadInMergedKeyBlock); + auto numTuplesRead = 0; + while (numTuplesRead < numTuplesToRead) { + auto numTuplesToReadInCurBlock = std::min( + numTuplesToRead - numTuplesRead, blockPtrInfo->getNumTuplesLeftInCurBlock()); + for (auto i = 0u; i < numTuplesToReadInCurBlock; i++) { + auto payloadInfo = blockPtrInfo->curTuplePtr + payloadIdxOffset; + auto blockIdx = OrderByKeyEncoder::getEncodedFTBlockIdx(payloadInfo); + auto blockOffset = OrderByKeyEncoder::getEncodedFTBlockOffset(payloadInfo); + auto ft = payloadTables[OrderByKeyEncoder::getEncodedFTIdx(payloadInfo)]; + tuplesToRead[numTuplesRead + i] = + ft->getTuple(blockIdx * ft->getNumTuplesPerBlock() + blockOffset); + blockPtrInfo->curTuplePtr += keyBlockToScan->getNumBytesPerTuple(); + } + blockPtrInfo->updateTuplePtrIfNecessary(); + numTuplesRead += numTuplesToReadInCurBlock; + } + // TODO(Ziyi): This is a hacky way of using factorizedTable::lookup function, + // since the tuples in tuplesToRead may not belong to factorizedTable0. The + // lookup function doesn't perform a check on whether it holds all the tuples in + // tuplesToRead. We should optimize this lookup function in the orderByScan + // optimization PR. + payloadTables[0]->lookup( + vectorsToRead, colsToScan, tuplesToRead.get(), 0, numTuplesToRead); + nextTupleIdxToReadInMergedKeyBlock += numTuplesToRead; + return numTuplesRead; + } + } +} + +} // namespace processor +} // namespace kuzu diff --git a/src/processor/operator/order_by/top_k.cpp b/src/processor/operator/order_by/top_k.cpp new file mode 100644 index 0000000000..756406cab2 --- /dev/null +++ b/src/processor/operator/order_by/top_k.cpp @@ -0,0 +1,301 @@ +#include "processor/operator/order_by/top_k.h" + +using namespace kuzu::common; + +namespace kuzu { +namespace processor { + +TopKSortState::TopKSortState() { + orderByLocalState = std::make_unique(); + orderBySharedState = std::make_unique(); +} + +void TopKSortState::init( + const OrderByDataInfo& orderByDataInfo, storage::MemoryManager* memoryManager) { + this->memoryManager = memoryManager; + orderBySharedState->init(orderByDataInfo); + orderByLocalState->init(orderByDataInfo, *orderBySharedState, memoryManager); + numTuples = 0; +} + +void TopKSortState::append(std::vector keyVectors, + std::vector payloadVectors) { + numTuples += keyVectors[0]->state->selVector->selectedSize; + orderByLocalState->append(std::move(keyVectors), std::move(payloadVectors)); +} + +void TopKSortState::finalize() { + orderByLocalState->finalize(*orderBySharedState); + auto merger = std::make_unique(orderBySharedState->getPayloadTables(), + orderBySharedState->getStrKeyColInfo(), orderBySharedState->getNumBytesPerTuple()); + auto dispatcher = std::make_unique(); + dispatcher->init(memoryManager, orderBySharedState->getSortedKeyBlocks(), + orderBySharedState->getPayloadTables(), orderBySharedState->getStrKeyColInfo(), + orderBySharedState->getNumBytesPerTuple()); + while (!dispatcher->isDoneMerge()) { + auto keyBlockMergeMorsel = dispatcher->getMorsel(); + merger->mergeKeyBlocks(*keyBlockMergeMorsel); + dispatcher->doneMorsel(std::move(keyBlockMergeMorsel)); + } +} + +void TopKBuffer::init(const kuzu::processor::OrderByDataInfo& orderByDataInfo, + storage::MemoryManager* memoryManager, uint64_t skipNumber, uint64_t limitNumber) { + this->orderByDataInfo = &orderByDataInfo; + this->memoryManager = memoryManager; + sortState->init(orderByDataInfo, memoryManager); + this->skip = skipNumber; + this->limit = limitNumber; + initVectors(); + initCompareFuncs(); +} + +void TopKBuffer::append(std::vector keyVectors, + std::vector payloadVectors) { + auto originalSelState = keyVectors[0]->state->selVector; + if (hasBoundaryValue && !compareBoundaryValue(keyVectors)) { + keyVectors[0]->state->selVector = std::move(originalSelState); + return; + } + sortState->append(keyVectors, payloadVectors); + keyVectors[0]->state->selVector = std::move(originalSelState); +} + +void TopKBuffer::reduce() { + auto reduceThreshold = std::max(OrderByConstants::MIN_SIZE_TO_REDUCE, + OrderByConstants::MIN_LIMIT_RATIO_TO_REDUCE * (limit + skip)); + if (sortState->getNumTuples() < reduceThreshold) { + return; + } + sortState->finalize(); + auto newSortState = std::make_unique(); + newSortState->init(*orderByDataInfo, memoryManager); + TopKScanState scanState; + sortState->initScan(scanState, 0, skip + limit); + while (true) { + auto numTuplesScanned = scanState.payloadScanner->scan(payloadVecsToScan); + if (numTuplesScanned == 0) { + setBoundaryValue(); + break; + } + newSortState->append(keyVecsToScan, payloadVecsToScan); + std::swap(payloadVecsToScan, lastPayloadVecsToScan); + std::swap(keyVecsToScan, lastKeyVecsToScan); + } + sortState = std::move(newSortState); +} + +void TopKBuffer::merge(TopKBuffer* other) { + other->finalize(); + if (other->sortState->getSharedState()->getSortedKeyBlocks()->empty()) { + return; + } + TopKScanState scanState; + other->sortState->initScan(scanState, 0, skip + limit); + while (scanState.payloadScanner->scan(payloadVecsToScan) > 0) { + sortState->append(keyVecsToScan, payloadVecsToScan); + } + reduce(); +} + +void TopKBuffer::initVectors() { + auto payloadState = std::make_shared(); + auto lastPayloadState = std::make_shared(); + for (auto& [pos, type] : orderByDataInfo->payloadsPosAndType) { + auto payloadVec = std::make_unique(type, memoryManager); + auto lastPayloadVec = std::make_unique(type, memoryManager); + payloadVec->setState(payloadState); + lastPayloadVec->setState(lastPayloadState); + payloadVecsToScan.push_back(payloadVec.get()); + lastPayloadVecsToScan.push_back(lastPayloadVec.get()); + tmpVectors.push_back(std::move(payloadVec)); + tmpVectors.push_back(std::move(lastPayloadVec)); + } + auto boundaryState = common::DataChunkState::getSingleValueDataChunkState(); + for (auto& [pos, type] : orderByDataInfo->keysPosAndType) { + auto boundaryVec = std::make_unique(type, memoryManager); + boundaryVec->setState(boundaryState); + boundaryVecs.push_back(std::move(boundaryVec)); + auto posInPayload = findKeyVectorPosInPayload(pos); + if (posInPayload == UINT64_MAX) { + // If the key is not present in the payload, create a new vector. + auto keyVec = std::make_unique(type, memoryManager); + auto lastKeyVec = std::make_unique(type, memoryManager); + keyVecsToScan.push_back(keyVec.get()); + lastKeyVecsToScan.push_back(lastKeyVec.get()); + tmpVectors.push_back(std::move(keyVec)); + tmpVectors.push_back(std::move(lastKeyVec)); + } else { + // Otherwise grab the vector from the payload. + keyVecsToScan.push_back(payloadVecsToScan[posInPayload]); + lastKeyVecsToScan.push_back(lastPayloadVecsToScan[posInPayload]); + } + } +} + +uint64_t TopKBuffer::findKeyVectorPosInPayload(const DataPos& keyPos) { + // TODO(Xiyang): this information should be passed by front end. (e.g. The key vector pos in the + // payload vector) + for (auto i = 0u; i < orderByDataInfo->payloadsPosAndType.size(); i++) { + if (keyPos == orderByDataInfo->payloadsPosAndType[i].first) { + return i; + } + } + return UINT64_MAX; +} + +template +void TopKBuffer::getSelectComparisonFunction( + common::PhysicalTypeID typeID, vector_select_comparison_func& selectFunc) { + switch (typeID) { + case common::PhysicalTypeID::INT64: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + case common::PhysicalTypeID::INT32: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + case common::PhysicalTypeID::INT16: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + case common::PhysicalTypeID::DOUBLE: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + case common::PhysicalTypeID::FLOAT: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + case common::PhysicalTypeID::BOOL: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + case common::PhysicalTypeID::STRING: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + case common::PhysicalTypeID::INTERVAL: { + selectFunc = function::BinaryFunctionExecutor::selectComparison; + } break; + default: + throw common::NotImplementedException{"TopKBuffer::getSelectComparisonFunction"}; + } +} + +void TopKBuffer::initCompareFuncs() { + compareFuncs.reserve(orderByDataInfo->isAscOrder.size()); + equalsFuncs.reserve(orderByDataInfo->isAscOrder.size()); + vector_select_comparison_func compareFunc; + vector_select_comparison_func equalsFunc; + for (auto i = 0u; i < orderByDataInfo->isAscOrder.size(); i++) { + auto physicalType = orderByDataInfo->keysPosAndType[i].second.getPhysicalType(); + if (orderByDataInfo->isAscOrder[i]) { + getSelectComparisonFunction(physicalType, compareFunc); + } else { + getSelectComparisonFunction(physicalType, compareFunc); + } + getSelectComparisonFunction(physicalType, equalsFunc); + compareFuncs.push_back(compareFunc); + equalsFuncs.push_back(equalsFunc); + } +} + +void TopKBuffer::setBoundaryValue() { + for (auto i = 0u; i < boundaryVecs.size(); i++) { + auto boundaryVec = boundaryVecs[i].get(); + auto dstData = + boundaryVec->getData() + boundaryVec->getNumBytesPerValue() * + boundaryVec->state->selVector->selectedPositions[0]; + auto srcVector = lastKeyVecsToScan[i]; + auto srcData = srcVector->getData() + + srcVector->getNumBytesPerValue() * + srcVector->state->selVector + ->selectedPositions[srcVector->state->selVector->selectedSize - 1]; + boundaryVec->copyFromVectorData(dstData, srcVector, srcData); + hasBoundaryValue = true; + } +} + +bool TopKBuffer::compareBoundaryValue(std::vector& keyVectors) { + if (keyVectors[0]->state->isFlat()) { + return compareFlatKeys(0 /* startKeyVectorIdxToCompare */, keyVectors); + } else { + compareUnflatKeys(0 /* startKeyVectorIdxToCompare */, keyVectors); + return keyVectors[0]->state->selVector->selectedSize > 0; + } +} + +bool TopKBuffer::compareFlatKeys( + vector_idx_t vectorIdxToCompare, std::vector keyVectors) { + std::shared_ptr selVector = + std::make_shared(common::DEFAULT_VECTOR_CAPACITY); + selVector->resetSelectorToValuePosBuffer(); + auto compareResult = compareFuncs[vectorIdxToCompare]( + *keyVectors[vectorIdxToCompare], *boundaryVecs[vectorIdxToCompare], *selVector); + if (vectorIdxToCompare == keyVectors.size() - 1) { + return compareResult; + } else if (equalsFuncs[vectorIdxToCompare](*keyVectors[vectorIdxToCompare], + *boundaryVecs[vectorIdxToCompare], *selVector)) { + return compareFlatKeys(vectorIdxToCompare + 1, keyVectors); + } else { + return false; + } +} + +void TopKBuffer::compareUnflatKeys( + vector_idx_t vectorIdxToCompare, std::vector keyVectors) { + auto compareSelVector = + std::make_shared(common::DEFAULT_VECTOR_CAPACITY); + compareSelVector->resetSelectorToValuePosBuffer(); + compareFuncs[vectorIdxToCompare]( + *keyVectors[vectorIdxToCompare], *boundaryVecs[vectorIdxToCompare], *compareSelVector); + if (vectorIdxToCompare != keyVectors.size() - 1) { + auto equalsSelVector = + std::make_shared(common::DEFAULT_VECTOR_CAPACITY); + equalsSelVector->resetSelectorToValuePosBuffer(); + if (equalsFuncs[vectorIdxToCompare](*keyVectors[vectorIdxToCompare], + *boundaryVecs[vectorIdxToCompare], *equalsSelVector)) { + keyVectors[vectorIdxToCompare]->state->selVector = equalsSelVector; + compareUnflatKeys(vectorIdxToCompare + 1, keyVectors); + appendSelState(compareSelVector.get(), equalsSelVector.get()); + } + } + keyVectors[vectorIdxToCompare]->state->selVector = std::move(compareSelVector); +} + +void TopKBuffer::appendSelState( + common::SelectionVector* selVector, common::SelectionVector* selVectorToAppend) { + for (auto i = 0u; i < selVectorToAppend->selectedSize; i++) { + selVector->selectedPositions[selVector->selectedSize + i] = + selVectorToAppend->selectedPositions[i]; + } + selVector->selectedSize += selVectorToAppend->selectedSize; +} + +void TopKLocalState::init(const OrderByDataInfo& orderByDataInfo, + storage::MemoryManager* memoryManager, ResultSet& resultSet, uint64_t skipNumber, + uint64_t limitNumber) { + buffer->init(orderByDataInfo, memoryManager, skipNumber, limitNumber); + for (auto [dataPos, _] : orderByDataInfo.payloadsPosAndType) { + payloadVectors.push_back(resultSet.getValueVector(dataPos).get()); + } + for (auto [dataPos, _] : orderByDataInfo.keysPosAndType) { + orderByVectors.push_back(resultSet.getValueVector(dataPos).get()); + } +} + +void TopKLocalState::append() { + buffer->append(orderByVectors, payloadVectors); + buffer->reduce(); +} + +void TopK::executeInternal(ExecutionContext* context) { + // Append thread-local tuples. + while (children[0]->getNextTuple(context)) { + for (auto i = 0u; i < resultSet->multiplicity; i++) { + localState->append(); + } + } + localState->finalize(); + sharedState->mergeLocalState(localState.get()); +} + +} // namespace processor +} // namespace kuzu diff --git a/src/processor/operator/order_by/top_k_scanner.cpp b/src/processor/operator/order_by/top_k_scanner.cpp new file mode 100644 index 0000000000..1d036daa80 --- /dev/null +++ b/src/processor/operator/order_by/top_k_scanner.cpp @@ -0,0 +1,27 @@ +#include "processor/operator/order_by/top_k_scanner.h" + +namespace kuzu { +namespace processor { + +void TopKLocalScanState::init( + std::vector& outVectorPos, TopKSharedState& sharedState, ResultSet& resultSet) { + scanState = std::make_unique(); + sharedState.buffer->initScan(*scanState); + for (auto& pos : outVectorPos) { + vectorsToScan.push_back(resultSet.getValueVector(pos).get()); + } +} + +void TopKScan::initLocalStateInternal( + kuzu::processor::ResultSet* resultSet, kuzu::processor::ExecutionContext* context) { + localState->init(outVectorPos, *sharedState, *resultSet); +} + +bool TopKScan::getNextTuplesInternal(ExecutionContext* context) { + auto numTuplesRead = localState->scan(); + metrics->numOutputTuple.increase(numTuplesRead); + return numTuplesRead != 0; +} + +} // namespace processor +} // namespace kuzu diff --git a/src/processor/operator/physical_operator.cpp b/src/processor/operator/physical_operator.cpp index 60601ab416..4ede5c121a 100644 --- a/src/processor/operator/physical_operator.cpp +++ b/src/processor/operator/physical_operator.cpp @@ -152,6 +152,12 @@ std::string PhysicalOperatorUtils::operatorTypeToString(PhysicalOperatorType ope case PhysicalOperatorType::SKIP: { return "SKIP"; } + case PhysicalOperatorType::TOP_K: { + return "TOP_K"; + } + case PhysicalOperatorType::TOP_K_SCAN: { + return "TOP_K_SCAN"; + } case PhysicalOperatorType::ORDER_BY: { return "ORDER_BY"; } diff --git a/src/processor/processor.cpp b/src/processor/processor.cpp index 04bf4f6ba5..3aabf985b0 100644 --- a/src/processor/processor.cpp +++ b/src/processor/processor.cpp @@ -63,6 +63,7 @@ void QueryProcessor::decomposePlanIntoTasks( switch (op->getOperatorType()) { // Ordered table should be scanned in single-thread mode. case PhysicalOperatorType::ORDER_BY_MERGE: + case PhysicalOperatorType::TOP_K: // DDL should be executed exactly once. case PhysicalOperatorType::CREATE_NODE_TABLE: case PhysicalOperatorType::CREATE_REL_TABLE: diff --git a/src/processor/result/factorized_table.cpp b/src/processor/result/factorized_table.cpp index 2b16944f55..402e6b8c0f 100644 --- a/src/processor/result/factorized_table.cpp +++ b/src/processor/result/factorized_table.cpp @@ -362,11 +362,13 @@ uint64_t FactorizedTable::computeNumTuplesToAppend( if (tableSchema->getColumn(i)->isFlat() && !vectorsToAppend[i]->state->isFlat()) { // The caller is not allowed to append multiple unflat columns from different // datachunks to multiple flat columns in the factorizedTable. - if (unflatDataChunkPos != -1 && - tableSchema->getColumn(i)->getDataChunkPos() != unflatDataChunkPos) { - assert(false); + if (!tableSchema->getColumn(i)->isFlat()) { + if (unflatDataChunkPos != -1 && + tableSchema->getColumn(i)->getDataChunkPos() != unflatDataChunkPos) { + assert(false); + } + unflatDataChunkPos = tableSchema->getColumn(i)->getDataChunkPos(); } - unflatDataChunkPos = tableSchema->getColumn(i)->getDataChunkPos(); numTuplesToAppend = vectorsToAppend[i]->state->selVector->selectedSize; } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c4adbe169e..94f0e2ab16 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,7 +20,6 @@ add_subdirectory(common) add_subdirectory(copy) add_subdirectory(main) add_subdirectory(optimizer) -add_subdirectory(processor) add_subdirectory(runner) add_subdirectory(storage) add_subdirectory(transaction) diff --git a/test/processor/CMakeLists.txt b/test/processor/CMakeLists.txt deleted file mode 100644 index 0057d5c7b9..0000000000 --- a/test/processor/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(order_by) diff --git a/test/processor/order_by/CMakeLists.txt b/test/processor/order_by/CMakeLists.txt deleted file mode 100644 index 39805726f8..0000000000 --- a/test/processor/order_by/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_kuzu_test(order_by_test - key_block_merger_test.cpp - order_by_key_encoder_test.cpp - radix_sort_test.cpp) diff --git a/test/processor/order_by/key_block_merger_test.cpp b/test/processor/order_by/key_block_merger_test.cpp deleted file mode 100644 index adcae99453..0000000000 --- a/test/processor/order_by/key_block_merger_test.cpp +++ /dev/null @@ -1,552 +0,0 @@ -#include -#include -#include - -#include "common/assert.h" -#include "common/constants.h" -#include "common/data_chunk/data_chunk.h" -#include "gtest/gtest.h" -#include "processor/operator/order_by/key_block_merger.h" -#include "processor/operator/order_by/order_by_key_encoder.h" - -using ::testing::Test; -using namespace kuzu::common; -using namespace kuzu::processor; -using namespace kuzu::storage; - -class KeyBlockMergerTest : public Test { - -public: - void SetUp() override { - LoggerUtils::createLogger(LoggerConstants::LoggerEnum::BUFFER_MANAGER); - LoggerUtils::createLogger(LoggerConstants::LoggerEnum::STORAGE); - bufferManager = std::make_unique( - BufferPoolConstants::DEFAULT_BUFFER_POOL_SIZE_FOR_TESTING); - memoryManager = std::make_unique(bufferManager.get()); - } - - void TearDown() override { - LoggerUtils::dropLogger(LoggerConstants::LoggerEnum::BUFFER_MANAGER); - LoggerUtils::dropLogger(LoggerConstants::LoggerEnum::STORAGE); - } - -public: - std::unique_ptr bufferManager; - std::unique_ptr memoryManager; - uint32_t numTuplesPerBlockInFT = BufferPoolConstants::PAGE_256KB_SIZE / 8; - - static void checkTupleIdxesAndFactorizedTableIdxes(uint8_t* keyBlockPtr, - const uint64_t keyBlockEntrySizeInBytes, - const std::vector& expectedBlockOffsetOrder, - const std::vector& expectedFactorizedTableIdxOrder) { - assert(expectedBlockOffsetOrder.size() == expectedFactorizedTableIdxOrder.size()); - for (auto i = 0u; i < expectedBlockOffsetOrder.size(); i++) { - auto tupleInfoPtr = keyBlockPtr + keyBlockEntrySizeInBytes - sizeof(uint64_t); - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTBlockIdx(tupleInfoPtr), 0); - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTBlockOffset(tupleInfoPtr), - expectedBlockOffsetOrder[i]); - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTIdx(tupleInfoPtr), - expectedFactorizedTableIdxOrder[i]); - keyBlockPtr += keyBlockEntrySizeInBytes; - } - } - - template - OrderByKeyEncoder prepareSingleOrderByColEncoder(const std::vector& sortingData, - const std::vector& nullMasks, LogicalTypeID dataTypeID, bool isAsc, - uint16_t factorizedTableIdx, bool hasPayLoadCol, - std::vector>& factorizedTables, - std::shared_ptr& dataChunk) { - KU_ASSERT(sortingData.size() == nullMasks.size()); - dataChunk->state->selVector->selectedSize = sortingData.size(); - auto valueVector = std::make_shared(dataTypeID, memoryManager.get()); - for (auto i = 0u; i < dataChunk->state->selVector->selectedSize; i++) { - if (nullMasks[i]) { - valueVector->setNull(i, true); - } else { - valueVector->setValue(i, sortingData[i]); - } - } - dataChunk->insert(0, valueVector); - - std::vector orderByVectors{ - valueVector.get()}; // only contains order_by columns - std::vector allVectors{ - valueVector.get()}; // all columns including order_by and payload columns - - std::unique_ptr tableSchema = - std::make_unique(); - tableSchema->appendColumn(std::make_unique(false /* isUnflat */, - 0 /* dataChunkPos */, LogicalTypeUtils::getRowLayoutSize(LogicalType{dataTypeID}))); - - if (hasPayLoadCol) { - auto payloadValueVector = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - for (auto i = 0u; i < dataChunk->state->selVector->selectedSize; i++) { - payloadValueVector->setValue(i, std::to_string(i)); - } - dataChunk->insert(1, payloadValueVector); - // To test whether the orderByCol -> factorizedTableColIdx works properly, we put the - // payload column at index 0, and the orderByCol at index 1. - allVectors.insert(allVectors.begin(), payloadValueVector.get()); - tableSchema->appendColumn(std::make_unique(false, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{dataTypeID}))); - } - - auto factorizedTable = - std::make_unique(memoryManager.get(), std::move(tableSchema)); - factorizedTable->append(allVectors); - - std::vector isAscOrder = {isAsc}; - auto orderByKeyEncoder = - OrderByKeyEncoder(orderByVectors, isAscOrder, memoryManager.get(), factorizedTableIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(orderByVectors)); - orderByKeyEncoder.encodeKeys(); - - factorizedTables.emplace_back(std::move(factorizedTable)); - return orderByKeyEncoder; - } - - template - void singleOrderByColMergeTest(const std::vector& leftSortingData, - const std::vector& leftNullMasks, const std::vector& rightSortingData, - const std::vector& rightNullMasks, - const std::vector& expectedBlockOffsetOrder, - const std::vector& expectedFactorizedTableIdxOrder, - const LogicalTypeID dataTypeID, const bool isAsc, bool hasPayLoadCol) { - std::vector> factorizedTables; - auto dataChunk0 = std::make_shared(hasPayLoadCol ? 2 : 1); - auto dataChunk1 = std::make_shared(hasPayLoadCol ? 2 : 1); - auto orderByKeyEncoder1 = prepareSingleOrderByColEncoder(leftSortingData, leftNullMasks, - dataTypeID, isAsc, 0 /* ftIdx */, hasPayLoadCol, factorizedTables, dataChunk0); - auto orderByKeyEncoder2 = prepareSingleOrderByColEncoder(rightSortingData, rightNullMasks, - dataTypeID, isAsc, 1 /* ftIdx */, hasPayLoadCol, factorizedTables, dataChunk1); - - std::vector strKeyColsInfo; - if (hasPayLoadCol) { - strKeyColsInfo.emplace_back( - StrKeyColInfo(8 /* colOffsetInFT */, 0 /* colOffsetInEncodedKeyBlock */, isAsc)); - } else if constexpr (std::is_same::value) { - strKeyColsInfo.emplace_back( - StrKeyColInfo(0 /* colOffsetInFT */, 0 /* colOffsetInEncodedKeyBlock */, isAsc)); - } - - KeyBlockMerger keyBlockMerger = KeyBlockMerger( - factorizedTables, strKeyColsInfo, orderByKeyEncoder1.getNumBytesPerTuple()); - - auto numBytesPerEntry = orderByKeyEncoder1.getNumBytesPerTuple(); - auto resultKeyBlock = std::make_shared(numBytesPerEntry, - leftSortingData.size() + rightSortingData.size(), memoryManager.get()); - auto keyBlockMergeTask = - std::make_shared(std::make_shared(numBytesPerEntry, - orderByKeyEncoder1.getKeyBlocks()[0]), - std::make_shared( - numBytesPerEntry, orderByKeyEncoder2.getKeyBlocks()[0]), - resultKeyBlock, keyBlockMerger); - KeyBlockMergeMorsel keyBlockMergeMorsel( - 0, leftSortingData.size(), 0, rightSortingData.size()); - keyBlockMergeMorsel.keyBlockMergeTask = keyBlockMergeTask; - - keyBlockMerger.mergeKeyBlocks(keyBlockMergeMorsel); - - checkTupleIdxesAndFactorizedTableIdxes(resultKeyBlock->getTuple(0), - orderByKeyEncoder1.getNumBytesPerTuple(), expectedBlockOffsetOrder, - expectedFactorizedTableIdxOrder); - } - - OrderByKeyEncoder prepareMultipleOrderByColsEncoder(uint16_t factorizedTableIdx, - std::vector>& factorizedTables, - std::shared_ptr& dataChunk, std::unique_ptr tableSchema) { - std::vector orderByVectors; - for (auto i = 0u; i < dataChunk->getNumValueVectors(); i++) { - orderByVectors.emplace_back(dataChunk->getValueVector(i).get()); - } - - std::vector isAscOrder(orderByVectors.size(), true); - auto orderByKeyEncoder = - OrderByKeyEncoder(orderByVectors, isAscOrder, memoryManager.get(), factorizedTableIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(orderByVectors)); - - auto factorizedTable = - std::make_unique(memoryManager.get(), std::move(tableSchema)); - // Manually flatten the data chunk. - dataChunk->state->selVector->resetSelectorToValuePosBuffer(); - dataChunk->state->selVector->selectedSize = 1; - for (auto i = 0u; i < dataChunk->state->originalSize; i++) { - dataChunk->state->selVector->selectedPositions[0] = i; - factorizedTable->append(orderByVectors); - orderByKeyEncoder.encodeKeys(); - dataChunk->state->currIdx++; - } - dataChunk->state->selVector->resetSelectorToUnselected(); - - factorizedTables.emplace_back(std::move(factorizedTable)); - return orderByKeyEncoder; - } - - void prepareMultipleOrderByColsValueVector(std::vector& int64Values, - std::vector& doubleValues, std::vector& timestampValues, - std::shared_ptr& dataChunk) { - assert(int64Values.size() == doubleValues.size()); - assert(doubleValues.size() == timestampValues.size()); - dataChunk->state->initOriginalAndSelectedSize(int64Values.size()); - dataChunk->state->currIdx = 0; - - auto int64ValueVector = - std::make_shared(LogicalTypeID::INT64, memoryManager.get()); - auto doubleValueVector = - std::make_shared(LogicalTypeID::DOUBLE, memoryManager.get()); - auto timestampValueVector = - std::make_shared(LogicalTypeID::TIMESTAMP, memoryManager.get()); - - dataChunk->insert(0, int64ValueVector); - dataChunk->insert(1, doubleValueVector); - dataChunk->insert(2, timestampValueVector); - - for (auto i = 0u; i < int64Values.size(); i++) { - int64ValueVector->setValue(i, int64Values[i]); - doubleValueVector->setValue(i, doubleValues[i]); - timestampValueVector->setValue(i, timestampValues[i]); - } - } - - void multipleOrderByColTest(bool hasStrCol) { - std::vector int64Values1 = {INT64_MIN, -78, 23}; - std::vector doubleValues1 = {3.28, -0.0001, 4.621}; - std::vector timestampValues1 = { - Timestamp::fromCString("2035-07-01 11:14:33", strlen("2035-07-01 11:14:33")), - Timestamp::fromCString("1962-04-07 11:12:35.123", strlen("1962-04-07 11:12:35.123")), - Timestamp::fromCString("1962-04-07 11:12:35.123", strlen("1962-04-07 11:12:35.123"))}; - auto dataChunk1 = std::make_shared(3 + (hasStrCol ? 1 : 0)); - prepareMultipleOrderByColsValueVector( - int64Values1, doubleValues1, timestampValues1, dataChunk1); - - std::vector int64Values2 = {INT64_MIN, -78, 23, INT64_MAX}; - std::vector doubleValues2 = {0.58, -0.0001, 4.621, 4.621}; - std::vector timestampValues2 = { - Timestamp::fromCString("2036-07-01 11:14:33", strlen("2036-07-01 11:14:33")), - Timestamp::fromCString("1962-04-07 11:12:35.123", strlen("1962-04-07 11:12:35.123")), - Timestamp::fromCString("1962-04-07 11:12:35.123", strlen("1962-04-07 11:12:35.123")), - Timestamp::fromCString("2035-07-01 11:14:33", strlen("2035-07-01 11:14:33"))}; - auto dataChunk2 = std::make_shared(3 + (hasStrCol ? 1 : 0)); - prepareMultipleOrderByColsValueVector( - int64Values2, doubleValues2, timestampValues2, dataChunk2); - - std::unique_ptr tableSchema = - std::make_unique(); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::INT64}))); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::DOUBLE}))); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::TIMESTAMP}))); - - if (hasStrCol) { - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::STRING}))); - auto stringValueVector1 = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - auto stringValueVector2 = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - dataChunk1->insert(3, stringValueVector1); - dataChunk2->insert(3, stringValueVector2); - - stringValueVector1->setValue(0, "same prefix 123"); - stringValueVector1->setValue(1, "same prefix 128"); - stringValueVector1->setValue(2, "same prefix 123"); - - stringValueVector2->setValue(0, "same prefix 127"); - stringValueVector2->setValue(1, "same prefix 123"); - stringValueVector2->setValue(2, "same prefix 121"); - stringValueVector2->setValue(3, "same prefix 126"); - } - - std::vector> factorizedTables; - for (auto i = 0; i < 4; i++) { - factorizedTables.emplace_back(std::make_unique( - memoryManager.get(), std::make_unique(*tableSchema))); - } - auto orderByKeyEncoder2 = prepareMultipleOrderByColsEncoder(4 /* ftIdx */, factorizedTables, - dataChunk2, std::make_unique(*tableSchema)); - auto orderByKeyEncoder1 = prepareMultipleOrderByColsEncoder(5 /* ftIdx */, factorizedTables, - dataChunk1, std::make_unique(*tableSchema)); - - std::vector expectedBlockOffsetOrder = {0, 0, 1, 1, 2, 2, 3}; - std::vector expectedFactorizedTableIdxOrder = {4, 5, 5, 4, 5, 4, 4}; - - std::vector strKeyColsInfo; - if (hasStrCol) { - strKeyColsInfo.emplace_back(StrKeyColInfo( - tableSchema->getColOffset(3 /* colIdx */) /* colOffsetInFT */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::INT64}) + - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::DOUBLE}) + - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::TIMESTAMP}) + 3, - true /* isAscOrder */)); - expectedBlockOffsetOrder = {0, 0, 1, 1, 2, 2, 3}; - expectedFactorizedTableIdxOrder = {4, 5, 4, 5, 4, 5, 4}; - } - - auto numBytesPerEntry = orderByKeyEncoder1.getNumBytesPerTuple(); - KeyBlockMerger keyBlockMerger = KeyBlockMerger( - factorizedTables, strKeyColsInfo, orderByKeyEncoder1.getNumBytesPerTuple()); - auto resultKeyBlock = - std::make_shared(numBytesPerEntry, 7ul, memoryManager.get()); - auto keyBlockMergeTask = - std::make_shared(std::make_shared(numBytesPerEntry, - orderByKeyEncoder1.getKeyBlocks()[0]), - std::make_shared( - numBytesPerEntry, orderByKeyEncoder2.getKeyBlocks()[0]), - resultKeyBlock, keyBlockMerger); - KeyBlockMergeMorsel keyBlockMergeMorsel(0, 3, 0, 4); - keyBlockMergeMorsel.keyBlockMergeTask = keyBlockMergeTask; - - keyBlockMerger.mergeKeyBlocks(keyBlockMergeMorsel); - - checkTupleIdxesAndFactorizedTableIdxes(resultKeyBlock->getTuple(0), - orderByKeyEncoder1.getNumBytesPerTuple(), expectedBlockOffsetOrder, - expectedFactorizedTableIdxOrder); - } - - OrderByKeyEncoder prepareMultipleStrKeyColsEncoder(std::shared_ptr& dataChunk, - std::vector>& strValues, uint16_t factorizedTableIdx, - std::vector>& factorizedTables) { - dataChunk->state->currIdx = 0; - dataChunk->state->initOriginalAndSelectedSize(strValues[0].size()); - for (auto i = 0u; i < strValues.size(); i++) { - auto strValueVector = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - dataChunk->insert(i, strValueVector); - for (auto j = 0u; j < strValues[i].size(); j++) { - strValueVector->setValue(j, strValues[i][j]); - } - } - - // The first, second, fourth columns are keyColumns. - std::vector orderByVectors{dataChunk->getValueVector(0).get(), - dataChunk->getValueVector(1).get(), dataChunk->getValueVector(3).get()}; - - std::vector allVectors{dataChunk->getValueVector(0).get(), - dataChunk->getValueVector(1).get(), dataChunk->getValueVector(2).get(), - dataChunk->getValueVector(3).get()}; - - std::unique_ptr tableSchema = - std::make_unique(); - auto stringColumnSize = - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::STRING}); - tableSchema->appendColumn(std::make_unique( - false /* isUnflat */, 0 /* dataChunkPos */, stringColumnSize)); - tableSchema->appendColumn(std::make_unique( - false /* isUnflat */, 0 /* dataChunkPos */, stringColumnSize)); - tableSchema->appendColumn(std::make_unique( - false /* isUnflat */, 0 /* dataChunkPos */, stringColumnSize)); - tableSchema->appendColumn(std::make_unique( - false /* isUnflat */, 0 /* dataChunkPos */, stringColumnSize)); - auto factorizedTable = - std::make_unique(memoryManager.get(), std::move(tableSchema)); - - std::vector isAscOrder(strValues.size(), true); - auto orderByKeyEncoder = - OrderByKeyEncoder(orderByVectors, isAscOrder, memoryManager.get(), factorizedTableIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(orderByVectors)); - // Manually flatten the data chunk. - dataChunk->state->selVector->resetSelectorToValuePosBuffer(); - dataChunk->state->selVector->selectedSize = 1; - for (auto i = 0u; i < strValues[0].size(); i++) { - dataChunk->state->selVector->selectedPositions[0] = i; - factorizedTable->append(allVectors); - orderByKeyEncoder.encodeKeys(); - dataChunk->state->currIdx++; - } - dataChunk->state->selVector->resetSelectorToUnselected(); - - factorizedTables.emplace_back(std::move(factorizedTable)); - return orderByKeyEncoder; - } -}; - -TEST_F(KeyBlockMergerTest, singleOrderByColInt64Test) { - std::vector leftSortingData = {INT64_MIN, -8848, 1, 7, 13, INT64_MAX, 0 /* NULL */}; - std::vector rightSortingData = {INT64_MIN, -6, 4, 22, 32, 38, 0 /* NULL */}; - std::vector leftNullMasks = {false, false, false, false, false, false, true}; - std::vector rightNullMasks = {false, false, false, false, false, false, true}; - std::vector expectedBlockOffsetOrder = {0, 0, 1, 1, 2, 2, 3, 4, 3, 4, 5, 5, 6, 6}; - std::vector expectedFactorizedTableIdxOrder = { - 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1}; - singleOrderByColMergeTest(leftSortingData, leftNullMasks, rightSortingData, rightNullMasks, - expectedBlockOffsetOrder, expectedFactorizedTableIdxOrder, LogicalTypeID::INT64, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(KeyBlockMergerTest, singleOrderByColInt64NoNullTest) { - std::vector leftSortingData = {INT64_MIN, -512, -5, 22, INT64_MAX}; - std::vector rightSortingData = {INT64_MIN, -999, 31, INT64_MAX}; - std::vector leftNullMasks(leftSortingData.size(), false); - std::vector rightNullMasks(rightSortingData.size(), false); - std::vector expectedBlockOffsetOrder = {0, 0, 1, 1, 2, 3, 2, 4, 3}; - std::vector expectedFactorizedTableIdxOrder = {0, 1, 1, 0, 0, 0, 1, 0, 1}; - singleOrderByColMergeTest(leftSortingData, leftNullMasks, rightSortingData, rightNullMasks, - expectedBlockOffsetOrder, expectedFactorizedTableIdxOrder, LogicalTypeID::INT64, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(KeyBlockMergerTest, singleOrderByColInt64SameValueTest) { - std::vector leftSortingData = {4, 4, 4, 4, 4, 4}; - std::vector rightSortingData = {4, 4, 4, 4, 4, 4, 4, 4, 4}; - std::vector leftNullMasks(leftSortingData.size(), false); - std::vector rightNullMasks(rightSortingData.size(), false); - std::vector expectedBlockOffsetOrder = {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6, 7, 8}; - std::vector expectedFactorizedTableIdxOrder = { - 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - singleOrderByColMergeTest(leftSortingData, leftNullMasks, rightSortingData, rightNullMasks, - expectedBlockOffsetOrder, expectedFactorizedTableIdxOrder, LogicalTypeID::INT64, - false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(KeyBlockMergerTest, singleOrderByColInt64LargeNumTuplesTest) { - std::vector leftSortingData, rightSortingData; - std::vector expectedBlockOffsetOrder( - leftSortingData.size() + rightSortingData.size()); - std::vector expectedFactorizedTableIdxOrder( - leftSortingData.size() + rightSortingData.size()); - // Each memory block can hold a maximum of 240 tuples (4096 / (8 + 9)). - // We fill the leftSortingData with the even numbers of 0-480 and the rightSortingData with - // the odd numbers of 0-480 so that each of them takes up exactly one memoryBlock. - for (auto i = 0u; i < 480; i++) { - if (i % 2) { - expectedBlockOffsetOrder.emplace_back(rightSortingData.size()); - expectedFactorizedTableIdxOrder.emplace_back(1); - rightSortingData.emplace_back(i); - } else { - expectedBlockOffsetOrder.emplace_back(leftSortingData.size()); - expectedFactorizedTableIdxOrder.emplace_back(0); - leftSortingData.emplace_back(i); - } - } - std::vector leftNullMasks(leftSortingData.size(), false); - std::vector rightNullMasks(rightSortingData.size(), false); - singleOrderByColMergeTest(leftSortingData, leftNullMasks, rightSortingData, rightNullMasks, - expectedBlockOffsetOrder, expectedFactorizedTableIdxOrder, LogicalTypeID::INT64, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(KeyBlockMergerTest, singleOrderByColStringTest) { - std::vector leftSortingData = {"" /* NULL */, "tiny str", "long std::string", - "common prefix string3", "common prefix string1"}; - std::vector rightSortingData = {"" /* NULL */, "" /* NULL */, "tiny str1", - "common prefix string4", "common prefix string2", "common prefix string1", - "" /* empty str */}; - std::vector leftNullMasks = {true, false, false, false, false}; - std::vector rightNullMasks = {true, true, false, false, false, false, false}; - std::vector expectedBlockOffsetOrder = {0, 0, 1, 2, 1, 2, 3, 3, 4, 4, 5, 6}; - std::vector expectedFactorizedTableIdxOrder = {0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1}; - singleOrderByColMergeTest(leftSortingData, leftNullMasks, rightSortingData, rightNullMasks, - expectedBlockOffsetOrder, expectedFactorizedTableIdxOrder, LogicalTypeID::STRING, - false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(KeyBlockMergerTest, singleOrderByColStringNoNullTest) { - std::vector leftSortingData = {"common prefix string1", "common prefix string2", - "common prefix string3", "long string", "tiny str"}; - std::vector rightSortingData = {"common prefix string1", "common prefix string2", - "common prefix string4", "tiny str", "tiny str1"}; - std::vector leftNullMasks(leftSortingData.size(), false); - std::vector rightNullMasks(rightSortingData.size(), false); - std::vector expectedBlockOffsetOrder = {0, 0, 1, 1, 2, 2, 3, 4, 3, 4}; - std::vector expectedFactorizedTableIdxOrder = {0, 1, 0, 1, 0, 1, 0, 0, 1, 1}; - singleOrderByColMergeTest(leftSortingData, leftNullMasks, rightSortingData, rightNullMasks, - expectedBlockOffsetOrder, expectedFactorizedTableIdxOrder, LogicalTypeID::STRING, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(KeyBlockMergerTest, singleOrderByColStringWithPayLoadTest) { - std::vector leftSortingData = { - "", "", "abcabc str", "long long string1", "short str2"}; - std::vector rightSortingData = { - "", "test str1", "this is a long string", "very short", "" /* NULL */}; - std::vector leftNullMasks(leftSortingData.size(), false); - std::vector rightNullMasks = {false, false, false, false, true}; - std::vector expectedBlockOffsetOrder = {0, 1, 0, 2, 3, 4, 1, 2, 3, 4}; - std::vector expectedFactorizedTableIdxOrder = {0, 0, 1, 0, 0, 0, 1, 1, 1, 1}; - singleOrderByColMergeTest(leftSortingData, leftNullMasks, rightSortingData, rightNullMasks, - expectedBlockOffsetOrder, expectedFactorizedTableIdxOrder, LogicalTypeID::STRING, - true /* isAsc */, true /* hasPayLoadCol */); -} - -TEST_F(KeyBlockMergerTest, multiple0rderByColNoStrTest) { - multipleOrderByColTest(false /* hasStrCol */); -} - -TEST_F(KeyBlockMergerTest, multiple0rderByColOneStrColTest) { - multipleOrderByColTest(true /* hasStrCol */); -} - -TEST_F(KeyBlockMergerTest, multipleStrKeyColsTest) { - auto dataChunk1 = std::make_shared(4); - auto dataChunk2 = std::make_shared(4); - auto dataChunk3 = std::make_shared(4); - std::vector> strValues1 = { - {"common str1", "common str1", "shorts1", "shorts2"}, - {"same str1", "same str1", "same str1", "same str1"}, - {"payload3", "payload1", "payload2", "payload4"}, - {"long long str4", "long long str6", "long long str3", "long long str2"}}; - std::vector> strValues2 = {{"common str1", "common str1", "shorts1"}, - {"same str1", "same str1", "same str1"}, {"payload3", "payload1", "payload2"}, - { - "", - "long long str5", - "long long str4", - }}; - - std::vector> strValues3 = {{"common str1", "common str1"}, - {"same str1", "same str1"}, {"payload3", "payload1"}, {"largerStr", "long long str4"}}; - std::vector> factorizedTables; - auto orderByKeyEncoder1 = - prepareMultipleStrKeyColsEncoder(dataChunk1, strValues1, 0 /* ftIdx */, factorizedTables); - auto orderByKeyEncoder2 = - prepareMultipleStrKeyColsEncoder(dataChunk2, strValues2, 1 /* ftIdx */, factorizedTables); - auto orderByKeyEncoder3 = - prepareMultipleStrKeyColsEncoder(dataChunk3, strValues3, 2 /* ftIdx */, factorizedTables); - - std::vector strKeyColsInfo = { - StrKeyColInfo(factorizedTables[0]->getTableSchema()->getColOffset(0 /* colIdx */), - 0 /* colOffsetInEncodedKeyBlock */, true /* isAscOrder */), - StrKeyColInfo(factorizedTables[0]->getTableSchema()->getColOffset(1 /* colIdx */), - orderByKeyEncoder1.getEncodingSize(LogicalType(LogicalTypeID::STRING)), - true /* isAscOrder */), - StrKeyColInfo(factorizedTables[0]->getTableSchema()->getColOffset(3 /* colIdx */), - orderByKeyEncoder1.getEncodingSize(LogicalType(LogicalTypeID::STRING)) * 2, - true /* isAscOrder */)}; - - KeyBlockMerger keyBlockMerger = - KeyBlockMerger(factorizedTables, strKeyColsInfo, orderByKeyEncoder1.getNumBytesPerTuple()); - - auto numBytesPerEntry = orderByKeyEncoder1.getNumBytesPerTuple(); - auto resultKeyBlock = - std::make_shared(numBytesPerEntry, 7ul, memoryManager.get()); - auto keyBlockMergeTask = std::make_shared( - std::make_shared(numBytesPerEntry, orderByKeyEncoder1.getKeyBlocks()[0]), - std::make_shared(numBytesPerEntry, orderByKeyEncoder2.getKeyBlocks()[0]), - resultKeyBlock, keyBlockMerger); - KeyBlockMergeMorsel keyBlockMergeMorsel(0, 4, 0, 3); - keyBlockMergeMorsel.keyBlockMergeTask = keyBlockMergeTask; - keyBlockMerger.mergeKeyBlocks(keyBlockMergeMorsel); - - auto resultMemBlock1 = - std::make_shared(numBytesPerEntry, 9ul, memoryManager.get()); - auto keyBlockMergeTask1 = std::make_shared(resultKeyBlock, - std::make_shared(numBytesPerEntry, orderByKeyEncoder3.getKeyBlocks()[0]), - resultMemBlock1, keyBlockMerger); - KeyBlockMergeMorsel keyBlockMergeMorsel1(0, 7, 0, 2); - keyBlockMergeMorsel1.keyBlockMergeTask = keyBlockMergeTask1; - keyBlockMerger.mergeKeyBlocks(keyBlockMergeMorsel1); - - std::vector expectedBlockOffsetOrder = {0, 0, 0, 1, 1, 1, 2, 2, 3}; - std::vector expectedFactorizedTableIdxOrder = {1, 2, 0, 2, 1, 0, 0, 1, 0}; - checkTupleIdxesAndFactorizedTableIdxes(resultMemBlock1->getTuple(0), - orderByKeyEncoder1.getNumBytesPerTuple(), expectedBlockOffsetOrder, - expectedFactorizedTableIdxOrder); -} diff --git a/test/processor/order_by/order_by_key_encoder_test.cpp b/test/processor/order_by/order_by_key_encoder_test.cpp deleted file mode 100644 index 8ea35fdd5f..0000000000 --- a/test/processor/order_by/order_by_key_encoder_test.cpp +++ /dev/null @@ -1,837 +0,0 @@ -#include - -#include "common/constants.h" -#include "common/data_chunk/data_chunk.h" -#include "common/string_utils.h" -#include "gtest/gtest.h" -#include "processor/operator/order_by/order_by_key_encoder.h" - -using ::testing::Test; -using namespace kuzu::common; -using namespace kuzu::processor; -using namespace kuzu::storage; - -class OrderByKeyEncoderTest : public Test { - -public: - void SetUp() override { - LoggerUtils::createLogger(LoggerConstants::LoggerEnum::BUFFER_MANAGER); - LoggerUtils::createLogger(LoggerConstants::LoggerEnum::STORAGE); - bufferManager = std::make_unique( - BufferPoolConstants::DEFAULT_BUFFER_POOL_SIZE_FOR_TESTING); - memoryManager = std::make_unique(bufferManager.get()); - } - - void TearDown() override { - LoggerUtils::dropLogger(LoggerConstants::LoggerEnum::BUFFER_MANAGER); - LoggerUtils::dropLogger(LoggerConstants::LoggerEnum::STORAGE); - } - - void checkTupleIdxAndFactorizedTableIdx(uint64_t expectedBlockOffset, uint8_t*& keyBlockPtr) { - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTBlockIdx(keyBlockPtr), 0); - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTBlockOffset(keyBlockPtr), expectedBlockOffset); - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTIdx(keyBlockPtr), ftIdx); - keyBlockPtr += 8; - } - - // This method can only be used to check the null flag for a not null value. - // We should call checkNullVal directly to check a null value. - inline void checkNonNullFlag(uint8_t*& keyBlockPtr, bool isAsc) { - ASSERT_EQ(*(keyBlockPtr++), isAsc ? 0x00 : 0xFF); - } - - // If the col is in asc order, the encoding string is: - // 0xFF(null flag) + 0xFF...FF(padding) - // if the col is in desc order, the encoding string is: - // 0x00(null flag) + 0x00...00(padding) - inline void checkNullVal(uint8_t*& keyBlockPtr, LogicalTypeID dataTypeID, bool isAsc) { - for (auto i = 0u; i < OrderByKeyEncoder::getEncodingSize(LogicalType(dataTypeID)); i++) { - ASSERT_EQ(*(keyBlockPtr++), isAsc ? 0xFF : 0x00); - } - } - - // This function generates a ValueVector of int64 tuples that are all 5. - std::pair, std::shared_ptr> getInt64TestValueVector( - const uint64_t numOfElementsPerCol, const uint64_t numOfOrderByCols, bool flatCol) { - std::shared_ptr dataChunk = std::make_shared(numOfOrderByCols); - dataChunk->state->selVector->selectedSize = numOfElementsPerCol; - std::vector valueVectors; - for (auto i = 0u; i < numOfOrderByCols; i++) { - std::shared_ptr valueVector = - std::make_shared(LogicalTypeID::INT64, memoryManager.get()); - for (auto j = 0u; j < numOfElementsPerCol; j++) { - valueVector->setValue(j, (int64_t)5); - } - dataChunk->insert(i, valueVector); - valueVector->state->currIdx = flatCol ? 0 : -1; - valueVectors.emplace_back(valueVector.get()); - } - return {valueVectors, dataChunk}; - } - - // This function assumes that all columns have datatype: INT64, and each tuple is 5. - void checkKeyBlockForInt64TestValueVector(std::vector& valueVectors, - std::vector>& keyBlocks, uint64_t numOfElements, - std::vector& isAscOrder, uint64_t numTuplesPerBlock) { - for (auto i = 0u; i < keyBlocks.size(); i++) { - auto numOfElementsToCheck = std::min(numOfElements, numTuplesPerBlock); - numOfElements -= numOfElementsToCheck; - auto keyBlockPtr = keyBlocks[i]->getData(); - for (auto j = 0u; j < numOfElementsToCheck; j++) { - for (auto k = 0u; k < valueVectors.size(); k++) { - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - if (isAscOrder[0]) { - // Check encoding for: NULL FLAG(0x00) + 5=0x8000000000000005(big endian). - ASSERT_EQ(*(keyBlockPtr++), 0x80); - for (auto k = 0u; k < 6; k++) { - ASSERT_EQ(*(keyBlockPtr++), 0x00); - } - ASSERT_EQ(*(keyBlockPtr++), 0x05); - } else { - // Check encoding for: NULL FLAG(0xFF) + 5=0x7FFFFFFFFFFFFFFFFA(big endian). - // Note: we need to flip all bits since this column is in descending order. - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - for (auto k = 0u; k < 6; k++) { - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - } - ASSERT_EQ(*(keyBlockPtr++), 0xFA); - } - } - checkTupleIdxAndFactorizedTableIdx(i * numTuplesPerBlock + j, keyBlockPtr); - } - } - } - - void singleOrderByColMultiBlockTest(bool isFlat) { - uint64_t numOfElements = 2000; - auto [valueVectors, dataChunk] = getInt64TestValueVector(numOfElements, 1, isFlat); - auto isAscOrder = std::vector(1, false); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), - ftIdx, numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - if (isFlat) { - valueVectors[0]->state->selVector->resetSelectorToValuePosBuffer(); - valueVectors[0]->state->selVector->selectedSize = 1; - for (auto i = 0u; i < numOfElements; i++) { - valueVectors[0]->state->selVector->selectedPositions[0] = i; - orderByKeyEncoder.encodeKeys(); - valueVectors[0]->state->currIdx++; - } - valueVectors[0]->state->selVector->resetSelectorToUnselected(); - } else { - orderByKeyEncoder.encodeKeys(); - } - checkKeyBlockForInt64TestValueVector(valueVectors, orderByKeyEncoder.getKeyBlocks(), - numOfElements, isAscOrder, orderByKeyEncoder.getMaxNumTuplesPerBlock()); - } - - static inline void checkLongStrFlag(uint8_t*& keyBlockPtr, bool isAscOrder, bool isLongString) { - ASSERT_EQ(*(keyBlockPtr++), isAscOrder == isLongString ? UINT8_MAX : 0); - } - -public: - std::unique_ptr bufferManager; - std::unique_ptr memoryManager; - const uint32_t ftIdx = 14; - const uint32_t numTuplesPerBlockInFT = BufferPoolConstants::PAGE_256KB_SIZE / 8; -}; - -TEST_F(OrderByKeyEncoderTest, singleOrderByColInt64UnflatTest) { - std::shared_ptr dataChunk = std::make_shared(1); - dataChunk->state->selVector->selectedSize = 6; - auto int64ValueVector = - std::make_shared(LogicalTypeID::INT64, memoryManager.get()); - int64ValueVector->setValue(0, (int64_t)73); // positive number - int64ValueVector->setNull(1, true); - int64ValueVector->setValue(2, (int64_t)-132); // negative 1 byte number - int64ValueVector->setValue(3, (int64_t)-5242); // negative 2 bytes number - int64ValueVector->setValue(4, (int64_t)INT64_MAX); - int64ValueVector->setValue(5, (int64_t)INT64_MIN); - dataChunk->insert(0, int64ValueVector); - std::vector valueVectors; - valueVectors.emplace_back(int64ValueVector.get()); - auto isAscOrder = std::vector(1, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + 73=0x8000000000000049(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - for (auto i = 0u; i < 6; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0x00); - } - ASSERT_EQ(*(keyBlockPtr++), 0x49); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - checkNullVal(keyBlockPtr, LogicalTypeID::INT64, isAscOrder[0]); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -132=0x7FFFFFFFFFFFFF7C(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - for (auto i = 0u; i < 6; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - } - ASSERT_EQ(*(keyBlockPtr++), 0x7C); - checkTupleIdxAndFactorizedTableIdx(2, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -5242=0x7FFFFFFFFFFFEB86(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - for (auto i = 0u; i < 5; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - } - ASSERT_EQ(*(keyBlockPtr++), 0xEB); - ASSERT_EQ(*(keyBlockPtr++), 0x86); - checkTupleIdxAndFactorizedTableIdx(3, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + INT64_MAX=0xFFFFFFFFFFFFFFFF(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - for (auto i = 0U; i < 8; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - } - checkTupleIdxAndFactorizedTableIdx(4, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + INT64_MIN=0x0000000000000000(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - for (auto i = 0u; i < 8; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0x00); - } - checkTupleIdxAndFactorizedTableIdx(5, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColInt64UnflatWithFilterTest) { - // This test is used to test whether the orderByKeyEncoder correctly encodes the filtered - // valueVector. - std::shared_ptr dataChunk = std::make_shared(1); - std::shared_ptr int64ValueVector = - std::make_shared(LogicalTypeID::INT64, memoryManager.get()); - int64ValueVector->setValue(0, (int64_t)73); - int64ValueVector->setValue(1, (int64_t)-52); - int64ValueVector->setValue(2, (int64_t)-132); - dataChunk->insert(0, int64ValueVector); - // Only the first and the third value is selected, so the encoder should - // not encode the second value. - int64ValueVector->state->selVector->resetSelectorToValuePosBuffer(); - int64ValueVector->state->selVector->selectedPositions[0] = 0; - int64ValueVector->state->selVector->selectedPositions[1] = 2; - int64ValueVector->state->selVector->selectedSize = 2; - std::vector valueVectors; - valueVectors.emplace_back(int64ValueVector.get()); - auto isAscOrder = std::vector(1, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + 73=0x8000000000000049(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - for (auto i = 0u; i < 6; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0x00); - } - ASSERT_EQ(*(keyBlockPtr++), 0x49); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -132=0x7FFFFFFFFFFFFF7C(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - for (auto i = 0u; i < 6; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - } - ASSERT_EQ(*(keyBlockPtr++), 0x7C); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColBoolUnflatTest) { - std::shared_ptr dataChunk = std::make_shared(1); - dataChunk->state->selVector->selectedSize = 3; - std::shared_ptr boolValueVector = - std::make_shared(LogicalTypeID::BOOL, memoryManager.get()); - boolValueVector->setValue(0, true); - boolValueVector->setValue(1, false); - boolValueVector->setNull(2, true); - dataChunk->insert(0, boolValueVector); - std::vector valueVectors; - valueVectors.emplace_back(boolValueVector.get()); - auto isAscOrder = std::vector(1, false); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + true=0xFE(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0xFE); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + false=0xFF(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); - - checkNullVal(keyBlockPtr, LogicalTypeID::BOOL, isAscOrder[0]); - checkTupleIdxAndFactorizedTableIdx(2, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColDateUnflatTest) { - std::shared_ptr dataChunk = std::make_shared(1); - dataChunk->state->selVector->selectedSize = 3; - std::shared_ptr dateValueVector = - std::make_shared(LogicalTypeID::DATE, memoryManager.get()); - dateValueVector->setValue( - 0, Date::fromCString("2035-07-04", strlen("2035-07-04"))); // date after 1970-01-01 - dateValueVector->setNull(1, true); - dateValueVector->setValue( - 2, Date::fromCString("1949-10-01", strlen("1949-10-01"))); // date before 1970-01-01 - dataChunk->insert(0, dateValueVector); - std::vector valueVectors; - valueVectors.emplace_back(dateValueVector.get()); - auto isAscOrder = std::vector(1, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + "2035-07-04"=0x80005D75(23925 days in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x5D); - ASSERT_EQ(*(keyBlockPtr++), 0x75); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - checkNullVal(keyBlockPtr, LogicalTypeID::DATE, isAscOrder[0]); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + "1949-10-01"=0x7FFFE31B(-7397 days in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - ASSERT_EQ(*(keyBlockPtr++), 0xE3); - ASSERT_EQ(*(keyBlockPtr++), 0x1B); - checkTupleIdxAndFactorizedTableIdx(2, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColTimestampUnflatTest) { - std::shared_ptr dataChunk = std::make_shared(1); - dataChunk->state->selVector->selectedSize = 3; - std::shared_ptr timestampValueVector = - std::make_shared(LogicalTypeID::TIMESTAMP, memoryManager.get()); - // timestamp before 1970-01-01 - timestampValueVector->setValue( - 0, Timestamp::fromCString("1962-04-07 11:12:35.123", strlen("1962-04-07 11:12:35.123"))); - timestampValueVector->setNull(1, true); - // timestamp after 1970-01-01 - timestampValueVector->setValue( - 2, Timestamp::fromCString("2035-07-01 11:14:33", strlen("2035-07-01 11:14:33"))); - dataChunk->insert(0, timestampValueVector); - std::vector valueVectors; - valueVectors.emplace_back(timestampValueVector.get()); - auto isAscOrder = std::vector(1, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + "1962-04-07 11:12:35.123"=0x7FFF21F7F9D08F38 - // (-244126044877000 micros in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - ASSERT_EQ(*(keyBlockPtr++), 0x21); - ASSERT_EQ(*(keyBlockPtr++), 0xF7); - ASSERT_EQ(*(keyBlockPtr++), 0xF9); - ASSERT_EQ(*(keyBlockPtr++), 0xD0); - ASSERT_EQ(*(keyBlockPtr++), 0x8F); - ASSERT_EQ(*(keyBlockPtr++), 0x38); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - checkNullVal(keyBlockPtr, LogicalTypeID::TIMESTAMP, isAscOrder[0]); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + "2035-07-01 11:14:33"=0x800757D5F429B840 - // (2066901273000000 micros in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x07); - ASSERT_EQ(*(keyBlockPtr++), 0x57); - ASSERT_EQ(*(keyBlockPtr++), 0xD5); - ASSERT_EQ(*(keyBlockPtr++), 0xF4); - ASSERT_EQ(*(keyBlockPtr++), 0x29); - ASSERT_EQ(*(keyBlockPtr++), 0xB8); - ASSERT_EQ(*(keyBlockPtr++), 0x40); - checkTupleIdxAndFactorizedTableIdx(2, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColIntervalUnflatTest) { - std::shared_ptr dataChunk = std::make_shared(1); - dataChunk->state->selVector->selectedSize = 2; - std::shared_ptr intervalValueVector = - std::make_shared(LogicalTypeID::INTERVAL, memoryManager.get()); - intervalValueVector->setValue( - 0, Interval::fromCString("18 hours 55 days 13 years 8 milliseconds 3 months", - strlen("18 hours 55 days 13 years 8 milliseconds 3 months"))); - intervalValueVector->setNull(1, true); - dataChunk->insert(0, intervalValueVector); - std::vector valueVectors; - valueVectors.emplace_back(intervalValueVector.get()); - auto isAscOrder = std::vector(1, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + "18 hours 55 days 13 years 8 milliseconds 3 months" - // = NULL FLAG(0x00) + 160 months(0x800000A0) + 25 days(0x80000019) - // + 64800008000 micros(0x8000000F1661A740). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - // Check for months: 160 (0x800000A0 in big endian). - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0xA0); - // Check for days: 25 (0x80000019 in big endian). - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x19); - // Check for micros: 64800008000 (0x8000000F1661A740 in big endian). - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x0F); - ASSERT_EQ(*(keyBlockPtr++), 0x16); - ASSERT_EQ(*(keyBlockPtr++), 0x61); - ASSERT_EQ(*(keyBlockPtr++), 0xA7); - ASSERT_EQ(*(keyBlockPtr++), 0x40); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - checkNullVal(keyBlockPtr, LogicalTypeID::INTERVAL, isAscOrder[0]); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColStringUnflatTest) { - std::shared_ptr dataChunk = std::make_shared(1); - dataChunk->state->selVector->selectedSize = 4; - std::shared_ptr stringValueVector = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - stringValueVector->setValue(0, "short str"); // short std::string - stringValueVector->setNull(1, true); - stringValueVector->setValue( - 2, "commonprefix string1"); // long string(encoding: commonprefix) - stringValueVector->setValue( - 3, "commonprefix string2"); // long string(encoding: commonprefix) - dataChunk->insert(0, stringValueVector); - std::vector valueVectors; - valueVectors.emplace_back(stringValueVector.get()); - auto isAscOrder = std::vector(1, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + "short str". - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 's'); - ASSERT_EQ(*(keyBlockPtr++), 'h'); - ASSERT_EQ(*(keyBlockPtr++), 'o'); - ASSERT_EQ(*(keyBlockPtr++), 'r'); - ASSERT_EQ(*(keyBlockPtr++), 't'); - ASSERT_EQ(*(keyBlockPtr++), ' '); - ASSERT_EQ(*(keyBlockPtr++), 's'); - ASSERT_EQ(*(keyBlockPtr++), 't'); - ASSERT_EQ(*(keyBlockPtr++), 'r'); - ASSERT_EQ(*(keyBlockPtr++), '\0'); - ASSERT_EQ(*(keyBlockPtr++), '\0'); - ASSERT_EQ(*(keyBlockPtr++), '\0'); - checkLongStrFlag(keyBlockPtr, isAscOrder[0], false /* isLongStr */); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - checkNullVal(keyBlockPtr, LogicalTypeID::STRING, isAscOrder[0]); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + "commonprefix string1". - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 'c'); - ASSERT_EQ(*(keyBlockPtr++), 'o'); - ASSERT_EQ(*(keyBlockPtr++), 'm'); - ASSERT_EQ(*(keyBlockPtr++), 'm'); - ASSERT_EQ(*(keyBlockPtr++), 'o'); - ASSERT_EQ(*(keyBlockPtr++), 'n'); - ASSERT_EQ(*(keyBlockPtr++), 'p'); - ASSERT_EQ(*(keyBlockPtr++), 'r'); - ASSERT_EQ(*(keyBlockPtr++), 'e'); - ASSERT_EQ(*(keyBlockPtr++), 'f'); - ASSERT_EQ(*(keyBlockPtr++), 'i'); - ASSERT_EQ(*(keyBlockPtr++), 'x'); - checkLongStrFlag(keyBlockPtr, isAscOrder[0], true /* isLongStr */); - checkTupleIdxAndFactorizedTableIdx(2, keyBlockPtr); - - // Check encoding for val: NULL FLAG(0x00) + "commonprefix string2". - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 'c'); - ASSERT_EQ(*(keyBlockPtr++), 'o'); - ASSERT_EQ(*(keyBlockPtr++), 'm'); - ASSERT_EQ(*(keyBlockPtr++), 'm'); - ASSERT_EQ(*(keyBlockPtr++), 'o'); - ASSERT_EQ(*(keyBlockPtr++), 'n'); - ASSERT_EQ(*(keyBlockPtr++), 'p'); - ASSERT_EQ(*(keyBlockPtr++), 'r'); - ASSERT_EQ(*(keyBlockPtr++), 'e'); - ASSERT_EQ(*(keyBlockPtr++), 'f'); - ASSERT_EQ(*(keyBlockPtr++), 'i'); - ASSERT_EQ(*(keyBlockPtr++), 'x'); - checkLongStrFlag(keyBlockPtr, isAscOrder[0], true /* isLongStr */); - checkTupleIdxAndFactorizedTableIdx(3, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColDoubleUnflatTest) { - std::shared_ptr dataChunk = std::make_shared(1); - dataChunk->state->selVector->selectedSize = 6; - std::shared_ptr doubleValueVector = - std::make_shared(LogicalTypeID::DOUBLE, memoryManager.get()); - doubleValueVector->setValue(0, (double_t)3.452); // small positive number - doubleValueVector->setNull(1, true); - doubleValueVector->setValue(2, (double_t)-0.00031213); // very small negative number - doubleValueVector->setValue(3, (double_t)-5.42113); // small negative number - doubleValueVector->setValue(4, (double_t)92931312341415); // large positive number - doubleValueVector->setValue(5, (double_t)-31234142783434); // large negative number - dataChunk->insert(0, doubleValueVector); - std::vector valueVectors; - valueVectors.emplace_back(doubleValueVector.get()); - auto isAscOrder = std::vector(1, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - orderByKeyEncoder.encodeKeys(); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - // Check encoding for: NULL FLAG(0x00) + 3.452=0xC00B9DB22D0E5604(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0xC0); - ASSERT_EQ(*(keyBlockPtr++), 0x0B); - ASSERT_EQ(*(keyBlockPtr++), 0x9D); - ASSERT_EQ(*(keyBlockPtr++), 0xB2); - ASSERT_EQ(*(keyBlockPtr++), 0x2D); - ASSERT_EQ(*(keyBlockPtr++), 0x0E); - ASSERT_EQ(*(keyBlockPtr++), 0x56); - ASSERT_EQ(*(keyBlockPtr++), 0x04); - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - checkNullVal(keyBlockPtr, LogicalTypeID::INT64, isAscOrder[0]); - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -0.00031213=0x40CB8B53DB9F4D8D(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x40); - ASSERT_EQ(*(keyBlockPtr++), 0xCB); - ASSERT_EQ(*(keyBlockPtr++), 0x8B); - ASSERT_EQ(*(keyBlockPtr++), 0x53); - ASSERT_EQ(*(keyBlockPtr++), 0xDB); - ASSERT_EQ(*(keyBlockPtr++), 0x9F); - ASSERT_EQ(*(keyBlockPtr++), 0x4D); - ASSERT_EQ(*(keyBlockPtr++), 0x8D); - checkTupleIdxAndFactorizedTableIdx(2, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -5.42113=0x3FEA50C34C1A8AC5(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x3F); - ASSERT_EQ(*(keyBlockPtr++), 0xEA); - ASSERT_EQ(*(keyBlockPtr++), 0x50); - ASSERT_EQ(*(keyBlockPtr++), 0xC3); - ASSERT_EQ(*(keyBlockPtr++), 0x4C); - ASSERT_EQ(*(keyBlockPtr++), 0x1A); - ASSERT_EQ(*(keyBlockPtr++), 0x8A); - ASSERT_EQ(*(keyBlockPtr++), 0xC5); - checkTupleIdxAndFactorizedTableIdx(3, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + 92931312341415=0xC2D52150771469C0(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0xC2); - ASSERT_EQ(*(keyBlockPtr++), 0xD5); - ASSERT_EQ(*(keyBlockPtr++), 0x21); - ASSERT_EQ(*(keyBlockPtr++), 0x50); - ASSERT_EQ(*(keyBlockPtr++), 0x77); - ASSERT_EQ(*(keyBlockPtr++), 0x14); - ASSERT_EQ(*(keyBlockPtr++), 0x69); - ASSERT_EQ(*(keyBlockPtr++), 0xC0); - checkTupleIdxAndFactorizedTableIdx(4, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -31234142783434=0x3D4397BC03B835FF(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x3D); - ASSERT_EQ(*(keyBlockPtr++), 0x43); - ASSERT_EQ(*(keyBlockPtr++), 0x97); - ASSERT_EQ(*(keyBlockPtr++), 0xBC); - ASSERT_EQ(*(keyBlockPtr++), 0x03); - ASSERT_EQ(*(keyBlockPtr++), 0xB8); - ASSERT_EQ(*(keyBlockPtr++), 0x35); - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - checkTupleIdxAndFactorizedTableIdx(5, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, largeNumBytesPerTupleErrorTest) { - // If the numBytesPerTuple is larger than 4096 bytes, the encoder will raise an encoding - // exception we need ((LARGE_PAGE_SIZE - 8) / 9 + 1 number of columns(with datatype INT) to - // trigger that exception. - auto numOfOrderByCols = (BufferPoolConstants::PAGE_256KB_SIZE - 8) / 9 + 1; - auto [valueVectors, dataChunk] = getInt64TestValueVector(1, numOfOrderByCols, true); - auto isAscOrder = std::vector(numOfOrderByCols, true); - try { - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), - ftIdx, numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - FAIL(); - } catch (Exception& e) { - ASSERT_STREQ(e.what(), - StringUtils::string_format("Runtime exception: TupleSize({} bytes) is larger than " - "the LARGE_PAGE_SIZE({} bytes)", - 9 * numOfOrderByCols + 8, BufferPoolConstants::PAGE_256KB_SIZE) - .c_str()); - } catch (std::exception& e) { FAIL(); } -} - -TEST_F(OrderByKeyEncoderTest, singleTuplePerBlockTest) { - uint32_t numOfOrderByCols = (BufferPoolConstants::PAGE_256KB_SIZE - 8) / 9; - uint32_t numOfElementsPerCol = 10; - auto [valueVectors, dataChunk] = - getInt64TestValueVector(numOfElementsPerCol, numOfOrderByCols, true); - auto isAscOrder = std::vector(numOfOrderByCols, false); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - valueVectors[0]->state->selVector->resetSelectorToValuePosBuffer(); - valueVectors[0]->state->selVector->selectedSize = 1; - for (auto i = 0u; i < numOfElementsPerCol; i++) { - valueVectors[0]->state->selVector->selectedPositions[0] = i; - orderByKeyEncoder.encodeKeys(); - valueVectors[0]->state->currIdx++; - } - valueVectors[0]->state->selVector->resetSelectorToUnselected(); - auto& keyBlocks = orderByKeyEncoder.getKeyBlocks(); - checkKeyBlockForInt64TestValueVector(valueVectors, keyBlocks, numOfElementsPerCol, isAscOrder, - orderByKeyEncoder.getMaxNumTuplesPerBlock()); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColMultiBlockUnflatTest) { - singleOrderByColMultiBlockTest(false); -} - -TEST_F(OrderByKeyEncoderTest, singleOrderByColMultiBlockFlatTest) { - singleOrderByColMultiBlockTest(true); -} - -TEST_F(OrderByKeyEncoderTest, multipleOrderByColSingleBlockTest) { - std::vector isAscOrder = {true, false, true, true, true}; - auto intFlatValueVector = - std::make_shared(LogicalTypeID::INT64, memoryManager.get()); - auto doubleFlatValueVector = - std::make_shared(LogicalTypeID::DOUBLE, memoryManager.get()); - auto stringFlatValueVector = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - auto timestampFlatValueVector = - std::make_shared(LogicalTypeID::TIMESTAMP, memoryManager.get()); - auto dateFlatValueVector = - std::make_shared(LogicalTypeID::DATE, memoryManager.get()); - - auto mockDataChunk = std::make_shared(5); - mockDataChunk->insert(0, intFlatValueVector); - mockDataChunk->insert(1, doubleFlatValueVector); - mockDataChunk->insert(2, stringFlatValueVector); - mockDataChunk->insert(3, timestampFlatValueVector); - mockDataChunk->insert(4, dateFlatValueVector); - - intFlatValueVector->state->currIdx = 0; - doubleFlatValueVector->state->currIdx = 0; - stringFlatValueVector->state->currIdx = 0; - timestampFlatValueVector->state->currIdx = 0; - dateFlatValueVector->state->currIdx = 0; - - std::vector valueVectors; - valueVectors.emplace_back(intFlatValueVector.get()); - valueVectors.emplace_back(doubleFlatValueVector.get()); - valueVectors.emplace_back(stringFlatValueVector.get()); - valueVectors.emplace_back(timestampFlatValueVector.get()); - valueVectors.emplace_back(dateFlatValueVector.get()); - - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - uint8_t* keyBlockPtr = orderByKeyEncoder.getKeyBlocks()[0]->getData(); - - intFlatValueVector->setValue(0, (int64_t)73); - intFlatValueVector->setValue(1, (int64_t)-132); - intFlatValueVector->setValue(2, (int64_t)-412414); - doubleFlatValueVector->setValue(0, (double_t)53.421); - doubleFlatValueVector->setValue(1, (double_t)-415.23); - doubleFlatValueVector->setNull(2, true); - stringFlatValueVector->setNull(0, true); - stringFlatValueVector->setValue(1, "this is a test string!!"); - stringFlatValueVector->setValue(2, "short str"); - timestampFlatValueVector->setValue( - 0, Timestamp::fromCString("2008-08-08 20:20:20", strlen("2008-08-08 20:20:20"))); - timestampFlatValueVector->setValue( - 1, Timestamp::fromCString("1962-04-07 11:12:35.123", strlen("1962-04-07 11:12:35.123"))); - timestampFlatValueVector->setNull(2, true); - dateFlatValueVector->setValue(0, Date::fromCString("1978-09-12", strlen("1978-09-12"))); - dateFlatValueVector->setValue(1, Date::fromCString("2035-07-04", strlen("2035-07-04"))); - dateFlatValueVector->setNull(2, true); - mockDataChunk->state->selVector->resetSelectorToValuePosBuffer(); - mockDataChunk->state->selVector->selectedSize = 1; - for (auto i = 0u; i < 3; i++) { - mockDataChunk->state->selVector->selectedPositions[0] = i; - orderByKeyEncoder.encodeKeys(); - mockDataChunk->state->currIdx++; - } - valueVectors[0]->state->selVector->resetSelectorToUnselected(); - - // Check encoding for: NULL FLAG(0x00) + 73=0x8000000000000049(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - for (auto i = 0u; i < 6; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0x00); - } - ASSERT_EQ(*(keyBlockPtr++), 0x49); - - // Check encoding for: NULL FLAG(0x00) + 53.421=0x3FB54A1CAC083126(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[1]); - ASSERT_EQ(*(keyBlockPtr++), 0x3F); - ASSERT_EQ(*(keyBlockPtr++), 0xB5); - ASSERT_EQ(*(keyBlockPtr++), 0x4A); - ASSERT_EQ(*(keyBlockPtr++), 0x1C); - ASSERT_EQ(*(keyBlockPtr++), 0xAC); - ASSERT_EQ(*(keyBlockPtr++), 0x08); - ASSERT_EQ(*(keyBlockPtr++), 0x31); - ASSERT_EQ(*(keyBlockPtr++), 0x26); - - checkNullVal(keyBlockPtr, LogicalTypeID::STRING, isAscOrder[2]); - - // Check encoding for: NULL FLAG(0x00) + "2008-08-08 20:20:20"=0x800453F888DCA900 - // (1218226820000000 micros in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[3]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x04); - ASSERT_EQ(*(keyBlockPtr++), 0x53); - ASSERT_EQ(*(keyBlockPtr++), 0xF8); - ASSERT_EQ(*(keyBlockPtr++), 0x88); - ASSERT_EQ(*(keyBlockPtr++), 0xDC); - ASSERT_EQ(*(keyBlockPtr++), 0xA9); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - - // Check encoding for: NULL FLAG(0x00) + "1978-09-12"=0x80000C68(3176 days in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[4]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x0C); - ASSERT_EQ(*(keyBlockPtr++), 0x68); - - checkTupleIdxAndFactorizedTableIdx(0, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -132=0x7FFFFFFFFFFFFF7C(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - for (auto i = 0u; i < 6; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - } - ASSERT_EQ(*(keyBlockPtr++), 0x7C); - - // Check encoding for: NULL FLAG(0x00) + -415.23=0xC079F3AE147AE148(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[1]); - ASSERT_EQ(*(keyBlockPtr++), 0xC0); - ASSERT_EQ(*(keyBlockPtr++), 0x79); - ASSERT_EQ(*(keyBlockPtr++), 0xF3); - ASSERT_EQ(*(keyBlockPtr++), 0xAE); - ASSERT_EQ(*(keyBlockPtr++), 0x14); - ASSERT_EQ(*(keyBlockPtr++), 0x7A); - ASSERT_EQ(*(keyBlockPtr++), 0xE1); - ASSERT_EQ(*(keyBlockPtr++), 0x48); - - // Check encoding for: "this is a test string!!". - checkNonNullFlag(keyBlockPtr, isAscOrder[2]); - ASSERT_EQ(*(keyBlockPtr++), 't'); - ASSERT_EQ(*(keyBlockPtr++), 'h'); - ASSERT_EQ(*(keyBlockPtr++), 'i'); - ASSERT_EQ(*(keyBlockPtr++), 's'); - ASSERT_EQ(*(keyBlockPtr++), ' '); - ASSERT_EQ(*(keyBlockPtr++), 'i'); - ASSERT_EQ(*(keyBlockPtr++), 's'); - ASSERT_EQ(*(keyBlockPtr++), ' '); - ASSERT_EQ(*(keyBlockPtr++), 'a'); - ASSERT_EQ(*(keyBlockPtr++), ' '); - ASSERT_EQ(*(keyBlockPtr++), 't'); - ASSERT_EQ(*(keyBlockPtr++), 'e'); - checkLongStrFlag(keyBlockPtr, isAscOrder[2], true /* isLongStr */); - - // Check encoding for: NULL FLAG(0x00) + "1962-04-07 11:12:35.123"=0x7FFF21F7F9D08F38 - // (-244126044877000 micros in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[3]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - ASSERT_EQ(*(keyBlockPtr++), 0x21); - ASSERT_EQ(*(keyBlockPtr++), 0xF7); - ASSERT_EQ(*(keyBlockPtr++), 0xF9); - ASSERT_EQ(*(keyBlockPtr++), 0xD0); - ASSERT_EQ(*(keyBlockPtr++), 0x8F); - ASSERT_EQ(*(keyBlockPtr++), 0x38); - - // Check encoding for: NULL FLAG(0x00) + "2035-07-04"=0x80005D75(23925 days in big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[4]); - ASSERT_EQ(*(keyBlockPtr++), 0x80); - ASSERT_EQ(*(keyBlockPtr++), 0x00); - ASSERT_EQ(*(keyBlockPtr++), 0x5D); - ASSERT_EQ(*(keyBlockPtr++), 0x75); - - checkTupleIdxAndFactorizedTableIdx(1, keyBlockPtr); - - // Check encoding for: NULL FLAG(0x00) + -412414=0x7FFFFFFFFFF9B502(big endian). - checkNonNullFlag(keyBlockPtr, isAscOrder[0]); - ASSERT_EQ(*(keyBlockPtr++), 0x7F); - for (auto i = 0u; i < 4; i++) { - ASSERT_EQ(*(keyBlockPtr++), 0xFF); - } - ASSERT_EQ(*(keyBlockPtr++), 0xF9); - ASSERT_EQ(*(keyBlockPtr++), 0xB5); - ASSERT_EQ(*(keyBlockPtr++), 0x02); - - checkNullVal(keyBlockPtr, LogicalTypeID::DOUBLE, isAscOrder[1]); - - // Check encoding for: "short str". - checkNonNullFlag(keyBlockPtr, isAscOrder[2]); - ASSERT_EQ(*(keyBlockPtr++), 's'); - ASSERT_EQ(*(keyBlockPtr++), 'h'); - ASSERT_EQ(*(keyBlockPtr++), 'o'); - ASSERT_EQ(*(keyBlockPtr++), 'r'); - ASSERT_EQ(*(keyBlockPtr++), 't'); - ASSERT_EQ(*(keyBlockPtr++), ' '); - ASSERT_EQ(*(keyBlockPtr++), 's'); - ASSERT_EQ(*(keyBlockPtr++), 't'); - ASSERT_EQ(*(keyBlockPtr++), 'r'); - ASSERT_EQ(*(keyBlockPtr++), '\0'); - ASSERT_EQ(*(keyBlockPtr++), '\0'); - ASSERT_EQ(*(keyBlockPtr++), '\0'); - checkLongStrFlag(keyBlockPtr, isAscOrder[2], false /* isLongStr */); - - checkNullVal(keyBlockPtr, LogicalTypeID::TIMESTAMP, isAscOrder[3]); - - checkNullVal(keyBlockPtr, LogicalTypeID::DATE, isAscOrder[4]); - - checkTupleIdxAndFactorizedTableIdx(2, keyBlockPtr); -} - -TEST_F(OrderByKeyEncoderTest, multipleOrderByColMultiBlockTest) { - const auto numOfOrderByCols = 10; - const auto numOfElementsPerCol = 2000; - auto [valueVectors, dataChunk] = - getInt64TestValueVector(numOfElementsPerCol, numOfOrderByCols, true); - auto isAscOrder = std::vector(numOfOrderByCols, true); - auto orderByKeyEncoder = OrderByKeyEncoder(valueVectors, isAscOrder, memoryManager.get(), ftIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(valueVectors)); - valueVectors[0]->state->selVector->resetSelectorToValuePosBuffer(); - valueVectors[0]->state->selVector->selectedSize = 1; - for (auto i = 0u; i < numOfElementsPerCol; i++) { - valueVectors[0]->state->selVector->selectedPositions[0] = i; - orderByKeyEncoder.encodeKeys(); - valueVectors[0]->state->currIdx++; - } - valueVectors[0]->state->selVector->resetSelectorToUnselected(); - checkKeyBlockForInt64TestValueVector(valueVectors, orderByKeyEncoder.getKeyBlocks(), - numOfElementsPerCol, isAscOrder, orderByKeyEncoder.getMaxNumTuplesPerBlock()); -} diff --git a/test/processor/order_by/radix_sort_test.cpp b/test/processor/order_by/radix_sort_test.cpp deleted file mode 100644 index f4ddda4ef6..0000000000 --- a/test/processor/order_by/radix_sort_test.cpp +++ /dev/null @@ -1,468 +0,0 @@ -#include -#include -#include - -#include "common/assert.h" -#include "common/constants.h" -#include "common/data_chunk/data_chunk.h" -#include "gtest/gtest.h" -#include "processor/operator/order_by/order_by_key_encoder.h" -#include "processor/operator/order_by/radix_sort.h" - -using ::testing::Test; -using namespace kuzu::common; -using namespace kuzu::processor; -using namespace kuzu::storage; - -class RadixSortTest : public Test { - -public: - void SetUp() override { - LoggerUtils::createLogger(LoggerConstants::LoggerEnum::BUFFER_MANAGER); - LoggerUtils::createLogger(LoggerConstants::LoggerEnum::STORAGE); - bufferManager = std::make_unique( - BufferPoolConstants::DEFAULT_BUFFER_POOL_SIZE_FOR_TESTING); - memoryManager = std::make_unique(bufferManager.get()); - } - - void TearDown() override { - LoggerUtils::dropLogger(LoggerConstants::LoggerEnum::BUFFER_MANAGER); - LoggerUtils::dropLogger(LoggerConstants::LoggerEnum::STORAGE); - } - -public: - std::unique_ptr bufferManager; - std::unique_ptr memoryManager; - const uint8_t factorizedTableIdx = 9; - const uint32_t numTuplesPerBlockInFT = BufferPoolConstants::PAGE_256KB_SIZE / 8; - - void checkTupleIdxesAndFactorizedTableIdxes(uint8_t* keyBlockPtr, const uint64_t entrySize, - const std::vector& expectedFTBlockOffsetOrder) { - for (auto expectedFTBlockOffset : expectedFTBlockOffsetOrder) { - auto tupleInfoPtr = keyBlockPtr + entrySize - 8; - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTIdx(tupleInfoPtr), factorizedTableIdx); - ASSERT_EQ(OrderByKeyEncoder::getEncodedFTBlockIdx(tupleInfoPtr), 0); - auto encodedFTBlockOffset = OrderByKeyEncoder::getEncodedFTBlockOffset(tupleInfoPtr); - if (expectedFTBlockOffset != -1) { - ASSERT_EQ(encodedFTBlockOffset, expectedFTBlockOffset); - } else { - // For tuples with the same value, we just need to check the tuple id is valid and - // in the range of [0, expectedFTBlockOffsetOrder.size()). - ASSERT_EQ((0 <= encodedFTBlockOffset) && - (encodedFTBlockOffset < expectedFTBlockOffsetOrder.size()), - true); - } - keyBlockPtr += entrySize; - } - } - - void sortAllKeyBlocks(OrderByKeyEncoder& orderByKeyEncoder, RadixSort& radixSort) { - for (auto& keyBlock : orderByKeyEncoder.getKeyBlocks()) { - radixSort.sortSingleKeyBlock(*keyBlock); - } - } - - template - void singleOrderByColTest(const std::vector& sortingData, const std::vector& nullMasks, - const std::vector& expectedBlockOffsetOrder, const LogicalTypeID dataTypeID, - const bool isAsc, bool hasPayLoadCol) { - KU_ASSERT(sortingData.size() == nullMasks.size()); - KU_ASSERT(sortingData.size() == expectedBlockOffsetOrder.size()); - auto dataChunk = std::make_shared(hasPayLoadCol ? 2 : 1); - dataChunk->state->selVector->selectedSize = sortingData.size(); - auto valueVector = std::make_shared(dataTypeID, memoryManager.get()); - for (auto i = 0u; i < dataChunk->state->selVector->selectedSize; i++) { - if (nullMasks[i]) { - valueVector->setNull(i, true); - } else { - valueVector->setValue(i, sortingData[i]); - } - } - dataChunk->insert(0, valueVector); - std::vector orderByVectors{ - valueVector.get()}; // only contains order_by columns - std::vector allVectors{ - valueVector.get()}; // all columns including order_by and payload columns - std::vector isAscOrder{isAsc}; - - std::unique_ptr tableSchema = - std::make_unique(); - tableSchema->appendColumn(std::make_unique(false /* isUnflat */, - 0 /* dataChunkPos */, LogicalTypeUtils::getRowLayoutSize(LogicalType{dataTypeID}))); - std::vector strKeyColsInfo; - - if (hasPayLoadCol) { - // Create a new payloadValueVector for the payload column. - auto payloadValueVector = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - for (auto i = 0u; i < dataChunk->state->selVector->selectedSize; i++) { - payloadValueVector->setValue(i, std::to_string(i)); - } - dataChunk->insert(1, payloadValueVector); - // To test whether the orderByCol -> ftIdx works properly, we put the - // payload column at index 0, and the orderByCol at index 1. - allVectors.insert(allVectors.begin(), payloadValueVector.get()); - tableSchema->appendColumn(std::make_unique(false /* isUnflat */, - 0 /* dataChunkPos */, LogicalTypeUtils::getRowLayoutSize(LogicalType{dataTypeID}))); - strKeyColsInfo.emplace_back( - StrKeyColInfo(tableSchema->getColOffset(1) /* colOffsetInFT */, - 0 /* colOffsetInEncodedKeyBlock */, isAsc)); - } else if constexpr (std::is_same::value) { - // If this is a string column and has no payload column, then the - // factorizedTable offset is just 0. - strKeyColsInfo.emplace_back( - StrKeyColInfo(tableSchema->getColOffset(0) /* colOffsetInFT */, - 0 /* colOffsetInEncodedKeyBlock */, isAsc)); - } - - FactorizedTable factorizedTable(memoryManager.get(), std::move(tableSchema)); - factorizedTable.append(allVectors); - - auto orderByKeyEncoder = - OrderByKeyEncoder(orderByVectors, isAscOrder, memoryManager.get(), factorizedTableIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(orderByVectors)); - orderByKeyEncoder.encodeKeys(); - - RadixSort radixSort = - RadixSort(memoryManager.get(), factorizedTable, orderByKeyEncoder, strKeyColsInfo); - sortAllKeyBlocks(orderByKeyEncoder, radixSort); - - checkTupleIdxesAndFactorizedTableIdxes(orderByKeyEncoder.getKeyBlocks()[0]->getData(), - orderByKeyEncoder.getNumBytesPerTuple(), expectedBlockOffsetOrder); - } - - void multipleOrderByColSolveTieTest(std::vector& isAscOrder, - std::vector& expectedBlockOffsetOrder, - std::vector>& stringValues) { - std::vector orderByVectors; - auto mockDataChunk = std::make_shared(stringValues.size()); - mockDataChunk->state->currIdx = 0; - std::unique_ptr tableSchema = - std::make_unique(); - std::vector strKeyColsInfo; - for (auto i = 0; i < stringValues.size(); i++) { - auto stringValueVector = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - tableSchema->appendColumn(std::make_unique( - false /* isUnflat */, 0 /* dataChunkPos */, sizeof(ku_string_t))); - strKeyColsInfo.push_back(StrKeyColInfo(tableSchema->getColOffset(strKeyColsInfo.size()), - strKeyColsInfo.size() * - OrderByKeyEncoder::getEncodingSize(stringValueVector->dataType), - isAscOrder[i])); - mockDataChunk->insert(i, stringValueVector); - for (auto j = 0u; j < stringValues[i].size(); j++) { - stringValueVector->setValue(j, stringValues[i][j]); - } - orderByVectors.emplace_back(stringValueVector.get()); - } - - FactorizedTable factorizedTable(memoryManager.get(), std::move(tableSchema)); - - auto orderByKeyEncoder = - OrderByKeyEncoder(orderByVectors, isAscOrder, memoryManager.get(), factorizedTableIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(orderByVectors)); - mockDataChunk->state->selVector->resetSelectorToValuePosBuffer(); - mockDataChunk->state->selVector->selectedSize = 1; - for (auto i = 0u; i < expectedBlockOffsetOrder.size(); i++) { - mockDataChunk->state->selVector->selectedPositions[0] = i; - factorizedTable.append(orderByVectors); - orderByKeyEncoder.encodeKeys(); - mockDataChunk->state->currIdx++; - } - mockDataChunk->state->selVector->resetSelectorToUnselected(); - - auto radixSort = - RadixSort(memoryManager.get(), factorizedTable, orderByKeyEncoder, strKeyColsInfo); - sortAllKeyBlocks(orderByKeyEncoder, radixSort); - - checkTupleIdxesAndFactorizedTableIdxes(orderByKeyEncoder.getKeyBlocks()[0]->getData(), - orderByKeyEncoder.getNumBytesPerTuple(), expectedBlockOffsetOrder); - } -}; - -TEST_F(RadixSortTest, singleOrderByColInt64Test) { - std::vector sortingData = {73 /* positive 1 byte number */, 0 /* NULL */, - -132 /* negative 1 byte number */, -5242 /* negative 2 bytes number */, INT64_MAX, - INT64_MIN, 210042 /* positive 2 bytes number */}; - std::vector nullMasks = {false, true, false, false, false, false, false}; - std::vector expectedFTBlockOffsetOrder = {5, 3, 2, 0, 6, 4, 1}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::INT64, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColNoNullInt64Test) { - std::vector sortingData = {48 /* positive 1 byte number */, - 39842 /* positive 2 bytes number */, -1 /* negative 1 byte number */, - -819321 /* negative 2 bytes number */, INT64_MAX, INT64_MIN}; - std::vector nullMasks(6, false); - std::vector expectedFTBlockOffsetOrder = {4, 1, 0, 2, 3, 5}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::INT64, - false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColLargeInputInt64Test) { - // 240 is the maximum number of tuples we can put into a memory block - // since: 4096 / (9 + 8) = 240. - std::vector sortingData(240); - iota(sortingData.begin(), sortingData.end(), 0); - reverse(sortingData.begin(), sortingData.end()); - std::vector nullMasks(240, false); - std::vector expectedFTBlockOffsetOrder(240); - iota(expectedFTBlockOffsetOrder.begin(), expectedFTBlockOffsetOrder.end(), 0); - reverse(expectedFTBlockOffsetOrder.begin(), expectedFTBlockOffsetOrder.end()); - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::INT64, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColBoolTest) { - std::vector sortingData = {true, false, false /* NULL */}; - std::vector nullMasks = {false, false, true}; - std::vector expectedFTBlockOffsetOrder = {2, 0, 1}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::BOOL, - false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColDateTest) { - std::vector sortingData = { - Date::fromCString("1970-01-01", strlen("1970-01-01")) /* days=0 */, - Date::fromCString("1970-01-02", strlen("1970-01-02")) /* positive days */, - Date::fromCString("2003-10-12", strlen("2003-10-12")) /* large positive days */, - Date::fromCString("1968-12-21", strlen("1968-12-21")) /* negative days */, - date_t(0) /*NULL*/}; - std::vector nullMasks = {false, false, false, false, true}; - std::vector expectedFTBlockOffsetOrder = {3, 0, 1, 2, 4}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::DATE, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColTimestampTest) { - std::vector sortingData = { - Timestamp::fromCString("1970-01-01 00:00:00", strlen("1970-01-01 00:00:00")) /* micros=0 */, - Timestamp::fromCString( - "1970-01-02 14:21:11", strlen("1970-01-02 14:21:11")) /* positive micros */, - timestamp_t(0) /*NULL*/, - Timestamp::fromCString( - "2003-10-12 08:21:10", strlen("2003-10-12 08:21:10")) /* large positive micros */, - Timestamp::fromCString( - "1959-03-20 11:12:13.500", strlen("1959-03-20 11:12:13.500")) /* negative micros */ - }; - - std::vector nullMasks = {false, false, true, false, false}; - std::vector expectedFTBlockOffsetOrder = {2, 3, 1, 0, 4}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, - LogicalTypeID::TIMESTAMP, false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColIntervalTest) { - // We need to normalize days and micros in intervals. - std::vector sortingData = { - interval_t(0, 0, 0) /* NULL */, - Interval::fromCString( - "100 days 3 years 2 hours 178 minutes", strlen("100 days 3 years 2 hours 178 minutes")), - Interval::fromCString("2 years 466 days 20 minutes", - strlen("2 years 466 days 20 minutes")) /* =3 years 106 days 20 minutes */, - Interval::fromCString("3 years 99 days 200 hours 100 minutes", - strlen("3 years 99 days 100 hours 100 minutes")) /* =3 years 107 days 8 hours 100 - minutes */ - , - }; - - std::vector nullMasks = {true, false, false, false}; - std::vector expectedFTBlockOffsetOrder = {0, 3, 2, 1}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, - LogicalTypeID::INTERVAL, false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColDoubleTest) { - std::vector sortingData = {0.0123 /* small positive number */, - -0.90123 /* small negative number */, 95152 /* large positive number */, - -76123 /* large negative number */, 0, 0 /* NULL */}; - std::vector nullMasks = {false, false, false, false, false, true}; - std::vector expectedFTBlockOffsetOrder = {5, 2, 0, 4, 1, 3}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::DOUBLE, - false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColStringTest) { - // Multiple groups of string with the same prefix generates multiple groups of ties during radix - // sort. - std::vector sortingData = {"abcdef", "other common prefix test1", - "another common prefix2", "common prefix rank1", "common prefix rank3", - "common prefix rank2", "another common prefix1", "another short string", "" /*NULL*/}; - std::vector nullMasks = {false, false, false, false, false, false, false, false, true}; - std::vector expectedFTBlockOffsetOrder = {0, 6, 2, 7, 3, 5, 4, 1, 8}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::STRING, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColNoNullStringTest) { - // Multiple groups of string with the same prefix generates multiple groups of ties during radix - // sort. - std::vector sortingData = {"simple short", "other common prefix test2", - "another common prefix2", "common prefix rank1", "common prefix rank3", - "common prefix rank2", "other common prefix test3", "another short string"}; - std::vector nullMasks(8, false); - std::vector expectedFTBlockOffsetOrder = {0, 6, 1, 4, 5, 3, 7, 2}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::STRING, - false /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColAllTiesStringTest) { - // All the strings are the same, so there is a tie across all tuples and the tie can't be - // solved. The tuple ordering depends on the c++ std::sort, so we just need to check that the - // tupleIdx is valid and is in the range of [0~19). - std::vector sortingData(20, "same string for all tuples"); - std::vector nullMasks(20, false); - std::vector expectedFTBlockOffsetOrder(20, -1); - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::STRING, - true /* isAsc */, false /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, singleOrderByColWithPayloadTest) { - // The first column is a payload column and the second column is an order_by column. The radix - // sort needs to use the factorizedTableColIdx to correctly read the strings. - std::vector sortingData = {"string column with payload col test5", - "string column with payload col test3", "string 1", - "string column with payload col long long", "very long long long string"}; - std::vector nullMasks(5, false); - std::vector expectedFTBlockOffsetOrder = {2, 3, 1, 0, 4}; - singleOrderByColTest(sortingData, nullMasks, expectedFTBlockOffsetOrder, LogicalTypeID::STRING, - true /* isAsc */, true /* hasPayLoadCol */); -} - -TEST_F(RadixSortTest, multipleOrderByColNoTieTest) { - std::vector isAscOrder = {true, false, true, false, false}; - auto intFlatValueVector = - std::make_shared(LogicalTypeID::INT64, memoryManager.get()); - auto doubleFlatValueVector = - std::make_shared(LogicalTypeID::DOUBLE, memoryManager.get()); - auto stringFlatValueVector = - std::make_shared(LogicalTypeID::STRING, memoryManager.get()); - auto timestampFlatValueVector = - std::make_shared(LogicalTypeID::TIMESTAMP, memoryManager.get()); - auto dateFlatValueVector = - std::make_shared(LogicalTypeID::DATE, memoryManager.get()); - - auto mockDataChunk = std::make_shared(5); - mockDataChunk->insert(0, intFlatValueVector); - mockDataChunk->insert(1, doubleFlatValueVector); - mockDataChunk->insert(2, stringFlatValueVector); - mockDataChunk->insert(3, timestampFlatValueVector); - mockDataChunk->insert(4, dateFlatValueVector); - - intFlatValueVector->state->currIdx = 0; - doubleFlatValueVector->state->currIdx = 0; - stringFlatValueVector->state->currIdx = 0; - timestampFlatValueVector->state->currIdx = 0; - dateFlatValueVector->state->currIdx = 0; - - std::vector orderByVectors{intFlatValueVector.get(), doubleFlatValueVector.get(), - stringFlatValueVector.get(), timestampFlatValueVector.get(), dateFlatValueVector.get()}; - intFlatValueVector->setValue(0, (int64_t)41); - intFlatValueVector->setValue(1, (int64_t)-132); - intFlatValueVector->setValue(2, (int64_t)41); - intFlatValueVector->setNull(3, true); - intFlatValueVector->setValue(4, (int64_t)0); - doubleFlatValueVector->setValue(0, (double_t)453.421); - doubleFlatValueVector->setValue(1, (double_t)-415.23); - doubleFlatValueVector->setValue(2, (double_t)-0.00421); - doubleFlatValueVector->setValue(3, (double_t)0); - doubleFlatValueVector->setValue(4, (double_t)0.0121); - stringFlatValueVector->setValue(0, "common prefix2"); - stringFlatValueVector->setValue(1, "common prefix1"); - stringFlatValueVector->setValue(2, "common prefix"); - stringFlatValueVector->setNull(3, true); - stringFlatValueVector->setValue(4, "short str"); - timestampFlatValueVector->setValue( - 0, Timestamp::fromCString("1970-01-01 00:00:00", strlen("1970-01-01 00:00:00"))); - timestampFlatValueVector->setValue( - 1, Timestamp::fromCString("1962-04-07 14:11:23", strlen("1962-04-07 14:11:23"))); - timestampFlatValueVector->setValue( - 2, Timestamp::fromCString("1970-01-01 01:00:00", strlen("1970-01-01 01:00:00"))); - timestampFlatValueVector->setValue( - 3, Timestamp::fromCString("1953-01-12 21:12:00", strlen("2053-01-12 21:12:00"))); - timestampFlatValueVector->setNull(4, true); - dateFlatValueVector->setValue(0, Date::fromCString("1978-09-12", strlen("1978-09-12"))); - dateFlatValueVector->setValue(1, Date::fromCString("2035-07-04", strlen("2035-07-04"))); - dateFlatValueVector->setNull(2, true); - dateFlatValueVector->setValue(3, Date::fromCString("1964-01-21", strlen("1964-01-21"))); - dateFlatValueVector->setValue(4, Date::fromCString("2000-11-13", strlen("2000-11-13"))); - - std::unique_ptr tableSchema = std::make_unique(); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::INT64}))); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::DOUBLE}))); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::STRING}))); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::TIMESTAMP}))); - tableSchema->appendColumn( - std::make_unique(false /* isUnflat */, 0 /* dataChunkPos */, - LogicalTypeUtils::getRowLayoutSize(LogicalType{LogicalTypeID::DATE}))); - - FactorizedTable factorizedTable(memoryManager.get(), std::move(tableSchema)); - std::vector strKeyColsInfo = {StrKeyColInfo(16 /* colOffsetInFT */, - OrderByKeyEncoder::getEncodingSize(LogicalType(LogicalTypeID::INT64)) + - OrderByKeyEncoder::getEncodingSize(LogicalType(LogicalTypeID::DOUBLE)), - true /* isAscOrder */)}; - - auto orderByKeyEncoder = - OrderByKeyEncoder(orderByVectors, isAscOrder, memoryManager.get(), factorizedTableIdx, - numTuplesPerBlockInFT, OrderByKeyEncoder::getNumBytesPerTuple(orderByVectors)); - mockDataChunk->state->selVector->resetSelectorToValuePosBuffer(); - mockDataChunk->state->selVector->selectedSize = 1; - for (auto i = 0u; i < 5; i++) { - mockDataChunk->state->selVector->selectedPositions[0] = i; - orderByKeyEncoder.encodeKeys(); - factorizedTable.append(orderByVectors); - mockDataChunk->state->currIdx++; - } - mockDataChunk->state->selVector->resetSelectorToUnselected(); - - RadixSort radixSort = - RadixSort(memoryManager.get(), factorizedTable, orderByKeyEncoder, strKeyColsInfo); - sortAllKeyBlocks(orderByKeyEncoder, radixSort); - - std::vector expectedFTBlockOffsetOrder = {1, 4, 0, 2, 3}; - checkTupleIdxesAndFactorizedTableIdxes(orderByKeyEncoder.getKeyBlocks()[0]->getData(), - orderByKeyEncoder.getNumBytesPerTuple(), expectedFTBlockOffsetOrder); -} - -TEST_F(RadixSortTest, multipleOrderByColSolvableTieTest) { - std::vector isAscOrder = {false, true}; - std::vector expectedFTBlockOffsetOrder = { - 4, - 0, - 1, - 3, - 2, - }; - // The first column has ties, need to compare the second column to solve the tie. However there - // are still some ties that are not solvable. - std::vector> stringValues = { - {"same common prefix different1", "same common prefix different", - "same common prefix different", "same common prefix different", - "same common prefix different1"}, - {"second same common prefix2", "second same common prefix0", "second same common prefix3", - "second same common prefix2", "second same common prefix1"}}; - multipleOrderByColSolveTieTest(isAscOrder, expectedFTBlockOffsetOrder, stringValues); -} - -TEST_F(RadixSortTest, multipleOrderByColUnSolvableTieTest) { - std::vector isAscOrder = {true, true}; - std::vector expectedFTBlockOffsetOrder = {1, 3, 2, 0, 4}; - // The first column has ties, need to compare the second column to solve the tie. However there - // are still some ties that are not solvable. - std::vector> stringValues = { - {"same common prefix different1", "same common prefix different", - "same common prefix different", "same common prefix different", - "same common prefix different1"}, - {"second same common prefix2", "second same common prefix0", "second same common prefix3", - "second same common prefix0", "second same common prefix2"}}; - multipleOrderByColSolveTieTest(isAscOrder, expectedFTBlockOffsetOrder, stringValues); -} diff --git a/test/test_files/copy/copy_node_csv.test b/test/test_files/copy/copy_node_csv.test index 1ed9fd97a9..e4ee92b710 100644 --- a/test/test_files/copy/copy_node_csv.test +++ b/test/test_files/copy/copy_node_csv.test @@ -53,3 +53,35 @@ -STATEMENT MATCH (row:tableOfTypes) WHERE 0 <= row.doubleColumn AND row.doubleColumn <= 10 AND 0 <= row.int64Column AND row.int64Column <= 10 RETURN count(*); ---- 1 546 + +-LOG OrderByTest +-STATEMENT MATCH (row:tableOfTypes) RETURN row.id ORDER BY row.id DESC LIMIT 10; +-PARALLELISM 1 +-CHECK_ORDER +---- 10 +49998 +49997 +49996 +49995 +49994 +49993 +49992 +49991 +49990 +49989 + +-LOG OrderByMultiColTest +-STATEMENT MATCH (row:tableOfTypes) RETURN row.id ORDER BY row.id DESC, row.booleanColumn, row.doubleColumn LIMIT 10; +-PARALLELISM 1 +-CHECK_ORDER +---- 10 +49998 +49997 +49996 +49995 +49994 +49993 +49992 +49991 +49990 +49989 diff --git a/test/test_files/tinysnb/order_by/single_label.test b/test/test_files/tinysnb/order_by/single_label.test index 9182c23a82..5cfcd57c4c 100644 --- a/test/test_files/tinysnb/order_by/single_label.test +++ b/test/test_files/tinysnb/order_by/single_label.test @@ -220,6 +220,10 @@ Elizabeth -STATEMENT MATCH (p:person) WHERE p.age > 100 RETURN p.age ORDER BY p.age ---- 0 +-LOG OrderByLimitEmptyResult +-STATEMENT MATCH (p:person) WHERE p.age > 100 RETURN p.age ORDER BY p.age limit 10 +---- 0 + -LOG OrderByAggregateTest1 -STATEMENT MATCH (a:person)-[:knows]->(b:person) return a.age, COUNT(b) as c ORDER BY a.age -CHECK_ORDER @@ -237,3 +241,12 @@ Elizabeth ---- 2 1|8 2|6 + +-LOG OrderByMultipleCols +-STATEMENT MATCH (p:person)-[s:studyAt]->(o:organisation) return s.year ORDER BY s.length, p.lastJobDuration, o.mark, p.height limit 5 +-CHECK_ORDER +-ENUMERATE +---- 3 +2021 +2020 +2020