diff --git a/src/include/storage/buffer_manager/bm_file_handle.h b/src/include/storage/buffer_manager/bm_file_handle.h index 0350604740..13a85f36c3 100644 --- a/src/include/storage/buffer_manager/bm_file_handle.h +++ b/src/include/storage/buffer_manager/bm_file_handle.h @@ -6,44 +6,87 @@ namespace kuzu { namespace storage { -static constexpr uint64_t IS_IN_FRAME_MASK = 0x8000000000000000; -static constexpr uint64_t DIRTY_MASK = 0x4000000000000000; -static constexpr uint64_t PAGE_IDX_MASK = 0x3FFFFFFFFFFFFFFF; - -enum class LockMode : uint8_t { SPIN = 0, NON_BLOCKING = 1 }; - class BMFileHandle; class BufferManager; // Keeps the state information of a page in a file. class PageState { + static constexpr uint64_t DIRTY_MASK = 0x0080000000000000; + static constexpr uint64_t STATE_MASK = 0xFF00000000000000; + static constexpr uint64_t VERSION_MASK = 0x00FFFFFFFFFFFFFF; + static constexpr uint64_t NUM_BITS_TO_SHIFT_FOR_STATE = 56; + public: - inline bool isInFrame() const { return pageIdx & IS_IN_FRAME_MASK; } - inline void setDirty() { pageIdx |= DIRTY_MASK; } - inline void clearDirty() { pageIdx &= ~DIRTY_MASK; } - inline bool isDirty() const { return pageIdx & DIRTY_MASK; } - inline common::page_idx_t getPageIdx() const { - return (common::page_idx_t)(pageIdx & PAGE_IDX_MASK); - } - inline uint64_t incrementPinCount() { return pinCount.fetch_add(1); } - inline uint64_t decrementPinCount() { return pinCount.fetch_sub(1); } - inline void setPinCount(uint64_t newPinCount) { pinCount.store(newPinCount); } - inline uint64_t getPinCount() const { return pinCount.load(); } - inline uint64_t getEvictionTimestamp() const { return evictionTimestamp.load(); } - inline uint64_t incrementEvictionTimestamp() { return evictionTimestamp.fetch_add(1); } - inline void releaseLock() { lock.clear(); } - - bool acquireLock(LockMode lockMode); - void setInFrame(common::page_idx_t pageIdx); - void resetState(); + static constexpr uint64_t UNLOCKED = 0; + static constexpr uint64_t LOCKED = 1; + static constexpr uint64_t MARKED = 2; + static constexpr uint64_t EVICTED = 3; + + PageState() { + stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release); + } + + inline uint64_t getState() { return getState(stateAndVersion.load()); } + inline static uint64_t getState(uint64_t stateAndVersion) { + return (stateAndVersion & STATE_MASK) >> NUM_BITS_TO_SHIFT_FOR_STATE; + } + inline static uint64_t getVersion(uint64_t stateAndVersion) { + return stateAndVersion & VERSION_MASK; + } + inline static uint64_t updateStateWithSameVersion( + uint64_t oldStateAndVersion, uint64_t newState) { + return ((oldStateAndVersion << 8) >> 8) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE); + } + inline static uint64_t updateStateAndIncrementVersion( + uint64_t oldStateAndVersion, uint64_t newState) { + return (((oldStateAndVersion << 8) >> 8) + 1) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE); + } + inline void spinLock(uint64_t oldStateAndVersion) { + while (true) { + if (tryLock(oldStateAndVersion)) { + return; + } + } + } + inline bool tryLock(uint64_t oldStateAndVersion) { + return stateAndVersion.compare_exchange_strong( + oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, LOCKED)); + } + inline void unlock() { + assert(getState(stateAndVersion.load()) == LOCKED); + stateAndVersion.store(updateStateAndIncrementVersion(stateAndVersion.load(), UNLOCKED), + std::memory_order_release); + } + // Change page state from Mark to Unlocked. + inline bool tryClearMark(uint64_t oldStateAndVersion) { + assert(getState(oldStateAndVersion) == MARKED); + return stateAndVersion.compare_exchange_strong( + oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, UNLOCKED)); + } + inline bool tryMark(uint64_t oldStateAndVersion) { + return stateAndVersion.compare_exchange_strong( + oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, MARKED)); + } + + inline void setDirty() { + assert(getState(stateAndVersion.load()) == LOCKED); + stateAndVersion |= DIRTY_MASK; + } + inline void clearDirty() { + assert(getState(stateAndVersion.load()) == LOCKED); + stateAndVersion &= ~DIRTY_MASK; + } + inline bool isDirty() const { return stateAndVersion & DIRTY_MASK; } + uint64_t getStateAndVersion() const { return stateAndVersion.load(); } + + inline void resetToEvicted() { + stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release); + } private: - std::atomic_flag lock = ATOMIC_FLAG_INIT; - // Highest 1st bit indicates if this page is loaded or not, 2nd bit indicates if this - // page is dirty or not. The rest 62 bits records the page idx inside the file. - uint64_t pageIdx = 0; - std::atomic pinCount = 0; - std::atomic evictionTimestamp = 0; + // Highest 1 bit is dirty bit, and the rest are page state and version bits. + // In the rest bits, the lowest 1 byte is state, and the rest are version. + std::atomic stateAndVersion; }; // This class is used to keep the WAL page idxes of a page group in the original file handle. @@ -94,6 +137,12 @@ class BMFileHandle : public FileHandle { BMFileHandle(const std::string& path, uint8_t flags, BufferManager* bm, common::PageSizeClass pageSizeClass, FileVersionedType fileVersionedType); + // This function assumes the page is already LOCKED. + inline void setLockedPageDirty(common::page_idx_t pageIdx) { + assert(pageIdx < numPages); + pageStates[pageIdx]->setDirty(); + } + common::page_group_idx_t addWALPageIdxGroupIfNecessary(common::page_idx_t originalPageIdx); // This function is intended to be used after a fileInfo is created and we want the file // to have no pages and page locks. Should be called after ensuring that the buffer manager @@ -122,16 +171,6 @@ class BMFileHandle : public FileHandle { assert(pageIdx < numPages && pageStates[pageIdx]); return pageStates[pageIdx].get(); } - inline void clearPageState(common::page_idx_t pageIdx) { - assert(pageIdx < numPages && pageStates[pageIdx]); - pageStates[pageIdx]->resetState(); - } - inline bool acquirePageLock(common::page_idx_t pageIdx, LockMode lockMode) { - return getPageState(pageIdx)->acquireLock(lockMode); - } - inline void releasePageLock(common::page_idx_t pageIdx) { - getPageState(pageIdx)->releaseLock(); - } inline common::frame_idx_t getFrameIdx(common::page_idx_t pageIdx) { assert(pageIdx < pageCapacity); return (frameGroupIdxes[pageIdx >> common::StorageConstants::PAGE_GROUP_SIZE_LOG2] diff --git a/src/include/storage/buffer_manager/buffer_manager.h b/src/include/storage/buffer_manager/buffer_manager.h index 57e1b155f8..7b26fb4dbe 100644 --- a/src/include/storage/buffer_manager/buffer_manager.h +++ b/src/include/storage/buffer_manager/buffer_manager.h @@ -12,25 +12,38 @@ class logger; namespace kuzu { namespace storage { +// This class keeps state info for pages potentially can be evicted. +// The page state of a candidate is set to MARKED when it is first enqueued. After enqueued, if the +// candidate was recently accessed, it is no longer immediately evictable. See the state transition +// diagram above `BufferManager` class declaration for more details. struct EvictionCandidate { - bool isEvictable() const { - return pageState->getEvictionTimestamp() == evictionTimestamp && - pageState->getPinCount() == 0; + // If the candidate is Marked and its version is the same as the one kept inside the candidate, + // it is evictable. + inline bool isEvictable(uint64_t currPageStateAndVersion) const { + return PageState::getState(currPageStateAndVersion) == PageState::MARKED && + PageState::getVersion(currPageStateAndVersion) == pageVersion; + } + // If the candidate was recently read optimistically, it is second chance evictable. + inline bool isSecondChanceEvictable(uint64_t currPageStateAndVersion) const { + return PageState::getState(currPageStateAndVersion) == PageState::UNLOCKED && + PageState::getVersion(currPageStateAndVersion) == pageVersion; } - BMFileHandle* fileHandle; - PageState* pageState; - // The eviction timestamp of the corresponding page state at the time the candidate is enqueued. - uint64_t evictionTimestamp = -1u; + BMFileHandle* fileHandle = nullptr; + common::page_idx_t pageIdx = common::INVALID_PAGE_IDX; + PageState* pageState = nullptr; + // The version of the corresponding page at the time the candidate is enqueued. + uint64_t pageVersion = -1u; }; class EvictionQueue { public: EvictionQueue() { queue = std::make_unique>(); } - inline void enqueue( - BMFileHandle* fileHandle, PageState* frameHandle, uint64_t evictionTimestamp) { - queue->enqueue(EvictionCandidate{fileHandle, frameHandle, evictionTimestamp}); + inline void enqueue(EvictionCandidate& candidate) { queue->enqueue(candidate); } + inline void enqueue(BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState, + uint64_t pageVersion) { + queue->enqueue(EvictionCandidate{fileHandle, pageIdx, pageState, pageVersion}); } inline bool dequeue(EvictionCandidate& candidate) { return queue->try_dequeue(candidate); } void removeNonEvictableCandidates(); @@ -45,10 +58,12 @@ class EvictionQueue { * 1) it provides the high-level functionality to pin() and unpin() the pages of the database files * used by storage structures, such as the Column, Lists, or HashIndex in the storage layer, and * operates via their BMFileHandle to read/write the page data into/out of one of the frames. - * 2) it supports the MemoryManager (MM) to allocate memory buffers that are not backed by - * any disk files. Similar to disk files, MM provides BMFileHandles backed by temp in-mem files to - * the BM to pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin - * happens when MM tries to reclaim a memory buffer. + * 2) it provides optimistic read of pages, which optimistically read unlocked or marked pages + * without acquiring locks. + * 3) it supports the MemoryManager (MM) to allocate memory buffers that are not + * backed by any disk files. Similar to disk files, MM provides in-mem file handles to the BM to + * pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin happens + * when MM tries to reclaim a memory buffer. * * Specifically, in BM's context, page is the basic management unit of data in a file. The file can * be a disk file, such as a column file, or an in-mem file, such as an temp in-memory file kept by @@ -60,7 +75,7 @@ class EvictionQueue { * to be kept in frame and what can be spilled to disk is directly determined by the pin/unpin * calls of the users. * - * Also, BM provides some specialized functionalities: + * Also, BM provides some specialized functionalities for WAL files: * 1) it supports the caller to set pinned pages as dirty, which will be safely written back to disk * when the pages are evicted; * 2) it supports the caller to flush or remove pages from the BM; @@ -86,6 +101,45 @@ class EvictionQueue { * queue based replacement policy and the MADV_DONTNEED hint to explicitly control evictions. See * comments above `claimAFrame()` for more details. * + * Page states in BM: + * A page can be in one of the four states: a) LOCKED, b) UNLOCKED, c) MARKED, d) EVICTED. + * Every page is initialized as in the EVICTED state. + * The state transition diagram of page X is as follows (oRead refers to optimisticRead): + * Note: optimistic reads on UNLOCKED pages don't make any changes to pages' states. oRead on + * UNLOCKED is omitted in the diagram. + * + * 7.2. pin(pY): evict pX. 7.1. pin(pY): tryLock(pX) + * |<-------------------------|<------------------------------------------------------------| + * | | 4. pin(pX) | + * | |<------------------------------------------------------------| + * | 1. pin(pX) | 5. pin(pX) 6. pin(pY): 2nd chance eviction | + * EVICTED ------------------> LOCKED <-------------UNLOCKED ------------------------------> MARKED + * | | 3. oRead(pX) | + * | <--------------------------------------| + * | 2. unpin(pX): enqueue pX & increment version | + * -------------------------------------------------------------> + * + * 1. When page pX at EVICTED state and it is pinned, it transits to the Locked state. `pin` will + * first acquire the exclusive lock on the page, then read the page from disk into its frame. The + * caller can safely make changes to the page. + * 2. When the caller finishes changes to the page, it calls `unpin`, which releases the lock on the + * page, puts the page into the eviction queue, and increments its version. The page now transits to + * the MARKED state. Note that currently the page is still cached, but it is ready to be evicted. + * The page version number is used to identify any potential writes on the page. Each time a page + * transits from LOCKED to MARKED state, we will increment its version. This happens when a page is + * pinned, then unpinned. During the pin and unpin, we assume the page's content in its + * corresponding frame might have changed, thus, we increment the version number to forbid stale + * reads on it; + * 3. The MARKED page can be optimistically read by the caller, setting the page's state to + * UNLOCKED. For evicted pages, optimistic reads will trigger pin and unpin to read pages from disk + * into frames. + * 4. The MARKED page can be pinned again by the caller, setting the page's state to LOCKED. + * 5. The UNLOCKED page can also be pinned again by the caller, setting the page's state to LOCKED. + * 6. During eviction, UNLOCKED pages will be check if they are second chance evictable. If so, they + * will be set to MARKED, and their eviction candidates will be moved back to the eviction queue. + * 7. During eviction, if the page is in the MARKED state, it will be LOCKED first (7.1), then + * removed from its frame, and set to EVICTED (7.2). + * * The design is inspired by vmcache in the paper "Virtual-Memory Assisted Buffer Management" * (https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf). * We would also like to thank Fadhil Abubaker for doing the initial research and prototyping of @@ -102,12 +156,12 @@ class BufferManager { uint8_t* pin(BMFileHandle& fileHandle, common::page_idx_t pageIdx, PageReadPolicy pageReadPolicy = PageReadPolicy::READ_PAGE); - - void setPinnedPageDirty(BMFileHandle& fileHandle, common::page_idx_t pageIdx); - + void optimisticRead(BMFileHandle& fileHandle, common::page_idx_t pageIdx, + const std::function& func); // The function assumes that the requested page is already pinned. void unpin(BMFileHandle& fileHandle, common::page_idx_t pageIdx); + // Currently, these functions are specifically used only for WAL files. void removeFilePagesFromFrames(BMFileHandle& fileHandle); void flushAllDirtyPagesInFrames(BMFileHandle& fileHandle); void updateFrameIfPageIsInFrameWithoutLock( @@ -137,7 +191,8 @@ class BufferManager { void removePageFromFrame( BMFileHandle& fileHandle, common::page_idx_t pageIdx, bool shouldFlush); - void addToEvictionQueue(BMFileHandle* fileHandle, PageState* pageState); + void addToEvictionQueue( + BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState); inline uint64_t reserveUsedMemory(uint64_t size) { return usedMemory.fetch_add(size); } inline uint64_t freeUsedMemory(uint64_t size) { return usedMemory.fetch_sub(size); } diff --git a/src/include/storage/storage_structure/disk_overflow_file.h b/src/include/storage/storage_structure/disk_overflow_file.h index 07aed6d498..77a2ba0f25 100644 --- a/src/include/storage/storage_structure/disk_overflow_file.h +++ b/src/include/storage/storage_structure/disk_overflow_file.h @@ -89,6 +89,9 @@ class DiskOverflowFile : public StorageStructure { void setListRecursiveIfNestedWithoutLock(const common::ku_list_t& inMemSrcList, common::ku_list_t& diskDstList, const common::DataType& dataType); void logNewOverflowFileNextBytePosRecordIfNecessaryWithoutLock(); + void readValuesInList(transaction::TransactionType trxType, const common::DataType& dataType, + std::vector>& retValues, uint32_t numBytesOfSingleValue, + uint64_t numValuesInList, PageByteCursor& cursor, uint8_t* frame); void pinOverflowPageCache(BMFileHandle* bmFileHandleToPin, common::page_idx_t pageIdxToPin, OverflowPageCache& overflowPageCache); void unpinOverflowPageCache(OverflowPageCache& overflowPageCache); diff --git a/src/storage/buffer_manager/bm_file_handle.cpp b/src/storage/buffer_manager/bm_file_handle.cpp index 1f487f3a6d..7c9c3fb109 100644 --- a/src/storage/buffer_manager/bm_file_handle.cpp +++ b/src/storage/buffer_manager/bm_file_handle.cpp @@ -7,27 +7,6 @@ using namespace kuzu::common; namespace kuzu { namespace storage { -void PageState::setInFrame(common::page_idx_t pageIdx_) { - pageIdx = 0; - pageIdx = pageIdx_; - pageIdx |= IS_IN_FRAME_MASK; -} - -bool PageState::acquireLock(LockMode lockMode) { - if (lockMode == LockMode::SPIN) { - while (lock.test_and_set()) // spinning - ; - return true; - } - return !lock.test_and_set(); -} - -void PageState::resetState() { - pageIdx = 0; - pinCount = 0; - evictionTimestamp = 0; -} - WALPageIdxGroup::WALPageIdxGroup() { walPageIdxes.resize(common::StorageConstants::PAGE_GROUP_SIZE, common::INVALID_PAGE_IDX); walPageIdxLocks.resize(common::StorageConstants::PAGE_GROUP_SIZE); diff --git a/src/storage/buffer_manager/buffer_manager.cpp b/src/storage/buffer_manager/buffer_manager.cpp index c3cc86969c..6bb3854360 100644 --- a/src/storage/buffer_manager/buffer_manager.cpp +++ b/src/storage/buffer_manager/buffer_manager.cpp @@ -1,7 +1,5 @@ #include "storage/buffer_manager/buffer_manager.h" -#include - #include "common/constants.h" #include "common/exception.h" #include "spdlog/spdlog.h" @@ -11,37 +9,36 @@ using namespace kuzu::common; namespace kuzu { namespace storage { -// In this function, we try to remove as more as possible candidates that are not evictable from the +// In this function, we try to remove as many as possible candidates that are not evictable from the // eviction queue until we hit a candidate that is evictable. -// Two kinds of candidates are not evictable: 1) it is currently pinned; 2) it has been recently -// visited. -// To identify those recently accessed candidates, we use the eviction timestamp. If the -// eviction timestamp of a candidate is different from the timestamp in its corresponding pageState, -// it means that the candidate has been recently visited and we should not evict it. The idea is -// that eviction timestamp is a logical per-page timestamp starting from 0, and is incremented each -// time the page is pushed into the eviction queue. For example, the first time p5 is pushed into -// the eviction queue, it will end up with a timestamp 1. When we push a page into the queue, we -// create an EvictionCandidate object for the page. Let's call this object c0 when p5 is first -// pushed. c0 will consist of (ptr to p5, 1), where the latter is the eviction timestamp at the time -// c0 is put into the queue. Suppose p5 is put into the eviction queue again (e.g., because it was -// pinned and unpinned). At this point we create another EvictionCandidate object c1 (ptr to p5, 2) -// where the latter eviction timestamp is now incremented by 1, which makes c1 now not evictable. -// This idea is inspired by DuckDB's queue-based eviction implementation. +// 1) If the candidate page's version has changed, which means the page was pinned and unpinned, we +// remove the candidate from the queue. +// 2) If the candidate page's state is UNLOCKED, and its page version hasn't changed, which means +// the page was optimistically read, we give a second chance to evict the page by marking the page +// as MARKED, and moving the candidate to the back of the queue. +// 3) If the candidate page's state is LOCKED, we remove the candidate from the queue. void EvictionQueue::removeNonEvictableCandidates() { EvictionCandidate evictionCandidate; while (true) { if (!queue->try_dequeue(evictionCandidate)) { break; } - if (evictionCandidate.pageState->getPinCount() != 0 || - evictionCandidate.pageState->getEvictionTimestamp() != - evictionCandidate.evictionTimestamp) { - // Remove the candidate from the eviction queue if it is still pinned or if it has - // been recently visited. - continue; - } else { + auto pageStateAndVersion = evictionCandidate.pageState->getStateAndVersion(); + if (evictionCandidate.isEvictable(pageStateAndVersion)) { queue->enqueue(evictionCandidate); break; + } else if (evictionCandidate.isSecondChanceEvictable(pageStateAndVersion)) { + // The page was optimistically read, mark it as MARKED, and enqueue to be evicted later. + evictionCandidate.pageState->tryMark(pageStateAndVersion); + queue->enqueue(evictionCandidate); + continue; + } else { + // Cases to remove the candidate from the queue: + // 1) The page is currently LOCKED (it is currently pinned), remove the candidate from + // the queue. + // 2) The page's version number has changed (it was pinned and unpinned), another + // candidate exists for this page in the queue. remove the candidate from the queue. + continue; } } } @@ -70,47 +67,67 @@ BufferManager::~BufferManager() = default; // both get access to the same piece of memory. uint8_t* BufferManager::pin( BMFileHandle& fileHandle, common::page_idx_t pageIdx, PageReadPolicy pageReadPolicy) { - fileHandle.acquirePageLock(pageIdx, LockMode::SPIN); auto pageState = fileHandle.getPageState(pageIdx); - if (pageState->isInFrame()) { - pageState->incrementPinCount(); - } else { - if (!claimAFrame(fileHandle, pageIdx, pageReadPolicy)) { - pageState->releaseLock(); - throw BufferManagerException("Failed to claim a frame."); + while (true) { + auto currStateAndVersion = pageState->getStateAndVersion(); + switch (PageState::getState(currStateAndVersion)) { + case PageState::EVICTED: { + if (pageState->tryLock(currStateAndVersion)) { + if (!claimAFrame(fileHandle, pageIdx, pageReadPolicy)) { + pageState->unlock(); + throw BufferManagerException("Failed to claim a frame."); + } + return getFrame(fileHandle, pageIdx); + } + } break; + case PageState::UNLOCKED: + case PageState::MARKED: { + if (pageState->tryLock(currStateAndVersion)) { + return getFrame(fileHandle, pageIdx); + } + } break; + case PageState::LOCKED: { + continue; + } } } - auto retVal = getFrame(fileHandle, pageIdx); - fileHandle.releasePageLock(pageIdx); - return retVal; } -// Important Note: The caller should make sure that they have pinned the page before calling this. -void BufferManager::setPinnedPageDirty(BMFileHandle& fileHandle, page_idx_t pageIdx) { - fileHandle.acquirePageLock(pageIdx, LockMode::SPIN); +void BufferManager::optimisticRead(BMFileHandle& fileHandle, common::page_idx_t pageIdx, + const std::function& func) { auto pageState = fileHandle.getPageState(pageIdx); - if (pageState && pageState->getPinCount() >= 1) { - pageState->setDirty(); - fileHandle.releasePageLock(pageIdx); - } else { - fileHandle.releasePageLock(pageIdx); - throw BufferManagerException("If a page is not in memory or is not pinned, cannot set " - "it to isDirty = true. filePath: " + - fileHandle.getFileInfo()->path + - " pageIdx: " + std::to_string(pageIdx) + "."); + while (true) { + auto currStateAndVersion = pageState->getStateAndVersion(); + switch (PageState::getState(currStateAndVersion)) { + case PageState::UNLOCKED: { + func(getFrame(fileHandle, pageIdx)); + if (pageState->getStateAndVersion() == currStateAndVersion) { + return; + } + } break; + case PageState::MARKED: { + // If the page is marked, we try to switch to unlocked. If we succeed, we read the page. + if (pageState->tryClearMark(currStateAndVersion)) { + func(getFrame(fileHandle, pageIdx)); + return; + } + } break; + case PageState::EVICTED: { + pin(fileHandle, pageIdx, PageReadPolicy::READ_PAGE); + unpin(fileHandle, pageIdx); + } break; + default: { + // When locked, continue the spinning. + continue; + } + } } } void BufferManager::unpin(BMFileHandle& fileHandle, page_idx_t pageIdx) { - fileHandle.acquirePageLock(pageIdx, LockMode::SPIN); auto pageState = fileHandle.getPageState(pageIdx); - // `count` is the value of `pinCount` before sub. - auto count = pageState->decrementPinCount(); - assert(count >= 1); - if (count == 1) { - addToEvictionQueue(&fileHandle, pageState); - } - fileHandle.releasePageLock(pageIdx); + pageState->unlock(); + addToEvictionQueue(&fileHandle, pageIdx, pageState); } // This function tries to load the given page into a frame. Due to our design of mmap, each page is @@ -135,13 +152,17 @@ bool BufferManager::claimAFrame( freeUsedMemory(pageSizeToClaim); return false; } - if (!evictionCandidate.isEvictable()) { + auto pageStateAndVersion = evictionCandidate.pageState->getStateAndVersion(); + if (!evictionCandidate.isEvictable(pageStateAndVersion)) { + if (evictionCandidate.isSecondChanceEvictable(pageStateAndVersion)) { + evictionCandidate.pageState->tryMark(pageStateAndVersion); + evictionQueue->enqueue(evictionCandidate); + } continue; } - // We found a page whose pin count can be 0, and potentially haven't been accessed since - // enqueued. We try to evict the page from its frame by calling `tryEvictPage`, which will - // check if the page's pin count is actually 0 and the page has not been accessed recently, - // if so, we evict the page from its frame. + // We found a page that potentially hasn't been accessed since enqueued. We try to evict the + // page from its frame by calling `tryEvictPage`, which will check if the page's version has + // changed, if not, we evict the page from its frame. claimedMemory += tryEvictPage(evictionCandidate); currentUsedMem = usedMemory.load(); } @@ -156,47 +177,43 @@ bool BufferManager::claimAFrame( return true; } -void BufferManager::addToEvictionQueue(BMFileHandle* fileHandle, PageState* pageState) { - auto timestampBeforeEvict = pageState->incrementEvictionTimestamp(); +void BufferManager::addToEvictionQueue( + BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState) { + auto currStateAndVersion = pageState->getStateAndVersion(); if (++numEvictionQueueInsertions == BufferPoolConstants::EVICTION_QUEUE_PURGING_INTERVAL) { evictionQueue->removeNonEvictableCandidates(); numEvictionQueueInsertions = 0; } - evictionQueue->enqueue(fileHandle, pageState, timestampBeforeEvict + 1); + pageState->tryMark(currStateAndVersion); + evictionQueue->enqueue( + fileHandle, pageIdx, pageState, PageState::getVersion(currStateAndVersion)); } uint64_t BufferManager::tryEvictPage(EvictionCandidate& candidate) { auto& pageState = *candidate.pageState; - if (!pageState.acquireLock(LockMode::NON_BLOCKING)) { - return 0; - } - // We check pinCount and evictionTimestamp again after acquiring the lock on page currently - // residing in the frame. At this point in time, no other thread can change the pinCount and the - // evictionTimestamp. - if (!candidate.isEvictable()) { - pageState.releaseLock(); + auto currStateAndVersion = pageState.getStateAndVersion(); + // We check if the page is evictable again. Note that if the page's state or version has + // changed after the check, `tryLock` will fail, and we will abort the eviction of this page. + if (!candidate.isEvictable(currStateAndVersion) || !pageState.tryLock(currStateAndVersion)) { return 0; } - // Else, flush out the frame into the file page if the frame is dirty. Then remove the page - // from the frame and release the lock on it. - flushIfDirtyWithoutLock(*candidate.fileHandle, pageState.getPageIdx()); + // At this point, the page is LOCKED. Next, flush out the frame into the file page if the frame + // is dirty. Finally remove the page from the frame and reset the page to EVICTED. + flushIfDirtyWithoutLock(*candidate.fileHandle, candidate.pageIdx); auto numBytesFreed = candidate.fileHandle->getPageSize(); - releaseFrameForPage(*candidate.fileHandle, pageState.getPageIdx()); - pageState.resetState(); - pageState.releaseLock(); + releaseFrameForPage(*candidate.fileHandle, candidate.pageIdx); + pageState.resetToEvicted(); return numBytesFreed; } void BufferManager::cachePageIntoFrame( BMFileHandle& fileHandle, common::page_idx_t pageIdx, PageReadPolicy pageReadPolicy) { auto pageState = fileHandle.getPageState(pageIdx); - pageState->setPinCount(1); pageState->clearDirty(); if (pageReadPolicy == PageReadPolicy::READ_PAGE) { FileUtils::readFromFile(fileHandle.getFileInfo(), (void*)getFrame(fileHandle, pageIdx), fileHandle.getPageSize(), pageIdx * fileHandle.getPageSize()); } - pageState->setInFrame(pageIdx); } void BufferManager::flushIfDirtyWithoutLock(BMFileHandle& fileHandle, common::page_idx_t pageIdx) { @@ -234,20 +251,16 @@ void BufferManager::removePageFromFrameIfNecessary(BMFileHandle& fileHandle, pag removePageFromFrame(fileHandle, pageIdx, false /* do not flush */); } -// NOTE: We assume the page is not pinned here. +// NOTE: We assume the page is not pinned (locked) here. void BufferManager::removePageFromFrame( BMFileHandle& fileHandle, common::page_idx_t pageIdx, bool shouldFlush) { - fileHandle.acquirePageLock(pageIdx, LockMode::SPIN); auto pageState = fileHandle.getPageState(pageIdx); - if (pageState && pageState->isInFrame()) { - if (shouldFlush) { - flushIfDirtyWithoutLock(fileHandle, pageIdx); - } - fileHandle.clearPageState(pageIdx); - releaseFrameForPage(fileHandle, pageIdx); - freeUsedMemory(fileHandle.getPageSize()); + pageState->spinLock(pageState->getStateAndVersion()); + if (shouldFlush) { + flushIfDirtyWithoutLock(fileHandle, pageIdx); } - fileHandle.releasePageLock(pageIdx); + releaseFrameForPage(fileHandle, pageIdx); + pageState->resetToEvicted(); } } // namespace storage diff --git a/src/storage/storage_structure/column.cpp b/src/storage/storage_structure/column.cpp index 7aecf3ec3f..b0743780b1 100644 --- a/src/storage/storage_structure/column.cpp +++ b/src/storage/storage_structure/column.cpp @@ -61,9 +61,10 @@ void Column::writeValues( Value Column::readValueForTestingOnly(offset_t offset) { auto cursor = PageUtils::getPageElementCursorForPos(offset, numElementsPerPage); - auto frame = bufferManager.pin(*fileHandle, cursor.pageIdx); - auto retVal = Value(dataType, frame + mapElementPosToByteOffset(cursor.elemPosInPage)); - bufferManager.unpin(*fileHandle, cursor.pageIdx); + Value retVal = Value::createDefaultValue(dataType); + bufferManager.optimisticRead(*fileHandle, cursor.pageIdx, [&](uint8_t* frame) { + retVal.copyValueFrom(frame + mapElementPosToByteOffset(cursor.elemPosInPage)); + }); return retVal; } @@ -123,12 +124,12 @@ void Column::lookup(Transaction* transaction, common::ValueVector* resultVector, auto [fileHandleToPin, pageIdxToPin] = StorageStructureUtils::getFileHandleAndPhysicalPageIdxToPin( *fileHandle, cursor.pageIdx, *wal, transaction->getType()); - auto frame = bufferManager.pin(*fileHandleToPin, pageIdxToPin); - auto vectorBytesOffset = getElemByteOffset(vectorPos); - auto frameBytesOffset = getElemByteOffset(cursor.elemPosInPage); - memcpy(resultVector->getData() + vectorBytesOffset, frame + frameBytesOffset, elementSize); - readSingleNullBit(resultVector, frame, cursor.elemPosInPage, vectorPos); - bufferManager.unpin(*fileHandleToPin, pageIdxToPin); + bufferManager.optimisticRead(*fileHandleToPin, pageIdxToPin, [&](uint8_t* frame) -> void { + auto vectorBytesOffset = getElemByteOffset(vectorPos); + auto frameBytesOffset = getElemByteOffset(cursor.elemPosInPage); + memcpy(resultVector->getData() + vectorBytesOffset, frame + frameBytesOffset, elementSize); + readSingleNullBit(resultVector, frame, cursor.elemPosInPage, vectorPos); + }); } WALPageIdxPosInPageAndFrame Column::beginUpdatingPage( @@ -189,9 +190,10 @@ void StringPropertyColumn::writeValueForSingleNodeIDPosition( Value StringPropertyColumn::readValueForTestingOnly(offset_t offset) { auto cursor = PageUtils::getPageElementCursorForPos(offset, numElementsPerPage); ku_string_t kuString; - auto frame = bufferManager.pin(*fileHandle, cursor.pageIdx); - memcpy(&kuString, frame + mapElementPosToByteOffset(cursor.elemPosInPage), sizeof(ku_string_t)); - bufferManager.unpin(*fileHandle, cursor.pageIdx); + bufferManager.optimisticRead(*fileHandle, cursor.pageIdx, [&](uint8_t* frame) -> void { + memcpy(&kuString, frame + mapElementPosToByteOffset(cursor.elemPosInPage), + sizeof(ku_string_t)); + }); return Value(diskOverflowFile.readString(TransactionType::READ_ONLY, kuString)); } @@ -225,9 +227,9 @@ void ListPropertyColumn::writeValueForSingleNodeIDPosition( Value ListPropertyColumn::readValueForTestingOnly(offset_t offset) { auto cursor = PageUtils::getPageElementCursorForPos(offset, numElementsPerPage); ku_list_t kuList; - auto frame = bufferManager.pin(*fileHandle, cursor.pageIdx); - memcpy(&kuList, frame + mapElementPosToByteOffset(cursor.elemPosInPage), sizeof(ku_list_t)); - bufferManager.unpin(*fileHandle, cursor.pageIdx); + bufferManager.optimisticRead(*fileHandle, cursor.pageIdx, [&](uint8_t* frame) -> void { + memcpy(&kuList, frame + mapElementPosToByteOffset(cursor.elemPosInPage), sizeof(ku_list_t)); + }); return Value(dataType, diskOverflowFile.readList(TransactionType::READ_ONLY, kuList, dataType)); } diff --git a/src/storage/storage_structure/disk_array.cpp b/src/storage/storage_structure/disk_array.cpp index fa05e6c614..a2bee7198a 100644 --- a/src/storage/storage_structure/disk_array.cpp +++ b/src/storage/storage_structure/disk_array.cpp @@ -82,9 +82,9 @@ U BaseDiskArray::get(uint64_t idx, TransactionType trxType) { auto& bmFileHandle = (BMFileHandle&)fileHandle; if (trxType == TransactionType::READ_ONLY || !hasTransactionalUpdates || !bmFileHandle.hasWALPageVersionNoWALPageIdxLock(apPageIdx)) { - auto frame = bufferManager->pin(bmFileHandle, apPageIdx); - auto retVal = *(U*)(frame + apCursor.offsetInPage); - bufferManager->unpin(bmFileHandle, apPageIdx); + U retVal; + bufferManager->optimisticRead(bmFileHandle, apPageIdx, + [&](const uint8_t* frame) -> void { retVal = *(U*)(frame + apCursor.offsetInPage); }); return retVal; } else { U retVal; diff --git a/src/storage/storage_structure/disk_overflow_file.cpp b/src/storage/storage_structure/disk_overflow_file.cpp index 2165a8adcb..8f5cdb0e1a 100644 --- a/src/storage/storage_structure/disk_overflow_file.cpp +++ b/src/storage/storage_structure/disk_overflow_file.cpp @@ -48,10 +48,10 @@ void DiskOverflowFile::lookupString( auto [fileHandleToPin, pageIdxToPin] = StorageStructureUtils::getFileHandleAndPhysicalPageIdxToPin( *fileHandle, cursor.pageIdx, *wal, trxType); - auto frame = bufferManager.pin(*fileHandleToPin, pageIdxToPin); - InMemOverflowBufferUtils::copyString( - (char*)(frame + cursor.offsetInPage), kuStr.len, kuStr, inMemOverflowBuffer); - bufferManager.unpin(*fileHandleToPin, pageIdxToPin); + bufferManager.optimisticRead(*fileHandleToPin, pageIdxToPin, [&](uint8_t* frame) { + InMemOverflowBufferUtils::copyString( + (char*)(frame + cursor.offsetInPage), kuStr.len, kuStr, inMemOverflowBuffer); + }); } void DiskOverflowFile::lookupString(TransactionType trxType, ku_string_t& kuStr, @@ -90,10 +90,10 @@ void DiskOverflowFile::readListToVector(TransactionType trxType, ku_list_t& kuLi auto [fileHandleToPin, pageIdxToPin] = StorageStructureUtils::getFileHandleAndPhysicalPageIdxToPin( *fileHandle, cursor.pageIdx, *wal, trxType); - auto frame = bufferManager.pin(*fileHandleToPin, pageIdxToPin); - InMemOverflowBufferUtils::copyListNonRecursive( - frame + cursor.offsetInPage, kuList, dataType, inMemOverflowBuffer); - bufferManager.unpin(*fileHandleToPin, pageIdxToPin); + bufferManager.optimisticRead(*fileHandleToPin, pageIdxToPin, [&](uint8_t* frame) { + InMemOverflowBufferUtils::copyListNonRecursive( + frame + cursor.offsetInPage, kuList, dataType, inMemOverflowBuffer); + }); if (dataType.childType->typeID == STRING) { auto kuStrings = (ku_string_t*)(kuList.overflowPtr); OverflowPageCache overflowPageCache; @@ -118,9 +118,10 @@ std::string DiskOverflowFile::readString(TransactionType trxType, const ku_strin auto [fileHandleToPin, pageIdxToPin] = StorageStructureUtils::getFileHandleAndPhysicalPageIdxToPin( *fileHandle, cursor.pageIdx, *wal, trxType); - auto frame = bufferManager.pin(*fileHandleToPin, pageIdxToPin); - auto retVal = std::string((char*)(frame + cursor.offsetInPage), str.len); - bufferManager.unpin(*fileHandleToPin, pageIdxToPin); + std::string retVal; + bufferManager.optimisticRead(*fileHandleToPin, pageIdxToPin, [&](uint8_t* frame) { + retVal = std::string((char*)(frame + cursor.offsetInPage), str.len); + }); return retVal; } } @@ -132,10 +133,20 @@ std::vector> DiskOverflowFile::readList( auto [fileHandleToPin, pageIdxToPin] = StorageStructureUtils::getFileHandleAndPhysicalPageIdxToPin( *fileHandle, cursor.pageIdx, *wal, trxType); - auto frame = bufferManager.pin(*fileHandleToPin, pageIdxToPin); auto numBytesOfSingleValue = Types::getDataTypeSize(*dataType.childType); auto numValuesInList = listVal.size; std::vector> retValues; + bufferManager.optimisticRead(*fileHandleToPin, pageIdxToPin, [&](uint8_t* frame) -> void { + readValuesInList( + trxType, dataType, retValues, numBytesOfSingleValue, numValuesInList, cursor, frame); + }); + return retValues; +} + +void DiskOverflowFile::readValuesInList(transaction::TransactionType trxType, + const common::DataType& dataType, std::vector>& retValues, + uint32_t numBytesOfSingleValue, uint64_t numValuesInList, PageByteCursor& cursor, + uint8_t* frame) { if (dataType.childType->typeID == STRING) { for (auto i = 0u; i < numValuesInList; i++) { auto kuListVal = *(ku_string_t*)(frame + cursor.offsetInPage); @@ -156,8 +167,6 @@ std::vector> DiskOverflowFile::readList( cursor.offsetInPage += numBytesOfSingleValue; } } - bufferManager.unpin(*fileHandleToPin, pageIdxToPin); - return retValues; } void DiskOverflowFile::addNewPageIfNecessaryWithoutLock(uint32_t numBytesToAppend) { diff --git a/src/storage/storage_structure/lists/lists.cpp b/src/storage/storage_structure/lists/lists.cpp index a7b94807fb..5c6b2e55d4 100644 --- a/src/storage/storage_structure/lists/lists.cpp +++ b/src/storage/storage_structure/lists/lists.cpp @@ -134,11 +134,11 @@ void Lists::fillInMemListsFromPersistentStore(offset_t nodeOffset, auto numElementsToReadInCurPage = std::min(numElementsToRead - numElementsRead, (uint64_t)(numElementsPerPage - pageCursor.elemPosInPage)); auto physicalPageIdx = pageMapper(pageCursor.pageIdx); - auto frame = bufferManager.pin(*fileHandle, physicalPageIdx); - fillInMemListsFromFrame(inMemList, frame, pageCursor.elemPosInPage, - numElementsToReadInCurPage, deletedRelOffsetsInList, numElementsRead, - nextPosToWriteToInMemList, updatedPersistentListOffsets); - bufferManager.unpin(*fileHandle, physicalPageIdx); + bufferManager.optimisticRead(*fileHandle, physicalPageIdx, [&](uint8_t* frame) { + fillInMemListsFromFrame(inMemList, frame, pageCursor.elemPosInPage, + numElementsToReadInCurPage, deletedRelOffsetsInList, numElementsRead, + nextPosToWriteToInMemList, updatedPersistentListOffsets); + }); numElementsRead += numElementsToReadInCurPage; pageCursor.nextPage(); } @@ -265,10 +265,10 @@ std::unique_ptr> AdjLists::readAdjacencyListOfNode( auto sizeToCopyInPage = std::min(((uint64_t)(numElementsPerPage - pageCursor.elemPosInPage) * elementSize), sizeLeftToCopy); - auto frame = bufferManager.pin(*fileHandle, physicalPageIdx); - memcpy(bufferPtr, frame + mapElementPosToByteOffset(pageCursor.elemPosInPage), - sizeToCopyInPage); - bufferManager.unpin(*fileHandle, physicalPageIdx); + bufferManager.optimisticRead(*fileHandle, physicalPageIdx, [&](uint8_t* frame) { + memcpy(bufferPtr, frame + mapElementPosToByteOffset(pageCursor.elemPosInPage), + sizeToCopyInPage); + }); bufferPtr += sizeToCopyInPage; sizeLeftToCopy -= sizeToCopyInPage; pageCursor.nextPage(); @@ -397,18 +397,19 @@ std::unordered_set RelIDList::getDeletedRelOffsetsInListForNodeOffset( auto numElementsToReadInCurPage = std::min(numElementsInPersistentStore - numElementsRead, (uint64_t)(numElementsPerPage - pageCursor.elemPosInPage)); auto physicalPageIdx = pageMapper(pageCursor.pageIdx); - auto buffer = bufferManager.pin(*fileHandle, physicalPageIdx) + - getElemByteOffset(pageCursor.elemPosInPage); - for (auto i = 0u; i < numElementsToReadInCurPage; i++) { - auto relID = *(int64_t*)buffer; - if (listsUpdatesStore->isRelDeletedInPersistentStore( - storageStructureIDAndFName.storageStructureID.listFileID, nodeOffset, relID)) { - deletedRelOffsetsInList.emplace(numElementsRead); + bufferManager.optimisticRead(*fileHandle, physicalPageIdx, [&](uint8_t* frame) -> void { + auto buffer = frame + getElemByteOffset(pageCursor.elemPosInPage); + for (auto i = 0u; i < numElementsToReadInCurPage; i++) { + auto relID = *(int64_t*)buffer; + if (listsUpdatesStore->isRelDeletedInPersistentStore( + storageStructureIDAndFName.storageStructureID.listFileID, nodeOffset, + relID)) { + deletedRelOffsetsInList.emplace(numElementsRead); + } + numElementsRead++; + buffer += elementSize; } - numElementsRead++; - buffer += elementSize; - } - bufferManager.unpin(*fileHandle, physicalPageIdx); + }); pageCursor.nextPage(); } return deletedRelOffsetsInList; @@ -420,27 +421,28 @@ list_offset_t RelIDList::getListOffset(offset_t nodeOffset, offset_t relOffset) auto pageCursor = ListHandle::getPageCursor(listHeader, numElementsPerPage); auto numElementsInPersistentStore = getNumElementsFromListHeader(nodeOffset); uint64_t numElementsRead = 0; - while (numElementsRead < numElementsInPersistentStore) { + uint64_t retVal = UINT64_MAX; + while (numElementsRead < numElementsInPersistentStore && retVal == UINT64_MAX) { auto numElementsToReadInCurPage = std::min(numElementsInPersistentStore - numElementsRead, (uint64_t)(numElementsPerPage - pageCursor.elemPosInPage)); auto physicalPageIdx = pageMapper(pageCursor.pageIdx); - auto buffer = bufferManager.pin(*fileHandle, physicalPageIdx) + - getElemByteOffset(pageCursor.elemPosInPage); - for (auto i = 0u; i < numElementsToReadInCurPage; i++) { - auto relIDInList = *(int64_t*)buffer; - if (relIDInList == relOffset) { - bufferManager.unpin(*fileHandle, physicalPageIdx); - return numElementsRead; + bufferManager.optimisticRead(*fileHandle, physicalPageIdx, [&](uint8_t* frame) -> void { + auto buffer = frame + getElemByteOffset(pageCursor.elemPosInPage); + for (auto i = 0u; i < numElementsToReadInCurPage; i++) { + auto relIDInList = *(int64_t*)buffer; + if (relIDInList == relOffset) { + retVal = numElementsRead; + return; + } + numElementsRead++; + buffer += elementSize; } - numElementsRead++; - buffer += elementSize; - } - bufferManager.unpin(*fileHandle, physicalPageIdx); + }); pageCursor.nextPage(); } // If we don't find the associated listOffset for the given relID in persistent store list, it // means that this rel is stored in update store, and we return UINT64_MAX for this case. - return UINT64_MAX; + return retVal; } void RelIDList::readFromSmallList(ValueVector* valueVector, ListHandle& listHandle) { diff --git a/src/storage/storage_structure/storage_structure.cpp b/src/storage/storage_structure/storage_structure.cpp index 6ca1f2929b..4fa099f4d9 100644 --- a/src/storage/storage_structure/storage_structure.cpp +++ b/src/storage/storage_structure/storage_structure.cpp @@ -20,7 +20,7 @@ void StorageStructure::addNewPageToFileHandle() { *wal->fileHandle, pageIdxInWAL, BufferManager::PageReadPolicy::DONT_READ_PAGE); fileHandle->addWALPageIdxGroupIfNecessary(pageIdxInOriginalFile); fileHandle->setWALPageIdx(pageIdxInOriginalFile, pageIdxInWAL); - bufferManager.setPinnedPageDirty(*wal->fileHandle, pageIdxInWAL); + wal->fileHandle->setLockedPageDirty(pageIdxInWAL); bufferManager.unpin(*wal->fileHandle, pageIdxInWAL); } @@ -88,21 +88,21 @@ void BaseColumnOrList::readInternalIDsFromAPageBySequentialCopy(Transaction* tra auto [fileHandleToPin, pageIdxToPin] = StorageStructureUtils::getFileHandleAndPhysicalPageIdxToPin( *fileHandle, physicalPageIdx, *wal, transaction->getType()); - auto frame = bufferManager.pin(*fileHandleToPin, pageIdxToPin); - if (hasNoNullGuarantee) { - vector->setRangeNonNull(vectorStartPos, numValuesToRead); - } else { - readNullBitsFromAPage( - vector, frame, pagePosOfFirstElement, vectorStartPos, numValuesToRead); - } - auto currentFrameHead = frame + getElemByteOffset(pagePosOfFirstElement); - for (auto i = 0u; i < numValuesToRead; i++) { - internalID_t internalID{0, commonTableID}; - internalID.offset = *(offset_t*)currentFrameHead; - currentFrameHead += sizeof(offset_t); - vector->setValue(vectorStartPos + i, internalID); - } - bufferManager.unpin(*fileHandleToPin, pageIdxToPin); + bufferManager.optimisticRead(*fileHandleToPin, pageIdxToPin, [&](uint8_t* frame) { + if (hasNoNullGuarantee) { + vector->setRangeNonNull(vectorStartPos, numValuesToRead); + } else { + readNullBitsFromAPage( + vector, frame, pagePosOfFirstElement, vectorStartPos, numValuesToRead); + } + auto currentFrameHead = frame + getElemByteOffset(pagePosOfFirstElement); + for (auto i = 0u; i < numValuesToRead; i++) { + internalID_t internalID{0, commonTableID}; + internalID.offset = *(offset_t*)currentFrameHead; + currentFrameHead += sizeof(offset_t); + vector->setValue(vectorStartPos + i, internalID); + } + }); } void BaseColumnOrList::readInternalIDsBySequentialCopyWithSelState(Transaction* transaction, @@ -176,11 +176,12 @@ void BaseColumnOrList::readAPageBySequentialCopy(Transaction* transaction, Value *fileHandle, physicalPageIdx, *wal, transaction->getType()); auto vectorBytesOffset = getElemByteOffset(vectorStartPos); auto frameBytesOffset = getElemByteOffset(pagePosOfFirstElement); - auto frame = bufferManager.pin(*fileHandleToPin, pageIdxToPin); - memcpy(vector->getData() + vectorBytesOffset, frame + frameBytesOffset, - numValuesToRead * elementSize); - readNullBitsFromAPage(vector, frame, pagePosOfFirstElement, vectorStartPos, numValuesToRead); - bufferManager.unpin(*fileHandleToPin, pageIdxToPin); + bufferManager.optimisticRead(*fileHandleToPin, pageIdxToPin, [&](uint8_t* frame) { + memcpy(vector->getData() + vectorBytesOffset, frame + frameBytesOffset, + numValuesToRead * elementSize); + readNullBitsFromAPage( + vector, frame, pagePosOfFirstElement, vectorStartPos, numValuesToRead); + }); } void BaseColumnOrList::readNullBitsFromAPage(ValueVector* valueVector, const uint8_t* frame, diff --git a/src/storage/storage_structure/storage_structure_utils.cpp b/src/storage/storage_structure/storage_structure_utils.cpp index d8297e2fe3..c2fde10059 100644 --- a/src/storage/storage_structure/storage_structure_utils.cpp +++ b/src/storage/storage_structure/storage_structure_utils.cpp @@ -42,27 +42,26 @@ WALPageIdxAndFrame StorageStructureUtils::createWALVersionIfNecessaryAndPinPage( StorageStructureID storageStructureID, BufferManager& bufferManager, WAL& wal) { fileHandle.addWALPageIdxGroupIfNecessary(originalPageIdx); page_idx_t pageIdxInWAL; - uint8_t* frame; + uint8_t* walFrame; fileHandle.acquireWALPageIdxLock(originalPageIdx); if (fileHandle.hasWALPageVersionNoWALPageIdxLock(originalPageIdx)) { pageIdxInWAL = fileHandle.getWALPageIdxNoWALPageIdxLock(originalPageIdx); - frame = bufferManager.pin( + walFrame = bufferManager.pin( *wal.fileHandle, pageIdxInWAL, BufferManager::PageReadPolicy::READ_PAGE); } else { pageIdxInWAL = wal.logPageUpdateRecord( storageStructureID, originalPageIdx /* pageIdxInOriginalFile */); - frame = bufferManager.pin( + walFrame = bufferManager.pin( *wal.fileHandle, pageIdxInWAL, BufferManager::PageReadPolicy::DONT_READ_PAGE); - auto originalFrame = bufferManager.pin(fileHandle, originalPageIdx, - insertingNewPage ? BufferManager::PageReadPolicy::DONT_READ_PAGE : - BufferManager::PageReadPolicy::READ_PAGE); - // Note: This logic only works for db files with DEFAULT_PAGE_SIZEs. - memcpy(frame, originalFrame, BufferPoolConstants::PAGE_4KB_SIZE); - bufferManager.unpin(fileHandle, originalPageIdx); + if (!insertingNewPage) { + bufferManager.optimisticRead(fileHandle, originalPageIdx, [&](uint8_t* frame) -> void { + memcpy(walFrame, frame, BufferPoolConstants::PAGE_4KB_SIZE); + }); + } fileHandle.setWALPageIdxNoLock(originalPageIdx /* pageIdxInOriginalFile */, pageIdxInWAL); - bufferManager.setPinnedPageDirty(*wal.fileHandle, pageIdxInWAL); + wal.fileHandle->setLockedPageDirty(pageIdxInWAL); } - return {originalPageIdx, pageIdxInWAL, frame}; + return {originalPageIdx, pageIdxInWAL, walFrame}; } void StorageStructureUtils::unpinWALPageAndReleaseOriginalPageLock( diff --git a/test/runner/e2e_ddl_test.cpp b/test/runner/e2e_ddl_test.cpp index a87ff7fb0e..d2167c0569 100644 --- a/test/runner/e2e_ddl_test.cpp +++ b/test/runner/e2e_ddl_test.cpp @@ -696,9 +696,9 @@ TEST_F(TinySnbDDLTest, DropNodeTablePropertyNormalExecution) { dropNodeTableProperty(TransactionTestType::NORMAL_EXECUTION); } -// TEST_F(TinySnbDDLTest, DropNodeTablePropertyRecovery) { -// dropNodeTableProperty(TransactionTestType::RECOVERY); -//} +TEST_F(TinySnbDDLTest, DropNodeTablePropertyRecovery) { + dropNodeTableProperty(TransactionTestType::RECOVERY); +} TEST_F(TinySnbDDLTest, DropRelTablePropertyNormalExecution) { dropRelTableProperty(TransactionTestType::NORMAL_EXECUTION);