Skip to content

Commit

Permalink
bm optimistic reads
Browse files Browse the repository at this point in the history
  • Loading branch information
ray6080 committed Mar 27, 2023
1 parent 3f574a4 commit 3d782ad
Show file tree
Hide file tree
Showing 12 changed files with 370 additions and 268 deletions.
119 changes: 79 additions & 40 deletions src/include/storage/buffer_manager/bm_file_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,44 +6,87 @@
namespace kuzu {
namespace storage {

static constexpr uint64_t IS_IN_FRAME_MASK = 0x8000000000000000;
static constexpr uint64_t DIRTY_MASK = 0x4000000000000000;
static constexpr uint64_t PAGE_IDX_MASK = 0x3FFFFFFFFFFFFFFF;

enum class LockMode : uint8_t { SPIN = 0, NON_BLOCKING = 1 };

class BMFileHandle;
class BufferManager;

// Keeps the state information of a page in a file.
class PageState {
static constexpr uint64_t DIRTY_MASK = 0x0080000000000000;
static constexpr uint64_t STATE_MASK = 0xFF00000000000000;
static constexpr uint64_t VERSION_MASK = 0x00FFFFFFFFFFFFFF;
static constexpr uint64_t NUM_BITS_TO_SHIFT_FOR_STATE = 56;

public:
inline bool isInFrame() const { return pageIdx & IS_IN_FRAME_MASK; }
inline void setDirty() { pageIdx |= DIRTY_MASK; }
inline void clearDirty() { pageIdx &= ~DIRTY_MASK; }
inline bool isDirty() const { return pageIdx & DIRTY_MASK; }
inline common::page_idx_t getPageIdx() const {
return (common::page_idx_t)(pageIdx & PAGE_IDX_MASK);
}
inline uint64_t incrementPinCount() { return pinCount.fetch_add(1); }
inline uint64_t decrementPinCount() { return pinCount.fetch_sub(1); }
inline void setPinCount(uint64_t newPinCount) { pinCount.store(newPinCount); }
inline uint64_t getPinCount() const { return pinCount.load(); }
inline uint64_t getEvictionTimestamp() const { return evictionTimestamp.load(); }
inline uint64_t incrementEvictionTimestamp() { return evictionTimestamp.fetch_add(1); }
inline void releaseLock() { lock.clear(); }

bool acquireLock(LockMode lockMode);
void setInFrame(common::page_idx_t pageIdx);
void resetState();
static constexpr uint64_t UNLOCKED = 0;
static constexpr uint64_t LOCKED = 1;
static constexpr uint64_t MARKED = 2;
static constexpr uint64_t EVICTED = 3;

PageState() {
stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release);
}

inline uint64_t getState() { return getState(stateAndVersion.load()); }
inline static uint64_t getState(uint64_t stateAndVersion) {
return (stateAndVersion & STATE_MASK) >> NUM_BITS_TO_SHIFT_FOR_STATE;
}
inline static uint64_t getVersion(uint64_t stateAndVersion) {
return stateAndVersion & VERSION_MASK;
}
inline static uint64_t updateStateWithSameVersion(
uint64_t oldStateAndVersion, uint64_t newState) {
return ((oldStateAndVersion << 8) >> 8) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE);
}
inline static uint64_t updateStateAndIncrementVersion(
uint64_t oldStateAndVersion, uint64_t newState) {
return (((oldStateAndVersion << 8) >> 8) + 1) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE);
}
inline void spinLock(uint64_t oldStateAndVersion) {
while (true) {
if (tryLock(oldStateAndVersion)) {
return;
}
}
}
inline bool tryLock(uint64_t oldStateAndVersion) {
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, LOCKED));
}
inline void unlock() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion.store(updateStateAndIncrementVersion(stateAndVersion.load(), UNLOCKED),
std::memory_order_release);
}
// Change page state from Mark to Unlocked.
inline bool tryClearMark(uint64_t oldStateAndVersion) {
assert(getState(oldStateAndVersion) == MARKED);
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, UNLOCKED));
}
inline bool tryMark(uint64_t oldStateAndVersion) {
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, MARKED));
}

inline void setDirty() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion |= DIRTY_MASK;
}
inline void clearDirty() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion &= ~DIRTY_MASK;
}
inline bool isDirty() const { return stateAndVersion & DIRTY_MASK; }
uint64_t getStateAndVersion() const { return stateAndVersion.load(); }

inline void resetToEvicted() {
stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release);
}

private:
std::atomic_flag lock = ATOMIC_FLAG_INIT;
// Highest 1st bit indicates if this page is loaded or not, 2nd bit indicates if this
// page is dirty or not. The rest 62 bits records the page idx inside the file.
uint64_t pageIdx = 0;
std::atomic<uint32_t> pinCount = 0;
std::atomic<uint64_t> evictionTimestamp = 0;
// Highest 1 bit is dirty bit, and the rest are page state and version bits.
// In the rest bits, the lowest 1 byte is state, and the rest are version.
std::atomic<uint64_t> stateAndVersion;
};

// This class is used to keep the WAL page idxes of a page group in the original file handle.
Expand Down Expand Up @@ -94,6 +137,12 @@ class BMFileHandle : public FileHandle {
BMFileHandle(const std::string& path, uint8_t flags, BufferManager* bm,
common::PageSizeClass pageSizeClass, FileVersionedType fileVersionedType);

// This function assumes the page is already LOCKED.
inline void setLockedPageDirty(common::page_idx_t pageIdx) {
assert(pageIdx < numPages);
pageStates[pageIdx]->setDirty();
}

common::page_group_idx_t addWALPageIdxGroupIfNecessary(common::page_idx_t originalPageIdx);
// This function is intended to be used after a fileInfo is created and we want the file
// to have no pages and page locks. Should be called after ensuring that the buffer manager
Expand Down Expand Up @@ -122,16 +171,6 @@ class BMFileHandle : public FileHandle {
assert(pageIdx < numPages && pageStates[pageIdx]);
return pageStates[pageIdx].get();
}
inline void clearPageState(common::page_idx_t pageIdx) {
assert(pageIdx < numPages && pageStates[pageIdx]);
pageStates[pageIdx]->resetState();
}
inline bool acquirePageLock(common::page_idx_t pageIdx, LockMode lockMode) {
return getPageState(pageIdx)->acquireLock(lockMode);
}
inline void releasePageLock(common::page_idx_t pageIdx) {
getPageState(pageIdx)->releaseLock();
}
inline common::frame_idx_t getFrameIdx(common::page_idx_t pageIdx) {
assert(pageIdx < pageCapacity);
return (frameGroupIdxes[pageIdx >> common::StorageConstants::PAGE_GROUP_SIZE_LOG2]
Expand Down
93 changes: 74 additions & 19 deletions src/include/storage/buffer_manager/buffer_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,38 @@ class logger;
namespace kuzu {
namespace storage {

// This class keeps state info for pages potentially can be evicted.
// The page state of a candidate is set to MARKED when it is first enqueued. After enqueued, if the
// candidate was recently accessed, it is no longer immediately evictable. See the state transition
// diagram above `BufferManager` class declaration for more details.
struct EvictionCandidate {
bool isEvictable() const {
return pageState->getEvictionTimestamp() == evictionTimestamp &&
pageState->getPinCount() == 0;
// If the candidate is Marked and its version is the same as the one kept inside the candidate,
// it is evictable.
inline bool isEvictable(uint64_t currPageStateAndVersion) const {
return PageState::getState(currPageStateAndVersion) == PageState::MARKED &&
PageState::getVersion(currPageStateAndVersion) == pageVersion;
}
// If the candidate was recently read optimistically, it is second chance evictable.
inline bool isSecondChanceEvictable(uint64_t currPageStateAndVersion) const {
return PageState::getState(currPageStateAndVersion) == PageState::UNLOCKED &&
PageState::getVersion(currPageStateAndVersion) == pageVersion;
}

BMFileHandle* fileHandle;
PageState* pageState;
// The eviction timestamp of the corresponding page state at the time the candidate is enqueued.
uint64_t evictionTimestamp = -1u;
BMFileHandle* fileHandle = nullptr;
common::page_idx_t pageIdx = common::INVALID_PAGE_IDX;
PageState* pageState = nullptr;
// The version of the corresponding page at the time the candidate is enqueued.
uint64_t pageVersion = -1u;
};

class EvictionQueue {
public:
EvictionQueue() { queue = std::make_unique<moodycamel::ConcurrentQueue<EvictionCandidate>>(); }

inline void enqueue(
BMFileHandle* fileHandle, PageState* frameHandle, uint64_t evictionTimestamp) {
queue->enqueue(EvictionCandidate{fileHandle, frameHandle, evictionTimestamp});
inline void enqueue(EvictionCandidate& candidate) { queue->enqueue(candidate); }
inline void enqueue(BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState,
uint64_t pageVersion) {
queue->enqueue(EvictionCandidate{fileHandle, pageIdx, pageState, pageVersion});
}
inline bool dequeue(EvictionCandidate& candidate) { return queue->try_dequeue(candidate); }
void removeNonEvictableCandidates();
Expand All @@ -45,10 +58,12 @@ class EvictionQueue {
* 1) it provides the high-level functionality to pin() and unpin() the pages of the database files
* used by storage structures, such as the Column, Lists, or HashIndex in the storage layer, and
* operates via their BMFileHandle to read/write the page data into/out of one of the frames.
* 2) it supports the MemoryManager (MM) to allocate memory buffers that are not backed by
* any disk files. Similar to disk files, MM provides BMFileHandles backed by temp in-mem files to
* the BM to pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin
* happens when MM tries to reclaim a memory buffer.
* 2) it provides optimistic read of pages, which optimistically read unlocked or marked pages
* without acquiring locks.
* 3) it supports the MemoryManager (MM) to allocate memory buffers that are not
* backed by any disk files. Similar to disk files, MM provides in-mem file handles to the BM to
* pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin happens
* when MM tries to reclaim a memory buffer.
*
* Specifically, in BM's context, page is the basic management unit of data in a file. The file can
* be a disk file, such as a column file, or an in-mem file, such as an temp in-memory file kept by
Expand All @@ -60,7 +75,7 @@ class EvictionQueue {
* to be kept in frame and what can be spilled to disk is directly determined by the pin/unpin
* calls of the users.
*
* Also, BM provides some specialized functionalities:
* Also, BM provides some specialized functionalities for WAL files:
* 1) it supports the caller to set pinned pages as dirty, which will be safely written back to disk
* when the pages are evicted;
* 2) it supports the caller to flush or remove pages from the BM;
Expand All @@ -86,6 +101,45 @@ class EvictionQueue {
* queue based replacement policy and the MADV_DONTNEED hint to explicitly control evictions. See
* comments above `claimAFrame()` for more details.
*
* Page states in BM:
* A page can be in one of the four states: a) LOCKED, b) UNLOCKED, c) MARKED, d) EVICTED.
* Every page is initialized as in the EVICTED state.
* The state transition diagram of page X is as follows (oRead refers to optimisticRead):
* Note: optimistic reads on UNLOCKED pages don't make any changes to pages' states. oRead on
* UNLOCKED is omitted in the diagram.
*
* 7.2. pin(pY): evict pX. 7.1. pin(pY): tryLock(pX)
* |<-------------------------|<------------------------------------------------------------|
* | | 4. pin(pX) |
* | |<------------------------------------------------------------|
* | 1. pin(pX) | 5. pin(pX) 6. pin(pY): 2nd chance eviction |
* EVICTED ------------------> LOCKED <-------------UNLOCKED ------------------------------> MARKED
* | | 3. oRead(pX) |
* | <--------------------------------------|
* | 2. unpin(pX): enqueue pX & increment version |
* ------------------------------------------------------------->
*
* 1. When page pX at EVICTED state and it is pinned, it transits to the Locked state. `pin` will
* first acquire the exclusive lock on the page, then read the page from disk into its frame. The
* caller can safely make changes to the page.
* 2. When the caller finishes changes to the page, it calls `unpin`, which releases the lock on the
* page, puts the page into the eviction queue, and increments its version. The page now transits to
* the MARKED state. Note that currently the page is still cached, but it is ready to be evicted.
* The page version number is used to identify any potential writes on the page. Each time a page
* transits from LOCKED to MARKED state, we will increment its version. This happens when a page is
* pinned, then unpinned. During the pin and unpin, we assume the page's content in its
* corresponding frame might have changed, thus, we increment the version number to forbid stale
* reads on it;
* 3. The MARKED page can be optimistically read by the caller, setting the page's state to
* UNLOCKED. For evicted pages, optimistic reads will trigger pin and unpin to read pages from disk
* into frames.
* 4. The MARKED page can be pinned again by the caller, setting the page's state to LOCKED.
* 5. The UNLOCKED page can also be pinned again by the caller, setting the page's state to LOCKED.
* 6. During eviction, UNLOCKED pages will be check if they are second chance evictable. If so, they
* will be set to MARKED, and their eviction candidates will be moved back to the eviction queue.
* 7. During eviction, if the page is in the MARKED state, it will be LOCKED first (7.1), then
* removed from its frame, and set to EVICTED (7.2).
*
* The design is inspired by vmcache in the paper "Virtual-Memory Assisted Buffer Management"
* (https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf).
* We would also like to thank Fadhil Abubaker for doing the initial research and prototyping of
Expand All @@ -102,12 +156,12 @@ class BufferManager {

uint8_t* pin(BMFileHandle& fileHandle, common::page_idx_t pageIdx,
PageReadPolicy pageReadPolicy = PageReadPolicy::READ_PAGE);

void setPinnedPageDirty(BMFileHandle& fileHandle, common::page_idx_t pageIdx);

void optimisticRead(BMFileHandle& fileHandle, common::page_idx_t pageIdx,
const std::function<void(uint8_t*)>& func);
// The function assumes that the requested page is already pinned.
void unpin(BMFileHandle& fileHandle, common::page_idx_t pageIdx);

// Currently, these functions are specifically used only for WAL files.
void removeFilePagesFromFrames(BMFileHandle& fileHandle);
void flushAllDirtyPagesInFrames(BMFileHandle& fileHandle);
void updateFrameIfPageIsInFrameWithoutLock(
Expand Down Expand Up @@ -137,7 +191,8 @@ class BufferManager {
void removePageFromFrame(
BMFileHandle& fileHandle, common::page_idx_t pageIdx, bool shouldFlush);

void addToEvictionQueue(BMFileHandle* fileHandle, PageState* pageState);
void addToEvictionQueue(
BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState);

inline uint64_t reserveUsedMemory(uint64_t size) { return usedMemory.fetch_add(size); }
inline uint64_t freeUsedMemory(uint64_t size) { return usedMemory.fetch_sub(size); }
Expand Down
3 changes: 3 additions & 0 deletions src/include/storage/storage_structure/disk_overflow_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ class DiskOverflowFile : public StorageStructure {
void setListRecursiveIfNestedWithoutLock(const common::ku_list_t& inMemSrcList,
common::ku_list_t& diskDstList, const common::DataType& dataType);
void logNewOverflowFileNextBytePosRecordIfNecessaryWithoutLock();
void readValuesInList(transaction::TransactionType trxType, const common::DataType& dataType,
std::vector<std::unique_ptr<common::Value>>& retValues, uint32_t numBytesOfSingleValue,
uint64_t numValuesInList, PageByteCursor& cursor, uint8_t* frame);
void pinOverflowPageCache(BMFileHandle* bmFileHandleToPin, common::page_idx_t pageIdxToPin,
OverflowPageCache& overflowPageCache);
void unpinOverflowPageCache(OverflowPageCache& overflowPageCache);
Expand Down
21 changes: 0 additions & 21 deletions src/storage/buffer_manager/bm_file_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,6 @@ using namespace kuzu::common;
namespace kuzu {
namespace storage {

void PageState::setInFrame(common::page_idx_t pageIdx_) {
pageIdx = 0;
pageIdx = pageIdx_;
pageIdx |= IS_IN_FRAME_MASK;
}

bool PageState::acquireLock(LockMode lockMode) {
if (lockMode == LockMode::SPIN) {
while (lock.test_and_set()) // spinning
;
return true;
}
return !lock.test_and_set();
}

void PageState::resetState() {
pageIdx = 0;
pinCount = 0;
evictionTimestamp = 0;
}

WALPageIdxGroup::WALPageIdxGroup() {
walPageIdxes.resize(common::StorageConstants::PAGE_GROUP_SIZE, common::INVALID_PAGE_IDX);
walPageIdxLocks.resize(common::StorageConstants::PAGE_GROUP_SIZE);
Expand Down
Loading

0 comments on commit 3d782ad

Please sign in to comment.