Skip to content

Commit

Permalink
bm optimistic reads
Browse files Browse the repository at this point in the history
  • Loading branch information
ray6080 committed Mar 26, 2023
1 parent 3f574a4 commit 57934bd
Show file tree
Hide file tree
Showing 13 changed files with 367 additions and 261 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ jobs:

benchmark:
name: benchmark
needs: [gcc-build-test, clang-build-test]
needs: [ gcc-build-test, clang-build-test ]
runs-on: kuzu-self-hosted-benchmarking
steps:
- uses: actions/checkout@v3
Expand Down
112 changes: 72 additions & 40 deletions src/include/storage/buffer_manager/bm_file_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,44 +6,86 @@
namespace kuzu {
namespace storage {

static constexpr uint64_t IS_IN_FRAME_MASK = 0x8000000000000000;
static constexpr uint64_t DIRTY_MASK = 0x4000000000000000;
static constexpr uint64_t PAGE_IDX_MASK = 0x3FFFFFFFFFFFFFFF;

enum class LockMode : uint8_t { SPIN = 0, NON_BLOCKING = 1 };

class BMFileHandle;
class BufferManager;

// Keeps the state information of a page in a file.
class PageState {
static constexpr uint64_t DIRTY_MASK = 0x0080000000000000;
static constexpr uint64_t STATE_MASK = 0xFF00000000000000;
static constexpr uint64_t VERSION_MASK = 0x00FFFFFFFFFFFFFF;
static constexpr uint64_t NUM_BITS_TO_SHIFT_FOR_STATE = 56;

public:
inline bool isInFrame() const { return pageIdx & IS_IN_FRAME_MASK; }
inline void setDirty() { pageIdx |= DIRTY_MASK; }
inline void clearDirty() { pageIdx &= ~DIRTY_MASK; }
inline bool isDirty() const { return pageIdx & DIRTY_MASK; }
inline common::page_idx_t getPageIdx() const {
return (common::page_idx_t)(pageIdx & PAGE_IDX_MASK);
}
inline uint64_t incrementPinCount() { return pinCount.fetch_add(1); }
inline uint64_t decrementPinCount() { return pinCount.fetch_sub(1); }
inline void setPinCount(uint64_t newPinCount) { pinCount.store(newPinCount); }
inline uint64_t getPinCount() const { return pinCount.load(); }
inline uint64_t getEvictionTimestamp() const { return evictionTimestamp.load(); }
inline uint64_t incrementEvictionTimestamp() { return evictionTimestamp.fetch_add(1); }
inline void releaseLock() { lock.clear(); }

bool acquireLock(LockMode lockMode);
void setInFrame(common::page_idx_t pageIdx);
void resetState();
static constexpr uint64_t UNLOCKED = 0;
static constexpr uint64_t LOCKED = 1;
static constexpr uint64_t MARKED = 2;
static constexpr uint64_t EVICTED = 3;

PageState() {
stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release);
}

inline uint64_t getState() { return getState(stateAndVersion.load()); }
inline static uint64_t getState(uint64_t stateAndVersion) {
return (stateAndVersion & STATE_MASK) >> NUM_BITS_TO_SHIFT_FOR_STATE;
}
inline static uint64_t getVersion(uint64_t stateAndVersion) {
return stateAndVersion & VERSION_MASK;
}
inline static uint64_t updateStateWithSameVersion(
uint64_t oldStateAndVersion, uint64_t newState) {
return ((oldStateAndVersion << 8) >> 8) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE);
}
inline static uint64_t updateStateAndIncrementVersion(
uint64_t oldStateAndVersion, uint64_t newState) {
return (((oldStateAndVersion << 8) >> 8) + 1) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE);
}
inline void spinLock(uint64_t oldStateAndVersion) {
while (true) {
if (tryLock(oldStateAndVersion)) {
return;
}
}
}
inline bool tryLock(uint64_t oldStateAndVersion) {
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, LOCKED));
}
inline void unlock() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion.store(updateStateAndIncrementVersion(stateAndVersion.load(), UNLOCKED),
std::memory_order_release);
}
// Change page state from Mark to Unlocked.
inline bool tryClearMark(uint64_t oldStateAndVersion) {
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, UNLOCKED));
}
inline bool tryMark(uint64_t oldStateAndVersion) {
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, MARKED));
}

inline void setDirty(uint64_t oldStateAndVersion) {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion = oldStateAndVersion | DIRTY_MASK;
}
inline void clearDirty() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion &= ~DIRTY_MASK;
}
inline bool isDirty() const { return stateAndVersion & DIRTY_MASK; }
uint64_t getStateAndVersion() const { return stateAndVersion.load(); }

inline void resetToEvicted() {
stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release);
}

private:
std::atomic_flag lock = ATOMIC_FLAG_INIT;
// Highest 1st bit indicates if this page is loaded or not, 2nd bit indicates if this
// page is dirty or not. The rest 62 bits records the page idx inside the file.
uint64_t pageIdx = 0;
std::atomic<uint32_t> pinCount = 0;
std::atomic<uint64_t> evictionTimestamp = 0;
// Highest 1 bit is dirty bit, and the rest are page state and version bits.
// In the rest bits, the lowest 1 byte is state, and the rest are version.
std::atomic<uint64_t> stateAndVersion;
};

// This class is used to keep the WAL page idxes of a page group in the original file handle.
Expand Down Expand Up @@ -122,16 +164,6 @@ class BMFileHandle : public FileHandle {
assert(pageIdx < numPages && pageStates[pageIdx]);
return pageStates[pageIdx].get();
}
inline void clearPageState(common::page_idx_t pageIdx) {
assert(pageIdx < numPages && pageStates[pageIdx]);
pageStates[pageIdx]->resetState();
}
inline bool acquirePageLock(common::page_idx_t pageIdx, LockMode lockMode) {
return getPageState(pageIdx)->acquireLock(lockMode);
}
inline void releasePageLock(common::page_idx_t pageIdx) {
getPageState(pageIdx)->releaseLock();
}
inline common::frame_idx_t getFrameIdx(common::page_idx_t pageIdx) {
assert(pageIdx < pageCapacity);
return (frameGroupIdxes[pageIdx >> common::StorageConstants::PAGE_GROUP_SIZE_LOG2]
Expand Down
88 changes: 69 additions & 19 deletions src/include/storage/buffer_manager/buffer_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,42 @@ namespace kuzu {
namespace storage {

struct EvictionCandidate {
bool isEvictable() const {
return pageState->getEvictionTimestamp() == evictionTimestamp &&
pageState->getPinCount() == 0;
// If the candidate was recently accessed, it is no longer evictable.
// The page state of a candidate is set to MARKED when it is first enqueued.
// Possible actions to the page, which makes it un-evictable:
// 1) When the page was pinned and unpinned, the page state is set to UNLOCKED, and its version
// had been incremented. Remove the candidate from the queue.
// 2) When the page was optimistically read, the page state is also set to UNLOCKED. We call
// this candidate second chance evictable. Set the candidate to MARKED, and put it back to the
// end of the queue, so it can be evicted later.
// 3) When the page was pinned, but not unpinned yet, the page state is set to LOCKED. Remove
// the candidate from the queue. Thus, if the page state is not MARKED, or the page version is
// not the same as the one kept inside the candidate, the candidate is no longer evictable.
inline bool isEvictable(uint64_t currPageStateAndVersion) const {
return PageState::getState(currPageStateAndVersion) == PageState::MARKED &&
PageState::getVersion(currPageStateAndVersion) == pageVersion;
}
// If the candidate was recently read optimistically, it is second chance evictable.
inline bool isSecondChanceEvictable(uint64_t currPageStateAndVersion) const {
return PageState::getState(currPageStateAndVersion) == PageState::UNLOCKED &&
PageState::getVersion(currPageStateAndVersion) == pageVersion;
}

BMFileHandle* fileHandle;
PageState* pageState;
// The eviction timestamp of the corresponding page state at the time the candidate is enqueued.
uint64_t evictionTimestamp = -1u;
BMFileHandle* fileHandle = nullptr;
common::page_idx_t pageIdx = common::INVALID_PAGE_IDX;
PageState* pageState = nullptr;
// The version of the corresponding page at the time the candidate is enqueued.
uint64_t pageVersion = -1u;
};

class EvictionQueue {
public:
EvictionQueue() { queue = std::make_unique<moodycamel::ConcurrentQueue<EvictionCandidate>>(); }

inline void enqueue(
BMFileHandle* fileHandle, PageState* frameHandle, uint64_t evictionTimestamp) {
queue->enqueue(EvictionCandidate{fileHandle, frameHandle, evictionTimestamp});
inline void enqueue(EvictionCandidate& candidate) { queue->enqueue(candidate); }
inline void enqueue(BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState,
uint64_t pageVersion) {
queue->enqueue(EvictionCandidate{fileHandle, pageIdx, pageState, pageVersion});
}
inline bool dequeue(EvictionCandidate& candidate) { return queue->try_dequeue(candidate); }
void removeNonEvictableCandidates();
Expand All @@ -45,10 +63,12 @@ class EvictionQueue {
* 1) it provides the high-level functionality to pin() and unpin() the pages of the database files
* used by storage structures, such as the Column, Lists, or HashIndex in the storage layer, and
* operates via their BMFileHandle to read/write the page data into/out of one of the frames.
* 2) it supports the MemoryManager (MM) to allocate memory buffers that are not backed by
* any disk files. Similar to disk files, MM provides BMFileHandles backed by temp in-mem files to
* the BM to pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin
* happens when MM tries to reclaim a memory buffer.
* 2) it provides optimistic read of pages, which optimistically read unlocked or marked pages
* without acquiring locks.
* 3) it supports the MemoryManager (MM) to allocate memory buffers that are not
* backed by any disk files. Similar to disk files, MM provides in-mem file handles to the BM to
* pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin happens
* when MM tries to reclaim a memory buffer.
*
* Specifically, in BM's context, page is the basic management unit of data in a file. The file can
* be a disk file, such as a column file, or an in-mem file, such as an temp in-memory file kept by
Expand All @@ -60,7 +80,7 @@ class EvictionQueue {
* to be kept in frame and what can be spilled to disk is directly determined by the pin/unpin
* calls of the users.
*
* Also, BM provides some specialized functionalities:
* Also, BM provides some specialized functionalities for WAL files:
* 1) it supports the caller to set pinned pages as dirty, which will be safely written back to disk
* when the pages are evicted;
* 2) it supports the caller to flush or remove pages from the BM;
Expand All @@ -86,6 +106,34 @@ class EvictionQueue {
* queue based replacement policy and the MADV_DONTNEED hint to explicitly control evictions. See
* comments above `claimAFrame()` for more details.
*
* Page states in BM:
* A page can be in one of the four states: a) Locked, b) Unlocked, c) Marked, d) Evicted.
* Every page is initialized as in the Evicted state. When a page is pinned, it transits to the
* Locked state. This happens through the `pin` call, which will first acquire the exclusive lock on
* the page to be pinned, and then the caller can safely make changes to the page.
* When the caller finishes changes to the page, it calls `unpin`, which releases the lock on the
* page, thus, the page transits to the Unlocked state. Note that currently the page is still
* cached, but potentially it can be evicted later. To track pages that can potentially be evicted,
* we push unlocked pages into the eviction queue, and set their states to Marked, which means they
* are ready to be evicted.
* Any optimistic reads on unlocked pages should not make any changes to the page, and reads on
* marked pages will first set their states to Unlocked, and then read the page. For evicted pages,
* optimistic reads will trigger pin and unpin to read pages from disk into frames.
*
* The page state transition diagram is as follows (oRead refers to optimisticRead):
* <------evict------<-------pin----------------pin---------
* | | | |
* Evicted --pin--> Locked --unpin--> Unlocked ---enqueue-> Marked
* | |
* <-------oRead--------
*
* Besides the page state, each page also has a version number. The version number is used to
* identify any potential writes on the page. Each time a page transits from Locked to Unlocked
* state, it will increment its version. When a page is pinned, then unpinned, it will transit from
* Locked to the Unlocked state, and its version will be incremented. During the pin and unpin, we
* assume the page's content in its corresponding frame might have changed, thus, we increment the
* version number to forbid stale reads on it;
*
* The design is inspired by vmcache in the paper "Virtual-Memory Assisted Buffer Management"
* (https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf).
* We would also like to thank Fadhil Abubaker for doing the initial research and prototyping of
Expand All @@ -102,12 +150,13 @@ class BufferManager {

uint8_t* pin(BMFileHandle& fileHandle, common::page_idx_t pageIdx,
PageReadPolicy pageReadPolicy = PageReadPolicy::READ_PAGE);

void setPinnedPageDirty(BMFileHandle& fileHandle, common::page_idx_t pageIdx);

void optimisticRead(BMFileHandle& fileHandle, common::page_idx_t pageIdx,
const std::function<void(uint8_t*)>& func);
// The function assumes that the requested page is already pinned.
void unpin(BMFileHandle& fileHandle, common::page_idx_t pageIdx);

// Currently, these functions are specifically used only for WAL files.
void setPinnedPageDirty(BMFileHandle& fileHandle, common::page_idx_t pageIdx);
void removeFilePagesFromFrames(BMFileHandle& fileHandle);
void flushAllDirtyPagesInFrames(BMFileHandle& fileHandle);
void updateFrameIfPageIsInFrameWithoutLock(
Expand Down Expand Up @@ -137,7 +186,8 @@ class BufferManager {
void removePageFromFrame(
BMFileHandle& fileHandle, common::page_idx_t pageIdx, bool shouldFlush);

void addToEvictionQueue(BMFileHandle* fileHandle, PageState* pageState);
void addToEvictionQueue(
BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState);

inline uint64_t reserveUsedMemory(uint64_t size) { return usedMemory.fetch_add(size); }
inline uint64_t freeUsedMemory(uint64_t size) { return usedMemory.fetch_sub(size); }
Expand Down
3 changes: 3 additions & 0 deletions src/include/storage/storage_structure/disk_overflow_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ class DiskOverflowFile : public StorageStructure {
void setListRecursiveIfNestedWithoutLock(const common::ku_list_t& inMemSrcList,
common::ku_list_t& diskDstList, const common::DataType& dataType);
void logNewOverflowFileNextBytePosRecordIfNecessaryWithoutLock();
void readValuesInList(transaction::TransactionType trxType, const common::DataType& dataType,
std::vector<std::unique_ptr<common::Value>>& retValues, uint32_t numBytesOfSingleValue,
uint64_t numValuesInList, PageByteCursor& cursor, uint8_t* frame);
void pinOverflowPageCache(BMFileHandle* bmFileHandleToPin, common::page_idx_t pageIdxToPin,
OverflowPageCache& overflowPageCache);
void unpinOverflowPageCache(OverflowPageCache& overflowPageCache);
Expand Down
21 changes: 0 additions & 21 deletions src/storage/buffer_manager/bm_file_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,6 @@ using namespace kuzu::common;
namespace kuzu {
namespace storage {

void PageState::setInFrame(common::page_idx_t pageIdx_) {
pageIdx = 0;
pageIdx = pageIdx_;
pageIdx |= IS_IN_FRAME_MASK;
}

bool PageState::acquireLock(LockMode lockMode) {
if (lockMode == LockMode::SPIN) {
while (lock.test_and_set()) // spinning
;
return true;
}
return !lock.test_and_set();
}

void PageState::resetState() {
pageIdx = 0;
pinCount = 0;
evictionTimestamp = 0;
}

WALPageIdxGroup::WALPageIdxGroup() {
walPageIdxes.resize(common::StorageConstants::PAGE_GROUP_SIZE, common::INVALID_PAGE_IDX);
walPageIdxLocks.resize(common::StorageConstants::PAGE_GROUP_SIZE);
Expand Down
Loading

0 comments on commit 57934bd

Please sign in to comment.