Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BM optimistic reads #1409

Merged
merged 1 commit into from
Mar 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 79 additions & 40 deletions src/include/storage/buffer_manager/bm_file_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,44 +6,87 @@
namespace kuzu {
namespace storage {

static constexpr uint64_t IS_IN_FRAME_MASK = 0x8000000000000000;
static constexpr uint64_t DIRTY_MASK = 0x4000000000000000;
static constexpr uint64_t PAGE_IDX_MASK = 0x3FFFFFFFFFFFFFFF;

enum class LockMode : uint8_t { SPIN = 0, NON_BLOCKING = 1 };

class BMFileHandle;
class BufferManager;

// Keeps the state information of a page in a file.
class PageState {
static constexpr uint64_t DIRTY_MASK = 0x0080000000000000;
static constexpr uint64_t STATE_MASK = 0xFF00000000000000;
static constexpr uint64_t VERSION_MASK = 0x00FFFFFFFFFFFFFF;
static constexpr uint64_t NUM_BITS_TO_SHIFT_FOR_STATE = 56;

public:
inline bool isInFrame() const { return pageIdx & IS_IN_FRAME_MASK; }
inline void setDirty() { pageIdx |= DIRTY_MASK; }
inline void clearDirty() { pageIdx &= ~DIRTY_MASK; }
inline bool isDirty() const { return pageIdx & DIRTY_MASK; }
inline common::page_idx_t getPageIdx() const {
return (common::page_idx_t)(pageIdx & PAGE_IDX_MASK);
}
inline uint64_t incrementPinCount() { return pinCount.fetch_add(1); }
inline uint64_t decrementPinCount() { return pinCount.fetch_sub(1); }
inline void setPinCount(uint64_t newPinCount) { pinCount.store(newPinCount); }
inline uint64_t getPinCount() const { return pinCount.load(); }
inline uint64_t getEvictionTimestamp() const { return evictionTimestamp.load(); }
inline uint64_t incrementEvictionTimestamp() { return evictionTimestamp.fetch_add(1); }
inline void releaseLock() { lock.clear(); }

bool acquireLock(LockMode lockMode);
void setInFrame(common::page_idx_t pageIdx);
void resetState();
static constexpr uint64_t UNLOCKED = 0;
static constexpr uint64_t LOCKED = 1;
static constexpr uint64_t MARKED = 2;
static constexpr uint64_t EVICTED = 3;

PageState() {
stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release);
}

inline uint64_t getState() { return getState(stateAndVersion.load()); }
inline static uint64_t getState(uint64_t stateAndVersion) {
return (stateAndVersion & STATE_MASK) >> NUM_BITS_TO_SHIFT_FOR_STATE;
}
inline static uint64_t getVersion(uint64_t stateAndVersion) {
return stateAndVersion & VERSION_MASK;
}
inline static uint64_t updateStateWithSameVersion(
uint64_t oldStateAndVersion, uint64_t newState) {
return ((oldStateAndVersion << 8) >> 8) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE);
}
inline static uint64_t updateStateAndIncrementVersion(
ray6080 marked this conversation as resolved.
Show resolved Hide resolved
uint64_t oldStateAndVersion, uint64_t newState) {
return (((oldStateAndVersion << 8) >> 8) + 1) | (newState << NUM_BITS_TO_SHIFT_FOR_STATE);
}
inline void spinLock(uint64_t oldStateAndVersion) {
while (true) {
if (tryLock(oldStateAndVersion)) {
return;
}
}
}
inline bool tryLock(uint64_t oldStateAndVersion) {
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, LOCKED));
}
inline void unlock() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion.store(updateStateAndIncrementVersion(stateAndVersion.load(), UNLOCKED),
std::memory_order_release);
}
// Change page state from Mark to Unlocked.
inline bool tryClearMark(uint64_t oldStateAndVersion) {
assert(getState(oldStateAndVersion) == MARKED);
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, UNLOCKED));
}
inline bool tryMark(uint64_t oldStateAndVersion) {
return stateAndVersion.compare_exchange_strong(
oldStateAndVersion, updateStateWithSameVersion(oldStateAndVersion, MARKED));
}

inline void setDirty() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion |= DIRTY_MASK;
}
inline void clearDirty() {
assert(getState(stateAndVersion.load()) == LOCKED);
stateAndVersion &= ~DIRTY_MASK;
}
inline bool isDirty() const { return stateAndVersion & DIRTY_MASK; }
uint64_t getStateAndVersion() const { return stateAndVersion.load(); }

inline void resetToEvicted() {
stateAndVersion.store(EVICTED << NUM_BITS_TO_SHIFT_FOR_STATE, std::memory_order_release);
}

private:
std::atomic_flag lock = ATOMIC_FLAG_INIT;
// Highest 1st bit indicates if this page is loaded or not, 2nd bit indicates if this
// page is dirty or not. The rest 62 bits records the page idx inside the file.
uint64_t pageIdx = 0;
std::atomic<uint32_t> pinCount = 0;
std::atomic<uint64_t> evictionTimestamp = 0;
// Highest 1 bit is dirty bit, and the rest are page state and version bits.
// In the rest bits, the lowest 1 byte is state, and the rest are version.
std::atomic<uint64_t> stateAndVersion;
};

// This class is used to keep the WAL page idxes of a page group in the original file handle.
Expand Down Expand Up @@ -94,6 +137,12 @@ class BMFileHandle : public FileHandle {
BMFileHandle(const std::string& path, uint8_t flags, BufferManager* bm,
common::PageSizeClass pageSizeClass, FileVersionedType fileVersionedType);

// This function assumes the page is already LOCKED.
inline void setLockedPageDirty(common::page_idx_t pageIdx) {
assert(pageIdx < numPages);
pageStates[pageIdx]->setDirty();
}

common::page_group_idx_t addWALPageIdxGroupIfNecessary(common::page_idx_t originalPageIdx);
// This function is intended to be used after a fileInfo is created and we want the file
// to have no pages and page locks. Should be called after ensuring that the buffer manager
Expand Down Expand Up @@ -122,16 +171,6 @@ class BMFileHandle : public FileHandle {
assert(pageIdx < numPages && pageStates[pageIdx]);
return pageStates[pageIdx].get();
}
inline void clearPageState(common::page_idx_t pageIdx) {
assert(pageIdx < numPages && pageStates[pageIdx]);
pageStates[pageIdx]->resetState();
}
inline bool acquirePageLock(common::page_idx_t pageIdx, LockMode lockMode) {
return getPageState(pageIdx)->acquireLock(lockMode);
}
inline void releasePageLock(common::page_idx_t pageIdx) {
getPageState(pageIdx)->releaseLock();
}
inline common::frame_idx_t getFrameIdx(common::page_idx_t pageIdx) {
assert(pageIdx < pageCapacity);
return (frameGroupIdxes[pageIdx >> common::StorageConstants::PAGE_GROUP_SIZE_LOG2]
Expand Down
93 changes: 74 additions & 19 deletions src/include/storage/buffer_manager/buffer_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,38 @@ class logger;
namespace kuzu {
namespace storage {

// This class keeps state info for pages potentially can be evicted.
// The page state of a candidate is set to MARKED when it is first enqueued. After enqueued, if the
// candidate was recently accessed, it is no longer immediately evictable. See the state transition
// diagram above `BufferManager` class declaration for more details.
struct EvictionCandidate {
bool isEvictable() const {
return pageState->getEvictionTimestamp() == evictionTimestamp &&
pageState->getPinCount() == 0;
// If the candidate is Marked and its version is the same as the one kept inside the candidate,
// it is evictable.
inline bool isEvictable(uint64_t currPageStateAndVersion) const {
return PageState::getState(currPageStateAndVersion) == PageState::MARKED &&
PageState::getVersion(currPageStateAndVersion) == pageVersion;
}
// If the candidate was recently read optimistically, it is second chance evictable.
inline bool isSecondChanceEvictable(uint64_t currPageStateAndVersion) const {
return PageState::getState(currPageStateAndVersion) == PageState::UNLOCKED &&
PageState::getVersion(currPageStateAndVersion) == pageVersion;
}

BMFileHandle* fileHandle;
PageState* pageState;
// The eviction timestamp of the corresponding page state at the time the candidate is enqueued.
uint64_t evictionTimestamp = -1u;
BMFileHandle* fileHandle = nullptr;
common::page_idx_t pageIdx = common::INVALID_PAGE_IDX;
PageState* pageState = nullptr;
// The version of the corresponding page at the time the candidate is enqueued.
uint64_t pageVersion = -1u;
};

class EvictionQueue {
public:
EvictionQueue() { queue = std::make_unique<moodycamel::ConcurrentQueue<EvictionCandidate>>(); }

inline void enqueue(
BMFileHandle* fileHandle, PageState* frameHandle, uint64_t evictionTimestamp) {
queue->enqueue(EvictionCandidate{fileHandle, frameHandle, evictionTimestamp});
inline void enqueue(EvictionCandidate& candidate) { queue->enqueue(candidate); }
inline void enqueue(BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState,
uint64_t pageVersion) {
queue->enqueue(EvictionCandidate{fileHandle, pageIdx, pageState, pageVersion});
}
inline bool dequeue(EvictionCandidate& candidate) { return queue->try_dequeue(candidate); }
void removeNonEvictableCandidates();
Expand All @@ -45,10 +58,12 @@ class EvictionQueue {
* 1) it provides the high-level functionality to pin() and unpin() the pages of the database files
* used by storage structures, such as the Column, Lists, or HashIndex in the storage layer, and
* operates via their BMFileHandle to read/write the page data into/out of one of the frames.
* 2) it supports the MemoryManager (MM) to allocate memory buffers that are not backed by
* any disk files. Similar to disk files, MM provides BMFileHandles backed by temp in-mem files to
* the BM to pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin
* happens when MM tries to reclaim a memory buffer.
* 2) it provides optimistic read of pages, which optimistically read unlocked or marked pages
* without acquiring locks.
* 3) it supports the MemoryManager (MM) to allocate memory buffers that are not
* backed by any disk files. Similar to disk files, MM provides in-mem file handles to the BM to
* pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin happens
* when MM tries to reclaim a memory buffer.
*
* Specifically, in BM's context, page is the basic management unit of data in a file. The file can
* be a disk file, such as a column file, or an in-mem file, such as an temp in-memory file kept by
Expand All @@ -60,7 +75,7 @@ class EvictionQueue {
* to be kept in frame and what can be spilled to disk is directly determined by the pin/unpin
* calls of the users.
*
* Also, BM provides some specialized functionalities:
* Also, BM provides some specialized functionalities for WAL files:
* 1) it supports the caller to set pinned pages as dirty, which will be safely written back to disk
* when the pages are evicted;
* 2) it supports the caller to flush or remove pages from the BM;
Expand All @@ -86,6 +101,45 @@ class EvictionQueue {
* queue based replacement policy and the MADV_DONTNEED hint to explicitly control evictions. See
* comments above `claimAFrame()` for more details.
*
* Page states in BM:
* A page can be in one of the four states: a) LOCKED, b) UNLOCKED, c) MARKED, d) EVICTED.
* Every page is initialized as in the EVICTED state.
* The state transition diagram of page X is as follows (oRead refers to optimisticRead):
* Note: optimistic reads on UNLOCKED pages don't make any changes to pages' states. oRead on
* UNLOCKED is omitted in the diagram.
*
* 7.2. pin(pY): evict pX. 7.1. pin(pY): tryLock(pX)
* |<-------------------------|<------------------------------------------------------------|
* | | 4. pin(pX) |
* | |<------------------------------------------------------------|
* | 1. pin(pX) | 5. pin(pX) 6. pin(pY): 2nd chance eviction |
* EVICTED ------------------> LOCKED <-------------UNLOCKED ------------------------------> MARKED
* | | 3. oRead(pX) |
* | <--------------------------------------|
* | 2. unpin(pX): enqueue pX & increment version |
* ------------------------------------------------------------->
*
* 1. When page pX at EVICTED state and it is pinned, it transits to the Locked state. `pin` will
* first acquire the exclusive lock on the page, then read the page from disk into its frame. The
* caller can safely make changes to the page.
* 2. When the caller finishes changes to the page, it calls `unpin`, which releases the lock on the
* page, puts the page into the eviction queue, and increments its version. The page now transits to
* the MARKED state. Note that currently the page is still cached, but it is ready to be evicted.
* The page version number is used to identify any potential writes on the page. Each time a page
* transits from LOCKED to MARKED state, we will increment its version. This happens when a page is
* pinned, then unpinned. During the pin and unpin, we assume the page's content in its
* corresponding frame might have changed, thus, we increment the version number to forbid stale
* reads on it;
* 3. The MARKED page can be optimistically read by the caller, setting the page's state to
* UNLOCKED. For evicted pages, optimistic reads will trigger pin and unpin to read pages from disk
* into frames.
* 4. The MARKED page can be pinned again by the caller, setting the page's state to LOCKED.
* 5. The UNLOCKED page can also be pinned again by the caller, setting the page's state to LOCKED.
* 6. During eviction, UNLOCKED pages will be check if they are second chance evictable. If so, they
* will be set to MARKED, and their eviction candidates will be moved back to the eviction queue.
* 7. During eviction, if the page is in the MARKED state, it will be LOCKED first (7.1), then
* removed from its frame, and set to EVICTED (7.2).
*
* The design is inspired by vmcache in the paper "Virtual-Memory Assisted Buffer Management"
* (https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf).
* We would also like to thank Fadhil Abubaker for doing the initial research and prototyping of
Expand All @@ -102,12 +156,12 @@ class BufferManager {

uint8_t* pin(BMFileHandle& fileHandle, common::page_idx_t pageIdx,
PageReadPolicy pageReadPolicy = PageReadPolicy::READ_PAGE);

void setPinnedPageDirty(BMFileHandle& fileHandle, common::page_idx_t pageIdx);

void optimisticRead(BMFileHandle& fileHandle, common::page_idx_t pageIdx,
const std::function<void(uint8_t*)>& func);
// The function assumes that the requested page is already pinned.
void unpin(BMFileHandle& fileHandle, common::page_idx_t pageIdx);

// Currently, these functions are specifically used only for WAL files.
void removeFilePagesFromFrames(BMFileHandle& fileHandle);
void flushAllDirtyPagesInFrames(BMFileHandle& fileHandle);
void updateFrameIfPageIsInFrameWithoutLock(
Expand Down Expand Up @@ -137,7 +191,8 @@ class BufferManager {
void removePageFromFrame(
BMFileHandle& fileHandle, common::page_idx_t pageIdx, bool shouldFlush);

void addToEvictionQueue(BMFileHandle* fileHandle, PageState* pageState);
void addToEvictionQueue(
BMFileHandle* fileHandle, common::page_idx_t pageIdx, PageState* pageState);

inline uint64_t reserveUsedMemory(uint64_t size) { return usedMemory.fetch_add(size); }
inline uint64_t freeUsedMemory(uint64_t size) { return usedMemory.fetch_sub(size); }
Expand Down
3 changes: 3 additions & 0 deletions src/include/storage/storage_structure/disk_overflow_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ class DiskOverflowFile : public StorageStructure {
void setListRecursiveIfNestedWithoutLock(const common::ku_list_t& inMemSrcList,
common::ku_list_t& diskDstList, const common::DataType& dataType);
void logNewOverflowFileNextBytePosRecordIfNecessaryWithoutLock();
void readValuesInList(transaction::TransactionType trxType, const common::DataType& dataType,
std::vector<std::unique_ptr<common::Value>>& retValues, uint32_t numBytesOfSingleValue,
uint64_t numValuesInList, PageByteCursor& cursor, uint8_t* frame);
void pinOverflowPageCache(BMFileHandle* bmFileHandleToPin, common::page_idx_t pageIdxToPin,
OverflowPageCache& overflowPageCache);
void unpinOverflowPageCache(OverflowPageCache& overflowPageCache);
Expand Down
21 changes: 0 additions & 21 deletions src/storage/buffer_manager/bm_file_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,6 @@ using namespace kuzu::common;
namespace kuzu {
namespace storage {

void PageState::setInFrame(common::page_idx_t pageIdx_) {
pageIdx = 0;
pageIdx = pageIdx_;
pageIdx |= IS_IN_FRAME_MASK;
}

bool PageState::acquireLock(LockMode lockMode) {
if (lockMode == LockMode::SPIN) {
while (lock.test_and_set()) // spinning
;
return true;
}
return !lock.test_and_set();
}

void PageState::resetState() {
pageIdx = 0;
pinCount = 0;
evictionTimestamp = 0;
}

WALPageIdxGroup::WALPageIdxGroup() {
walPageIdxes.resize(common::StorageConstants::PAGE_GROUP_SIZE, common::INVALID_PAGE_IDX);
walPageIdxLocks.resize(common::StorageConstants::PAGE_GROUP_SIZE);
Expand Down
Loading