From ba2dde5448f90b9406999d77f2c7e74f2a19bd8c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 9 Jul 2022 16:30:41 +0300 Subject: [PATCH] Large last written lsn cache (#177) Maintain cache of last written LSN for each relation segment (8 Mb). --- contrib/neon/pagestore_smgr.c | 31 ++-- src/backend/access/gin/gininsert.c | 2 +- src/backend/access/gist/gistbuild.c | 8 +- src/backend/access/spgist/spginsert.c | 3 +- src/backend/access/transam/xlog.c | 186 ++++++++++++++++++++--- src/backend/commands/dbcommands.c | 4 +- src/backend/replication/walsender.c | 7 + src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/utils/misc/guc.c | 10 ++ src/include/access/xlog.h | 5 +- 10 files changed, 221 insertions(+), 36 deletions(-) diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c index 5fdfea5e487..a8b73c5f342 100644 --- a/contrib/neon/pagestore_smgr.c +++ b/contrib/neon/pagestore_smgr.c @@ -84,6 +84,11 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; +/* + * Pseudo block number used to associate LSN with relation metadata (relation size) + */ +#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber + page_server_api *page_server; /* GUCs */ @@ -558,7 +563,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * Remember the LSN on this page. When we read the page again, we must * read the same or newer version of it. */ - SetLastWrittenPageLSN(lsn); + SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, blocknum, blocknum); } @@ -603,7 +608,7 @@ zm_adjust_lsn(XLogRecPtr lsn) * Return LSN for requesting pages and number of blocks from page server */ static XLogRecPtr -zenith_get_request_lsn(bool *latest) +zenith_get_request_lsn(bool *latest, Oid rnode, BlockNumber blkno) { XLogRecPtr lsn; @@ -630,9 +635,9 @@ zenith_get_request_lsn(bool *latest) * so our request cannot concern those. */ *latest = true; - lsn = GetLastWrittenPageLSN(); + lsn = GetLastWrittenLSN(rnode, blkno); Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", + elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ", (uint32) ((lsn) >> 32), (uint32) (lsn)); lsn = zm_adjust_lsn(lsn); @@ -716,7 +721,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO); { ZenithExistsRequest request = { .req.tag = T_ZenithExistsRequest, @@ -791,7 +796,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * * FIXME: This is currently not just an optimization, but required for * correctness. Postgres can call smgrnblocks() on the newly-created - * relation. Currently, we don't call SetLastWrittenPageLSN() when a new + * relation. Currently, we don't call SetLastWrittenLSN() when a new * relation created, so if we didn't remember the size in the relsize * cache, we might call smgrnblocks() on the newly-created relation before * the creation WAL record hass been received by the page server. @@ -904,6 +909,8 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, if (IS_LOCAL_REL(reln)) mdextend(reln, forkNum, blkno, buffer, skipFsync); #endif + + SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO); } /* @@ -1079,7 +1086,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, blkno); zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); #ifdef DEBUG_COMPARE_LOCAL @@ -1284,7 +1291,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum) return n_blocks; } - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO); { ZenithNblocksRequest request = { .req.tag = T_ZenithNblocksRequest, @@ -1344,7 +1351,7 @@ zenith_dbsize(Oid dbNode) XLogRecPtr request_lsn; bool latest; - request_lsn = zenith_get_request_lsn(&latest); + request_lsn = zenith_get_request_lsn(&latest, InvalidOid, REL_METADATA_PSEUDO_BLOCKNO); { ZenithDbSizeRequest request = { .req.tag = T_ZenithDbSizeRequest, @@ -1431,7 +1438,11 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ XLogFlush(lsn); - SetLastWrittenPageLSN(lsn); + /* + * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them, + * either update LSN for "dummy" metadata block. Second approach seems to be more efficient. + */ + SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index dfad28d1f61..ea358d2038e 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -421,8 +421,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, 0, RelationGetNumberOfBlocks(index)); } - SetLastWrittenPageLSN(XactLastRecEnd); smgr_end_unlogged_build(index->rd_smgr); diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 78cf9e06391..d4f0086d38b 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -335,9 +335,10 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSN(XactLastRecEnd, + index->rd_smgr->smgr_rnode.node.relNode, + 0, RelationGetNumberOfBlocks(index)); } - SetLastWrittenPageLSN(XactLastRecEnd); - smgr_end_unlogged_build(index->rd_smgr); } @@ -467,7 +468,8 @@ gist_indexsortbuild(GISTBuildState *state) lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, pagestate->page, true); - SetLastWrittenPageLSN(lsn); + SetLastWrittenLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode, + GIST_ROOT_BLKNO, GIST_ROOT_BLKNO); } pfree(pagestate->page); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index a7608f4d54c..fec27816765 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -143,8 +143,9 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, + 0, RelationGetNumberOfBlocks(index)); } - SetLastWrittenPageLSN(XactLastRecEnd); smgr_end_unlogged_build(index->rd_smgr); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 4f9776c49db..ec91d513767 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -113,6 +113,7 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; bool track_wal_io_timing = false; uint64 predefined_sysidentifier; +int lastWrittenLsnCacheSize; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -182,6 +183,28 @@ const struct config_enum_entry recovery_target_action_options[] = { {NULL, 0, false} }; + +/* + * We are not taken in account dbnode, spcnode, forknum fields of + * relation tag, because possibility of collision is assumed to be small + * and should not affect performance. And reducing cache key size speed-up + * hash calculation and comparison. + */ +typedef struct LastWrittenLsnCacheKey +{ + Oid relid; + BlockNumber bucket; +} LastWrittenLsnCacheKey; + +typedef struct LastWrittenLsnCacheEntry +{ + LastWrittenLsnCacheKey key; + XLogRecPtr lsn; + /* L2-List for LRU replacement algorithm */ + struct LastWrittenLsnCacheEntry* next; + struct LastWrittenLsnCacheEntry* prev; +} LastWrittenLsnCacheEntry; + /* * Statistics for current checkpoint are collected in this global struct. * Because only the checkpointer or a stand-alone backend can perform @@ -751,6 +774,17 @@ typedef struct XLogCtlData XLogRecPtr lastFpwDisableRecPtr; XLogRecPtr lastWrittenPageLSN; + /* + * Maximal last written LSN for pages not present in lastWrittenLsnCache + */ + XLogRecPtr maxLastWrittenLsn; + + /* + * Double linked list to implement LRU replacement policy for last written LSN cache. + * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. + */ + LastWrittenLsnCacheEntry lastWrittenLsnLRU; + /* neon: copy of startup's RedoStartLSN for walproposer's use */ XLogRecPtr RedoStartLSN; @@ -762,6 +796,7 @@ typedef struct XLogCtlData slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; + static XLogCtlData *XLogCtl = NULL; /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */ @@ -772,6 +807,19 @@ static WALInsertLockPadded *WALInsertLocks = NULL; */ static ControlFileData *ControlFile = NULL; +#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */ + + +/* + * Cache of last written LSN for each relation chunk (hash bucket). + * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last + * relation metadata update. + * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), + * pages are replaced using LRU algorithm, based on L2-list. + * Access to this cache is protected by 'LastWrittenLsnLock'. + */ +static HTAB *lastWrittenLsnCache; + /* * Calculate the amount of space left on the page after 'endptr'. Beware * multiple evaluation! @@ -5141,11 +5189,8 @@ LocalProcessControlFile(bool reset) ReadControlFile(); } -/* - * Initialization of shared memory for XLOG - */ -Size -XLOGShmemSize(void) +static Size +XLOGCtlShmemSize(void) { Size size; @@ -5185,6 +5230,16 @@ XLOGShmemSize(void) return size; } +/* + * Initialization of shared memory for XLOG + */ +Size +XLOGShmemSize(void) +{ + return XLOGCtlShmemSize() + + hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry)); +} + void XLOGShmemInit(void) { @@ -5214,6 +5269,15 @@ XLOGShmemInit(void) XLogCtl = (XLogCtlData *) ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); + { + static HASHCTL info; + info.keysize = sizeof(LastWrittenLsnCacheKey); + info.entrysize = sizeof(LastWrittenLsnCacheEntry); + lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache", + lastWrittenLsnCacheSize, lastWrittenLsnCacheSize, + &info, + HASH_ELEM | HASH_BLOBS); + } localControlFile = ControlFile; ControlFile = (ControlFileData *) ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile); @@ -8111,7 +8175,8 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Write = EndOfLog; XLogCtl->LogwrtRqst.Flush = EndOfLog; - XLogCtl->lastWrittenPageLSN = EndOfLog; + XLogCtl->maxLastWrittenLsn = EndOfLog; + XLogCtl->lastWrittenLsnLRU.next = XLogCtl->lastWrittenLsnLRU.prev = &XLogCtl->lastWrittenLsnLRU; LocalSetXLogInsertAllowed(); @@ -8883,29 +8948,116 @@ GetInsertRecPtr(void) } /* - * GetLastWrittenPageLSN -- Returns maximal LSN of written page + * GetLastWrittenLSN -- Returns maximal LSN of written page. + * It returns an upper bound for the last written LSN of a given page, + * either from a cached last written LSN or a global maximum last written LSN. + * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. + * If cache is large enough ,iterting through all hash items may be rather expensive. + * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical. */ XLogRecPtr -GetLastWrittenPageLSN(void) +GetLastWrittenLSN(Oid rnode, BlockNumber blkno) { XLogRecPtr lsn; - SpinLockAcquire(&XLogCtl->info_lck); - lsn = XLogCtl->lastWrittenPageLSN; - SpinLockRelease(&XLogCtl->info_lck); + LastWrittenLsnCacheEntry* entry; + + LWLockAcquire(LastWrittenLsnLock, LW_SHARED); + + /* Maximal last written LSN among all non-cached pages */ + lsn = XLogCtl->maxLastWrittenLsn; + + if (rnode != InvalidOid) + { + LastWrittenLsnCacheKey key; + key.relid = rnode; + key.bucket = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET; + entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); + if (entry != NULL) + lsn = entry->lsn; + } + else + { + HASH_SEQ_STATUS seq; + /* Find maximum of all cached LSNs */ + hash_seq_init(&seq, lastWrittenLsnCache); + while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) + { + if (entry->lsn > lsn) + lsn = entry->lsn; + } + } + LWLockRelease(LastWrittenLsnLock); return lsn; } /* - * SetLastWrittenPageLSN -- Set maximal LSN of written page + * SetLastWrittenLSN -- Set maximal LSN of written page. + * We maintain cache of last written LSNs with limited size and LRU replacement + * policy. To reduce cache size we store max LSN not for each page, but for + * bucket (1024 blocks). This cache allows to use old LSN when + * requesting pages of unchanged or appended relations. + * + * rnode can be InvalidOid, in this case maxLastWrittenLsn is updated. SetLastWrittensn with InvalidOid + * is used by createdb and dbase_redo functions. */ void -SetLastWrittenPageLSN(XLogRecPtr lsn) +SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till) { - SpinLockAcquire(&XLogCtl->info_lck); - if (lsn > XLogCtl->lastWrittenPageLSN) - XLogCtl->lastWrittenPageLSN = lsn; - SpinLockRelease(&XLogCtl->info_lck); + if (lsn == InvalidXLogRecPtr) + return; + + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + if (rnode == InvalidOid) + { + if (lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = lsn; + } + else + { + LastWrittenLsnCacheEntry* entry; + LastWrittenLsnCacheKey key; + bool found; + BlockNumber bucket; + + key.relid = rnode; + for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET; + bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET; + bucket++) + { + key.bucket = bucket; + entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); + if (found) + { + if (lsn > entry->lsn) + entry->lsn = lsn; + /* Unlink from LRU list */ + entry->next->prev = entry->prev; + entry->prev->next = entry->next; + } + else + { + entry->lsn = lsn; + if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize) + { + /* Replace least recently used entry */ + LastWrittenLsnCacheEntry* victim = XLogCtl->lastWrittenLsnLRU.prev; + /* Adjust max LSN for not cached relations/chunks if needed */ + if (victim->lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = victim->lsn; + + victim->next->prev = victim->prev; + victim->prev->next = victim->next; + hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); + } + } + /* Link to the head of LRU list */ + entry->next = XLogCtl->lastWrittenLsnLRU.next; + entry->prev = &XLogCtl->lastWrittenLsnLRU; + XLogCtl->lastWrittenLsnLRU.next = entry->next->prev = entry; + } + } + LWLockRelease(LastWrittenLsnLock); } /* diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 090bcf817d8..d9fbb9511dd 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -675,7 +675,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) lsn = XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE); - SetLastWrittenPageLSN(lsn); + SetLastWrittenLSN(lsn, InvalidOid, 0, 0); } } table_endscan(scan); @@ -2294,7 +2294,7 @@ dbase_redo(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; - SetLastWrittenPageLSN(lsn); + SetLastWrittenLSN(lsn, InvalidOid, 0, 0); } } else if (info == XLOG_DBASE_DROP) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index a841beebf3f..8f9de58d158 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2058,6 +2058,13 @@ ProcessStandbyReply(XLogRecPtr writePtr, if (!am_cascading_walsender) SyncRepReleaseWaiters(); + /* + * walproposer use trunclateLsn instead of flushPtr for confirmed + * received location, so we shouldn't update restart_lsn here. + */ + if (am_wal_proposer) + return; + /* * walproposer use trunclateLsn instead of flushPtr for confirmed * received location, so we shouldn't update restart_lsn here. diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c2956..b4652c33ff6 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +LastWrittenLsnLock 48 diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 91bdfd40d88..a77045df21f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2379,6 +2379,16 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"lsn_cache_size", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Size of las written LSN cache used by Neon."), + NULL + }, + &lastWrittenLsnCacheSize, + 1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */ + NULL, NULL, NULL + }, + { {"temp_buffers", PGC_USERSET, RESOURCES_MEM, gettext_noop("Sets the maximum number of temporary buffers used by each session."), diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 66fe9dfcd9e..cd4e6c7f876 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -132,6 +132,7 @@ extern char *PrimaryConnInfo; extern char *PrimarySlotName; extern bool wal_receiver_create_temp_slot; extern bool track_wal_io_timing; +extern int lastWrittenLsnCacheSize; /* indirectly set via GUC system */ extern TransactionId recoveryTargetXid; @@ -351,8 +352,8 @@ extern XLogRecPtr GetFlushRecPtr(void); extern XLogRecPtr GetLastImportantRecPtr(void); extern void RemovePromoteSignalFiles(void); -extern void SetLastWrittenPageLSN(XLogRecPtr lsn); -extern XLogRecPtr GetLastWrittenPageLSN(void); +extern void SetLastWrittenLSN(XLogRecPtr lsn, Oid relfilenode, BlockNumber from, BlockNumber till); +extern XLogRecPtr GetLastWrittenLSN(Oid relfilenode, BlockNumber blkno); extern XLogRecPtr GetRedoStartLsn(void);